source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 6135

Last change on this file since 6135 was 6135, checked in by xwa163, 6 months ago
  1. Implement twist_kernel and untwist_kernel by PEXT and PDEP
  2. Use twist form for multiplexing lz4 grep
File size: 12.1 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <pablo/pabloAST.h>
9#include <pablo/builder.hpp>
10#include <pablo/pe_pack.h>
11
12#include <llvm/Support/raw_ostream.h>
13
14using namespace llvm;
15
16namespace kernel {
17
18const int PACK_LANES = 2;
19void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
20    Value * t0 = nullptr;
21    Value * t1 = nullptr;
22    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
23        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
24        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
25
26        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1); // TODO 4䞪bit streams时这里的16改䞺8?
27        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
28
29    } else {
30        t0 = iBuilder->hsimd_packh(16, s0, s1);
31        t1 = iBuilder->hsimd_packl(16, s0, s1);
32    }
33
34    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
35    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
36}
37
38void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
39    {
40        //input[0 - 3]
41        Value* bit3311[2];
42        Value* bit2200[2];
43        for (unsigned i = 0; i < 2; i++) {
44            s2p_step(iBuilder, input[2 * i], input[2 * i + 1], iBuilder->simd_himask(2), 1, bit3311[i], bit2200[i]);
45        }
46
47        Value* out[4];
48        s2p_step(iBuilder, bit3311[0], bit3311[1],
49                 iBuilder->simd_himask(4), 2, out[3], out[1]);
50
51        s2p_step(iBuilder, bit2200[0], bit2200[1],
52                 iBuilder->simd_himask(4), 2, out[2], out[0]);
53    }
54
55
56    // Little-endian bit number is used for variables.
57    Value * bit66442200[4];
58    Value * bit77553311[4];
59
60    for (unsigned i = 0; i < 4; i++) {
61        Value * s0 = input[2 * i];
62        Value * s1 = input[2 * i + 1];
63        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit77553311[i], bit66442200[i]);
64    }
65    Value * bit44440000[2];
66    Value * bit66662222[2];
67    Value * bit55551111[2];
68    Value * bit77773333[2];
69    for (unsigned j = 0; j<2; j++) {
70        s2p_step(iBuilder, bit66442200[2*j], bit66442200[2*j+1],
71                 iBuilder->simd_himask(4), 2, bit66662222[j], bit44440000[j]);
72        s2p_step(iBuilder, bit77553311[2*j], bit77553311[2*j+1],
73                 iBuilder->simd_himask(4), 2, bit77773333[j], bit55551111[j]);
74    }
75    if (basisNumbering == cc::BitNumbering::LittleEndian) {
76        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[4], output[0]);
77        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[5], output[1]);
78        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[6], output[2]);
79        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[7], output[3]);
80    }
81    else {
82        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
83        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
84        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
85        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
86    }
87}
88
89/* Alternative transposition model, but small field width packs are problematic. */
90#if 0
91void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
92    Value * hi_nybble[4];
93    Value * lo_nybble[4];
94    for (unsigned i = 0; i<4; i++) {
95        Value * s0 = input[2*i];
96        Value * s1 = input[2*i+1];
97        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
98        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
99    }
100    Value * pair76[2];
101    Value * pair54[2];
102    Value * pair32[2];
103    Value * pair10[2];
104    for (unsigned i = 0; i<2; i++) {
105        pair76[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
106        pair54[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
107        pair32[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
108        pair10[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
109    }
110    if (basisNumbering == cc::BitNumbering::LittleEndian) {
111        output[7] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
112        output[6] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
113        output[5] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
114        output[4] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
115        output[3] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
116        output[2] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
117        output[1] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
118        output[0] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
119    } else {
120        output[0] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
121        output[1] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
122        output[2] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
123        output[3] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
124        output[4] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
125        output[5] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
126        output[6] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
127        output[7] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
128    }
129}
130#endif
131
132void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
133    BasicBlock * entry = kb->GetInsertBlock();
134    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
135    BasicBlock * s2pDone = kb->CreateBasicBlock("s2pDone");
136    Constant * const ZERO = kb->getSize(0);
137
138    kb->CreateBr(processBlock);
139   
140    kb->SetInsertPoint(processBlock);
141    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
142    blockOffsetPhi->addIncoming(ZERO, entry);
143
144    Value * bytepack[8];
145    for (unsigned i = 0; i < 8; i++) {
146        if (mAligned) {
147            bytepack[i] = kb->loadInputStreamPack("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
148        } else {
149            Value * ptr = kb->getInputStreamPackPtr("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
150            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
151            bytepack[i] = kb->CreateAlignedLoad(ptr, 1);
152        }
153    }
154    Value * basisbits[8];
155    s2p(kb, bytepack, basisbits, mBasisSetNumbering);
156    for (unsigned i = 0; i < mNumOfStreams; ++i) {
157        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]);
158    }
159    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
160    blockOffsetPhi->addIncoming(nextBlk, processBlock);
161    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
162    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
163    kb->SetInsertPoint(s2pDone);
164}
165
166S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering, bool aligned, std::string prefix, unsigned numOfStreams)
167    : MultiBlockKernel(aligned ? prefix + "s2p" + cc::numberingSuffix(numbering): prefix + "s2p_unaligned" + cc::numberingSuffix(numbering),
168    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
169    {Binding{b->getStreamSetTy(numOfStreams, 1), "basisBits"}}, {}, {}, {}),
170  mBasisSetNumbering(numbering),
171  mAligned(aligned),
172  mNumOfStreams(numOfStreams)
173{
174    if (!aligned) {
175        mStreamSetInputs[0].addAttribute(Misaligned());
176    }
177}
178   
179S2P_21Kernel::S2P_21Kernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering)
180: MultiBlockKernel("s2p_21" + cc::numberingSuffix(numbering),
181                   {Binding{b->getStreamSetTy(1, 32), "codeUnitStream", FixedRate(), Principal()}},
182                   {Binding{b->getStreamSetTy(21, 1), "basisBits"}}, {}, {}, {}),
183    mBasisSetNumbering(numbering) {
184}
185
186void S2P_21Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
187    BasicBlock * entry = kb->GetInsertBlock();
188    BasicBlock * processBlock = kb->CreateBasicBlock("s2p21_loop");
189    BasicBlock * s2pDone = kb->CreateBasicBlock("s2p21_done");
190    Constant * const ZERO = kb->getSize(0);
191   
192    kb->CreateBr(processBlock);
193    kb->SetInsertPoint(processBlock);
194    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
195    blockOffsetPhi->addIncoming(ZERO, entry);
196    Value * u32byte0[8];
197    Value * u32byte1[8];
198    Value * u32byte2[8];
199    for (unsigned i = 0; i < 8; i++) {
200        Value * UTF32units[4];
201        for (unsigned j = 0; j < 4; j++) {
202            UTF32units[j] = kb->loadInputStreamPack("codeUnitStream", ZERO, kb->getInt32(4*i + j), blockOffsetPhi);
203        }
204        Value * u32lo16_0 = kb->hsimd_packl(32, UTF32units[0], UTF32units[1]);
205        Value * u32lo16_1 = kb->hsimd_packl(32, UTF32units[2], UTF32units[3]);
206        Value * u32hi16_0 = kb->hsimd_packh(32, UTF32units[0], UTF32units[1]);
207        Value * u32hi16_1 = kb->hsimd_packh(32, UTF32units[2], UTF32units[3]);
208        u32byte0[i] = kb->hsimd_packl(16, u32lo16_0, u32lo16_1);
209        u32byte1[i] = kb->hsimd_packh(16, u32lo16_0, u32lo16_1);
210        u32byte2[i] = kb->hsimd_packl(16, u32hi16_0, u32hi16_1);
211    #ifdef VALIDATE_U32
212        //  Validation should ensure that none of the high 11 bits are
213        //  set for any UTF-32 code unit.   We simply combine the bits
214        //  of code units together with bitwise-or, and then perform a
215        //  single check at the end.
216        u32_check = simd_or(u32_check, simd_or(u32hi16_0, u32hi16_1));
217    #endif
218    }
219    Value * basisbits[24];
220    s2p(kb, u32byte0, basisbits, cc::BitNumbering::LittleEndian);
221    s2p(kb, u32byte1, &basisbits[8], cc::BitNumbering::LittleEndian);
222    s2p(kb, u32byte2, &basisbits[16], cc::BitNumbering::LittleEndian);
223    for (unsigned i = 0; i < 21; ++i) {
224        const unsigned bitIdx = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? i : 21 - i;
225        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[bitIdx]);
226    }
227    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
228    blockOffsetPhi->addIncoming(nextBlk, processBlock);
229    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
230    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
231    kb->SetInsertPoint(s2pDone);
232}
233
234void S2P_PabloKernel::generatePabloMethod() {
235    pablo::PabloBlock * const pb = getEntryScope();
236    const unsigned steps = std::log2(mCodeUnitWidth);
237    std::vector<PabloAST *> streamSet[steps + 1];
238    for (unsigned i = 0; i <= steps; i++) {
239        streamSet[i].resize(1<<i);
240    }
241    streamSet[0][0] = pb->createExtract(getInputStreamVar("codeUnitStream"), pb->getInteger(0));
242    unsigned streamWidth = mCodeUnitWidth;
243    for (unsigned i = 1; i <= steps; i++) {
244        for (unsigned j = 0; j < streamSet[i-1].size(); j++) {
245            auto strm = streamSet[i-1][j];
246            streamSet[i][2*j] = pb->createPackL(pb->getInteger(streamWidth), strm);
247            streamSet[i][2*j+1] = pb->createPackH(pb->getInteger(streamWidth), strm);
248        }
249        streamWidth = streamWidth/2;
250    }
251    for (unsigned bit = 0; bit < mCodeUnitWidth; bit++) {
252        const unsigned bitIndex = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? bit : mCodeUnitWidth-1-bit;
253        pb->createAssign(pb->createExtract(getOutputStreamVar("basisBits"), pb->getInteger(bitIndex)), streamSet[steps][bit]);
254    }
255}
256
257S2P_PabloKernel::S2P_PabloKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned codeUnitWidth, cc::BitNumbering numbering)
258: PabloKernel(b, "s2p_pablo" + std::to_string(codeUnitWidth) + cc::numberingSuffix(numbering),
259    {Binding{b->getStreamSetTy(1, codeUnitWidth), "codeUnitStream"}},
260    {Binding{b->getStreamSetTy(codeUnitWidth, 1), "basisBits"}}),
261  mCodeUnitWidth(codeUnitWidth),
262  mBasisSetNumbering(numbering) {
263}
264
265
266}
Note: See TracBrowser for help on using the repository browser.