source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 6145

Last change on this file since 6145 was 6145, checked in by xwa163, 3 months ago
  1. LZ4 Grep: complete utf8 character classes for multiplexing pipeline
  2. Implement multiple streams version of S2P and P2S
File size: 14.7 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <pablo/pabloAST.h>
9#include <pablo/builder.hpp>
10#include <pablo/pe_pack.h>
11
12#include <llvm/Support/raw_ostream.h>
13
14using namespace llvm;
15
16namespace kernel {
17
18const int PACK_LANES = 2;
19void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
20    Value * t0 = nullptr;
21    Value * t1 = nullptr;
22    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
23        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
24        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
25
26        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1); // TODO 4䞪bit streams时这里的16改䞺8?
27        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
28
29    } else {
30        t0 = iBuilder->hsimd_packh(16, s0, s1);
31        t1 = iBuilder->hsimd_packl(16, s0, s1);
32    }
33
34    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
35    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
36}
37
38void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
39    {
40        //input[0 - 3]
41        Value* bit3311[2];
42        Value* bit2200[2];
43        for (unsigned i = 0; i < 2; i++) {
44            s2p_step(iBuilder, input[2 * i], input[2 * i + 1], iBuilder->simd_himask(2), 1, bit3311[i], bit2200[i]);
45        }
46
47        Value* out[4];
48        s2p_step(iBuilder, bit3311[0], bit3311[1],
49                 iBuilder->simd_himask(4), 2, out[3], out[1]);
50
51        s2p_step(iBuilder, bit2200[0], bit2200[1],
52                 iBuilder->simd_himask(4), 2, out[2], out[0]);
53    }
54
55
56    // Little-endian bit number is used for variables.
57    Value * bit66442200[4];
58    Value * bit77553311[4];
59
60    for (unsigned i = 0; i < 4; i++) {
61        Value * s0 = input[2 * i];
62        Value * s1 = input[2 * i + 1];
63        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit77553311[i], bit66442200[i]);
64    }
65    Value * bit44440000[2];
66    Value * bit66662222[2];
67    Value * bit55551111[2];
68    Value * bit77773333[2];
69    for (unsigned j = 0; j<2; j++) {
70        s2p_step(iBuilder, bit66442200[2*j], bit66442200[2*j+1],
71                 iBuilder->simd_himask(4), 2, bit66662222[j], bit44440000[j]);
72        s2p_step(iBuilder, bit77553311[2*j], bit77553311[2*j+1],
73                 iBuilder->simd_himask(4), 2, bit77773333[j], bit55551111[j]);
74    }
75    if (basisNumbering == cc::BitNumbering::LittleEndian) {
76        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[4], output[0]);
77        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[5], output[1]);
78        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[6], output[2]);
79        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[7], output[3]);
80    }
81    else {
82        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
83        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
84        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
85        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
86    }
87}
88
89/* Alternative transposition model, but small field width packs are problematic. */
90#if 0
91void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
92    Value * hi_nybble[4];
93    Value * lo_nybble[4];
94    for (unsigned i = 0; i<4; i++) {
95        Value * s0 = input[2*i];
96        Value * s1 = input[2*i+1];
97        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
98        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
99    }
100    Value * pair76[2];
101    Value * pair54[2];
102    Value * pair32[2];
103    Value * pair10[2];
104    for (unsigned i = 0; i<2; i++) {
105        pair76[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
106        pair54[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
107        pair32[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
108        pair10[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
109    }
110    if (basisNumbering == cc::BitNumbering::LittleEndian) {
111        output[7] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
112        output[6] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
113        output[5] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
114        output[4] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
115        output[3] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
116        output[2] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
117        output[1] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
118        output[0] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
119    } else {
120        output[0] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
121        output[1] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
122        output[2] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
123        output[3] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
124        output[4] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
125        output[5] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
126        output[6] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
127        output[7] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
128    }
129}
130#endif
131
132void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
133    BasicBlock * entry = kb->GetInsertBlock();
134    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
135    BasicBlock * s2pDone = kb->CreateBasicBlock("s2pDone");
136    Constant * const ZERO = kb->getSize(0);
137
138    kb->CreateBr(processBlock);
139   
140    kb->SetInsertPoint(processBlock);
141    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
142    blockOffsetPhi->addIncoming(ZERO, entry);
143
144    Value * bytepack[8];
145    for (unsigned i = 0; i < 8; i++) {
146        if (mAligned) {
147            bytepack[i] = kb->loadInputStreamPack("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
148        } else {
149            Value * ptr = kb->getInputStreamPackPtr("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
150            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
151            bytepack[i] = kb->CreateAlignedLoad(ptr, 1);
152        }
153    }
154    Value * basisbits[8];
155    s2p(kb, bytepack, basisbits, mBasisSetNumbering);
156    for (unsigned i = 0; i < mNumOfStreams; ++i) {
157        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]);
158    }
159    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
160    blockOffsetPhi->addIncoming(nextBlk, processBlock);
161    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
162    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
163    kb->SetInsertPoint(s2pDone);
164}
165
166S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering, bool aligned, std::string prefix, unsigned numOfStreams)
167    : MultiBlockKernel(aligned ? prefix + "s2p" + cc::numberingSuffix(numbering): prefix + "s2p_unaligned" + cc::numberingSuffix(numbering),
168    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
169    {Binding{b->getStreamSetTy(numOfStreams, 1), "basisBits"}}, {}, {}, {}),
170  mBasisSetNumbering(numbering),
171  mAligned(aligned),
172  mNumOfStreams(numOfStreams)
173{
174    if (!aligned) {
175        mStreamSetInputs[0].addAttribute(Misaligned());
176    }
177}
178
179S2PMultipleStreamsKernel::S2PMultipleStreamsKernel(
180        const std::unique_ptr<kernel::KernelBuilder> & b,
181        cc::BitNumbering basisNumbering,
182        bool aligned,
183        std::vector<unsigned> numsOfStreams)
184: MultiBlockKernel(aligned ? "s2pMultipleStreams" + cc::numberingSuffix(basisNumbering): "s2p_unaligned" + cc::numberingSuffix(basisNumbering),
185                   {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
186                   {}, {}, {}, {}),
187  mBasisSetNumbering(basisNumbering),
188  mAligned(aligned),
189  mNumsOfStreams(numsOfStreams)
190{
191    for (unsigned i = 0; i < numsOfStreams.size(); i++) {
192        mStreamSetOutputs.push_back(Binding{b->getStreamSetTy(numsOfStreams[i], 1), "basisBits_" + std::to_string(i)});
193    }
194}
195
196void S2PMultipleStreamsKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &kb,
197                                                       llvm::Value *const numOfBlocks) {
198    BasicBlock * entry = kb->GetInsertBlock();
199    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
200    BasicBlock * s2pDone = kb->CreateBasicBlock("s2pDone");
201    Constant * const ZERO = kb->getSize(0);
202
203    kb->CreateBr(processBlock);
204
205    kb->SetInsertPoint(processBlock);
206    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
207    blockOffsetPhi->addIncoming(ZERO, entry);
208
209    Value * bytepack[8];
210    for (unsigned i = 0; i < 8; i++) {
211        if (mAligned) {
212            bytepack[i] = kb->loadInputStreamPack("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
213        } else {
214            Value * ptr = kb->getInputStreamPackPtr("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
215            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
216            bytepack[i] = kb->CreateAlignedLoad(ptr, 1);
217        }
218    }
219    Value * basisbits[8];
220    s2p(kb, bytepack, basisbits, mBasisSetNumbering);
221    unsigned iStreamIndex = 0;
222    for (unsigned i = 0; i < mNumsOfStreams.size(); i++) {
223        for (unsigned j = 0; j < mNumsOfStreams[i]; j++) {
224            kb->storeOutputStreamBlock("basisBits_" + std::to_string(i), kb->getInt32(j), blockOffsetPhi, basisbits[iStreamIndex]);
225            iStreamIndex++;
226        }
227    }
228
229    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
230    blockOffsetPhi->addIncoming(nextBlk, processBlock);
231    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
232    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
233    kb->SetInsertPoint(s2pDone);
234}
235
236
237S2P_21Kernel::S2P_21Kernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering)
238: MultiBlockKernel("s2p_21" + cc::numberingSuffix(numbering),
239                   {Binding{b->getStreamSetTy(1, 32), "codeUnitStream", FixedRate(), Principal()}},
240                   {Binding{b->getStreamSetTy(21, 1), "basisBits"}}, {}, {}, {}),
241    mBasisSetNumbering(numbering) {
242}
243
244void S2P_21Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
245    BasicBlock * entry = kb->GetInsertBlock();
246    BasicBlock * processBlock = kb->CreateBasicBlock("s2p21_loop");
247    BasicBlock * s2pDone = kb->CreateBasicBlock("s2p21_done");
248    Constant * const ZERO = kb->getSize(0);
249   
250    kb->CreateBr(processBlock);
251    kb->SetInsertPoint(processBlock);
252    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
253    blockOffsetPhi->addIncoming(ZERO, entry);
254    Value * u32byte0[8];
255    Value * u32byte1[8];
256    Value * u32byte2[8];
257    for (unsigned i = 0; i < 8; i++) {
258        Value * UTF32units[4];
259        for (unsigned j = 0; j < 4; j++) {
260            UTF32units[j] = kb->loadInputStreamPack("codeUnitStream", ZERO, kb->getInt32(4*i + j), blockOffsetPhi);
261        }
262        Value * u32lo16_0 = kb->hsimd_packl(32, UTF32units[0], UTF32units[1]);
263        Value * u32lo16_1 = kb->hsimd_packl(32, UTF32units[2], UTF32units[3]);
264        Value * u32hi16_0 = kb->hsimd_packh(32, UTF32units[0], UTF32units[1]);
265        Value * u32hi16_1 = kb->hsimd_packh(32, UTF32units[2], UTF32units[3]);
266        u32byte0[i] = kb->hsimd_packl(16, u32lo16_0, u32lo16_1);
267        u32byte1[i] = kb->hsimd_packh(16, u32lo16_0, u32lo16_1);
268        u32byte2[i] = kb->hsimd_packl(16, u32hi16_0, u32hi16_1);
269    #ifdef VALIDATE_U32
270        //  Validation should ensure that none of the high 11 bits are
271        //  set for any UTF-32 code unit.   We simply combine the bits
272        //  of code units together with bitwise-or, and then perform a
273        //  single check at the end.
274        u32_check = simd_or(u32_check, simd_or(u32hi16_0, u32hi16_1));
275    #endif
276    }
277    Value * basisbits[24];
278    s2p(kb, u32byte0, basisbits, cc::BitNumbering::LittleEndian);
279    s2p(kb, u32byte1, &basisbits[8], cc::BitNumbering::LittleEndian);
280    s2p(kb, u32byte2, &basisbits[16], cc::BitNumbering::LittleEndian);
281    for (unsigned i = 0; i < 21; ++i) {
282        const unsigned bitIdx = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? i : 21 - i;
283        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[bitIdx]);
284    }
285    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
286    blockOffsetPhi->addIncoming(nextBlk, processBlock);
287    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
288    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
289    kb->SetInsertPoint(s2pDone);
290}
291
292void S2P_PabloKernel::generatePabloMethod() {
293    pablo::PabloBlock * const pb = getEntryScope();
294    const unsigned steps = std::log2(mCodeUnitWidth);
295    std::vector<PabloAST *> streamSet[steps + 1];
296    for (unsigned i = 0; i <= steps; i++) {
297        streamSet[i].resize(1<<i);
298    }
299    streamSet[0][0] = pb->createExtract(getInputStreamVar("codeUnitStream"), pb->getInteger(0));
300    unsigned streamWidth = mCodeUnitWidth;
301    for (unsigned i = 1; i <= steps; i++) {
302        for (unsigned j = 0; j < streamSet[i-1].size(); j++) {
303            auto strm = streamSet[i-1][j];
304            streamSet[i][2*j] = pb->createPackL(pb->getInteger(streamWidth), strm);
305            streamSet[i][2*j+1] = pb->createPackH(pb->getInteger(streamWidth), strm);
306        }
307        streamWidth = streamWidth/2;
308    }
309    for (unsigned bit = 0; bit < mCodeUnitWidth; bit++) {
310        const unsigned bitIndex = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? bit : mCodeUnitWidth-1-bit;
311        pb->createAssign(pb->createExtract(getOutputStreamVar("basisBits"), pb->getInteger(bitIndex)), streamSet[steps][bit]);
312    }
313}
314
315S2P_PabloKernel::S2P_PabloKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned codeUnitWidth, cc::BitNumbering numbering)
316: PabloKernel(b, "s2p_pablo" + std::to_string(codeUnitWidth) + cc::numberingSuffix(numbering),
317    {Binding{b->getStreamSetTy(1, codeUnitWidth), "codeUnitStream"}},
318    {Binding{b->getStreamSetTy(codeUnitWidth, 1), "basisBits"}}),
319  mCodeUnitWidth(codeUnitWidth),
320  mBasisSetNumbering(numbering) {
321}
322
323
324}
Note: See TracBrowser for help on using the repository browser.