source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp

Last change on this file was 6261, checked in by nmedfort, 5 months ago

Work on OptimizationBranch?; revisited pipeline termination

File size: 17.7 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/callback.h>
8#include <kernels/kernel_builder.h>
9#include <pablo/pabloAST.h>
10#include <pablo/builder.hpp>
11#include <pablo/pe_pack.h>
12#include <llvm/IR/Module.h>
13#include <llvm/Support/raw_ostream.h>
14
15using namespace llvm;
16
17namespace kernel {
18
19const int PACK_LANES = 2;
20void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
21    Value * t0 = nullptr;
22    Value * t1 = nullptr;
23    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
24        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
25        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
26
27        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
28        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
29
30    } else {
31        t0 = iBuilder->hsimd_packh(16, s0, s1);
32        t1 = iBuilder->hsimd_packl(16, s0, s1);
33    }
34
35    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
36    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
37}
38
39void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
40    // Little-endian bit number is used for variables.
41    Value * bit66442200[4];
42    Value * bit77553311[4];
43
44    for (unsigned i = 0; i < 4; i++) {
45        Value * s0 = input[2 * i];
46        Value * s1 = input[2 * i + 1];
47        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit77553311[i], bit66442200[i]);
48    }
49    Value * bit44440000[2];
50    Value * bit66662222[2];
51    Value * bit55551111[2];
52    Value * bit77773333[2];
53    for (unsigned j = 0; j<2; j++) {
54        s2p_step(iBuilder, bit66442200[2*j], bit66442200[2*j+1],
55                 iBuilder->simd_himask(4), 2, bit66662222[j], bit44440000[j]);
56        s2p_step(iBuilder, bit77553311[2*j], bit77553311[2*j+1],
57                 iBuilder->simd_himask(4), 2, bit77773333[j], bit55551111[j]);
58    }
59    if (basisNumbering == cc::BitNumbering::LittleEndian) {
60        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[4], output[0]);
61        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[5], output[1]);
62        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[6], output[2]);
63        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[7], output[3]);
64    }
65    else {
66        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
67        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
68        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
69        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
70    }
71}
72
73/* Alternative transposition model, but small field width packs are problematic. */
74#if 0
75void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
76    Value * hi_nybble[4];
77    Value * lo_nybble[4];
78    for (unsigned i = 0; i<4; i++) {
79        Value * s0 = input[2*i];
80        Value * s1 = input[2*i+1];
81        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
82        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
83    }
84    Value * pair76[2];
85    Value * pair54[2];
86    Value * pair32[2];
87    Value * pair10[2];
88    for (unsigned i = 0; i<2; i++) {
89        pair76[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
90        pair54[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
91        pair32[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
92        pair10[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
93    }
94    if (basisNumbering == cc::BitNumbering::LittleEndian) {
95        output[7] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
96        output[6] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
97        output[5] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
98        output[4] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
99        output[3] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
100        output[2] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
101        output[1] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
102        output[0] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
103    } else {
104        output[0] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
105        output[1] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
106        output[2] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
107        output[3] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
108        output[4] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
109        output[5] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
110        output[6] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
111        output[7] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
112    }
113}
114#endif
115
116void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
117    Module * m = kb->getModule();
118    DataLayout DL(m);
119    IntegerType * const intPtrTy = DL.getIntPtrType(kb->getContext());
120    PointerType * const voidPtrTy = kb->getVoidPtrTy();
121    BasicBlock * entry = kb->GetInsertBlock();
122    BasicBlock * s2pLoop = kb->CreateBasicBlock("s2pLoop");
123    BasicBlock * s2pFinalize = kb->CreateBasicBlock("s2pFinalize");
124    Constant * const ZERO = kb->getSize(0);
125    // Declarations for AbortOnNull mode:
126    PHINode * nullCheckPhi = nullptr;
127    Value * nonNullSoFar = nullptr;
128
129    kb->CreateBr(s2pLoop);
130
131    kb->SetInsertPoint(s2pLoop);
132    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
133    blockOffsetPhi->addIncoming(ZERO, entry);
134    if (mAbortOnNull) {
135        nullCheckPhi = kb->CreatePHI(kb->getBitBlockType(), 2);
136        nullCheckPhi->addIncoming(kb->allOnes(), entry);
137    }
138    Value * bytepack[8];
139    for (unsigned i = 0; i < 8; i++) {
140        bytepack[i] = kb->loadInputStreamPack("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
141    }
142    Value * basisbits[8];
143    s2p(kb, bytepack, basisbits, mBasisSetNumbering);
144    for (unsigned i = 0; i < mNumOfStreams; ++i) {
145        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]);
146    }
147    if (mAbortOnNull) {
148        Value * nonNull = kb->simd_or(kb->simd_or(kb->simd_or(basisbits[0], basisbits[1]),
149                                                  kb->simd_or(basisbits[2], basisbits[3])),
150                                      kb->simd_or(kb->simd_or(basisbits[4], basisbits[5]),
151                                                  kb->simd_or(basisbits[6], basisbits[7])));
152        nonNullSoFar = kb->simd_and(nonNull, nullCheckPhi);
153        nullCheckPhi->addIncoming(nonNullSoFar, s2pLoop);
154    }
155    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
156    blockOffsetPhi->addIncoming(nextBlk, s2pLoop);
157    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
158
159    kb->CreateCondBr(moreToDo, s2pLoop, s2pFinalize);
160
161    kb->SetInsertPoint(s2pFinalize);
162    //  s2p is complete, except for null byte check.
163    if (mAbortOnNull) {
164        BasicBlock * nullByteDetected = kb->CreateBasicBlock("nullByteDetected");
165        BasicBlock * nullInFileDetected = kb->CreateBasicBlock("nullInFileDetected");
166        BasicBlock * s2pExit = kb->CreateBasicBlock("s2pExit");
167        Value * itemsToDo = kb->getAccessibleItemCount("byteStream");
168        Value * anyNull = kb->bitblock_any(kb->simd_not(nonNullSoFar));
169        kb->CreateCondBr(anyNull, nullByteDetected, s2pExit);
170
171        kb->SetInsertPoint(nullByteDetected);
172        // A null byte has been detected, determine its position and whether it is past EOF.
173        Value * byteStreamBasePtr = kb->getInputStreamBlockPtr("byteStream", ZERO, ZERO);
174        Value * ptrToNull = kb->CreateMemChr(kb->CreatePointerCast(byteStreamBasePtr, voidPtrTy), kb->getInt32(0), itemsToDo);
175        Value * nullInFile = kb->CreateICmpNE(ptrToNull, ConstantPointerNull::get(cast<PointerType>(ptrToNull->getType())));
176        kb->CreateCondBr(nullInFile, nullInFileDetected, s2pExit);
177
178        kb->SetInsertPoint(nullInFileDetected);
179        // A null byte has been located within the file; set the termination code and call the signal handler.
180        Value * firstNull = kb->CreatePtrToInt(ptrToNull, intPtrTy);
181        Value * startOfStride = kb->CreatePtrToInt(byteStreamBasePtr, intPtrTy);
182        Value * itemsBeforeNull = kb->CreateZExtOrTrunc(kb->CreateSub(firstNull, startOfStride), itemsToDo->getType());
183        Value * producedCount = kb->CreateAdd(kb->getProducedItemCount("basisBits"), itemsBeforeNull);
184        kb->setProducedItemCount("basisBits", producedCount);
185        kb->setTerminationSignal();
186        Function * const dispatcher = m->getFunction("signal_dispatcher"); assert (dispatcher);
187        Value * handler = kb->getScalarField("handler_address");
188        kb->CreateCall(dispatcher, {handler, ConstantInt::get(kb->getInt32Ty(), NULL_SIGNAL)});
189        kb->CreateBr(s2pExit);
190
191        kb->SetInsertPoint(s2pExit);
192    }
193}
194
195Bindings S2PKernel::makeOutputBindings(StreamSet * const BasisBits, bool abortOnNull) {
196    if (abortOnNull) {
197        return {Binding("basisBits", BasisBits)};
198    } else {
199        return {Binding("basisBits", BasisBits)};
200    }
201}
202
203Bindings S2PKernel::makeInputScalarBindings(Scalar * signalNullObject) {
204    if (signalNullObject) {
205        return {Binding{"handler_address", signalNullObject}};
206    } else {
207        return {};
208    }
209}
210
211S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b,
212                     StreamSet * const codeUnitStream,
213                     StreamSet * const BasisBits,
214                     const cc::BitNumbering numbering,
215                     Scalar * signalNullObject)
216: MultiBlockKernel(b, (signalNullObject ? "s2pa" : "s2p") + std::to_string(BasisBits->getNumElements()) + cc::numberingSuffix(numbering)
217, {Binding{"byteStream", codeUnitStream, FixedRate(), Principal()}}
218, makeOutputBindings(BasisBits, signalNullObject)
219, makeInputScalarBindings(signalNullObject), {}, {})
220, mBasisSetNumbering(numbering)
221, mAbortOnNull(signalNullObject != nullptr)
222, mNumOfStreams(BasisBits->getNumElements()) {
223    assert (codeUnitStream->getFieldWidth() == BasisBits->getNumElements());
224    if (mAbortOnNull) addAttribute(CanTerminateEarly());
225}
226
227inline std::string makeMultiS2PName(const StreamSets & outputStreams, const cc::BitNumbering basisNumbering, const bool aligned) {
228    std::string buffer;
229    raw_string_ostream out(buffer);
230    out << "s2p";
231    for (unsigned i = 0; i < outputStreams.size(); ++i) {
232        if (i) out << ".";
233        out << outputStreams[i]->getNumElements();
234    }
235    out << (aligned ? "a" : "u");
236    out << cc::numberingSuffix(basisNumbering);
237    out.flush();
238    return buffer;
239}
240
241S2PMultipleStreamsKernel::S2PMultipleStreamsKernel(const std::unique_ptr<kernel::KernelBuilder> & b,
242        StreamSet * codeUnitStream,
243        const StreamSets & outputStreams,
244        const cc::BitNumbering basisNumbering,
245        const bool aligned)
246: MultiBlockKernel(b, makeMultiS2PName(outputStreams, basisNumbering, aligned),
247// input
248{Binding{"byteStream", codeUnitStream}},
249{}, {}, {}, {}),
250mBasisSetNumbering(basisNumbering),
251mAligned(aligned) {
252    for (unsigned i = 0; i < outputStreams.size(); i++) {
253        mOutputStreamSets.emplace_back("basisBits_" + std::to_string(i), outputStreams[i]);
254    }
255}
256
257void S2PMultipleStreamsKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &b, Value *const numOfBlocks) {
258    BasicBlock * entry = b->GetInsertBlock();
259    BasicBlock * processBlock = b->CreateBasicBlock("processBlock");
260    BasicBlock * s2pDone = b->CreateBasicBlock("s2pDone");
261    Constant * const ZERO = b->getSize(0);
262
263    b->CreateBr(processBlock);
264
265    b->SetInsertPoint(processBlock);
266    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
267    blockOffsetPhi->addIncoming(ZERO, entry);
268
269    Value * bytepack[8];
270    for (unsigned i = 0; i < 8; i++) {
271        if (mAligned) {
272            bytepack[i] = b->loadInputStreamPack("byteStream", ZERO, b->getInt32(i), blockOffsetPhi);
273        } else {
274            Value * ptr = b->getInputStreamPackPtr("byteStream", ZERO, b->getInt32(i), blockOffsetPhi);
275            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
276            bytepack[i] = b->CreateAlignedLoad(ptr, 1);
277        }
278    }
279    Value * basisbits[8];
280    s2p(b, bytepack, basisbits, mBasisSetNumbering);
281
282    unsigned k = 0;
283    for (unsigned i = 0; i < getNumOfStreamOutputs(); ++i) {
284        const auto m = getOutputStreamSet(i)->getNumElements();
285        for (unsigned j = 0; j < m; j++) {
286            b->storeOutputStreamBlock("basisBits_" + std::to_string(i), b->getInt32(j), blockOffsetPhi, basisbits[k++]);
287        }
288    }
289
290    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
291    blockOffsetPhi->addIncoming(nextBlk, processBlock);
292    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
293    b->CreateCondBr(moreToDo, processBlock, s2pDone);
294    b->SetInsertPoint(s2pDone);
295}
296
297
298S2P_21Kernel::S2P_21Kernel(const std::unique_ptr<KernelBuilder> & b, StreamSet * const codeUnitStream, StreamSet * const BasisBits, cc::BitNumbering numbering)
299: MultiBlockKernel(b, "s2p_21" + cc::numberingSuffix(numbering),
300{Binding{"codeUnitStream", codeUnitStream, FixedRate(), Principal()}},
301{Binding{"basisBits", BasisBits}}, {}, {}, {})
302, mBasisSetNumbering(numbering) {
303
304}
305
306void S2P_21Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
307    BasicBlock * entry = kb->GetInsertBlock();
308    BasicBlock * processBlock = kb->CreateBasicBlock("s2p21_loop");
309    BasicBlock * s2pDone = kb->CreateBasicBlock("s2p21_done");
310    Constant * const ZERO = kb->getSize(0);
311
312    kb->CreateBr(processBlock);
313
314    kb->SetInsertPoint(processBlock);
315    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
316    blockOffsetPhi->addIncoming(ZERO, entry);
317
318    Value * u32byte0[8];
319    Value * u32byte1[8];
320    Value * u32byte2[8];
321    for (unsigned i = 0; i < 8; i++) {
322        Value * UTF32units[4];
323        for (unsigned j = 0; j < 4; j++) {
324            UTF32units[j] = kb->loadInputStreamPack("codeUnitStream", ZERO, kb->getInt32(4 * i + j), blockOffsetPhi);
325        }
326        Value * u32lo16_0 = kb->hsimd_packl(32, UTF32units[0], UTF32units[1]);
327        Value * u32lo16_1 = kb->hsimd_packl(32, UTF32units[2], UTF32units[3]);
328        Value * u32hi16_0 = kb->hsimd_packh(32, UTF32units[0], UTF32units[1]);
329        Value * u32hi16_1 = kb->hsimd_packh(32, UTF32units[2], UTF32units[3]);
330        u32byte0[i] = kb->hsimd_packl(16, u32lo16_0, u32lo16_1);
331        u32byte1[i] = kb->hsimd_packh(16, u32lo16_0, u32lo16_1);
332        u32byte2[i] = kb->hsimd_packl(16, u32hi16_0, u32hi16_1);
333    #ifdef VALIDATE_U32
334        //  Validation should ensure that none of the high 11 bits are
335        //  set for any UTF-32 code unit.   We simply combine the bits
336        //  of code units together with bitwise-or, and then perform a
337        //  single check at the end.
338        u32_check = simd_or(u32_check, simd_or(u32hi16_0, u32hi16_1));
339    #endif
340    }
341    Value * basisbits[24];
342    s2p(kb, u32byte0, basisbits, cc::BitNumbering::LittleEndian);
343    s2p(kb, u32byte1, &basisbits[8], cc::BitNumbering::LittleEndian);
344    s2p(kb, u32byte2, &basisbits[16], cc::BitNumbering::LittleEndian);
345    for (unsigned i = 0; i < 21; ++i) {
346        const unsigned bitIdx = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? i : 21 - i;
347        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[bitIdx]);
348    }
349    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
350    blockOffsetPhi->addIncoming(nextBlk, processBlock);
351    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
352    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
353    kb->SetInsertPoint(s2pDone);
354}
355
356void S2P_PabloKernel::generatePabloMethod() {
357    pablo::PabloBlock * const pb = getEntryScope();
358    const unsigned steps = std::log2(mCodeUnitWidth);
359    std::vector<PabloAST *> streamSet[steps + 1];
360    for (unsigned i = 0; i <= steps; i++) {
361        streamSet[i].resize(1<<i);
362    }
363    streamSet[0][0] = pb->createExtract(getInputStreamVar("codeUnitStream"), pb->getInteger(0));
364    unsigned streamWidth = mCodeUnitWidth;
365    for (unsigned i = 1; i <= steps; i++) {
366        for (unsigned j = 0; j < streamSet[i-1].size(); j++) {
367            auto strm = streamSet[i-1][j];
368            streamSet[i][2*j] = pb->createPackL(pb->getInteger(streamWidth), strm);
369            streamSet[i][2*j+1] = pb->createPackH(pb->getInteger(streamWidth), strm);
370        }
371        streamWidth = streamWidth/2;
372    }
373    for (unsigned bit = 0; bit < mCodeUnitWidth; bit++) {
374        const unsigned bitIndex = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? bit : mCodeUnitWidth-1-bit;
375        pb->createAssign(pb->createExtract(getOutputStreamVar("basisBits"), pb->getInteger(bitIndex)), streamSet[steps][bit]);
376    }
377}
378
379S2P_PabloKernel::S2P_PabloKernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * const codeUnitStream, StreamSet * const BasisBits, cc::BitNumbering numbering)
380: PabloKernel(b, "s2p_pablo" + std::to_string(codeUnitStream->getFieldWidth()) + cc::numberingSuffix(numbering),
381// input
382{Binding{"codeUnitStream", codeUnitStream}},
383// output
384{Binding{"basisBits", BasisBits}}),
385mBasisSetNumbering(numbering),
386mCodeUnitWidth(codeUnitStream->getFieldWidth()) {
387    assert (codeUnitStream->getFieldWidth() == BasisBits->getNumElements());
388}
389
390
391}
392
Note: See TracBrowser for help on using the repository browser.