source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 6214

Last change on this file since 6214 was 6214, checked in by cameron, 6 months ago

abort-on-null functionality for s2p

File size: 17.8 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/callback.h>
8#include <kernels/kernel_builder.h>
9#include <pablo/pabloAST.h>
10#include <pablo/builder.hpp>
11#include <pablo/pe_pack.h>
12#include <llvm/IR/Module.h>
13#include <llvm/Support/raw_ostream.h>
14
15using namespace llvm;
16
17namespace kernel {
18
19const int PACK_LANES = 2;
20void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
21    Value * t0 = nullptr;
22    Value * t1 = nullptr;
23    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
24        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
25        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
26
27        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
28        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
29
30    } else {
31        t0 = iBuilder->hsimd_packh(16, s0, s1);
32        t1 = iBuilder->hsimd_packl(16, s0, s1);
33    }
34
35    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
36    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
37}
38
39void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
40    // Little-endian bit number is used for variables.
41    Value * bit66442200[4];
42    Value * bit77553311[4];
43
44    for (unsigned i = 0; i < 4; i++) {
45        Value * s0 = input[2 * i];
46        Value * s1 = input[2 * i + 1];
47        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit77553311[i], bit66442200[i]);
48    }
49    Value * bit44440000[2];
50    Value * bit66662222[2];
51    Value * bit55551111[2];
52    Value * bit77773333[2];
53    for (unsigned j = 0; j<2; j++) {
54        s2p_step(iBuilder, bit66442200[2*j], bit66442200[2*j+1],
55                 iBuilder->simd_himask(4), 2, bit66662222[j], bit44440000[j]);
56        s2p_step(iBuilder, bit77553311[2*j], bit77553311[2*j+1],
57                 iBuilder->simd_himask(4), 2, bit77773333[j], bit55551111[j]);
58    }
59    if (basisNumbering == cc::BitNumbering::LittleEndian) {
60        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[4], output[0]);
61        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[5], output[1]);
62        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[6], output[2]);
63        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[7], output[3]);
64    }
65    else {
66        s2p_step(iBuilder, bit44440000[0], bit44440000[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
67        s2p_step(iBuilder, bit55551111[0], bit55551111[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
68        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
69        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
70    }
71}
72
73/* Alternative transposition model, but small field width packs are problematic. */
74#if 0
75void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
76    Value * hi_nybble[4];
77    Value * lo_nybble[4];
78    for (unsigned i = 0; i<4; i++) {
79        Value * s0 = input[2*i];
80        Value * s1 = input[2*i+1];
81        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
82        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
83    }
84    Value * pair76[2];
85    Value * pair54[2];
86    Value * pair32[2];
87    Value * pair10[2];
88    for (unsigned i = 0; i<2; i++) {
89        pair76[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
90        pair54[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
91        pair32[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
92        pair10[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
93    }
94    if (basisNumbering == cc::BitNumbering::LittleEndian) {
95        output[7] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
96        output[6] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
97        output[5] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
98        output[4] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
99        output[3] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
100        output[2] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
101        output[1] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
102        output[0] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
103    } else {
104        output[0] = iBuilder->hsimd_packh(2, pair76[0], pair76[1]);
105        output[1] = iBuilder->hsimd_packl(2, pair76[0], pair76[1]);
106        output[2] = iBuilder->hsimd_packh(2, pair54[0], pair54[1]);
107        output[3] = iBuilder->hsimd_packl(2, pair54[0], pair54[1]);
108        output[4] = iBuilder->hsimd_packh(2, pair32[0], pair32[1]);
109        output[5] = iBuilder->hsimd_packl(2, pair32[0], pair32[1]);
110        output[6] = iBuilder->hsimd_packh(2, pair10[0], pair10[1]);
111        output[7] = iBuilder->hsimd_packl(2, pair10[0], pair10[1]);
112    }
113}
114#endif
115
116void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
117    Module * m = kb->getModule();
118    DataLayout DL(m);
119    IntegerType * const intPtrTy = DL.getIntPtrType(kb->getContext());
120    PointerType * const voidPtrTy = kb->getVoidPtrTy();
121    BasicBlock * entry = kb->GetInsertBlock();
122    BasicBlock * s2pLoop = kb->CreateBasicBlock("s2pLoop");
123    BasicBlock * s2pFinalize = kb->CreateBasicBlock("s2pFinalize");
124    Constant * const ZERO = kb->getSize(0);
125    // Declarations for AbortOnNull mode:
126    PHINode * nullCheckPhi = nullptr;
127    Value * nonNullSoFar = nullptr;
128   
129    kb->CreateBr(s2pLoop);
130   
131    kb->SetInsertPoint(s2pLoop);
132    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
133    blockOffsetPhi->addIncoming(ZERO, entry);
134    if (mAbortOnNull) {
135        nullCheckPhi = kb->CreatePHI(kb->getBitBlockType(), 2);
136        nullCheckPhi->addIncoming(kb->allOnes(), entry);
137    }
138    Value * bytepack[8];
139    for (unsigned i = 0; i < 8; i++) {
140        bytepack[i] = kb->loadInputStreamPack("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
141    }
142    Value * basisbits[8];
143    s2p(kb, bytepack, basisbits, mBasisSetNumbering);
144    for (unsigned i = 0; i < mNumOfStreams; ++i) {
145        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]);
146    }
147    if (mAbortOnNull) {
148        Value * nonNull = kb->simd_or(kb->simd_or(kb->simd_or(basisbits[0], basisbits[1]),
149                                                  kb->simd_or(basisbits[2], basisbits[3])),
150                                      kb->simd_or(kb->simd_or(basisbits[4], basisbits[5]),
151                                                  kb->simd_or(basisbits[6], basisbits[7])));
152        nonNullSoFar = kb->simd_and(nonNull, nullCheckPhi);
153        nullCheckPhi->addIncoming(nonNullSoFar, s2pLoop);
154    }
155    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
156    blockOffsetPhi->addIncoming(nextBlk, s2pLoop);
157    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
158   
159    kb->CreateCondBr(moreToDo, s2pLoop, s2pFinalize);
160   
161    kb->SetInsertPoint(s2pFinalize);
162    //  s2p is complete, except for null byte check.
163    if (mAbortOnNull) {
164        BasicBlock * nullByteDetected = kb->CreateBasicBlock("nullByteDetected");
165        BasicBlock * nullInFileDetected = kb->CreateBasicBlock("nullInFileDetected");
166        BasicBlock * s2pExit = kb->CreateBasicBlock("s2pExit");
167        Value * itemsToDo = kb->getAccessibleItemCount("byteStream");
168        Value * anyNull = kb->bitblock_any(kb->simd_not(nonNullSoFar));
169        kb->CreateCondBr(anyNull, nullByteDetected, s2pExit);
170       
171        kb->SetInsertPoint(nullByteDetected);
172        // A null byte has been detected, determine its position and whether it is past EOF.
173        Value * byteStreamBasePtr = kb->getInputStreamBlockPtr("byteStream", ZERO, ZERO);
174        Value * ptrToNull = kb->CreateMemChr(kb->CreatePointerCast(byteStreamBasePtr, voidPtrTy), kb->getInt32(0), itemsToDo);
175        Value * nullInFile = kb->CreateICmpNE(ptrToNull, ZERO);
176        kb->CreateCondBr(nullInFile, nullInFileDetected, s2pExit);
177        kb->SetInsertPoint(nullInFileDetected);
178        // A null byte has been located within the file; set the termination code and call the signal handler.
179        Value * nullPosn = kb->CreateSub(kb->CreatePtrToInt(ptrToNull, intPtrTy), kb->CreatePtrToInt(byteStreamBasePtr, intPtrTy));
180        kb->setTerminationSignal();
181        Function * const dispatcher = m->getFunction("signal_dispatcher"); assert (dispatcher);
182        Value * handler = kb->getScalarField("handler_address");
183        kb->CreateCall(dispatcher, {handler, ConstantInt::get(kb->getInt32Ty(), NULL_SIGNAL)});
184        kb->CreateBr(s2pExit);
185        kb->SetInsertPoint(s2pExit);
186        PHINode * const produced = kb->CreatePHI(kb->getSizeTy(), 3);
187        produced->addIncoming(itemsToDo, nullByteDetected);
188        produced->addIncoming(nullPosn, nullInFileDetected);
189        produced->addIncoming(itemsToDo, s2pFinalize);
190        Value * producedCount = kb->getProducedItemCount("basisBits");
191        producedCount = kb->CreateAdd(producedCount, produced);
192        kb->setProducedItemCount("basisBits", producedCount);
193    }
194}
195
196Bindings S2PKernel::makeOutputBindings(StreamSet * const BasisBits, bool abortOnNull) {
197    if (abortOnNull) {
198        return {Binding("basisBits", BasisBits, FixedRate(), Deferred())};
199    } else {
200        return {Binding("basisBits", BasisBits)};
201    }
202}
203
204Bindings S2PKernel::makeInputScalarBindings(bool abortOnNull, Scalar * signalNullObject) {
205    if (abortOnNull) {
206        return {Binding{"handler_address", signalNullObject}};
207    } else {
208        return {};
209    }
210}
211
212S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> &, StreamSet * const codeUnitStream, StreamSet * const BasisBits, const cc::BitNumbering numbering,
213                     bool abortOnNull, Scalar * signalNullObject)
214    : MultiBlockKernel((abortOnNull ? "s2pa" : "s2p") + std::to_string(BasisBits->getNumElements()) + cc::numberingSuffix(numbering),
215{Binding{"byteStream", codeUnitStream, FixedRate(), Principal()}},
216makeOutputBindings(BasisBits, abortOnNull), makeInputScalarBindings(abortOnNull, signalNullObject), {}, {}),
217mBasisSetNumbering(numbering),
218mAbortOnNull(abortOnNull),
219mNumOfStreams(BasisBits->getNumElements()) {
220    assert (codeUnitStream->getFieldWidth() == BasisBits->getNumElements());
221    addAttribute(CanTerminateEarly());
222}
223
224inline std::string makeMultiS2PName(const StreamSets & outputStreams, const cc::BitNumbering basisNumbering, const bool aligned) {
225    std::string buffer;
226    raw_string_ostream out(buffer);
227    out << "s2p";
228    for (unsigned i = 0; i < outputStreams.size(); ++i) {
229        if (i) out << ".";
230        out << outputStreams[i]->getNumElements();
231    }
232    out << (aligned ? "a" : "u");
233    out << cc::numberingSuffix(basisNumbering);
234    out.flush();
235    return buffer;
236}
237
238S2PMultipleStreamsKernel::S2PMultipleStreamsKernel(const std::unique_ptr<kernel::KernelBuilder> & b,
239        StreamSet * codeUnitStream,
240        const StreamSets & outputStreams,
241        const cc::BitNumbering basisNumbering,
242        const bool aligned)
243: MultiBlockKernel(makeMultiS2PName(outputStreams, basisNumbering, aligned),
244// input
245{Binding{"byteStream", codeUnitStream}},
246{}, {}, {}, {}),
247mBasisSetNumbering(basisNumbering),
248mAligned(aligned) {
249    for (unsigned i = 0; i < outputStreams.size(); i++) {
250        mOutputStreamSets.emplace_back("basisBits_" + std::to_string(i), outputStreams[i]);
251    }
252}
253
254void S2PMultipleStreamsKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &b, Value *const numOfBlocks) {
255    BasicBlock * entry = b->GetInsertBlock();
256    BasicBlock * processBlock = b->CreateBasicBlock("processBlock");
257    BasicBlock * s2pDone = b->CreateBasicBlock("s2pDone");
258    Constant * const ZERO = b->getSize(0);
259
260    b->CreateBr(processBlock);
261
262    b->SetInsertPoint(processBlock);
263    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
264    blockOffsetPhi->addIncoming(ZERO, entry);
265
266    Value * bytepack[8];
267    for (unsigned i = 0; i < 8; i++) {
268        if (mAligned) {
269            bytepack[i] = b->loadInputStreamPack("byteStream", ZERO, b->getInt32(i), blockOffsetPhi);
270        } else {
271            Value * ptr = b->getInputStreamPackPtr("byteStream", ZERO, b->getInt32(i), blockOffsetPhi);
272            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
273            bytepack[i] = b->CreateAlignedLoad(ptr, 1);
274        }
275    }
276    Value * basisbits[8];
277    s2p(b, bytepack, basisbits, mBasisSetNumbering);
278
279    unsigned k = 0;
280    for (unsigned i = 0; i < getNumOfStreamOutputs(); ++i) {
281        const auto m = getOutputStreamSet(i)->getNumElements();
282        for (unsigned j = 0; j < m; j++) {
283            b->storeOutputStreamBlock("basisBits_" + std::to_string(i), b->getInt32(j), blockOffsetPhi, basisbits[k++]);
284        }
285    }
286
287    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
288    blockOffsetPhi->addIncoming(nextBlk, processBlock);
289    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
290    b->CreateCondBr(moreToDo, processBlock, s2pDone);
291    b->SetInsertPoint(s2pDone);
292}
293
294
295S2P_21Kernel::S2P_21Kernel(const std::unique_ptr<KernelBuilder> &, StreamSet * const codeUnitStream, StreamSet * const BasisBits, cc::BitNumbering numbering)
296: MultiBlockKernel("s2p_21" + cc::numberingSuffix(numbering),
297{Binding{"codeUnitStream", codeUnitStream, FixedRate(), Principal()}},
298{Binding{"basisBits", BasisBits}}, {}, {}, {})
299, mBasisSetNumbering(numbering) {
300
301}
302
303void S2P_21Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
304    BasicBlock * entry = kb->GetInsertBlock();
305    BasicBlock * processBlock = kb->CreateBasicBlock("s2p21_loop");
306    BasicBlock * s2pDone = kb->CreateBasicBlock("s2p21_done");
307    Constant * const ZERO = kb->getSize(0);
308   
309    kb->CreateBr(processBlock);
310
311    kb->SetInsertPoint(processBlock);
312    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
313    blockOffsetPhi->addIncoming(ZERO, entry);
314
315    Value * u32byte0[8];
316    Value * u32byte1[8];
317    Value * u32byte2[8];
318    for (unsigned i = 0; i < 8; i++) {
319        Value * UTF32units[4];
320        for (unsigned j = 0; j < 4; j++) {
321            UTF32units[j] = kb->loadInputStreamPack("codeUnitStream", ZERO, kb->getInt32(4 * i + j), blockOffsetPhi);
322        }
323        Value * u32lo16_0 = kb->hsimd_packl(32, UTF32units[0], UTF32units[1]);
324        Value * u32lo16_1 = kb->hsimd_packl(32, UTF32units[2], UTF32units[3]);
325        Value * u32hi16_0 = kb->hsimd_packh(32, UTF32units[0], UTF32units[1]);
326        Value * u32hi16_1 = kb->hsimd_packh(32, UTF32units[2], UTF32units[3]);
327        u32byte0[i] = kb->hsimd_packl(16, u32lo16_0, u32lo16_1);
328        u32byte1[i] = kb->hsimd_packh(16, u32lo16_0, u32lo16_1);
329        u32byte2[i] = kb->hsimd_packl(16, u32hi16_0, u32hi16_1);
330    #ifdef VALIDATE_U32
331        //  Validation should ensure that none of the high 11 bits are
332        //  set for any UTF-32 code unit.   We simply combine the bits
333        //  of code units together with bitwise-or, and then perform a
334        //  single check at the end.
335        u32_check = simd_or(u32_check, simd_or(u32hi16_0, u32hi16_1));
336    #endif
337    }
338    Value * basisbits[24];
339    s2p(kb, u32byte0, basisbits, cc::BitNumbering::LittleEndian);
340    s2p(kb, u32byte1, &basisbits[8], cc::BitNumbering::LittleEndian);
341    s2p(kb, u32byte2, &basisbits[16], cc::BitNumbering::LittleEndian);
342    for (unsigned i = 0; i < 21; ++i) {
343        const unsigned bitIdx = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? i : 21 - i;
344        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[bitIdx]);
345    }
346    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
347    blockOffsetPhi->addIncoming(nextBlk, processBlock);
348    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
349    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
350    kb->SetInsertPoint(s2pDone);
351}
352
353void S2P_PabloKernel::generatePabloMethod() {
354    pablo::PabloBlock * const pb = getEntryScope();
355    const unsigned steps = std::log2(mCodeUnitWidth);
356    std::vector<PabloAST *> streamSet[steps + 1];
357    for (unsigned i = 0; i <= steps; i++) {
358        streamSet[i].resize(1<<i);
359    }
360    streamSet[0][0] = pb->createExtract(getInputStreamVar("codeUnitStream"), pb->getInteger(0));
361    unsigned streamWidth = mCodeUnitWidth;
362    for (unsigned i = 1; i <= steps; i++) {
363        for (unsigned j = 0; j < streamSet[i-1].size(); j++) {
364            auto strm = streamSet[i-1][j];
365            streamSet[i][2*j] = pb->createPackL(pb->getInteger(streamWidth), strm);
366            streamSet[i][2*j+1] = pb->createPackH(pb->getInteger(streamWidth), strm);
367        }
368        streamWidth = streamWidth/2;
369    }
370    for (unsigned bit = 0; bit < mCodeUnitWidth; bit++) {
371        const unsigned bitIndex = mBasisSetNumbering == cc::BitNumbering::LittleEndian ? bit : mCodeUnitWidth-1-bit;
372        pb->createAssign(pb->createExtract(getOutputStreamVar("basisBits"), pb->getInteger(bitIndex)), streamSet[steps][bit]);
373    }
374}
375
376S2P_PabloKernel::S2P_PabloKernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * const codeUnitStream, StreamSet * const BasisBits, cc::BitNumbering numbering)
377: PabloKernel(b, "s2p_pablo" + std::to_string(codeUnitStream->getFieldWidth()) + cc::numberingSuffix(numbering),
378// input
379{Binding{"codeUnitStream", codeUnitStream}},
380// output
381{Binding{"basisBits", BasisBits}}),
382mBasisSetNumbering(numbering),
383mCodeUnitWidth(codeUnitStream->getFieldWidth()) {
384    assert (codeUnitStream->getFieldWidth() == BasisBits->getNumElements());
385}
386
387
388}
Note: See TracBrowser for help on using the repository browser.