source: icGREP/icgrep-devel/icgrep/kernels/symboltablepipeline.cpp @ 4992

Last change on this file since 4992 was 4992, checked in by nmedfort, 3 years ago

Continued work on symbol table.

File size: 48.4 KB
Line 
1#include "symboltablepipeline.h"
2
3/*
4 *  Copyright (c) 2016 International Characters.
5 *  This software is licensed to the public under the Open Software License 3.0.
6 */
7
8#include "pipeline.h"
9#include "toolchain.h"
10#include "utf_encoding.h"
11
12#include <kernels/s2p_kernel.h>
13#include <kernels/instance.h>
14
15#include <pablo/function.h>
16#include <pablo/pablo_compiler.h>
17#include <pablo/analysis/pabloverifier.hpp>
18
19#include <re/re_cc.h>
20#include <re/re_rep.h>
21#include <re/re_name.h>
22#include <re/re_compiler.h>
23#include <re/printer_re.h>
24
25#include <cc/cc_compiler.h>
26
27#include <pablo/printer_pablos.h>
28#include <iostream>
29
30#include <llvm/IR/Intrinsics.h>
31
32using namespace re;
33using namespace pablo;
34
35namespace kernel {
36
37SymbolTableBuilder::SymbolTableBuilder(Module * m, IDISA::IDISA_Builder * b)
38: mMod(m)
39, iBuilder(b)
40, mLongestLookahead(0)
41, mBitBlockType(b->getBitBlockType())
42, mBlockSize(b->getBitBlockWidth()) {
43
44}
45
46/** ------------------------------------------------------------------------------------------------------------- *
47 * @brief generateLeadingFunction
48 ** ------------------------------------------------------------------------------------------------------------- */
49PabloFunction * SymbolTableBuilder::generateLeadingFunction(const std::vector<unsigned> & endpoints) {
50    PabloFunction * const function = PabloFunction::Create("leading", 8, endpoints.size() + 2);
51    Encoding enc(Encoding::Type::ASCII, 8);
52    cc::CC_Compiler ccCompiler(*function, enc);
53    re::RE_Compiler reCompiler(*function, ccCompiler);
54    RE * cc = makeName(makeCC(makeCC(65, 90), makeCC(97, 122)));
55    reCompiler.compileUnicodeNames(cc);
56    PabloAST * const matches = reCompiler.compile(cc).stream;
57    PabloBlock * const entry = function->getEntryBlock();
58    PabloAST * const adv = entry->createAdvance(matches, 1);
59    PabloAST * const starts = entry->createAnd(matches, entry->createNot(adv));
60    PabloAST * const ends = entry->createAnd(adv, entry->createNot(matches));
61
62    function->setResult(0, entry->createAssign("S", starts));
63    function->setResult(1, entry->createAssign("E", ends));
64
65    PabloAST * M = ends;
66    unsigned step = 1;
67    unsigned i = 0;
68    for (unsigned endpoint : endpoints) {
69        assert (endpoint >= step);
70        unsigned span = endpoint - step;
71        while (span > step) {
72            M = entry->createOr(entry->createAdvance(M, step), M);
73            span = span - step;
74            step *= 2;
75        }
76        M = entry->createOr(entry->createAdvance(M, span), M);
77        function->setResult(i + 2, entry->createAssign("M" + std::to_string(i), M));
78        ++i;
79        step += span;
80    }
81
82    return function;
83}
84
85/** ------------------------------------------------------------------------------------------------------------- *
86 * @brief generateSortingFunction
87 ** ------------------------------------------------------------------------------------------------------------- */
88PabloFunction * SymbolTableBuilder::generateSortingFunction(const PabloFunction * const leading, const std::vector<unsigned> & endpoints) {
89    PabloFunction * const function = PabloFunction::Create("sorting", leading->getNumOfResults(), leading->getNumOfResults() * 2);
90    PabloBlock * entry = function->getEntryBlock();
91    function->setParameter(0, entry->createVar("S"));
92    function->setParameter(1, entry->createVar("E"));
93    for (unsigned i = 2; i < leading->getNumOfResults(); ++i) {
94        function->setParameter(i, entry->createVar("M" + std::to_string(i - 2)));
95    }
96    PabloAST * R = function->getParameter(0);
97    PabloAST * const E = entry->createNot(function->getParameter(1));
98    unsigned i = 1;
99    unsigned lowerbound = 0;
100    for (unsigned endpoint : endpoints) {
101        PabloAST * const M = function->getParameter(i + 1);
102        PabloAST * const L = entry->createLookahead(M, endpoint, "lookahead" + std::to_string(endpoint));
103        PabloAST * S = entry->createAnd(L, R);
104        Assign * Si = entry->createAssign("S_" + std::to_string(i), S);
105        PabloAST * F = entry->createScanThru(R, E);
106        Assign * Ei = entry->createAssign("E_" + std::to_string(i), F);
107        function->setResult(i * 2, Si);
108        function->setResult(i * 2 + 1, Ei);
109        R = entry->createXor(R, S);
110        ++i;
111        lowerbound = endpoint;
112    }
113    Assign * Si = entry->createAssign("S_n", R);
114    PabloAST * F = entry->createScanThru(R, E);
115    Assign * Ei = entry->createAssign("E_n", F);
116    function->setResult(i * 2, Si);
117    function->setResult(i * 2 + 1, Ei);
118    mLongestLookahead = lowerbound;
119
120    return function;
121}
122
123/** ------------------------------------------------------------------------------------------------------------- *
124 * @brief generateCountForwardZeroes
125 ** ------------------------------------------------------------------------------------------------------------- */
126inline Value * generateCountForwardZeroes(IDISA::IDISA_Builder * iBuilder, Value * bits) {
127    Value * cttzFunc = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::cttz, bits->getType());
128    return iBuilder->CreateCall(cttzFunc, std::vector<Value *>({bits, ConstantInt::get(iBuilder->getInt1Ty(), 0)}));
129}
130
131/** ------------------------------------------------------------------------------------------------------------- *
132 * @brief generateGather
133 ** ------------------------------------------------------------------------------------------------------------- */
134inline Value * SymbolTableBuilder::generateGather(Value * const base, Value * const vindex) {
135
136    /*
137        From Intel:
138
139        extern __m256i _mm256_i32gather_epi32(int const * base, __m256i vindex, const int scale)
140
141        From Clang avx2intrin.h:
142
143        #define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
144          (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
145                                               (int const *)(m),  \
146                                               (__v8si)(__m256i)(i), \
147                                               (__v8si)_mm256_set1_epi32(-1), (s)); })
148
149        From llvm IntrinsicsX86.td:
150
151        def llvm_ptr_ty        : LLVMPointerType<llvm_i8_ty>;             // i8*
152
153        def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
154           Intrinsic<[llvm_v8i32_ty],
155           [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
156           [IntrReadArgMem]>;
157
158     */
159
160    VectorType * const vecType = VectorType::get(iBuilder->getInt32Ty(), 8);
161    Function * const vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
162    Constant * const ones = Constant::getAllOnesValue(vecType);
163    return iBuilder->CreateCall(vgather, {ones, base, iBuilder->CreateBitCast(vindex, vecType), ones, iBuilder->getInt8(1)});
164}
165
166/** ------------------------------------------------------------------------------------------------------------- *
167 * @brief generateMaskedGather
168 ** ------------------------------------------------------------------------------------------------------------- */
169inline Value * SymbolTableBuilder::generateMaskedGather(Value * const base, Value * const vindex, Value * const mask) {
170
171    /*
172        From Intel:
173
174        extern __m256i _mm256_mask_i32gather_epi32(__m256i def_vals, int const * base, __m256i vindex, __m256i vmask, const int scale);
175
176        From Clang avx2intrin.h:
177
178        #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
179           (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
180                                                (int const *)(m), \
181                                                (__v8si)(__m256i)(i), \
182                                                (__v8si)(__m256i)(mask), (s)); })
183        From llvm IntrinsicsX86.td:
184
185        def llvm_ptr_ty        : LLVMPointerType<llvm_i8_ty>;             // i8*
186
187        def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
188           Intrinsic<[llvm_v8i32_ty],
189           [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
190           [IntrReadArgMem]>;
191
192     */
193
194    VectorType * const vecType = VectorType::get(iBuilder->getInt32Ty(), 8);
195    Function * const vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
196    return iBuilder->CreateCall(vgather, {Constant::getNullValue(vecType), base, iBuilder->CreateBitCast(vindex, vecType), iBuilder->CreateBitCast(mask, vecType), iBuilder->getInt8(1)});
197}
198
199/** ------------------------------------------------------------------------------------------------------------- *
200 * @brief generateResetLowestBit
201 ** ------------------------------------------------------------------------------------------------------------- */
202inline Value * generateResetLowestBit(IDISA::IDISA_Builder * iBuilder, Value * bits) {
203    Value * bits_minus1 = iBuilder->CreateSub(bits, ConstantInt::get(bits->getType(), 1));
204    return iBuilder->CreateAnd(bits_minus1, bits);
205}
206
207///** ------------------------------------------------------------------------------------------------------------- *
208// * @brief generateScanMatch
209// ** ------------------------------------------------------------------------------------------------------------- */
210//void SymbolTableBuilder::generateHashingKernel(KernelBuilder * kBuilder, const unsigned minKeyLength, const unsigned maxKeyLength, const unsigned scanWordBitWidth) {
211
212//    const unsigned minKeyBlockCount = (minKeyLength / 4);
213//    const unsigned maxKeyBlockCount = ((maxKeyLength + 3) / 4);
214
215//    Type * const intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
216//    const unsigned fieldCount = iBuilder->getBitBlockWidth() / scanWordBitWidth;
217//    Type * const scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
218//    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
219//    const unsigned gatherCount = vectorWidth * 4;
220//    Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
221
222//    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
223//    const unsigned startIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex");
224//    const unsigned startArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount), "StartArray");
225//    const unsigned endIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex");
226//    const unsigned endArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), gatherCount), "EndArray");
227
228//    kBuilder->addInputStream(1, "startStream");
229//    kBuilder->addInputStream(1, "endStream");
230
231//    Function * function = kBuilder->prepareFunction();
232
233//    BasicBlock * const entry = iBuilder->GetInsertBlock();
234
235//    BasicBlock * startOuterCond = BasicBlock::Create(mMod->getContext(), "startOuterCond", function, 0);
236//    BasicBlock * startOuterBody = BasicBlock::Create(mMod->getContext(), "startOuterBody", function, 0);
237//    BasicBlock * startInnerCond = BasicBlock::Create(mMod->getContext(), "startInnerCond", function, 0);
238//    BasicBlock * startInnerBody = BasicBlock::Create(mMod->getContext(), "startInnerBody", function, 0);
239
240//    BasicBlock * endOuterCond = BasicBlock::Create(mMod->getContext(), "endOuterCond", function, 0);
241//    BasicBlock * endOuterBody = BasicBlock::Create(mMod->getContext(), "endOuterBody", function, 0);
242//    BasicBlock * endInnerCond = BasicBlock::Create(mMod->getContext(), "endInnerCond", function, 0);
243//    BasicBlock * endInnerBody = BasicBlock::Create(mMod->getContext(), "endInnerBody", function, 0);
244
245//    BasicBlock * gatherInit = BasicBlock::Create(mMod->getContext(), "gather", function, 0);
246
247//    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
248
249//    //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
250//    Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
251//    Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
252//    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
253
254//    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
255//    Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(startIndexIdx), "startIndex");
256//    Value * startArray = kBuilder->getInternalState(startArrayIdx);
257//    Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(0)), scanWordVectorType, "startStream");
258
259//    Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(endIndexIdx), "endIndex");
260//    Value * endArray = kBuilder->getInternalState(endArrayIdx);
261//    Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(1)), scanWordVectorType, "endStream");
262
263//    iBuilder->CreateBr(startOuterCond);
264
265//    // START OUTER COND
266//    iBuilder->SetInsertPoint(startOuterCond);
267//    PHINode * outerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
268//    outerStartIndexPhi->addIncoming(startIndex, entry);
269//    PHINode * startIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
270//    startIV->addIncoming(iBuilder->getInt64(0), entry);
271//    Value * startOuterTest = iBuilder->CreateICmpNE(startIV, iBuilder->getInt64(fieldCount));
272//    iBuilder->CreateCondBr(startOuterTest, startOuterBody, endOuterCond);
273
274//    // START OUTER BODY
275//    iBuilder->SetInsertPoint(startOuterBody);
276//    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
277//    startIV->addIncoming(iBuilder->CreateAdd(startIV, iBuilder->getInt64(1)), startInnerCond);
278//    iBuilder->CreateBr(startInnerCond);
279
280//    // START INNER COND
281//    iBuilder->SetInsertPoint(startInnerCond);
282//    PHINode * innerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
283//    innerStartIndexPhi->addIncoming(outerStartIndexPhi, startOuterBody);
284//    outerStartIndexPhi->addIncoming(innerStartIndexPhi, startInnerCond);
285//    PHINode * startFieldPhi = iBuilder->CreatePHI(intScanWordTy, 2);
286//    startFieldPhi->addIncoming(startField, startOuterBody);
287//    Value * test = iBuilder->CreateICmpNE(startFieldPhi, ConstantInt::getNullValue(intScanWordTy));
288//    iBuilder->CreateCondBr(test, startInnerBody, startOuterCond);
289
290//    // START INNER BODY
291//    iBuilder->SetInsertPoint(startInnerBody);
292//    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
293//    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
294//    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, blockPos), iBuilder->getInt32Ty());
295//    iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), innerStartIndexPhi}));
296//    innerStartIndexPhi->addIncoming(iBuilder->CreateAdd(innerStartIndexPhi, ConstantInt::get(innerStartIndexPhi->getType(), 1)), startInnerBody);
297//    iBuilder->CreateBr(startInnerCond);
298
299//    // END POINT OUTER COND
300//    iBuilder->SetInsertPoint(endOuterCond);
301//    PHINode * outerStartIndexPhi2 = iBuilder->CreatePHI(startIndex->getType(), 2);
302//    outerStartIndexPhi2->addIncoming(outerStartIndexPhi, startOuterCond);
303//    PHINode * endIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
304//    endIV->addIncoming(iBuilder->getInt64(0), startOuterCond);
305//    Value * endOuterTest = iBuilder->CreateICmpNE(endIV, iBuilder->getInt64(fieldCount));
306//    iBuilder->CreateCondBr(endOuterTest, endOuterBody, exit);
307
308//    // END POINT OUTER BODY
309//    iBuilder->SetInsertPoint(endOuterBody);
310//    Value * endField = iBuilder->CreateExtractElement(endStream, endIV);
311//    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
312//    iBuilder->CreateBr(endInnerCond);
313
314//    // END POINT INNER COND
315//    iBuilder->SetInsertPoint(endInnerCond);
316//    innerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 3);
317//    innerStartIndexPhi->addIncoming(outerStartIndexPhi2, endOuterBody);
318//    innerStartIndexPhi->addIncoming(innerStartIndexPhi, endInnerBody);
319//    outerStartIndexPhi2->addIncoming(innerStartIndexPhi, endInnerCond);
320//    PHINode * endIndexPhi = iBuilder->CreatePHI(endIndex->getType(), 3);
321//    endIndexPhi->addIncoming(endIndex, endOuterBody);
322//    endIndexPhi->addIncoming(ConstantInt::getNullValue(endIndex->getType()), gatherInit);
323//    PHINode * endFieldPhi = iBuilder->CreatePHI(intScanWordTy, 3);
324//    endFieldPhi->addIncoming(endField, endOuterBody);
325//    Value * endInnerTest = iBuilder->CreateICmpNE(endFieldPhi, ConstantInt::getNullValue(intScanWordTy));
326//    iBuilder->CreateCondBr(endInnerTest, endInnerBody, endOuterCond);
327
328//    // END POINT INNER BODY
329//    iBuilder->SetInsertPoint(endInnerBody);
330//    Value * endPos = generateCountForwardZeroes(iBuilder, endFieldPhi);
331//    Value * updatedEndFieldPhi = generateResetLowestBit(iBuilder, endFieldPhi);
332//    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
333//    endFieldPhi->addIncoming(updatedEndFieldPhi, gatherInit);
334//    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, blockPos), iBuilder->getInt32Ty());
335//    iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi}));
336//    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi, ConstantInt::get(endIndexPhi->getType(), 1));
337//    endIndexPhi->addIncoming(updatedEndIndexPhi, endInnerBody);
338//    Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), gatherCount));
339//    iBuilder->CreateCondBr(filledEndPosBufferTest, gatherInit, endInnerCond);
340
341//    // GATHER INIT
342//    iBuilder->SetInsertPoint(gatherInit);
343//    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0));
344//    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(gatherVectorType, 0));
345//    CallGatherFunction(base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), minKeyBlockCount, maxKeyBlockCount);
346//    // ... call hashing function ...
347//    Value * untouchedArrayPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(startArray, iBuilder->getInt32(vectorWidth)), PointerType::get(gatherVectorType, 0));
348//    Value * untouchedCount = iBuilder->CreateSub(innerStartIndexPhi, ConstantInt::get(innerStartIndexPhi->getType(), gatherCount));
349//    iBuilder->CreateMemCpy(startArrayPtr, untouchedArrayPtr, untouchedCount, 4);
350//    innerStartIndexPhi->addIncoming(untouchedCount, gatherInit);
351//    iBuilder->CreateBr(endInnerCond);
352
353
354//    iBuilder->SetInsertPoint(exit);
355
356
357
358//    // need to save the start/end index still
359//    kBuilder->finalize();
360
361//    function->dump();
362//}
363
364/** ------------------------------------------------------------------------------------------------------------- *
365 * @brief generateGatherKernel
366 ** ------------------------------------------------------------------------------------------------------------- */
367void SymbolTableBuilder::generateGatherKernel(KernelBuilder * kBuilder, const std::vector<unsigned> & endpoints, const unsigned scanWordBitWidth) {
368
369    Type * const intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
370    const unsigned fieldCount = iBuilder->getBitBlockWidth() / scanWordBitWidth;
371    Type * const scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
372    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
373    const unsigned gatherCount = vectorWidth * 4;
374    Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
375    Type * const transposedVectorType = VectorType::get(iBuilder->getInt8Ty(), iBuilder->getBitBlockWidth() / 8);
376
377    unsigned minKeyLength = 0;
378
379    for (unsigned maxKeyLength : endpoints) {
380
381        kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex" + std::to_string(maxKeyLength));
382        kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount), "StartArray" + std::to_string(maxKeyLength));
383        kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex" + std::to_string(maxKeyLength));
384        kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), gatherCount), "EndArray" + std::to_string(maxKeyLength));
385
386        kBuilder->addInputStream(1, "startStream" + std::to_string(maxKeyLength));
387        kBuilder->addInputStream(1, "endStream" + std::to_string(maxKeyLength));
388
389        kBuilder->addOutputStream(maxKeyLength);
390    }
391
392    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
393
394    Function * function = kBuilder->prepareFunction();
395
396    BasicBlock * const entry = iBuilder->GetInsertBlock();
397
398    BasicBlock * groupCond = BasicBlock::Create(mMod->getContext(), "groupCond", function, 0);
399    BasicBlock * groupBody = BasicBlock::Create(mMod->getContext(), "groupBody", function, 0);
400
401    BasicBlock * startOuterCond = BasicBlock::Create(mMod->getContext(), "startOuterCond", function, 0);
402    BasicBlock * startOuterBody = BasicBlock::Create(mMod->getContext(), "startOuterBody", function, 0);
403    BasicBlock * startInnerCond = BasicBlock::Create(mMod->getContext(), "startInnerCond", function, 0);
404    BasicBlock * startInnerBody = BasicBlock::Create(mMod->getContext(), "startInnerBody", function, 0);
405
406    BasicBlock * endOuterCond = BasicBlock::Create(mMod->getContext(), "endOuterCond", function, 0);
407    BasicBlock * endOuterBody = BasicBlock::Create(mMod->getContext(), "endOuterBody", function, 0);
408    BasicBlock * endInnerCond = BasicBlock::Create(mMod->getContext(), "endInnerCond", function, 0);
409    BasicBlock * endInnerBody = BasicBlock::Create(mMod->getContext(), "endInnerBody", function, 0);
410
411    BasicBlock * gather = BasicBlock::Create(mMod->getContext(), "gather", function, 0);
412
413    BasicBlock * nextGroup = BasicBlock::Create(mMod->getContext(), "nextGroup", function, 0);
414
415    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
416
417    //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
418    Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
419    Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
420    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
421
422    FunctionType * const functionType = FunctionType::get(PointerType::get(transposedVectorType, 0), {iBuilder->getInt8PtrTy(), PointerType::get(gatherVectorType, 0), PointerType::get(gatherVectorType, 0), iBuilder->getInt32Ty(), PointerType::get(transposedVectorType, 0)}, false);
423    Value * gatherFunctionPtrArray = iBuilder->CreateAlloca(PointerType::get(functionType, 0), iBuilder->getInt32(endpoints.size()));
424    unsigned i = 0;
425    minKeyLength = 0;
426    for (unsigned maxKeyLength : endpoints) {
427        const unsigned minCount = (minKeyLength / 4);
428        const unsigned maxCount = ((maxKeyLength + 3) / 4);
429        Value * ptr = iBuilder->CreateGEP(gatherFunctionPtrArray, iBuilder->getInt32(i++));
430        iBuilder->CreateStore(generateGatherFunction(transposedVectorType, minCount, maxCount), ptr);
431        minKeyLength = maxKeyLength;
432    }
433    iBuilder->CreateBr(groupCond);
434
435    // GROUP COND
436    iBuilder->SetInsertPoint(groupCond);
437    PHINode * groupIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
438    groupIV->addIncoming(iBuilder->getInt32(0), entry);
439    Value * groupTest = iBuilder->CreateICmpNE(groupIV, iBuilder->getInt32(endpoints.size()));
440    iBuilder->CreateCondBr(groupTest, groupBody, exit);
441
442    // GROUP BODY
443    iBuilder->SetInsertPoint(groupBody);
444    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
445    Value * input0 = iBuilder->CreateMul(groupIV, iBuilder->getInt32(2));
446    Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(input0)), scanWordVectorType, "startStream");
447    Value * input1 = iBuilder->CreateAdd(input0, iBuilder->getInt32(1));
448    Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(input1)), scanWordVectorType, "endStream");
449
450    Value * internal0 = iBuilder->CreateMul(groupIV, iBuilder->getInt32(4));
451    Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(internal0), "startIndex");
452    Value * internal1 = iBuilder->CreateAdd(internal0, iBuilder->getInt32(1));
453    Value * startArray = kBuilder->getInternalState(internal1);
454    Value * internal2 = iBuilder->CreateAdd(internal1, iBuilder->getInt32(1));
455    Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(internal2), "endIndex");
456    Value * internal3 = iBuilder->CreateAdd(internal2, iBuilder->getInt32(1));
457    Value * endArray = kBuilder->getInternalState(internal3);
458
459    Value * const buffer = kBuilder->getOutputStream(groupIV);
460
461    iBuilder->CreateBr(startOuterCond);
462
463    // START OUTER COND
464    iBuilder->SetInsertPoint(startOuterCond);
465    PHINode * startIndexPhi1 = iBuilder->CreatePHI(startIndex->getType(), 2);
466    startIndexPhi1->addIncoming(startIndex, groupBody);
467    PHINode * startIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
468    startIV->addIncoming(iBuilder->getInt64(0), groupBody);
469    Value * startOuterTest = iBuilder->CreateICmpNE(startIV, iBuilder->getInt64(fieldCount));
470    iBuilder->CreateCondBr(startOuterTest, startOuterBody, endOuterCond);
471
472    // START OUTER BODY
473    iBuilder->SetInsertPoint(startOuterBody);
474    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
475    startIV->addIncoming(iBuilder->CreateAdd(startIV, iBuilder->getInt64(1)), startInnerCond);
476    iBuilder->CreateBr(startInnerCond);
477
478    // START INNER COND
479    iBuilder->SetInsertPoint(startInnerCond);
480    PHINode * startIndexPhi3 = iBuilder->CreatePHI(startIndex->getType(), 2);
481    startIndexPhi3->addIncoming(startIndexPhi1, startOuterBody);
482    startIndexPhi1->addIncoming(startIndexPhi3, startInnerCond);
483    PHINode * startFieldPhi = iBuilder->CreatePHI(intScanWordTy, 2);
484    startFieldPhi->addIncoming(startField, startOuterBody);
485    Value * test = iBuilder->CreateICmpNE(startFieldPhi, ConstantInt::getNullValue(intScanWordTy));
486    iBuilder->CreateCondBr(test, startInnerBody, startOuterCond);
487
488    // START INNER BODY
489    iBuilder->SetInsertPoint(startInnerBody);
490    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
491    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
492    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, blockPos), iBuilder->getInt32Ty());
493    iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), startIndexPhi3}));
494    startIndexPhi3->addIncoming(iBuilder->CreateAdd(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), 1)), startInnerBody);
495    iBuilder->CreateBr(startInnerCond);
496
497    // END POINT OUTER COND
498    iBuilder->SetInsertPoint(endOuterCond);
499    PHINode * endIndexPhi1 = iBuilder->CreatePHI(endIndex->getType(), 2);
500    endIndexPhi1->addIncoming(endIndex, startOuterCond);
501    PHINode * startIndexPhi2 = iBuilder->CreatePHI(startIndex->getType(), 2);
502    startIndexPhi2->addIncoming(startIndexPhi1, startOuterCond);
503    PHINode * endIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
504    endIV->addIncoming(iBuilder->getInt64(0), startOuterCond);
505    Value * endOuterTest = iBuilder->CreateICmpNE(endIV, iBuilder->getInt64(fieldCount));
506    iBuilder->CreateCondBr(endOuterTest, endOuterBody, nextGroup);
507
508    // END POINT OUTER BODY
509    iBuilder->SetInsertPoint(endOuterBody);
510    Value * endField = iBuilder->CreateExtractElement(endStream, endIV);
511    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
512    iBuilder->CreateBr(endInnerCond);
513
514    // END POINT INNER COND
515    iBuilder->SetInsertPoint(endInnerCond);
516    startIndexPhi3 = iBuilder->CreatePHI(startIndexPhi2->getType(), 3);
517    startIndexPhi3->addIncoming(startIndexPhi2, endOuterBody);
518    startIndexPhi3->addIncoming(startIndexPhi3, endInnerBody);
519    startIndexPhi2->addIncoming(startIndexPhi3, endInnerCond);
520    PHINode * endIndexPhi2 = iBuilder->CreatePHI(endIndex->getType(), 3);
521    endIndexPhi2->addIncoming(endIndexPhi1, endOuterBody);
522    endIndexPhi1->addIncoming(endIndexPhi2, endInnerCond);
523    endIndexPhi2->addIncoming(ConstantInt::getNullValue(endIndex->getType()), gather);
524    PHINode * endFieldPhi = iBuilder->CreatePHI(intScanWordTy, 3);
525    endFieldPhi->addIncoming(endField, endOuterBody);
526    Value * endInnerTest = iBuilder->CreateICmpNE(endFieldPhi, ConstantInt::getNullValue(intScanWordTy));
527    iBuilder->CreateCondBr(endInnerTest, endInnerBody, endOuterCond);
528
529    // END POINT INNER BODY
530    iBuilder->SetInsertPoint(endInnerBody);
531    Value * endPos = generateCountForwardZeroes(iBuilder, endFieldPhi);
532    Value * updatedEndFieldPhi = generateResetLowestBit(iBuilder, endFieldPhi);
533    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
534    endFieldPhi->addIncoming(updatedEndFieldPhi, gather);
535    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, blockPos), iBuilder->getInt32Ty());
536    iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi2}));
537    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi2, ConstantInt::get(endIndexPhi2->getType(), 1));
538    endIndexPhi2->addIncoming(updatedEndIndexPhi, endInnerBody);
539    Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), gatherCount));
540    iBuilder->CreateCondBr(filledEndPosBufferTest, gather, endInnerCond);
541
542    // GATHER
543    iBuilder->SetInsertPoint(gather);
544    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0));
545    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(gatherVectorType, 0));
546    Value * const bufferPtr = iBuilder->CreatePointerCast(buffer, PointerType::get(transposedVectorType, 0));
547    Value * gatherFunctionPtr = iBuilder->CreateLoad(iBuilder->CreateGEP(gatherFunctionPtrArray, groupIV));
548    iBuilder->CreateCall5(gatherFunctionPtr, base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), bufferPtr);
549
550    // ... call hashing function ...
551    Value * untouchedArrayPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(startArray, iBuilder->getInt32(vectorWidth)), PointerType::get(gatherVectorType, 0));
552    Value * untouchedCount = iBuilder->CreateSub(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), gatherCount));
553    iBuilder->CreateMemCpy(startArrayPtr, untouchedArrayPtr, untouchedCount, 4);
554    startIndexPhi3->addIncoming(untouchedCount, gather);
555    iBuilder->CreateBr(endInnerCond);
556
557    // NEXT GROUP
558    iBuilder->SetInsertPoint(nextGroup);
559    kBuilder->setInternalState(internal0, startIndexPhi2);
560    kBuilder->setInternalState(internal2, endIndexPhi1);
561    groupIV->addIncoming(iBuilder->CreateAdd(groupIV, ConstantInt::get(groupIV->getType(), 1)), nextGroup);
562    iBuilder->CreateBr(groupCond);
563
564    iBuilder->SetInsertPoint(exit);
565    kBuilder->finalize();
566}
567
568/** ------------------------------------------------------------------------------------------------------------- *
569 * @brief generateGatherFunction
570 ** ------------------------------------------------------------------------------------------------------------- */
571Function * SymbolTableBuilder::generateGatherFunction(Type * const resultType, const unsigned minCount, const unsigned maxCount) {
572
573    assert (maxCount > minCount);
574
575    const std::string functionName = "gather_" + std::to_string(minCount) + "_" + std::to_string(maxCount);
576    Function * function = mMod->getFunction(functionName);
577    if (function == nullptr) {
578
579        const auto ip = iBuilder->saveIP();
580
581        const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
582        Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
583        Type * const gatherVectorArrayType = ArrayType::get(gatherVectorType, maxCount);
584
585        FunctionType * const functionType = FunctionType::get(PointerType::get(resultType, 0), {iBuilder->getInt8PtrTy(), PointerType::get(gatherVectorType, 0), PointerType::get(gatherVectorType, 0), iBuilder->getInt32Ty(), PointerType::get(resultType, 0)}, false);
586        function = Function::Create(functionType, GlobalValue::ExternalLinkage, functionName, mMod);
587        function->setCallingConv(CallingConv::C);
588        function->setDoesNotCapture(1);
589        function->setDoesNotCapture(2);
590        function->setDoesNotCapture(3);
591        function->setDoesNotThrow();
592
593        Function::arg_iterator args = function->arg_begin();
594        Value * const base = args++;
595        base->setName("base");
596        Value * const startArray = args++;
597        startArray->setName("startArray");
598        Value * const endArray = args++;
599        endArray->setName("endArray");
600        Value * const count = args++;
601        count->setName("count");
602        Value * const transposedBuffer = args++;
603        transposedBuffer->setName("buffer");
604
605        BasicBlock * entry = BasicBlock::Create(mMod->getContext(), "entry", function, 0);
606        BasicBlock * gatherCond = BasicBlock::Create(mMod->getContext(), "gatherCond", function, 0);
607        BasicBlock * partialGatherCond = BasicBlock::Create(mMod->getContext(), "partialGatherCond", function, 0);
608        BasicBlock * partialGatherBody = BasicBlock::Create(mMod->getContext(), "partialGatherBody", function, 0);
609        BasicBlock * gatherBody = BasicBlock::Create(mMod->getContext(), "gatherBody", function, 0);
610        BasicBlock * transposeCond = BasicBlock::Create(mMod->getContext(), "transposeCond", function, 0);
611        BasicBlock * transposeBody = BasicBlock::Create(mMod->getContext(), "transposeBody", function, 0);
612        BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
613
614        Value * const four = iBuilder->CreateVectorSplat(vectorWidth, iBuilder->getInt32(4));
615
616        // ENTRY
617        iBuilder->SetInsertPoint(entry);
618        Value * const untransposedBuffer = iBuilder->CreateAlloca(gatherVectorArrayType, iBuilder->getInt32(4), "untransposedBuffer");
619        iBuilder->CreateBr(gatherCond);
620
621        // FULL GATHER COND
622        iBuilder->SetInsertPoint(gatherCond);
623        PHINode * remainingLanes = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
624        remainingLanes->addIncoming(count, entry);
625        PHINode * gatherIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
626        gatherIV->addIncoming(iBuilder->getInt32(0), entry);
627        Value * gatherLoopTest = iBuilder->CreateICmpNE(gatherIV, iBuilder->getInt32(4));
628        iBuilder->CreateCondBr(gatherLoopTest, partialGatherCond, transposeCond);
629
630        // PARTIAL GATHER COND
631        iBuilder->SetInsertPoint(partialGatherCond);
632        Value * partialGatherLoopTest = iBuilder->CreateICmpSGE(remainingLanes, iBuilder->getInt32(vectorWidth));
633        iBuilder->CreateCondBr(partialGatherLoopTest, gatherBody, partialGatherBody);
634
635        // PARTIAL GATHER BODY
636        iBuilder->SetInsertPoint(partialGatherBody);
637        Type * registerType = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
638        Value * maskedLanes = iBuilder->CreateSub(iBuilder->getInt32(vectorWidth), remainingLanes);
639        maskedLanes = iBuilder->CreateMul(maskedLanes, iBuilder->getInt32(32));
640        maskedLanes = iBuilder->CreateZExt(maskedLanes, registerType);
641        maskedLanes = iBuilder->CreateLShr(Constant::getAllOnesValue(registerType), maskedLanes);
642        maskedLanes = iBuilder->CreateBitCast(maskedLanes, gatherVectorType);
643
644        iBuilder->CreateBr(gatherBody);
645
646        // FULL GATHER BODY
647        iBuilder->SetInsertPoint(gatherBody);
648        PHINode * activeLanes = iBuilder->CreatePHI(gatherVectorType, 2, "activeLanes");
649        activeLanes->addIncoming(Constant::getAllOnesValue(gatherVectorType), partialGatherCond);
650        activeLanes->addIncoming(maskedLanes, partialGatherBody);
651
652        Value * startPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(startArray, gatherIV), 4);
653        for (unsigned blockCount = 0; blockCount < minCount; ++blockCount) {
654            Value * tokenData = generateMaskedGather(base, startPos, activeLanes);
655            startPos = iBuilder->CreateAdd(startPos, four);
656            iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
657        }
658
659        Value * const endPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(endArray, gatherIV), 4);
660        for (unsigned blockCount = minCount; blockCount < maxCount; ++blockCount) {
661            // if we have not fully gathered the data for this key
662            Value * atLeastOneByte = iBuilder->CreateAnd(iBuilder->CreateSExt(iBuilder->CreateICmpULT(startPos, endPos), startPos->getType()), activeLanes);
663            // gather it ...
664            Value * tokenData = generateMaskedGather(base, startPos, atLeastOneByte);
665            // and compute how much data is remaining.
666            Value * remaining = iBuilder->CreateSub(endPos, startPos);
667            // if this token only has 1 to 3 bytes remaining ...
668            Value * lessThanFourBytes = iBuilder->CreateSExt(iBuilder->CreateICmpSLT(remaining, four), remaining->getType());
669            Value * betweenOneAndThreeBytes = iBuilder->CreateAnd(atLeastOneByte, lessThanFourBytes);
670            // determine how many bytes (bits?) do *not* belong to the token
671            remaining = iBuilder->CreateSub(four, iBuilder->CreateAnd(remaining, betweenOneAndThreeBytes));
672            // remaining = iBuilder->CreateShl(remaining, ConstantInt::get(remaining->getType(), 3));
673            // then mask them out prior to storing the value
674            Value * partialTokenMask = iBuilder->CreateLShr(ConstantInt::getAllOnesValue(remaining->getType()), remaining);
675            tokenData = iBuilder->CreateAnd(partialTokenMask, tokenData);
676            iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
677            if (blockCount < (maxCount - 1)) {
678                startPos = iBuilder->CreateAdd(startPos, four);
679            }
680        }
681        gatherIV->addIncoming(iBuilder->CreateAdd(gatherIV, iBuilder->getInt32(1)), gatherBody);
682        remainingLanes->addIncoming(iBuilder->CreateSub(remainingLanes, iBuilder->getInt32(vectorWidth)), gatherBody);
683        iBuilder->CreateBr(gatherCond);
684
685        // TRANSPOSE COND
686        iBuilder->SetInsertPoint(transposeCond);
687        PHINode * transposeIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
688        transposeIV->addIncoming(iBuilder->getInt32(0), gatherCond);
689        Value * transposeLoopTest = iBuilder->CreateICmpNE(transposeIV, iBuilder->getInt32(maxCount));
690        iBuilder->CreateCondBr(transposeLoopTest, transposeBody, exit);
691
692        // TRANSPOSE BODY
693        iBuilder->SetInsertPoint(transposeBody);
694
695        Value * value[4];
696        Value * temporary[4];
697        for (unsigned i = 0; i < 4; ++i) {
698            Value * const ptr = iBuilder->CreateGEP(untransposedBuffer, {transposeIV, iBuilder->getInt32(i)});
699            value[i] = iBuilder->CreateAlignedLoad(ptr, 4);
700        }
701        for (unsigned fieldWidth = 16; fieldWidth != 4; fieldWidth /= 2) {
702            const unsigned fieldCount = iBuilder->getBitBlockWidth() / fieldWidth;
703            VectorType * const vecType = VectorType::get(IntegerType::get(mMod->getContext(), fieldWidth), fieldCount);
704            std::vector<Constant *> lowFields(fieldCount);
705            std::vector<Constant *> highFields(fieldCount);
706            for (unsigned j = 0; j < fieldCount; ++j) {
707                lowFields[j] = iBuilder->getInt32(j * 2);
708                highFields[j] = iBuilder->getInt32(j * 2 + 1);
709            }
710            Constant * const lowVector = ConstantVector::get(lowFields);
711            Constant * const highVector = ConstantVector::get(highFields);
712            for (unsigned i = 0; i < 4; i += 2) {
713                value[i] = iBuilder->CreateBitCast(value[i], vecType);
714                value[i + 1] = iBuilder->CreateBitCast(value[i + 1], vecType);
715                temporary[i / 2] = iBuilder->CreateShuffleVector(value[i], value[i + 1], lowVector);
716                temporary[(i / 2) + 2] = iBuilder->CreateShuffleVector(value[i], value[i + 1], highVector);
717            }
718            std::swap(value, temporary);
719        }
720        Value * offset = iBuilder->CreateShl(transposeIV, ConstantInt::get(transposeIV->getType(), 2));
721        for (unsigned i = 0; i < 4; ++i) {
722            Value * index = offset;
723            if (i) {
724                index = iBuilder->CreateOr(offset, iBuilder->getInt32(i));
725            }
726            Value * ptr = iBuilder->CreateGEP(transposedBuffer, index);
727            iBuilder->CreateAlignedStore(value[i], ptr, 4);
728        }
729        transposeIV->addIncoming(iBuilder->CreateAdd(transposeIV, iBuilder->getInt32(1)), transposeBody);
730        iBuilder->CreateBr(transposeCond);
731
732        // EXIT
733        iBuilder->SetInsertPoint(exit);
734        iBuilder->CreateRet(transposedBuffer);
735
736        iBuilder->restoreIP(ip);
737    }
738
739    return function;
740}
741
742
743/** ------------------------------------------------------------------------------------------------------------- *
744 * @brief createKernels
745 ** ------------------------------------------------------------------------------------------------------------- */
746void SymbolTableBuilder::createKernels() {
747
748    std::vector<unsigned> endpoints;
749    endpoints.push_back(1);
750    endpoints.push_back(2);
751    endpoints.push_back(4);
752    endpoints.push_back(8);
753    endpoints.push_back(16);
754
755    PabloCompiler pablo_compiler(mMod, iBuilder);
756    PabloFunction * const leading = generateLeadingFunction(endpoints);
757    PabloFunction * const sorting = generateSortingFunction(leading, endpoints);
758
759    const auto bufferSize = ((mLongestLookahead + iBuilder->getBitBlockWidth() - 1) / iBuilder->getBitBlockWidth()) + 1;
760
761    mS2PKernel = new KernelBuilder("s2p", mMod, iBuilder, 1);
762    mLeadingKernel = new KernelBuilder("leading", mMod, iBuilder, bufferSize);
763    mSortingKernel = new KernelBuilder("sorting", mMod, iBuilder, bufferSize);
764    mGatherKernel = new KernelBuilder("gathering", mMod, iBuilder, 1);
765
766    generateS2PKernel(mMod, iBuilder, mS2PKernel);
767
768    pablo_compiler.setKernel(mLeadingKernel);
769    pablo_compiler.compile(leading);
770    pablo_compiler.setKernel(mSortingKernel);
771    pablo_compiler.compile(sorting);
772
773    delete leading;
774    delete sorting;
775
776    releaseSlabAllocatorMemory();
777
778    generateGatherKernel(mGatherKernel, endpoints, 64);
779
780}
781
782Function * SymbolTableBuilder::ExecuteKernels(){
783
784    Type * intType = iBuilder->getInt64Ty();
785
786    Type * inputType = PointerType::get(ArrayType::get(StructType::get(mMod->getContext(), std::vector<Type *>({ArrayType::get(mBitBlockType, 8)})), 1), 0);
787    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", Type::getVoidTy(mMod->getContext()), inputType, intType, nullptr));
788    main->setCallingConv(CallingConv::C);
789    Function::arg_iterator args = main->arg_begin();
790
791    Value * const inputStream = args++;
792    inputStream->setName("inputStream");
793
794    Value * const bufferSize = args++;
795    bufferSize->setName("bufferSize");
796
797    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
798
799    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
800
801    BasicBlock * leadingTestBlock = BasicBlock::Create(mMod->getContext(), "leadingCond", main, 0);
802    BasicBlock * safetyCheckBlock = BasicBlock::Create(mMod->getContext(), "safetyCheck", main, 0);
803    BasicBlock * leadingBodyBlock = BasicBlock::Create(mMod->getContext(), "leadingBody", main, 0);
804
805    BasicBlock * regularTestBlock = BasicBlock::Create(mMod->getContext(), "fullCond", main, 0);
806    BasicBlock * regularBodyBlock = BasicBlock::Create(mMod->getContext(), "fullBody", main, 0);
807    BasicBlock * regularExitBlock = BasicBlock::Create(mMod->getContext(), "fullExit", main, 0);
808
809    BasicBlock * partialBlock = BasicBlock::Create(mMod->getContext(),  "partialBlock", main, 0);
810
811    BasicBlock * finalTestBlock = BasicBlock::Create(mMod->getContext(),  "finalCond", main, 0);
812    BasicBlock * finalBodyBlock = BasicBlock::Create(mMod->getContext(),  "finalBody", main, 0);
813
814    BasicBlock * exitBlock = BasicBlock::Create(mMod->getContext(), "exit", main, 0);
815
816    Instance * s2pInstance = mS2PKernel->instantiate(inputStream);
817    Instance * leadingInstance = mLeadingKernel->instantiate(s2pInstance->getOutputStreamSet());
818    Instance * sortingInstance = mSortingKernel->instantiate(leadingInstance->getOutputStreamSet());
819
820    const unsigned leadingBlocks = (mLongestLookahead + iBuilder->getBitBlockWidth() - 1) / iBuilder->getBitBlockWidth();
821
822    Value * const requiredBytes = iBuilder->getInt64(mBlockSize * leadingBlocks);
823    Value * const blockSize = iBuilder->getInt64(mBlockSize);
824
825    // If the buffer size is smaller than our largest length group, only check up to the buffer size.
826    Value * safetyCheck = iBuilder->CreateICmpUGE(bufferSize, blockSize);
827    if (blockSize == requiredBytes) {
828        iBuilder->CreateCondBr(safetyCheck, leadingTestBlock, exitBlock); // fix this to be a special case
829    } else {
830        throw std::runtime_error("Not supported yet!");
831    }
832
833    // First compute any necessary leading blocks to allow the sorting kernel access to the "future" data produced by
834    // the leading kernel ...
835    iBuilder->SetInsertPoint(leadingTestBlock);
836    PHINode * blockNo = iBuilder->CreatePHI(intType, 2);
837    blockNo->addIncoming(iBuilder->getInt64(0), entryBlock);
838    PHINode * remainingBytes = iBuilder->CreatePHI(intType, 2);
839    remainingBytes->addIncoming(bufferSize, entryBlock);
840    Value * leadingBlocksCond = iBuilder->CreateICmpULT(blockNo, iBuilder->getInt64(leadingBlocks));
841    iBuilder->CreateCondBr(leadingBlocksCond, safetyCheckBlock, regularTestBlock);
842
843    iBuilder->SetInsertPoint(safetyCheckBlock);
844    Value * safetyCheckCond = iBuilder->CreateICmpULT(remainingBytes, blockSize);
845    iBuilder->CreateCondBr(safetyCheckCond, regularExitBlock, leadingBodyBlock);
846
847    iBuilder->SetInsertPoint(leadingBodyBlock);
848    s2pInstance->CreateDoBlockCall();
849    leadingInstance->CreateDoBlockCall();
850    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, iBuilder->getInt64(1)), leadingBodyBlock);
851    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, blockSize), leadingBodyBlock);
852    iBuilder->CreateBr(leadingTestBlock);
853
854    // Now all the data for which we can produce and consume a full leading block...
855    iBuilder->SetInsertPoint(regularTestBlock);
856    PHINode * remainingBytes2 = iBuilder->CreatePHI(intType, 2);
857    remainingBytes2->addIncoming(remainingBytes, leadingTestBlock);
858    Value * remainingBytesCond = iBuilder->CreateICmpULT(remainingBytes2, requiredBytes);
859    iBuilder->CreateCondBr(remainingBytesCond, regularExitBlock, regularBodyBlock);
860    iBuilder->SetInsertPoint(regularBodyBlock);
861    s2pInstance->CreateDoBlockCall();
862    leadingInstance->CreateDoBlockCall();
863    sortingInstance->CreateDoBlockCall();
864    remainingBytes2->addIncoming(iBuilder->CreateSub(remainingBytes2, blockSize), regularBodyBlock);
865    iBuilder->CreateBr(regularTestBlock);
866
867    // Check if we have a partial blocks worth of leading data remaining
868    iBuilder->SetInsertPoint(regularExitBlock);
869    PHINode * remainingBytes3 = iBuilder->CreatePHI(intType, 2);
870    remainingBytes3->addIncoming(remainingBytes, safetyCheckBlock);
871    remainingBytes3->addIncoming(remainingBytes2, regularTestBlock);
872    Value * partialBlockCond = iBuilder->CreateICmpNE(remainingBytes3, ConstantInt::getNullValue(intType));
873    iBuilder->CreateCondBr(partialBlockCond, finalTestBlock, partialBlock);
874
875    // If we do, process it and mask out the data
876    iBuilder->SetInsertPoint(partialBlock);
877    s2pInstance->CreateDoBlockCall();
878    leadingInstance->CreateDoBlockCall();
879    leadingInstance->clearOutputStreamSet();
880    sortingInstance->CreateDoBlockCall();
881    iBuilder->CreateBr(finalTestBlock);
882
883    // Now clear the leading data and test the final blocks
884    iBuilder->SetInsertPoint(finalTestBlock);
885    PHINode * remainingFullBlocks = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 3);
886    remainingFullBlocks->addIncoming(iBuilder->getInt64(leadingBlocks), regularExitBlock);
887    remainingFullBlocks->addIncoming(iBuilder->getInt64(leadingBlocks), partialBlock);
888    Value * remainingFullBlocksCond = iBuilder->CreateICmpUGT(remainingFullBlocks, ConstantInt::getNullValue(intType));
889    iBuilder->CreateCondBr(remainingFullBlocksCond, finalBodyBlock, exitBlock);
890
891    iBuilder->SetInsertPoint(finalBodyBlock);
892    leadingInstance->clearOutputStreamSet();
893    sortingInstance->CreateDoBlockCall();
894    remainingFullBlocks->addIncoming(iBuilder->CreateSub(remainingFullBlocks, iBuilder->getInt64(1)), finalBodyBlock);
895
896    iBuilder->CreateBr(finalTestBlock);
897    iBuilder->SetInsertPoint(exitBlock);
898    iBuilder->CreateRetVoid();
899
900    return main;
901}
902
903SymbolTableBuilder::~SymbolTableBuilder() {
904    delete mS2PKernel;
905    delete mLeadingKernel;
906    delete mSortingKernel;
907    delete mGatherKernel;
908}
909
910
911}
Note: See TracBrowser for help on using the repository browser.