source: icGREP/icgrep-devel/icgrep/kernels/symboltablepipeline.cpp @ 4995

Last change on this file since 4995 was 4995, checked in by nmedfort, 3 years ago

More work on symbol table; unexpected bug with 4KiB-one page tests observed.

File size: 41.1 KB
Line 
1#include "symboltablepipeline.h"
2
3/*
4 *  Copyright (c) 2016 International Characters.
5 *  This software is licensed to the public under the Open Software License 3.0.
6 */
7
8#include "pipeline.h"
9#include "toolchain.h"
10#include "utf_encoding.h"
11
12#include <kernels/s2p_kernel.h>
13#include <kernels/instance.h>
14#include <kernels/stdout_kernel.h>
15
16#include <pablo/function.h>
17#include <pablo/pablo_compiler.h>
18#include <pablo/analysis/pabloverifier.hpp>
19
20#include <re/re_cc.h>
21#include <re/re_rep.h>
22#include <re/re_name.h>
23#include <re/re_compiler.h>
24#include <re/printer_re.h>
25
26#include <cc/cc_compiler.h>
27
28#include <pablo/printer_pablos.h>
29#include <iostream>
30
31#include <llvm/IR/Intrinsics.h>
32
33using namespace re;
34using namespace pablo;
35
36namespace kernel {
37
38SymbolTableBuilder::SymbolTableBuilder(Module * m, IDISA::IDISA_Builder * b)
39: mMod(m)
40, iBuilder(b)
41, mLongestLookahead(0)
42, mBitBlockType(b->getBitBlockType())
43, mBlockSize(b->getBitBlockWidth()) {
44
45}
46
47/** ------------------------------------------------------------------------------------------------------------- *
48 * @brief generateLeadingFunction
49 ** ------------------------------------------------------------------------------------------------------------- */
50PabloFunction * SymbolTableBuilder::generateLeadingFunction(const std::vector<unsigned> & endpoints) {
51    PabloFunction * const function = PabloFunction::Create("leading", 8, endpoints.size() + 2);
52    Encoding enc(Encoding::Type::ASCII, 8);
53    cc::CC_Compiler ccCompiler(*function, enc);
54    re::RE_Compiler reCompiler(*function, ccCompiler);
55    RE * cc = makeName(makeCC(makeCC(makeCC('a', 'z'), makeCC('A', 'Z')), makeCC('0', '9')));
56    reCompiler.compileUnicodeNames(cc);
57    PabloAST * const matches = reCompiler.compile(cc).stream;
58    PabloBlock * const entry = function->getEntryBlock();
59    PabloAST * const adv = entry->createAdvance(matches, 1);
60    PabloAST * const starts = entry->createAnd(matches, entry->createNot(adv));
61    PabloAST * const ends = entry->createAnd(adv, entry->createNot(matches));
62
63    function->setResult(0, entry->createAssign("l.S", starts));
64    function->setResult(1, entry->createAssign("l.E", ends));
65
66    PabloAST * M = ends;
67    unsigned step = 1;
68    unsigned i = 0;
69    for (unsigned endpoint : endpoints) {
70        assert (endpoint >= step);
71        unsigned span = endpoint - step;
72        while (span > step) {
73            M = entry->createOr(entry->createAdvance(M, step), M);
74            span = span - step;
75            step *= 2;
76        }
77        M = entry->createOr(entry->createAdvance(M, span), M);
78        function->setResult(i + 2, entry->createAssign("l.M" + std::to_string(i), M));
79        ++i;
80        step += span;
81    }
82
83    return function;
84}
85
86/** ------------------------------------------------------------------------------------------------------------- *
87 * @brief generateSortingFunction
88 ** ------------------------------------------------------------------------------------------------------------- */
89PabloFunction * SymbolTableBuilder::generateSortingFunction(const PabloFunction * const leading, const std::vector<unsigned> & endpoints) {
90    PabloFunction * const function = PabloFunction::Create("sorting", leading->getNumOfResults(), (leading->getNumOfResults() - 1) * 2);
91    PabloBlock * entry = function->getEntryBlock();
92    function->setParameter(0, entry->createVar("l.S"));
93    function->setParameter(1, entry->createVar("l.E"));
94    for (unsigned i = 2; i < leading->getNumOfResults(); ++i) {
95        function->setParameter(i, entry->createVar("l.M" + std::to_string(i - 2)));
96    }
97    PabloAST * R = function->getParameter(0);
98    PabloAST * const E = entry->createNot(function->getParameter(1));
99    unsigned i = 0;
100    unsigned lowerbound = 0;
101    for (unsigned endpoint : endpoints) {
102        PabloAST * const M = function->getParameter(i + 2);
103        PabloAST * const L = entry->createLookahead(M, endpoint, "lookahead" + std::to_string(endpoint));
104        PabloAST * S = entry->createAnd(L, R);
105        Assign * Si = entry->createAssign("s.S_" + std::to_string(i + 1), S);
106        PabloAST * F = entry->createScanThru(S, E);
107        Assign * Ei = entry->createAssign("s.E_" + std::to_string(i + 1), F);
108        function->setResult(i * 2, Si);
109        function->setResult(i * 2 + 1, Ei);
110        R = entry->createXor(R, S);
111        ++i;
112        lowerbound = endpoint;
113    }
114    Assign * Si = entry->createAssign("s.S_n", R);
115    PabloAST * F = entry->createScanThru(R, E);
116    Assign * Ei = entry->createAssign("s.E_n", F);
117    function->setResult(i * 2, Si);
118    function->setResult(i * 2 + 1, Ei);
119    mLongestLookahead = lowerbound;
120
121    return function;
122}
123
124/** ------------------------------------------------------------------------------------------------------------- *
125 * @brief generateCountForwardZeroes
126 ** ------------------------------------------------------------------------------------------------------------- */
127inline Value * generateCountForwardZeroes(IDISA::IDISA_Builder * iBuilder, Value * bits) {
128    Value * cttzFunc = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::cttz, bits->getType());
129    return iBuilder->CreateCall(cttzFunc, std::vector<Value *>({bits, ConstantInt::get(iBuilder->getInt1Ty(), 0)}));
130}
131
132/** ------------------------------------------------------------------------------------------------------------- *
133 * @brief generateMaskedGather
134 ** ------------------------------------------------------------------------------------------------------------- */
135inline Value * SymbolTableBuilder::generateMaskedGather(Value * const base, Value * const vindex, Value * const mask) {
136
137    /*
138        From Intel:
139
140        extern __m256i _mm256_mask_i32gather_epi32(__m256i def_vals, int const * base, __m256i vindex, __m256i vmask, const int scale);
141
142        From Clang avx2intrin.h:
143
144        #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
145           (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
146                                                (int const *)(m), \
147                                                (__v8si)(__m256i)(i), \
148                                                (__v8si)(__m256i)(mask), (s)); })
149        From llvm IntrinsicsX86.td:
150
151        def llvm_ptr_ty        : LLVMPointerType<llvm_i8_ty>;             // i8*
152
153        def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
154           Intrinsic<[llvm_v8i32_ty],
155           [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
156           [IntrReadArgMem]>;
157
158     */
159
160    VectorType * const vecType = VectorType::get(iBuilder->getInt32Ty(), 8);
161    Function * const vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
162    return iBuilder->CreateCall(vgather, {Constant::getNullValue(vecType), base, iBuilder->CreateBitCast(vindex, vecType), iBuilder->CreateBitCast(mask, vecType), iBuilder->getInt8(1)});
163}
164
165/** ------------------------------------------------------------------------------------------------------------- *
166 * @brief generateResetLowestBit
167 ** ------------------------------------------------------------------------------------------------------------- */
168inline Value * generateResetLowestBit(IDISA::IDISA_Builder * iBuilder, Value * bits) {
169    Value * bits_minus1 = iBuilder->CreateSub(bits, ConstantInt::get(bits->getType(), 1));
170    return iBuilder->CreateAnd(bits_minus1, bits);
171}
172
173/** ------------------------------------------------------------------------------------------------------------- *
174 * @brief generateGatherKernel
175 ** ------------------------------------------------------------------------------------------------------------- */
176void SymbolTableBuilder::generateGatherKernel(KernelBuilder * kBuilder, const std::vector<unsigned> & endpoints, const unsigned scanWordBitWidth) {
177
178    Type * const intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
179    const unsigned fieldCount = iBuilder->getBitBlockWidth() / scanWordBitWidth;
180    Type * const scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
181    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
182    const unsigned gatherCount = vectorWidth * 4;
183    Type * const transposedVectorType = VectorType::get(iBuilder->getInt8Ty(), iBuilder->getBitBlockWidth() / 8);
184
185    unsigned minKeyLength = 0;
186
187    Type * startArrayType = ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount);
188    Type * endArrayType = ArrayType::get(iBuilder->getInt32Ty(), gatherCount);
189    Type * groupType = StructType::get(iBuilder->getInt32Ty(), startArrayType, iBuilder->getInt32Ty(), endArrayType, nullptr);
190    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
191    const unsigned positionArrayIdx = kBuilder->addInternalState(ArrayType::get(groupType, endpoints.size()), "Positions");
192
193    for (unsigned maxKeyLength : endpoints) {
194        kBuilder->addInputStream(1, "startStream" + std::to_string(maxKeyLength));
195        kBuilder->addInputStream(1, "endStream" + std::to_string(maxKeyLength));
196        kBuilder->addOutputStream(((maxKeyLength + 3) / 4) * 4);
197    }
198    kBuilder->addInputStream(1, "startStreamN");
199    kBuilder->addInputStream(1, "endStreamN");
200
201    Function * const function = kBuilder->prepareFunction();
202
203    BasicBlock * const entry = iBuilder->GetInsertBlock();
204
205    BasicBlock * groupCond = BasicBlock::Create(mMod->getContext(), "groupCond", function, 0);
206    BasicBlock * groupBody = BasicBlock::Create(mMod->getContext(), "groupBody", function, 0);
207
208    BasicBlock * startOuterCond = BasicBlock::Create(mMod->getContext(), "startOuterCond", function, 0);
209    BasicBlock * startOuterBody = BasicBlock::Create(mMod->getContext(), "startOuterBody", function, 0);
210    BasicBlock * startInnerCond = BasicBlock::Create(mMod->getContext(), "startInnerCond", function, 0);
211    BasicBlock * startInnerBody = BasicBlock::Create(mMod->getContext(), "startInnerBody", function, 0);
212
213    BasicBlock * endOuterCond = BasicBlock::Create(mMod->getContext(), "endOuterCond", function, 0);
214    BasicBlock * endOuterBody = BasicBlock::Create(mMod->getContext(), "endOuterBody", function, 0);
215    BasicBlock * endInnerCond = BasicBlock::Create(mMod->getContext(), "endInnerCond", function, 0);
216    BasicBlock * endInnerBody = BasicBlock::Create(mMod->getContext(), "endInnerBody", function, 0);
217
218    BasicBlock * gather = BasicBlock::Create(mMod->getContext(), "gather", function, 0);
219
220    BasicBlock * nextGroup = BasicBlock::Create(mMod->getContext(), "nextGroup", function, 0);
221
222    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
223
224    Type * const int32PtrTy = PointerType::get(iBuilder->getInt32Ty(), 0);
225    FunctionType * const functionType = FunctionType::get(iBuilder->getVoidTy(), {iBuilder->getInt8PtrTy(), int32PtrTy, int32PtrTy, iBuilder->getInt32Ty(), int32PtrTy}, false);
226    Value * const gatherFunctionPtrArray = iBuilder->CreateAlloca(PointerType::get(functionType, 0), iBuilder->getInt32(endpoints.size()));
227    unsigned i = 0;
228    minKeyLength = 0;
229    for (unsigned maxKeyLength : endpoints) {
230        const unsigned minCount = (minKeyLength / 4);
231        const unsigned maxCount = ((maxKeyLength + 3) / 4);
232        Value * ptr = iBuilder->CreateGEP(gatherFunctionPtrArray, iBuilder->getInt32(i++));
233        iBuilder->CreateStore(generateGatherFunction(transposedVectorType, minCount, maxCount), ptr);
234        minKeyLength = maxKeyLength;
235    }
236
237    //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
238    Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
239    Value * const positionArray = kBuilder->getInternalState(positionArrayIdx);
240
241    Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
242    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
243
244    iBuilder->CreateBr(groupCond);
245
246    // GROUP COND
247    iBuilder->SetInsertPoint(groupCond);
248    PHINode * groupIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
249    groupIV->addIncoming(iBuilder->getInt32(0), entry);
250    Value * groupTest = iBuilder->CreateICmpNE(groupIV, iBuilder->getInt32(endpoints.size()));
251    iBuilder->CreateCondBr(groupTest, groupBody, exit);
252
253    // GROUP BODY
254    iBuilder->SetInsertPoint(groupBody);
255    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
256
257    iBuilder->CallPrintInt(" ---- groupIV ---- ", groupIV);
258
259    Value * index = iBuilder->CreateMul(groupIV, iBuilder->getInt32(2));
260    Value * startStreamPtr = kBuilder->getInputStream(index);
261    Value * startStream = iBuilder->CreateBlockAlignedLoad(startStreamPtr);
262    iBuilder->CallPrintRegister("startStream", startStream);
263    startStream = iBuilder->CreateBitCast(startStream, scanWordVectorType, "startStream");
264
265    index = iBuilder->CreateAdd(index, iBuilder->getInt32(1));
266    Value * endStreamPtr = kBuilder->getInputStream(index);
267    Value * endStream = iBuilder->CreateBlockAlignedLoad(endStreamPtr);
268    iBuilder->CallPrintRegister("endStream", endStream);
269    endStream = iBuilder->CreateBitCast(endStream, scanWordVectorType, "endStream");
270
271    Value * startIndexPtr = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(0)}, "startIndexPtr");
272    Value * startIndex = iBuilder->CreateLoad(startIndexPtr, "startIndex");
273    Value * startArray = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(1)}, "startArray");
274    Value * endIndexPtr = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(2)}, "endIndexPtr");
275    Value * endIndex = iBuilder->CreateLoad(endIndexPtr, "endIndex");
276    Value * endArray = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(3)}, "endArray");
277
278    Value * const buffer = kBuilder->getOutputStream(groupIV);
279
280    iBuilder->CreateBr(startOuterCond);
281
282    // START OUTER COND
283    iBuilder->SetInsertPoint(startOuterCond);
284    PHINode * startBlockOffset = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
285    startBlockOffset->addIncoming(blockPos, groupBody);
286    PHINode * startIndexPhi1 = iBuilder->CreatePHI(startIndex->getType(), 2);
287    startIndexPhi1->addIncoming(startIndex, groupBody);
288    PHINode * startIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
289    startIV->addIncoming(iBuilder->getInt64(0), groupBody);
290    Value * startOuterTest = iBuilder->CreateICmpNE(startIV, iBuilder->getInt64(fieldCount));
291    iBuilder->CreateCondBr(startOuterTest, startOuterBody, endOuterCond);
292
293    // START OUTER BODY
294    iBuilder->SetInsertPoint(startOuterBody);
295    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
296    startIV->addIncoming(iBuilder->CreateAdd(startIV, iBuilder->getInt64(1)), startInnerCond);
297    startBlockOffset->addIncoming(iBuilder->CreateAdd(startBlockOffset, iBuilder->getInt64(scanWordBitWidth)), startInnerCond);
298    iBuilder->CreateBr(startInnerCond);
299
300    // START INNER COND
301    iBuilder->SetInsertPoint(startInnerCond);
302    PHINode * startIndexPhi3 = iBuilder->CreatePHI(startIndex->getType(), 2);
303    startIndexPhi3->addIncoming(startIndexPhi1, startOuterBody);
304    startIndexPhi1->addIncoming(startIndexPhi3, startInnerCond);
305    PHINode * startFieldPhi = iBuilder->CreatePHI(intScanWordTy, 2);
306    startFieldPhi->addIncoming(startField, startOuterBody);
307    Value * test = iBuilder->CreateICmpNE(startFieldPhi, ConstantInt::getNullValue(intScanWordTy));
308    iBuilder->CreateCondBr(test, startInnerBody, startOuterCond);
309
310    // START INNER BODY
311    iBuilder->SetInsertPoint(startInnerBody);
312    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
313    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
314    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, startBlockOffset), iBuilder->getInt32Ty());
315    Value * startAddr = iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), startIndexPhi3});
316    iBuilder->CallPrintInt("> startIndex ", startIndexPhi3);
317    iBuilder->CallPrintInt("> startPos ", startPos);
318    iBuilder->CreateStore(startPos, startAddr);
319    startIndexPhi3->addIncoming(iBuilder->CreateAdd(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), 1)), startInnerBody);
320    iBuilder->CreateBr(startInnerCond);
321
322    // END POINT OUTER COND
323    iBuilder->SetInsertPoint(endOuterCond);
324    PHINode * endBlockOffset = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
325    endBlockOffset->addIncoming(blockPos, startOuterCond);
326    PHINode * endIndexPhi1 = iBuilder->CreatePHI(endIndex->getType(), 2);
327    endIndexPhi1->addIncoming(endIndex, startOuterCond);
328    PHINode * startIndexPhi2 = iBuilder->CreatePHI(startIndex->getType(), 2);
329    startIndexPhi2->addIncoming(startIndexPhi1, startOuterCond);
330    PHINode * endIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
331    endIV->addIncoming(iBuilder->getInt64(0), startOuterCond);
332    Value * endOuterTest = iBuilder->CreateICmpNE(endIV, iBuilder->getInt64(fieldCount));
333    iBuilder->CreateCondBr(endOuterTest, endOuterBody, nextGroup);
334
335    // END POINT OUTER BODY
336    iBuilder->SetInsertPoint(endOuterBody);
337    Value * endField = iBuilder->CreateExtractElement(endStream, endIV);
338    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
339    endBlockOffset->addIncoming(iBuilder->CreateAdd(endBlockOffset, iBuilder->getInt64(scanWordBitWidth)), endInnerCond);
340    iBuilder->CreateBr(endInnerCond);
341
342    // END POINT INNER COND
343    iBuilder->SetInsertPoint(endInnerCond);
344    startIndexPhi3 = iBuilder->CreatePHI(startIndexPhi2->getType(), 3);
345    startIndexPhi3->addIncoming(startIndexPhi2, endOuterBody);
346    startIndexPhi3->addIncoming(startIndexPhi3, endInnerBody);
347    startIndexPhi2->addIncoming(startIndexPhi3, endInnerCond);
348    PHINode * endIndexPhi2 = iBuilder->CreatePHI(endIndex->getType(), 3);
349    endIndexPhi2->addIncoming(endIndexPhi1, endOuterBody);
350    endIndexPhi1->addIncoming(endIndexPhi2, endInnerCond);
351    endIndexPhi2->addIncoming(ConstantInt::getNullValue(endIndex->getType()), gather);
352    PHINode * endFieldPhi = iBuilder->CreatePHI(intScanWordTy, 3);
353    endFieldPhi->addIncoming(endField, endOuterBody);
354    Value * endInnerTest = iBuilder->CreateICmpNE(endFieldPhi, ConstantInt::getNullValue(intScanWordTy));
355    iBuilder->CreateCondBr(endInnerTest, endInnerBody, endOuterCond);
356
357    // END POINT INNER BODY
358    iBuilder->SetInsertPoint(endInnerBody);
359    Value * endPos = generateCountForwardZeroes(iBuilder, endFieldPhi);
360    Value * updatedEndFieldPhi = generateResetLowestBit(iBuilder, endFieldPhi);
361    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
362    endFieldPhi->addIncoming(updatedEndFieldPhi, gather);
363    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, endBlockOffset), iBuilder->getInt32Ty());
364    Value * endAddr = iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi2});
365    iBuilder->CallPrintInt("> endIndex ", endIndexPhi2);
366    iBuilder->CallPrintInt("> endPos ", endPos);
367    iBuilder->CreateStore(endPos, endAddr);
368    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi2, ConstantInt::get(endIndexPhi2->getType(), 1));
369    endIndexPhi2->addIncoming(updatedEndIndexPhi, endInnerBody);
370    Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), gatherCount));
371    iBuilder->CreateCondBr(filledEndPosBufferTest, gather, endInnerCond);
372
373    // GATHER
374    iBuilder->SetInsertPoint(gather);
375
376    iBuilder->CallPrintInt(" **** gathering **** ", groupIV);
377
378    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(iBuilder->getInt32Ty(), 0));
379    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(iBuilder->getInt32Ty(), 0));
380    Value * const bufferPtr = iBuilder->CreatePointerCast(buffer, PointerType::get(iBuilder->getInt32Ty(), 0));
381    Value * gatherFunctionPtr = iBuilder->CreateLoad(iBuilder->CreateGEP(gatherFunctionPtrArray, groupIV));
382
383    iBuilder->CreateCall5(gatherFunctionPtr, base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), bufferPtr);
384
385    // ... call hashing function ...
386    Value * remainingArrayPtr = iBuilder->CreateGEP(startArrayPtr, iBuilder->getInt32(gatherCount));
387    Value * remainingCount = iBuilder->CreateSub(startIndexPhi3, iBuilder->getInt32(gatherCount));
388    iBuilder->CreateMemCpy(startArrayPtr, remainingArrayPtr, remainingCount, 4);
389    startIndexPhi3->addIncoming(remainingCount, gather);
390    iBuilder->CreateBr(endInnerCond);
391
392    // NEXT GROUP
393    iBuilder->SetInsertPoint(nextGroup);
394    iBuilder->CreateStore(startIndexPhi2, startIndexPtr);
395    iBuilder->CreateStore(endIndexPhi1, endIndexPtr);
396    groupIV->addIncoming(iBuilder->CreateAdd(groupIV, ConstantInt::get(groupIV->getType(), 1)), nextGroup);
397    iBuilder->CreateBr(groupCond);
398
399    iBuilder->SetInsertPoint(exit);
400    kBuilder->finalize();
401}
402
403/** ------------------------------------------------------------------------------------------------------------- *
404 * @brief generateGatherFunction
405 ** ------------------------------------------------------------------------------------------------------------- */
406Function * SymbolTableBuilder::generateGatherFunction(Type * const resultType, const unsigned minCount, const unsigned maxCount) {
407
408    assert (maxCount > minCount);
409
410    const std::string functionName = "gather_" + std::to_string(minCount) + "_" + std::to_string(maxCount);
411    Function * function = mMod->getFunction(functionName);
412    if (function == nullptr) {
413
414        const auto ip = iBuilder->saveIP();
415
416        const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
417        Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
418        Type * const gatherVectorArrayType = ArrayType::get(gatherVectorType, maxCount);
419
420        Type * const int32PtrTy = PointerType::get(iBuilder->getInt32Ty(), 0);
421        FunctionType * const functionType = FunctionType::get(iBuilder->getVoidTy(), {iBuilder->getInt8PtrTy(), int32PtrTy, int32PtrTy, iBuilder->getInt32Ty(), int32PtrTy}, false);
422        function = Function::Create(functionType, GlobalValue::ExternalLinkage, functionName, mMod);
423        function->setCallingConv(CallingConv::C);
424        function->setDoesNotCapture(1);
425        function->setDoesNotCapture(2);
426        function->setDoesNotCapture(3);
427        function->setDoesNotThrow();
428
429        Function::arg_iterator args = function->arg_begin();
430        Value * const base = args++;
431        base->setName("base");
432        Value * startArray = args++;
433        startArray->setName("startArray");
434        Value * endArray = args++;
435        endArray->setName("endArray");
436        Value * const numOfKeys = args++;
437        numOfKeys->setName("numOfKeys");
438        Value * buffer = args++;
439        buffer->setName("buffer");
440
441        BasicBlock * entry = BasicBlock::Create(mMod->getContext(), "entry", function, 0);
442        BasicBlock * gatherCond = BasicBlock::Create(mMod->getContext(), "gatherCond", function, 0);
443        BasicBlock * partialGatherCond = BasicBlock::Create(mMod->getContext(), "partialGatherCond", function, 0);
444        BasicBlock * partialGatherBody = BasicBlock::Create(mMod->getContext(), "partialGatherBody", function, 0);
445        BasicBlock * gatherBody = BasicBlock::Create(mMod->getContext(), "gatherBody", function, 0);
446        BasicBlock * transposeCond = BasicBlock::Create(mMod->getContext(), "transposeCond", function, 0);
447        BasicBlock * transposeBody = BasicBlock::Create(mMod->getContext(), "transposeBody", function, 0);
448        BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
449
450        Value * const four = iBuilder->CreateVectorSplat(vectorWidth, iBuilder->getInt32(4));
451
452        // ENTRY
453        iBuilder->SetInsertPoint(entry);
454        Value * const untransposedBuffer = iBuilder->CreateAlloca(gatherVectorArrayType, iBuilder->getInt32(4), "untransposedBuffer");
455
456        iBuilder->CallPrintInt("base", base);
457        iBuilder->CallPrintInt("startArray", startArray);
458        iBuilder->CallPrintInt("endArray", endArray);
459        iBuilder->CallPrintInt("numOfKeys", numOfKeys);
460        iBuilder->CallPrintInt("buffer", buffer);
461
462        iBuilder->CreateBr(gatherCond);
463
464        // FULL GATHER COND
465        iBuilder->SetInsertPoint(gatherCond);
466        PHINode * remainingLanes = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
467        remainingLanes->addIncoming(numOfKeys, entry);
468        PHINode * gatherIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
469        gatherIV->addIncoming(iBuilder->getInt32(0), entry);
470        iBuilder->CallPrintInt(" --- gatherIV", gatherIV);
471        Value * gatherLoopTest = iBuilder->CreateICmpNE(gatherIV, iBuilder->getInt32(4));
472        iBuilder->CreateCondBr(gatherLoopTest, partialGatherCond, transposeCond);
473
474        // PARTIAL GATHER COND
475        iBuilder->SetInsertPoint(partialGatherCond);
476        iBuilder->CallPrintInt(" --- remainingLanes", remainingLanes);
477        Value * partialGatherLoopTest = iBuilder->CreateICmpSGE(remainingLanes, iBuilder->getInt32(vectorWidth));
478        iBuilder->CreateCondBr(partialGatherLoopTest, gatherBody, partialGatherBody);
479
480        // PARTIAL GATHER BODY
481        iBuilder->SetInsertPoint(partialGatherBody);
482        Type * registerType = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
483        Value * maskedLanes = iBuilder->CreateSub(iBuilder->getInt32(vectorWidth), remainingLanes);
484        maskedLanes = iBuilder->CreateMul(maskedLanes, iBuilder->getInt32(32));
485        maskedLanes = iBuilder->CreateZExt(maskedLanes, registerType);
486        maskedLanes = iBuilder->CreateLShr(Constant::getAllOnesValue(registerType), maskedLanes);       
487        maskedLanes = iBuilder->CreateBitCast(maskedLanes, gatherVectorType);
488        iBuilder->CreateBr(gatherBody);
489
490        // FULL GATHER BODY
491        iBuilder->SetInsertPoint(gatherBody);
492        PHINode * activeLanes = iBuilder->CreatePHI(gatherVectorType, 2, "activeLanes");
493        activeLanes->addIncoming(Constant::getAllOnesValue(gatherVectorType), partialGatherCond);
494        activeLanes->addIncoming(maskedLanes, partialGatherBody);
495        iBuilder->CallPrintRegister(" --- activeLanes", activeLanes);
496
497        startArray = iBuilder->CreateBitCast(startArray, PointerType::get(gatherVectorType, 0));
498        Value * startPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(startArray, gatherIV), 4);
499        for (unsigned blockCount = 0; blockCount < minCount; ++blockCount) {
500
501            iBuilder->CallPrintRegister(" --- startPosF" + std::to_string(blockCount), startPos);
502            Value * tokenData = generateMaskedGather(base, startPos, activeLanes);
503            startPos = iBuilder->CreateAdd(startPos, four);
504            iBuilder->CallPrintRegister(" --- tokenDataF" + std::to_string(blockCount), tokenData);
505            iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
506        }
507
508        endArray = iBuilder->CreateBitCast(endArray, PointerType::get(gatherVectorType, 0));
509        Value * const endPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(endArray, gatherIV), 4);
510        for (unsigned blockCount = minCount; blockCount < maxCount; ++blockCount) {
511
512            iBuilder->CallPrintRegister(" --- startPosP" + std::to_string(blockCount), startPos);
513
514            // if we have not fully gathered the data for this key
515            Value * atLeastOneByte = iBuilder->CreateSExt(iBuilder->CreateICmpULT(startPos, endPos), startPos->getType());
516            atLeastOneByte = iBuilder->CreateAnd(atLeastOneByte, activeLanes);
517            iBuilder->CallPrintRegister(" --- atLeastOneByte" + std::to_string(blockCount), atLeastOneByte);
518
519            // gather it ...
520            Value * tokenData = generateMaskedGather(base, startPos, atLeastOneByte);
521            iBuilder->CallPrintRegister(" --- tokenDataP" + std::to_string(blockCount), tokenData);
522            // and compute how much data is remaining.
523            Value * remaining = iBuilder->CreateSub(endPos, startPos);
524
525            iBuilder->CallPrintRegister(" --- remaining" + std::to_string(blockCount), remaining);
526
527            // if this token only has 1 to 3 bytes remaining ...
528            Value * atLeastFourBytes = iBuilder->CreateSExt(iBuilder->CreateICmpUGE(remaining, four), remaining->getType());
529
530            iBuilder->CallPrintRegister(" --- atLeastFourBytes" + std::to_string(blockCount), atLeastFourBytes);
531
532
533            // determine how many bits do *not* belong to the token
534            remaining = iBuilder->CreateSub(four, remaining);
535            remaining = iBuilder->CreateShl(remaining, ConstantInt::get(remaining->getType(), 3));
536
537            iBuilder->CallPrintRegister(" --- remaining" + std::to_string(blockCount), remaining);
538
539            // then mask them out prior to storing the value
540            Value * partialTokenMask = iBuilder->CreateLShr(ConstantInt::getAllOnesValue(remaining->getType()), remaining);
541            partialTokenMask = iBuilder->CreateOr(partialTokenMask, atLeastFourBytes);
542
543            iBuilder->CallPrintRegister(" --- partialTokenMask" + std::to_string(blockCount), partialTokenMask);
544
545            tokenData = iBuilder->CreateAnd(partialTokenMask, tokenData);
546
547            iBuilder->CallPrintRegister(" --- tokenDataM" + std::to_string(blockCount), tokenData);
548
549            Value * untransposedBufferPtr = iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV});
550
551            iBuilder->CallPrintInt(" --- untransposedBufferPtr" + std::to_string(blockCount), untransposedBufferPtr);
552
553            iBuilder->CreateAlignedStore(tokenData, untransposedBufferPtr, 4);
554            if (blockCount < (maxCount - 1)) {
555                startPos = iBuilder->CreateAdd(startPos, four);
556            }
557        }
558        gatherIV->addIncoming(iBuilder->CreateAdd(gatherIV, iBuilder->getInt32(1)), gatherBody);
559        remainingLanes->addIncoming(iBuilder->CreateSub(remainingLanes, iBuilder->getInt32(vectorWidth)), gatherBody);
560        iBuilder->CreateBr(gatherCond);
561
562        // TRANSPOSE COND
563        iBuilder->SetInsertPoint(transposeCond);
564        PHINode * transposeIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
565        transposeIV->addIncoming(iBuilder->getInt32(0), gatherCond);
566        Value * transposeLoopTest = iBuilder->CreateICmpNE(transposeIV, iBuilder->getInt32(maxCount));
567        iBuilder->CreateCondBr(transposeLoopTest, transposeBody, exit);
568
569        // TRANSPOSE BODY
570        iBuilder->SetInsertPoint(transposeBody);
571
572        Value * value[4];
573        Value * temporary[4];
574        for (unsigned i = 0; i < 4; ++i) {
575            Value * const ptr = iBuilder->CreateGEP(untransposedBuffer, {transposeIV, iBuilder->getInt32(i)});
576            value[i] = iBuilder->CreateAlignedLoad(ptr, 4);
577        }
578        for (unsigned fieldWidth = 16; fieldWidth != 4; fieldWidth /= 2) {
579            const unsigned fieldCount = iBuilder->getBitBlockWidth() / fieldWidth;
580            VectorType * const vecType = VectorType::get(IntegerType::get(mMod->getContext(), fieldWidth), fieldCount);
581            std::vector<Constant *> lowFields(fieldCount);
582            std::vector<Constant *> highFields(fieldCount);
583            for (unsigned j = 0; j < fieldCount; ++j) {
584                lowFields[j] = iBuilder->getInt32(j * 2);
585                highFields[j] = iBuilder->getInt32(j * 2 + 1);
586            }
587            Constant * const lowVector = ConstantVector::get(lowFields);
588            Constant * const highVector = ConstantVector::get(highFields);
589            for (unsigned i = 0; i < 4; i += 2) {
590                value[i] = iBuilder->CreateBitCast(value[i], vecType);
591                value[i + 1] = iBuilder->CreateBitCast(value[i + 1], vecType);
592                temporary[i / 2] = iBuilder->CreateShuffleVector(value[i], value[i + 1], lowVector);
593                temporary[(i / 2) + 2] = iBuilder->CreateShuffleVector(value[i], value[i + 1], highVector);
594            }
595            std::swap(value, temporary);
596        }
597        Value * offset = iBuilder->CreateShl(transposeIV, ConstantInt::get(transposeIV->getType(), 2));
598        transposeIV->addIncoming(iBuilder->CreateAdd(transposeIV, iBuilder->getInt32(1)), transposeBody);
599        buffer = iBuilder->CreateBitCast(buffer, PointerType::get(resultType, 0));
600        for (unsigned i = 0; i < 4; ++i) {
601            Value * index = offset;
602            if (i) {
603                index = iBuilder->CreateAdd(offset, iBuilder->getInt32(i));
604            }
605            Value * ptr = iBuilder->CreateGEP(buffer, index);
606            iBuilder->CreateAlignedStore(value[i], ptr, 4);
607        }
608        iBuilder->CreateBr(transposeCond);
609
610        // EXIT
611        iBuilder->SetInsertPoint(exit);
612        iBuilder->CreateRetVoid();
613
614        iBuilder->restoreIP(ip);
615    }
616
617    return function;
618}
619
620
621/** ------------------------------------------------------------------------------------------------------------- *
622 * @brief createKernels
623 ** ------------------------------------------------------------------------------------------------------------- */
624void SymbolTableBuilder::createKernels() {
625
626    std::vector<unsigned> endpoints;
627    endpoints.push_back(1);
628    endpoints.push_back(2);
629    endpoints.push_back(4);
630    endpoints.push_back(8);
631    endpoints.push_back(16);
632
633    PabloCompiler pablo_compiler(mMod, iBuilder);
634    PabloFunction * const leading = generateLeadingFunction(endpoints);
635    PabloFunction * const sorting = generateSortingFunction(leading, endpoints);
636
637    const auto bufferSize = ((mLongestLookahead + iBuilder->getBitBlockWidth() - 1) / iBuilder->getBitBlockWidth()) + 1;
638
639    mS2PKernel = new KernelBuilder("s2p", mMod, iBuilder, 1);
640    mLeadingKernel = new KernelBuilder("leading", mMod, iBuilder, bufferSize);
641    mSortingKernel = new KernelBuilder("sorting", mMod, iBuilder, bufferSize);
642    mGatherKernel = new KernelBuilder("gathering", mMod, iBuilder, 1);
643    mStdOutKernel = new KernelBuilder("stddout", mMod, iBuilder, 1);
644
645    generateS2PKernel(mMod, iBuilder, mS2PKernel);
646
647    pablo_compiler.setKernel(mLeadingKernel);
648    pablo_compiler.compile(leading);
649    pablo_compiler.setKernel(mSortingKernel);
650    pablo_compiler.compile(sorting);
651
652    delete leading;
653    delete sorting;
654
655    releaseSlabAllocatorMemory();
656
657    generateGatherKernel(mGatherKernel, endpoints, 64);
658    generateStdOutKernel(mMod, iBuilder, mStdOutKernel);
659}
660
661Function * SymbolTableBuilder::ExecuteKernels(){
662
663    Type * intType = iBuilder->getInt64Ty();
664
665    Type * inputType = PointerType::get(ArrayType::get(StructType::get(mMod->getContext(), std::vector<Type *>({ArrayType::get(mBitBlockType, 8)})), 1), 0);
666    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", Type::getVoidTy(mMod->getContext()), inputType, intType, nullptr));
667    main->setCallingConv(CallingConv::C);
668    Function::arg_iterator args = main->arg_begin();
669
670    Value * const inputStream = args++;
671    inputStream->setName("inputStream");
672
673    Value * const bufferSize = args++;
674    bufferSize->setName("bufferSize");
675
676    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
677
678    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
679
680    BasicBlock * leadingTestBlock = BasicBlock::Create(mMod->getContext(), "leadingCond", main, 0);
681    BasicBlock * safetyCheckBlock = BasicBlock::Create(mMod->getContext(), "safetyCheck", main, 0);
682    BasicBlock * leadingBodyBlock = BasicBlock::Create(mMod->getContext(), "leadingBody", main, 0);
683
684    BasicBlock * regularTestBlock = BasicBlock::Create(mMod->getContext(), "fullCond", main, 0);
685    BasicBlock * regularBodyBlock = BasicBlock::Create(mMod->getContext(), "fullBody", main, 0);
686    BasicBlock * regularExitBlock = BasicBlock::Create(mMod->getContext(), "fullExit", main, 0);
687
688    BasicBlock * partialBlock = BasicBlock::Create(mMod->getContext(),  "partialBlock", main, 0);
689
690    BasicBlock * finalTestBlock = BasicBlock::Create(mMod->getContext(),  "finalCond", main, 0);
691    BasicBlock * finalBodyBlock = BasicBlock::Create(mMod->getContext(),  "finalBody", main, 0);
692
693    BasicBlock * exitBlock = BasicBlock::Create(mMod->getContext(), "exit", main, 0);
694
695    Instance * s2pInstance = mS2PKernel->instantiate(inputStream);
696    Instance * leadingInstance = mLeadingKernel->instantiate(s2pInstance->getOutputStreamSet());
697    Instance * sortingInstance = mSortingKernel->instantiate(leadingInstance->getOutputStreamSet());
698    Instance * gatheringInstance = mGatherKernel->instantiate(sortingInstance->getOutputStreamSet());
699    Instance * stdOutInstance = mStdOutKernel->instantiate(gatheringInstance->getOutputStreamSet());
700
701    gatheringInstance->setInternalState("Base", iBuilder->CreateBitCast(inputStream, iBuilder->getInt8PtrTy()));
702
703    stdOutInstance->setInternalState("RemainingBytes", bufferSize);  // The total number of bytes to be sent to stdout.
704
705    const unsigned leadingBlocks = (mLongestLookahead + iBuilder->getBitBlockWidth() - 1) / iBuilder->getBitBlockWidth();
706
707    Value * const requiredBytes = iBuilder->getInt64(mBlockSize * leadingBlocks);
708    Value * const blockSize = iBuilder->getInt64(mBlockSize);
709
710    // If the buffer size is smaller than our largest length group, only check up to the buffer size.
711    Value * safetyCheck = iBuilder->CreateICmpUGE(bufferSize, blockSize);
712    if (blockSize == requiredBytes) {
713        iBuilder->CreateCondBr(safetyCheck, leadingTestBlock, exitBlock); // fix this to be a special case
714    } else {
715        throw std::runtime_error("Not supported yet!");
716    }
717
718    // First compute any necessary leading blocks to allow the sorting kernel access to the "future" data produced by
719    // the leading kernel ...
720    iBuilder->SetInsertPoint(leadingTestBlock);
721    PHINode * blockNo = iBuilder->CreatePHI(intType, 2);
722    blockNo->addIncoming(iBuilder->getInt64(0), entryBlock);
723    PHINode * remainingBytes = iBuilder->CreatePHI(intType, 2);
724    remainingBytes->addIncoming(bufferSize, entryBlock);
725    Value * leadingBlocksCond = iBuilder->CreateICmpULT(blockNo, iBuilder->getInt64(leadingBlocks));
726    iBuilder->CreateCondBr(leadingBlocksCond, safetyCheckBlock, regularTestBlock);
727
728    iBuilder->SetInsertPoint(safetyCheckBlock);
729    Value * safetyCheckCond = iBuilder->CreateICmpULT(remainingBytes, blockSize);
730    iBuilder->CreateCondBr(safetyCheckCond, regularExitBlock, leadingBodyBlock);
731
732    iBuilder->SetInsertPoint(leadingBodyBlock);
733    s2pInstance->CreateDoBlockCall();
734    leadingInstance->CreateDoBlockCall();
735    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, iBuilder->getInt64(1)), leadingBodyBlock);
736    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, blockSize), leadingBodyBlock);
737    iBuilder->CreateBr(leadingTestBlock);
738
739    // Now all the data for which we can produce and consume a full leading block...
740    iBuilder->SetInsertPoint(regularTestBlock);
741    PHINode * remainingBytes2 = iBuilder->CreatePHI(intType, 2);
742    remainingBytes2->addIncoming(remainingBytes, leadingTestBlock);
743    Value * remainingBytesCond = iBuilder->CreateICmpULT(remainingBytes2, requiredBytes);
744    iBuilder->CreateCondBr(remainingBytesCond, regularExitBlock, regularBodyBlock);
745
746    iBuilder->SetInsertPoint(regularBodyBlock);
747    s2pInstance->CreateDoBlockCall();
748    leadingInstance->CreateDoBlockCall();
749    sortingInstance->CreateDoBlockCall();
750    gatheringInstance->CreateDoBlockCall();
751//    stdOutInstance->CreateDoBlockCall();
752    remainingBytes2->addIncoming(iBuilder->CreateSub(remainingBytes2, blockSize), regularBodyBlock);
753    iBuilder->CreateBr(regularTestBlock);
754
755    // Check if we have a partial blocks worth of leading data remaining
756    iBuilder->SetInsertPoint(regularExitBlock);
757    PHINode * remainingBytes3 = iBuilder->CreatePHI(intType, 2);
758    remainingBytes3->addIncoming(remainingBytes, safetyCheckBlock);
759    remainingBytes3->addIncoming(remainingBytes2, regularTestBlock);
760    Value * partialBlockCond = iBuilder->CreateICmpNE(remainingBytes3, ConstantInt::getNullValue(intType));
761    iBuilder->CreateCondBr(partialBlockCond, finalTestBlock, partialBlock);
762
763    // If we do, process it and mask out the data
764    iBuilder->SetInsertPoint(partialBlock);
765    s2pInstance->CreateDoBlockCall();
766    leadingInstance->CreateDoBlockCall();
767    leadingInstance->clearOutputStreamSet();
768    sortingInstance->CreateDoBlockCall();
769    gatheringInstance->CreateDoBlockCall();
770//    stdOutInstance->CreateDoBlockCall();
771    iBuilder->CreateBr(finalTestBlock);
772
773    // Now clear the leading data and test the final blocks
774    iBuilder->SetInsertPoint(finalTestBlock);
775    PHINode * remainingFullBlocks = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 3);
776    remainingFullBlocks->addIncoming(iBuilder->getInt64(leadingBlocks), regularExitBlock);
777    remainingFullBlocks->addIncoming(iBuilder->getInt64(leadingBlocks), partialBlock);
778    Value * remainingFullBlocksCond = iBuilder->CreateICmpUGT(remainingFullBlocks, ConstantInt::getNullValue(intType));
779    iBuilder->CreateCondBr(remainingFullBlocksCond, finalBodyBlock, exitBlock);
780
781    iBuilder->SetInsertPoint(finalBodyBlock);
782    leadingInstance->clearOutputStreamSet();
783    sortingInstance->CreateDoBlockCall();
784    gatheringInstance->CreateDoBlockCall();
785//    stdOutInstance->CreateDoBlockCall();
786    remainingFullBlocks->addIncoming(iBuilder->CreateSub(remainingFullBlocks, iBuilder->getInt64(1)), finalBodyBlock);
787
788
789
790
791    iBuilder->CreateBr(finalTestBlock);
792    iBuilder->SetInsertPoint(exitBlock);
793    iBuilder->CreateRetVoid();
794
795    return main;
796}
797
798SymbolTableBuilder::~SymbolTableBuilder() {
799    delete mS2PKernel;
800    delete mLeadingKernel;
801    delete mSortingKernel;
802    delete mGatherKernel;
803    delete mStdOutKernel;
804}
805
806
807}
Note: See TracBrowser for help on using the repository browser.