Changeset 4992


Ignore:
Timestamp:
Mar 28, 2016, 3:44:17 PM (20 months ago)
Author:
nmedfort
Message:

Continued work on symbol table.

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/instance.h

    r4991 r4992  
    2929    }
    3030
     31    llvm::Value * getInternalState(llvm::Value * const index) {
     32        return mDefinition->getInternalState(mMemory, index);
     33    }
     34
    3135    void setInternalState(const unsigned index, llvm::Value * value) {
    3236        mDefinition->setInternalState(mMemory, index, value);
    3337    }
    3438
     39    void setInternalState(llvm::Value * const index, llvm::Value * value) {
     40        mDefinition->setInternalState(mMemory, index, value);
     41    }
     42
    3543    llvm::Value * getInputStream(const unsigned index, const unsigned streamOffset = 0) {
     44        return mDefinition->getInputStream(mMemory, index, streamOffset);
     45    }
     46
     47    llvm::Value * getInputStream(llvm::Value * const index, const unsigned streamOffset = 0) {
    3648        return mDefinition->getInputStream(mMemory, index, streamOffset);
    3749    }
     
    4557    }
    4658
     59    llvm::Value * getInputScalar(llvm::Value * const index) {
     60        return mDefinition->getInputScalar(mMemory, index);
     61    }
     62
    4763    llvm::Value * getOutputStream(const unsigned index, const unsigned streamOffset = 0) {
     64        return mDefinition->getOutputStream(mMemory, index, streamOffset);
     65    }
     66
     67    llvm::Value * getOutputStream(llvm::Value * const index, const unsigned streamOffset = 0) {
    4868        return mDefinition->getOutputStream(mMemory, index, streamOffset);
    4969    }
     
    5878
    5979    llvm::Value * getOutputScalar(const unsigned index) {
     80        return mDefinition->getOutputScalar(mMemory, index);
     81    }
     82
     83    llvm::Value * getOutputScalar(llvm::Value * const index) {
    6084        return mDefinition->getOutputScalar(mMemory, index);
    6185    }
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r4991 r4992  
    6565 ** ------------------------------------------------------------------------------------------------------------- */
    6666Value * KernelBuilder::getInternalState(Value * const instance, const unsigned index) {
    67     Value* indices[] = {iBuilder->getInt64(0),
    68                         iBuilder->getInt32(INTERNAL_STATE),
    69                         iBuilder->getInt32(index)};
    70     return iBuilder->CreateGEP(instance, indices);
     67    assert (index < mInternalState.size());
     68    return getInternalState(instance, iBuilder->getInt32(index));
     69}
     70
     71Value * KernelBuilder::getInternalState(Value * const instance, disable_implicit_conversion<Value *> index) {
     72    assert (index->getType() == iBuilder->getInt32Ty());
     73    return iBuilder->CreateGEP(instance, {iBuilder->getInt64(0), iBuilder->getInt32(INTERNAL_STATE), index});
    7174}
    7275
     
    9396
    9497void KernelBuilder::setInternalState(Value * const instance, const unsigned index, Value * const value) {
     98    assert (index < mInternalState.size());
     99    return setInternalState(instance, iBuilder->getInt32(index), value);
     100}
     101
     102void KernelBuilder::setInternalState(Value * const instance, disable_implicit_conversion<Value *> index, Value * const value) {
    95103    Value * ptr = getInternalState(instance, index);
    96104    assert (ptr->getType()->getPointerElementType() == value->getType());
     
    123131 ** ------------------------------------------------------------------------------------------------------------- */
    124132Value * KernelBuilder::getInputStream(Value * const instance, const unsigned index, const unsigned streamOffset) {
    125     assert (instance);
    126133    assert (index < mInputStream.size());
     134    return getInputStream(instance, iBuilder->getInt32(index), streamOffset);
     135}
     136
     137Value * KernelBuilder::getInputStream(Value * const instance, disable_implicit_conversion<Value *> index, const unsigned streamOffset) {
     138    assert (instance && index);
    127139    Value * inputStream = iBuilder->CreateLoad(iBuilder->CreateGEP(instance,
    128140        {iBuilder->getInt32(0), iBuilder->getInt32(INPUT_STREAM_SET), iBuilder->getInt32(0)}));
     
    132144    if (streamOffset) {
    133145        offset = iBuilder->CreateAdd(offset, ConstantInt::get(offset->getType(), streamOffset));
    134     }   
    135     return iBuilder->CreateGEP(inputStream, { iBuilder->CreateCall(modFunction, offset), iBuilder->getInt32(index) });
     146    }
     147    assert (index->getType() == iBuilder->getInt32Ty());
     148    return iBuilder->CreateGEP(inputStream, { iBuilder->CreateCall(modFunction, offset), index });
    136149}
    137150
     
    153166 ** ------------------------------------------------------------------------------------------------------------- */
    154167Value * KernelBuilder::getInputScalar(Value * const instance, const unsigned) {
     168    assert (instance);
     169    throw std::runtime_error("currently not supported!");
     170}
     171
     172Value * KernelBuilder::getInputScalar(Value * const instance, disable_implicit_conversion<Value *>) {
     173    assert (instance);
    155174    throw std::runtime_error("currently not supported!");
    156175}
     
    180199 ** ------------------------------------------------------------------------------------------------------------- */
    181200Value * KernelBuilder::getOutputStream(Value * const instance, const unsigned index, const unsigned streamOffset) {
    182     assert (instance);
    183     Value * const offset = getOffset(instance, streamOffset);
    184     Value * const indices[] = {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET), offset, iBuilder->getInt32(index)};
    185     return iBuilder->CreateGEP(instance, indices);
     201    assert (index < mOutputStream.size());
     202    return getOutputStream(instance, iBuilder->getInt32(index), streamOffset);
     203}
     204
     205Value * KernelBuilder::getOutputStream(Value * const instance, disable_implicit_conversion<Value *> index, const unsigned streamOffset) {
     206    assert (instance && index);
     207    assert (index->getType() == iBuilder->getInt32Ty());
     208    return iBuilder->CreateGEP(instance, {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET), getStreamOffset(instance, streamOffset), index});
    186209}
    187210
     
    190213 ** ------------------------------------------------------------------------------------------------------------- */
    191214Value * KernelBuilder::getOutputScalar(Value * const instance, const unsigned) {
     215    throw std::runtime_error("currently not supported!");
     216}
     217
     218Value * KernelBuilder::getOutputScalar(Value * const instance, disable_implicit_conversion<Value *> ) {
    192219    throw std::runtime_error("currently not supported!");
    193220}
     
    369396 ** ------------------------------------------------------------------------------------------------------------- */
    370397void KernelBuilder::clearOutputStreamSet(Value * const instance, const unsigned streamOffset) {
    371     Value * const indices[] = {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET), getOffset(instance, streamOffset)};
     398    Value * const indices[] = {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET), getStreamOffset(instance, streamOffset)};
    372399    Value * ptr = iBuilder->CreateGEP(instance, indices);
    373400    unsigned size = 0;
     
    383410 * Compute the stream index of the given offset value.
    384411 ** ------------------------------------------------------------------------------------------------------------- */
    385 Value * KernelBuilder::getOffset(Value * const instance, const unsigned value) {
     412Value * KernelBuilder::getStreamOffset(Value * const instance, const unsigned index) {
    386413    Value * offset = nullptr;
    387414    if (mBufferSize > 1) {
    388415        offset = iBuilder->CreateLoad(getBlockNo(instance));
    389         if (value) {
    390             offset = iBuilder->CreateAdd(offset, iBuilder->getInt64(value));
     416        if (index) {
     417            offset = iBuilder->CreateAdd(offset, iBuilder->getInt64(index));
    391418        }
    392419        if (isPowerOfTwo(mBufferSize)) {
     
    396423        }
    397424    } else {
    398         offset = iBuilder->getInt64(value);
     425        offset = iBuilder->getInt64(index);
    399426    }
    400427    return offset;
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r4991 r4992  
    4444    KernelBuilder(std::string name, llvm::Module * m, IDISA::IDISA_Builder * b, const unsigned bufferSize = 1);
    4545
     46    template<typename T>
     47    struct disable_implicit_conversion {
     48        inline disable_implicit_conversion(T const value) : _value(value) {}
     49        inline disable_implicit_conversion(std::nullptr_t) = delete;
     50        inline disable_implicit_conversion(unsigned) = delete;
     51        operator T() { return _value; }
     52        T operator -> () { return _value; }
     53    private:
     54        T const  _value;
     55    };
     56
    4657    unsigned addInternalState(llvm::Type * const type);
    4758    unsigned addInternalState(llvm::Type * const type, std::string && name);
     
    6273    }
    6374
     75    inline llvm::Value * getInputStream(disable_implicit_conversion<llvm::Value *> index, const unsigned streamOffset = 0) {
     76        return getInputStream(mKernelState, index, streamOffset);
     77    }
     78
    6479    inline llvm::Value * getInputScalar(const unsigned index) {
    6580        return getInputScalar(mKernelState, index);
    6681    }
    6782
     83    inline llvm::Value * getInputScalar(disable_implicit_conversion<llvm::Value *> const index) {
     84        return getInputScalar(mKernelState, index);
     85    }
     86
    6887    llvm::Value * getInternalState(const std::string & name) {
    6988        return getInternalState(mKernelState, name);
     
    7897    }
    7998
     99    llvm::Value * getInternalState(disable_implicit_conversion<llvm::Value *> const index) {
     100        return getInternalState(mKernelState, index);
     101    }
     102
    80103    void setInternalState(const unsigned index, llvm::Value * value) {
    81104        setInternalState(mKernelState, index, value);
    82105    }
    83106
     107    void setInternalState(disable_implicit_conversion<llvm::Value *> const index, llvm::Value * value) {
     108        setInternalState(mKernelState, index, value);
     109    }
     110
    84111    llvm::Value * getOutputStream(const unsigned index, const unsigned streamOffset = 0) {
    85112        return getOutputStream(mKernelState, index, streamOffset);
    86113    }
    87114
     115    llvm::Value * getOutputStream(disable_implicit_conversion<llvm::Value *> const index, const unsigned streamOffset = 0) {
     116        return getOutputStream(mKernelState, index, streamOffset);
     117    }
     118
    88119    inline unsigned getNumOfOutputStreams() const {
    89120        return mOutputStream.size();
     
    94125    }
    95126
     127    llvm::Value * getOutputScalar(disable_implicit_conversion<llvm::Value *> const index) {
     128        return getOutputScalar(mKernelState, index);
     129    }
     130
    96131    inline unsigned getNumOfOutputScalars() const {
    97132        return mOutputScalar.size();
     
    130165    llvm::Value * getInputStream(llvm::Value * const instance, const unsigned index, const unsigned streamOffset);
    131166
     167    llvm::Value * getInputStream(llvm::Value * const instance, disable_implicit_conversion<llvm::Value *> index, const unsigned streamOffset);
     168
    132169    llvm::Value * getInputScalar(llvm::Value * const instance, const unsigned index);
    133170
     171    llvm::Value * getInputScalar(llvm::Value * const instance, disable_implicit_conversion<llvm::Value *> index);
     172
    134173    llvm::Value * getInternalState(llvm::Value * const instance, const std::string & name);
    135174
     
    138177    llvm::Value * getInternalState(llvm::Value * const instance, const unsigned index);
    139178
     179    llvm::Value * getInternalState(llvm::Value * const instance, disable_implicit_conversion<llvm::Value *> index);
     180
    140181    void setInternalState(llvm::Value * const instance, const unsigned index, llvm::Value * const value);
    141182
     183    void setInternalState(llvm::Value * const instance, disable_implicit_conversion<llvm::Value *> index, llvm::Value * const value);
     184
    142185    llvm::Value * getOutputStream(llvm::Value * const instance, const unsigned index, const unsigned streamOffset);
    143186
     187    llvm::Value * getOutputStream(llvm::Value * const instance, disable_implicit_conversion<llvm::Value *> index, const unsigned streamOffset);
     188
    144189    llvm::Value * getOutputScalar(llvm::Value * const instance, const unsigned index);
    145190
    146     llvm::Value * getOffset(llvm::Value * const instance, const unsigned value);
     191    llvm::Value * getOutputScalar(llvm::Value * const instance, disable_implicit_conversion<llvm::Value *> index);
     192
     193    llvm::Value * getStreamOffset(llvm::Value * const instance, const unsigned index);
    147194
    148195    llvm::Value * getBlockNo(llvm::Value * const instance);
  • icGREP/icgrep-devel/icgrep/kernels/symboltablepipeline.cpp

    r4991 r4992  
    159159
    160160    VectorType * const vecType = VectorType::get(iBuilder->getInt32Ty(), 8);
    161     Function * vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
    162     return iBuilder->CreateCall(vgather, {Constant::getAllOnesValue(vecType), base, iBuilder->CreateBitCast(vindex, vecType), Constant::getAllOnesValue(vecType), iBuilder->getInt8(1)});
     161    Function * const vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
     162    Constant * const ones = Constant::getAllOnesValue(vecType);
     163    return iBuilder->CreateCall(vgather, {ones, base, iBuilder->CreateBitCast(vindex, vecType), ones, iBuilder->getInt8(1)});
    163164}
    164165
     
    192193
    193194    VectorType * const vecType = VectorType::get(iBuilder->getInt32Ty(), 8);
    194     Function * vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
     195    Function * const vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
    195196    return iBuilder->CreateCall(vgather, {Constant::getNullValue(vecType), base, iBuilder->CreateBitCast(vindex, vecType), iBuilder->CreateBitCast(mask, vecType), iBuilder->getInt8(1)});
    196197}
     
    204205}
    205206
     207///** ------------------------------------------------------------------------------------------------------------- *
     208// * @brief generateScanMatch
     209// ** ------------------------------------------------------------------------------------------------------------- */
     210//void SymbolTableBuilder::generateHashingKernel(KernelBuilder * kBuilder, const unsigned minKeyLength, const unsigned maxKeyLength, const unsigned scanWordBitWidth) {
     211
     212//    const unsigned minKeyBlockCount = (minKeyLength / 4);
     213//    const unsigned maxKeyBlockCount = ((maxKeyLength + 3) / 4);
     214
     215//    Type * const intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
     216//    const unsigned fieldCount = iBuilder->getBitBlockWidth() / scanWordBitWidth;
     217//    Type * const scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
     218//    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
     219//    const unsigned gatherCount = vectorWidth * 4;
     220//    Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
     221
     222//    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
     223//    const unsigned startIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex");
     224//    const unsigned startArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount), "StartArray");
     225//    const unsigned endIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex");
     226//    const unsigned endArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), gatherCount), "EndArray");
     227
     228//    kBuilder->addInputStream(1, "startStream");
     229//    kBuilder->addInputStream(1, "endStream");
     230
     231//    Function * function = kBuilder->prepareFunction();
     232
     233//    BasicBlock * const entry = iBuilder->GetInsertBlock();
     234
     235//    BasicBlock * startOuterCond = BasicBlock::Create(mMod->getContext(), "startOuterCond", function, 0);
     236//    BasicBlock * startOuterBody = BasicBlock::Create(mMod->getContext(), "startOuterBody", function, 0);
     237//    BasicBlock * startInnerCond = BasicBlock::Create(mMod->getContext(), "startInnerCond", function, 0);
     238//    BasicBlock * startInnerBody = BasicBlock::Create(mMod->getContext(), "startInnerBody", function, 0);
     239
     240//    BasicBlock * endOuterCond = BasicBlock::Create(mMod->getContext(), "endOuterCond", function, 0);
     241//    BasicBlock * endOuterBody = BasicBlock::Create(mMod->getContext(), "endOuterBody", function, 0);
     242//    BasicBlock * endInnerCond = BasicBlock::Create(mMod->getContext(), "endInnerCond", function, 0);
     243//    BasicBlock * endInnerBody = BasicBlock::Create(mMod->getContext(), "endInnerBody", function, 0);
     244
     245//    BasicBlock * gatherInit = BasicBlock::Create(mMod->getContext(), "gather", function, 0);
     246
     247//    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
     248
     249//    //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
     250//    Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
     251//    Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
     252//    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
     253
     254//    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
     255//    Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(startIndexIdx), "startIndex");
     256//    Value * startArray = kBuilder->getInternalState(startArrayIdx);
     257//    Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(0)), scanWordVectorType, "startStream");
     258
     259//    Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(endIndexIdx), "endIndex");
     260//    Value * endArray = kBuilder->getInternalState(endArrayIdx);
     261//    Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(1)), scanWordVectorType, "endStream");
     262
     263//    iBuilder->CreateBr(startOuterCond);
     264
     265//    // START OUTER COND
     266//    iBuilder->SetInsertPoint(startOuterCond);
     267//    PHINode * outerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
     268//    outerStartIndexPhi->addIncoming(startIndex, entry);
     269//    PHINode * startIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
     270//    startIV->addIncoming(iBuilder->getInt64(0), entry);
     271//    Value * startOuterTest = iBuilder->CreateICmpNE(startIV, iBuilder->getInt64(fieldCount));
     272//    iBuilder->CreateCondBr(startOuterTest, startOuterBody, endOuterCond);
     273
     274//    // START OUTER BODY
     275//    iBuilder->SetInsertPoint(startOuterBody);
     276//    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
     277//    startIV->addIncoming(iBuilder->CreateAdd(startIV, iBuilder->getInt64(1)), startInnerCond);
     278//    iBuilder->CreateBr(startInnerCond);
     279
     280//    // START INNER COND
     281//    iBuilder->SetInsertPoint(startInnerCond);
     282//    PHINode * innerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
     283//    innerStartIndexPhi->addIncoming(outerStartIndexPhi, startOuterBody);
     284//    outerStartIndexPhi->addIncoming(innerStartIndexPhi, startInnerCond);
     285//    PHINode * startFieldPhi = iBuilder->CreatePHI(intScanWordTy, 2);
     286//    startFieldPhi->addIncoming(startField, startOuterBody);
     287//    Value * test = iBuilder->CreateICmpNE(startFieldPhi, ConstantInt::getNullValue(intScanWordTy));
     288//    iBuilder->CreateCondBr(test, startInnerBody, startOuterCond);
     289
     290//    // START INNER BODY
     291//    iBuilder->SetInsertPoint(startInnerBody);
     292//    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
     293//    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
     294//    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, blockPos), iBuilder->getInt32Ty());
     295//    iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), innerStartIndexPhi}));
     296//    innerStartIndexPhi->addIncoming(iBuilder->CreateAdd(innerStartIndexPhi, ConstantInt::get(innerStartIndexPhi->getType(), 1)), startInnerBody);
     297//    iBuilder->CreateBr(startInnerCond);
     298
     299//    // END POINT OUTER COND
     300//    iBuilder->SetInsertPoint(endOuterCond);
     301//    PHINode * outerStartIndexPhi2 = iBuilder->CreatePHI(startIndex->getType(), 2);
     302//    outerStartIndexPhi2->addIncoming(outerStartIndexPhi, startOuterCond);
     303//    PHINode * endIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
     304//    endIV->addIncoming(iBuilder->getInt64(0), startOuterCond);
     305//    Value * endOuterTest = iBuilder->CreateICmpNE(endIV, iBuilder->getInt64(fieldCount));
     306//    iBuilder->CreateCondBr(endOuterTest, endOuterBody, exit);
     307
     308//    // END POINT OUTER BODY
     309//    iBuilder->SetInsertPoint(endOuterBody);
     310//    Value * endField = iBuilder->CreateExtractElement(endStream, endIV);
     311//    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
     312//    iBuilder->CreateBr(endInnerCond);
     313
     314//    // END POINT INNER COND
     315//    iBuilder->SetInsertPoint(endInnerCond);
     316//    innerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 3);
     317//    innerStartIndexPhi->addIncoming(outerStartIndexPhi2, endOuterBody);
     318//    innerStartIndexPhi->addIncoming(innerStartIndexPhi, endInnerBody);
     319//    outerStartIndexPhi2->addIncoming(innerStartIndexPhi, endInnerCond);
     320//    PHINode * endIndexPhi = iBuilder->CreatePHI(endIndex->getType(), 3);
     321//    endIndexPhi->addIncoming(endIndex, endOuterBody);
     322//    endIndexPhi->addIncoming(ConstantInt::getNullValue(endIndex->getType()), gatherInit);
     323//    PHINode * endFieldPhi = iBuilder->CreatePHI(intScanWordTy, 3);
     324//    endFieldPhi->addIncoming(endField, endOuterBody);
     325//    Value * endInnerTest = iBuilder->CreateICmpNE(endFieldPhi, ConstantInt::getNullValue(intScanWordTy));
     326//    iBuilder->CreateCondBr(endInnerTest, endInnerBody, endOuterCond);
     327
     328//    // END POINT INNER BODY
     329//    iBuilder->SetInsertPoint(endInnerBody);
     330//    Value * endPos = generateCountForwardZeroes(iBuilder, endFieldPhi);
     331//    Value * updatedEndFieldPhi = generateResetLowestBit(iBuilder, endFieldPhi);
     332//    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
     333//    endFieldPhi->addIncoming(updatedEndFieldPhi, gatherInit);
     334//    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, blockPos), iBuilder->getInt32Ty());
     335//    iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi}));
     336//    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi, ConstantInt::get(endIndexPhi->getType(), 1));
     337//    endIndexPhi->addIncoming(updatedEndIndexPhi, endInnerBody);
     338//    Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), gatherCount));
     339//    iBuilder->CreateCondBr(filledEndPosBufferTest, gatherInit, endInnerCond);
     340
     341//    // GATHER INIT
     342//    iBuilder->SetInsertPoint(gatherInit);
     343//    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0));
     344//    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(gatherVectorType, 0));
     345//    CallGatherFunction(base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), minKeyBlockCount, maxKeyBlockCount);
     346//    // ... call hashing function ...
     347//    Value * untouchedArrayPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(startArray, iBuilder->getInt32(vectorWidth)), PointerType::get(gatherVectorType, 0));
     348//    Value * untouchedCount = iBuilder->CreateSub(innerStartIndexPhi, ConstantInt::get(innerStartIndexPhi->getType(), gatherCount));
     349//    iBuilder->CreateMemCpy(startArrayPtr, untouchedArrayPtr, untouchedCount, 4);
     350//    innerStartIndexPhi->addIncoming(untouchedCount, gatherInit);
     351//    iBuilder->CreateBr(endInnerCond);
     352
     353
     354//    iBuilder->SetInsertPoint(exit);
     355
     356
     357
     358//    // need to save the start/end index still
     359//    kBuilder->finalize();
     360
     361//    function->dump();
     362//}
     363
    206364/** ------------------------------------------------------------------------------------------------------------- *
    207  * @brief generateScanMatch
     365 * @brief generateGatherKernel
    208366 ** ------------------------------------------------------------------------------------------------------------- */
    209 void SymbolTableBuilder::generateScannerKernel(KernelBuilder * kBuilder, const unsigned minKeyLength, const unsigned maxKeyLength, const unsigned scanWordBitWidth) {
    210 
    211     Type * intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
     367void SymbolTableBuilder::generateGatherKernel(KernelBuilder * kBuilder, const std::vector<unsigned> & endpoints, const unsigned scanWordBitWidth) {
     368
     369    Type * const intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
    212370    const unsigned fieldCount = iBuilder->getBitBlockWidth() / scanWordBitWidth;
    213     Type * scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
     371    Type * const scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
    214372    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
    215     Type * gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
     373    const unsigned gatherCount = vectorWidth * 4;
     374    Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
     375    Type * const transposedVectorType = VectorType::get(iBuilder->getInt8Ty(), iBuilder->getBitBlockWidth() / 8);
     376
     377    unsigned minKeyLength = 0;
     378
     379    for (unsigned maxKeyLength : endpoints) {
     380
     381        kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex" + std::to_string(maxKeyLength));
     382        kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount), "StartArray" + std::to_string(maxKeyLength));
     383        kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex" + std::to_string(maxKeyLength));
     384        kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), gatherCount), "EndArray" + std::to_string(maxKeyLength));
     385
     386        kBuilder->addInputStream(1, "startStream" + std::to_string(maxKeyLength));
     387        kBuilder->addInputStream(1, "endStream" + std::to_string(maxKeyLength));
     388
     389        kBuilder->addOutputStream(maxKeyLength);
     390    }
    216391
    217392    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
    218     const unsigned startIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex");
    219     const unsigned startArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + vectorWidth), "StartArray");
    220     const unsigned endIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex");
    221     const unsigned endArrayIdx = kBuilder->addInternalState(gatherVectorType, "EndArray");
    222 
    223     kBuilder->addInputStream(1, "startStream");
    224     kBuilder->addInputStream(1, "endStream");
    225393
    226394    Function * function = kBuilder->prepareFunction();
    227395
    228396    BasicBlock * const entry = iBuilder->GetInsertBlock();
     397
     398    BasicBlock * groupCond = BasicBlock::Create(mMod->getContext(), "groupCond", function, 0);
     399    BasicBlock * groupBody = BasicBlock::Create(mMod->getContext(), "groupBody", function, 0);
    229400
    230401    BasicBlock * startOuterCond = BasicBlock::Create(mMod->getContext(), "startOuterCond", function, 0);
     
    238409    BasicBlock * endInnerBody = BasicBlock::Create(mMod->getContext(), "endInnerBody", function, 0);
    239410
    240     BasicBlock * gatherInit = BasicBlock::Create(mMod->getContext(), "gatherInit", function, 0);
    241 
    242     BasicBlock * gatherFullCond = BasicBlock::Create(mMod->getContext(), "gatherFullCond", function, 0);
    243     BasicBlock * gatherFullBody = BasicBlock::Create(mMod->getContext(), "gatherFullBody", function, 0);
    244 
    245 //    BasicBlock * gatherPartialCond = BasicBlock::Create(mMod->getContext(), "gatherPartialCond", function, 0);
    246 //    BasicBlock * gatherPartialBody = BasicBlock::Create(mMod->getContext(), "gatherPartialBody", function, 0);
     411    BasicBlock * gather = BasicBlock::Create(mMod->getContext(), "gather", function, 0);
     412
     413    BasicBlock * nextGroup = BasicBlock::Create(mMod->getContext(), "nextGroup", function, 0);
    247414
    248415    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
     
    253420    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
    254421
     422    FunctionType * const functionType = FunctionType::get(PointerType::get(transposedVectorType, 0), {iBuilder->getInt8PtrTy(), PointerType::get(gatherVectorType, 0), PointerType::get(gatherVectorType, 0), iBuilder->getInt32Ty(), PointerType::get(transposedVectorType, 0)}, false);
     423    Value * gatherFunctionPtrArray = iBuilder->CreateAlloca(PointerType::get(functionType, 0), iBuilder->getInt32(endpoints.size()));
     424    unsigned i = 0;
     425    minKeyLength = 0;
     426    for (unsigned maxKeyLength : endpoints) {
     427        const unsigned minCount = (minKeyLength / 4);
     428        const unsigned maxCount = ((maxKeyLength + 3) / 4);
     429        Value * ptr = iBuilder->CreateGEP(gatherFunctionPtrArray, iBuilder->getInt32(i++));
     430        iBuilder->CreateStore(generateGatherFunction(transposedVectorType, minCount, maxCount), ptr);
     431        minKeyLength = maxKeyLength;
     432    }
     433    iBuilder->CreateBr(groupCond);
     434
     435    // GROUP COND
     436    iBuilder->SetInsertPoint(groupCond);
     437    PHINode * groupIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
     438    groupIV->addIncoming(iBuilder->getInt32(0), entry);
     439    Value * groupTest = iBuilder->CreateICmpNE(groupIV, iBuilder->getInt32(endpoints.size()));
     440    iBuilder->CreateCondBr(groupTest, groupBody, exit);
     441
     442    // GROUP BODY
     443    iBuilder->SetInsertPoint(groupBody);
    255444    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
    256     Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(startIndexIdx), "startIndex");
    257     Value * startArray = kBuilder->getInternalState(startArrayIdx);
    258     Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(0)), scanWordVectorType, "startStream");
    259 
    260     Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(endIndexIdx), "endIndex");
    261     Value * endArray = kBuilder->getInternalState(endArrayIdx);
    262     Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(1)), scanWordVectorType, "endStream");
     445    Value * input0 = iBuilder->CreateMul(groupIV, iBuilder->getInt32(2));
     446    Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(input0)), scanWordVectorType, "startStream");
     447    Value * input1 = iBuilder->CreateAdd(input0, iBuilder->getInt32(1));
     448    Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(input1)), scanWordVectorType, "endStream");
     449
     450    Value * internal0 = iBuilder->CreateMul(groupIV, iBuilder->getInt32(4));
     451    Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(internal0), "startIndex");
     452    Value * internal1 = iBuilder->CreateAdd(internal0, iBuilder->getInt32(1));
     453    Value * startArray = kBuilder->getInternalState(internal1);
     454    Value * internal2 = iBuilder->CreateAdd(internal1, iBuilder->getInt32(1));
     455    Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(internal2), "endIndex");
     456    Value * internal3 = iBuilder->CreateAdd(internal2, iBuilder->getInt32(1));
     457    Value * endArray = kBuilder->getInternalState(internal3);
     458
     459    Value * const buffer = kBuilder->getOutputStream(groupIV);
    263460
    264461    iBuilder->CreateBr(startOuterCond);
     462
     463    // START OUTER COND
    265464    iBuilder->SetInsertPoint(startOuterCond);
    266 
     465    PHINode * startIndexPhi1 = iBuilder->CreatePHI(startIndex->getType(), 2);
     466    startIndexPhi1->addIncoming(startIndex, groupBody);
    267467    PHINode * startIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
    268     startIV->addIncoming(iBuilder->getInt64(0), entry);
     468    startIV->addIncoming(iBuilder->getInt64(0), groupBody);
    269469    Value * startOuterTest = iBuilder->CreateICmpNE(startIV, iBuilder->getInt64(fieldCount));
    270470    iBuilder->CreateCondBr(startOuterTest, startOuterBody, endOuterCond);
    271471
     472    // START OUTER BODY
    272473    iBuilder->SetInsertPoint(startOuterBody);
    273474    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
     
    275476    iBuilder->CreateBr(startInnerCond);
    276477
     478    // START INNER COND
    277479    iBuilder->SetInsertPoint(startInnerCond);
    278     PHINode * startIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
    279     startIndexPhi->addIncoming(startIndex, startOuterBody);
     480    PHINode * startIndexPhi3 = iBuilder->CreatePHI(startIndex->getType(), 2);
     481    startIndexPhi3->addIncoming(startIndexPhi1, startOuterBody);
     482    startIndexPhi1->addIncoming(startIndexPhi3, startInnerCond);
    280483    PHINode * startFieldPhi = iBuilder->CreatePHI(intScanWordTy, 2);
    281484    startFieldPhi->addIncoming(startField, startOuterBody);
     
    283486    iBuilder->CreateCondBr(test, startInnerBody, startOuterCond);
    284487
     488    // START INNER BODY
    285489    iBuilder->SetInsertPoint(startInnerBody);
    286490    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
    287491    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
    288492    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, blockPos), iBuilder->getInt32Ty());
    289     iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), startIndexPhi}));
    290     startIndexPhi->addIncoming(iBuilder->CreateAdd(startIndexPhi, ConstantInt::get(startIndexPhi->getType(), 1)), startInnerBody);
     493    iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), startIndexPhi3}));
     494    startIndexPhi3->addIncoming(iBuilder->CreateAdd(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), 1)), startInnerBody);
    291495    iBuilder->CreateBr(startInnerCond);
     496
    292497    // END POINT OUTER COND
    293498    iBuilder->SetInsertPoint(endOuterCond);
     499    PHINode * endIndexPhi1 = iBuilder->CreatePHI(endIndex->getType(), 2);
     500    endIndexPhi1->addIncoming(endIndex, startOuterCond);
     501    PHINode * startIndexPhi2 = iBuilder->CreatePHI(startIndex->getType(), 2);
     502    startIndexPhi2->addIncoming(startIndexPhi1, startOuterCond);
    294503    PHINode * endIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
    295504    endIV->addIncoming(iBuilder->getInt64(0), startOuterCond);
    296505    Value * endOuterTest = iBuilder->CreateICmpNE(endIV, iBuilder->getInt64(fieldCount));
    297     iBuilder->CreateCondBr(endOuterTest, endOuterBody, exit);
     506    iBuilder->CreateCondBr(endOuterTest, endOuterBody, nextGroup);
     507
    298508    // END POINT OUTER BODY
    299509    iBuilder->SetInsertPoint(endOuterBody);
     
    301511    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
    302512    iBuilder->CreateBr(endInnerCond);
     513
    303514    // END POINT INNER COND
    304515    iBuilder->SetInsertPoint(endInnerCond);
    305     PHINode * endIndexPhi = iBuilder->CreatePHI(endIndex->getType(), 3);
    306     endIndexPhi->addIncoming(endIndex, endOuterBody);
     516    startIndexPhi3 = iBuilder->CreatePHI(startIndexPhi2->getType(), 3);
     517    startIndexPhi3->addIncoming(startIndexPhi2, endOuterBody);
     518    startIndexPhi3->addIncoming(startIndexPhi3, endInnerBody);
     519    startIndexPhi2->addIncoming(startIndexPhi3, endInnerCond);
     520    PHINode * endIndexPhi2 = iBuilder->CreatePHI(endIndex->getType(), 3);
     521    endIndexPhi2->addIncoming(endIndexPhi1, endOuterBody);
     522    endIndexPhi1->addIncoming(endIndexPhi2, endInnerCond);
     523    endIndexPhi2->addIncoming(ConstantInt::getNullValue(endIndex->getType()), gather);
    307524    PHINode * endFieldPhi = iBuilder->CreatePHI(intScanWordTy, 3);
    308525    endFieldPhi->addIncoming(endField, endOuterBody);
    309526    Value * endInnerTest = iBuilder->CreateICmpNE(endFieldPhi, ConstantInt::getNullValue(intScanWordTy));
    310527    iBuilder->CreateCondBr(endInnerTest, endInnerBody, endOuterCond);
     528
    311529    // END POINT INNER BODY
    312530    iBuilder->SetInsertPoint(endInnerBody);
     
    314532    Value * updatedEndFieldPhi = generateResetLowestBit(iBuilder, endFieldPhi);
    315533    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
     534    endFieldPhi->addIncoming(updatedEndFieldPhi, gather);
    316535    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, blockPos), iBuilder->getInt32Ty());
    317     iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi}));
    318     Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi, ConstantInt::get(endIndexPhi->getType(), 1));
    319     endIndexPhi->addIncoming(updatedEndIndexPhi, endInnerBody);
    320     Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), vectorWidth));
    321     iBuilder->CreateCondBr(filledEndPosBufferTest, gatherInit, endInnerCond);
    322     // GATHER INIT
    323     iBuilder->SetInsertPoint(gatherInit);
    324     Value * rawTokenBuffer = iBuilder->CreateAlloca(ArrayType::get(gatherVectorType, (maxKeyLength / 4) + (maxKeyLength % 4) != 0 ? 1 : 0));
    325     rawTokenBuffer = iBuilder->CreatePointerCast(rawTokenBuffer, PointerType::get(gatherVectorType, 0));
    326     Value * const startPositions = iBuilder->CreateAlignedLoad(iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0)), 4);
    327     iBuilder->CreateBr(gatherFullCond);
    328     // GATHER FULL COND
    329     iBuilder->SetInsertPoint(gatherFullCond);
    330 
    331     endIndexPhi->addIncoming(iBuilder->getInt32(0), gatherFullCond);
    332     endFieldPhi->addIncoming(updatedEndFieldPhi, gatherFullCond);
    333 
    334     PHINode * fullGatherIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
    335     fullGatherIV->addIncoming(iBuilder->getInt64(0), gatherInit);
    336     PHINode * startPositionsPhi = iBuilder->CreatePHI(startPositions->getType(), 2);
    337     startPositionsPhi->addIncoming(startPositions, gatherInit);
    338 
    339     Value * fullGatherTest = iBuilder->CreateICmpNE(fullGatherIV, iBuilder->getInt64(minKeyLength / vectorWidth));
    340     iBuilder->CreateCondBr(fullGatherTest, gatherFullBody, endInnerCond);
    341     // GATHER FULL BODY
    342     iBuilder->SetInsertPoint(gatherFullBody);
    343     Value * gathered = generateGather(base, startPositionsPhi);
    344     startPositionsPhi->addIncoming(iBuilder->CreateAdd(startPositionsPhi, iBuilder->CreateVectorSplat(vectorWidth, iBuilder->getInt32(4))), gatherFullBody);
    345     iBuilder->CreateAlignedStore(gathered, iBuilder->CreateGEP(rawTokenBuffer, fullGatherIV), 4);
    346     fullGatherIV->addIncoming(iBuilder->CreateAdd(fullGatherIV, iBuilder->getInt64(1)), gatherFullBody);
    347     iBuilder->CreateBr(gatherFullCond);
     536    iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi2}));
     537    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi2, ConstantInt::get(endIndexPhi2->getType(), 1));
     538    endIndexPhi2->addIncoming(updatedEndIndexPhi, endInnerBody);
     539    Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), gatherCount));
     540    iBuilder->CreateCondBr(filledEndPosBufferTest, gather, endInnerCond);
     541
     542    // GATHER
     543    iBuilder->SetInsertPoint(gather);
     544    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0));
     545    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(gatherVectorType, 0));
     546    Value * const bufferPtr = iBuilder->CreatePointerCast(buffer, PointerType::get(transposedVectorType, 0));
     547    Value * gatherFunctionPtr = iBuilder->CreateLoad(iBuilder->CreateGEP(gatherFunctionPtrArray, groupIV));
     548    iBuilder->CreateCall5(gatherFunctionPtr, base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), bufferPtr);
     549
     550    // ... call hashing function ...
     551    Value * untouchedArrayPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(startArray, iBuilder->getInt32(vectorWidth)), PointerType::get(gatherVectorType, 0));
     552    Value * untouchedCount = iBuilder->CreateSub(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), gatherCount));
     553    iBuilder->CreateMemCpy(startArrayPtr, untouchedArrayPtr, untouchedCount, 4);
     554    startIndexPhi3->addIncoming(untouchedCount, gather);
     555    iBuilder->CreateBr(endInnerCond);
     556
     557    // NEXT GROUP
     558    iBuilder->SetInsertPoint(nextGroup);
     559    kBuilder->setInternalState(internal0, startIndexPhi2);
     560    kBuilder->setInternalState(internal2, endIndexPhi1);
     561    groupIV->addIncoming(iBuilder->CreateAdd(groupIV, ConstantInt::get(groupIV->getType(), 1)), nextGroup);
     562    iBuilder->CreateBr(groupCond);
    348563
    349564    iBuilder->SetInsertPoint(exit);
    350     // need to save the start/end index still
    351565    kBuilder->finalize();
    352566}
     567
     568/** ------------------------------------------------------------------------------------------------------------- *
     569 * @brief generateGatherFunction
     570 ** ------------------------------------------------------------------------------------------------------------- */
     571Function * SymbolTableBuilder::generateGatherFunction(Type * const resultType, const unsigned minCount, const unsigned maxCount) {
     572
     573    assert (maxCount > minCount);
     574
     575    const std::string functionName = "gather_" + std::to_string(minCount) + "_" + std::to_string(maxCount);
     576    Function * function = mMod->getFunction(functionName);
     577    if (function == nullptr) {
     578
     579        const auto ip = iBuilder->saveIP();
     580
     581        const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
     582        Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
     583        Type * const gatherVectorArrayType = ArrayType::get(gatherVectorType, maxCount);
     584
     585        FunctionType * const functionType = FunctionType::get(PointerType::get(resultType, 0), {iBuilder->getInt8PtrTy(), PointerType::get(gatherVectorType, 0), PointerType::get(gatherVectorType, 0), iBuilder->getInt32Ty(), PointerType::get(resultType, 0)}, false);
     586        function = Function::Create(functionType, GlobalValue::ExternalLinkage, functionName, mMod);
     587        function->setCallingConv(CallingConv::C);
     588        function->setDoesNotCapture(1);
     589        function->setDoesNotCapture(2);
     590        function->setDoesNotCapture(3);
     591        function->setDoesNotThrow();
     592
     593        Function::arg_iterator args = function->arg_begin();
     594        Value * const base = args++;
     595        base->setName("base");
     596        Value * const startArray = args++;
     597        startArray->setName("startArray");
     598        Value * const endArray = args++;
     599        endArray->setName("endArray");
     600        Value * const count = args++;
     601        count->setName("count");
     602        Value * const transposedBuffer = args++;
     603        transposedBuffer->setName("buffer");
     604
     605        BasicBlock * entry = BasicBlock::Create(mMod->getContext(), "entry", function, 0);
     606        BasicBlock * gatherCond = BasicBlock::Create(mMod->getContext(), "gatherCond", function, 0);
     607        BasicBlock * partialGatherCond = BasicBlock::Create(mMod->getContext(), "partialGatherCond", function, 0);
     608        BasicBlock * partialGatherBody = BasicBlock::Create(mMod->getContext(), "partialGatherBody", function, 0);
     609        BasicBlock * gatherBody = BasicBlock::Create(mMod->getContext(), "gatherBody", function, 0);
     610        BasicBlock * transposeCond = BasicBlock::Create(mMod->getContext(), "transposeCond", function, 0);
     611        BasicBlock * transposeBody = BasicBlock::Create(mMod->getContext(), "transposeBody", function, 0);
     612        BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
     613
     614        Value * const four = iBuilder->CreateVectorSplat(vectorWidth, iBuilder->getInt32(4));
     615
     616        // ENTRY
     617        iBuilder->SetInsertPoint(entry);
     618        Value * const untransposedBuffer = iBuilder->CreateAlloca(gatherVectorArrayType, iBuilder->getInt32(4), "untransposedBuffer");
     619        iBuilder->CreateBr(gatherCond);
     620
     621        // FULL GATHER COND
     622        iBuilder->SetInsertPoint(gatherCond);
     623        PHINode * remainingLanes = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
     624        remainingLanes->addIncoming(count, entry);
     625        PHINode * gatherIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
     626        gatherIV->addIncoming(iBuilder->getInt32(0), entry);
     627        Value * gatherLoopTest = iBuilder->CreateICmpNE(gatherIV, iBuilder->getInt32(4));
     628        iBuilder->CreateCondBr(gatherLoopTest, partialGatherCond, transposeCond);
     629
     630        // PARTIAL GATHER COND
     631        iBuilder->SetInsertPoint(partialGatherCond);
     632        Value * partialGatherLoopTest = iBuilder->CreateICmpSGE(remainingLanes, iBuilder->getInt32(vectorWidth));
     633        iBuilder->CreateCondBr(partialGatherLoopTest, gatherBody, partialGatherBody);
     634
     635        // PARTIAL GATHER BODY
     636        iBuilder->SetInsertPoint(partialGatherBody);
     637        Type * registerType = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
     638        Value * maskedLanes = iBuilder->CreateSub(iBuilder->getInt32(vectorWidth), remainingLanes);
     639        maskedLanes = iBuilder->CreateMul(maskedLanes, iBuilder->getInt32(32));
     640        maskedLanes = iBuilder->CreateZExt(maskedLanes, registerType);
     641        maskedLanes = iBuilder->CreateLShr(Constant::getAllOnesValue(registerType), maskedLanes);
     642        maskedLanes = iBuilder->CreateBitCast(maskedLanes, gatherVectorType);
     643
     644        iBuilder->CreateBr(gatherBody);
     645
     646        // FULL GATHER BODY
     647        iBuilder->SetInsertPoint(gatherBody);
     648        PHINode * activeLanes = iBuilder->CreatePHI(gatherVectorType, 2, "activeLanes");
     649        activeLanes->addIncoming(Constant::getAllOnesValue(gatherVectorType), partialGatherCond);
     650        activeLanes->addIncoming(maskedLanes, partialGatherBody);
     651
     652        Value * startPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(startArray, gatherIV), 4);
     653        for (unsigned blockCount = 0; blockCount < minCount; ++blockCount) {
     654            Value * tokenData = generateMaskedGather(base, startPos, activeLanes);
     655            startPos = iBuilder->CreateAdd(startPos, four);
     656            iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
     657        }
     658
     659        Value * const endPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(endArray, gatherIV), 4);
     660        for (unsigned blockCount = minCount; blockCount < maxCount; ++blockCount) {
     661            // if we have not fully gathered the data for this key
     662            Value * atLeastOneByte = iBuilder->CreateAnd(iBuilder->CreateSExt(iBuilder->CreateICmpULT(startPos, endPos), startPos->getType()), activeLanes);
     663            // gather it ...
     664            Value * tokenData = generateMaskedGather(base, startPos, atLeastOneByte);
     665            // and compute how much data is remaining.
     666            Value * remaining = iBuilder->CreateSub(endPos, startPos);
     667            // if this token only has 1 to 3 bytes remaining ...
     668            Value * lessThanFourBytes = iBuilder->CreateSExt(iBuilder->CreateICmpSLT(remaining, four), remaining->getType());
     669            Value * betweenOneAndThreeBytes = iBuilder->CreateAnd(atLeastOneByte, lessThanFourBytes);
     670            // determine how many bytes (bits?) do *not* belong to the token
     671            remaining = iBuilder->CreateSub(four, iBuilder->CreateAnd(remaining, betweenOneAndThreeBytes));
     672            // remaining = iBuilder->CreateShl(remaining, ConstantInt::get(remaining->getType(), 3));
     673            // then mask them out prior to storing the value
     674            Value * partialTokenMask = iBuilder->CreateLShr(ConstantInt::getAllOnesValue(remaining->getType()), remaining);
     675            tokenData = iBuilder->CreateAnd(partialTokenMask, tokenData);
     676            iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
     677            if (blockCount < (maxCount - 1)) {
     678                startPos = iBuilder->CreateAdd(startPos, four);
     679            }
     680        }
     681        gatherIV->addIncoming(iBuilder->CreateAdd(gatherIV, iBuilder->getInt32(1)), gatherBody);
     682        remainingLanes->addIncoming(iBuilder->CreateSub(remainingLanes, iBuilder->getInt32(vectorWidth)), gatherBody);
     683        iBuilder->CreateBr(gatherCond);
     684
     685        // TRANSPOSE COND
     686        iBuilder->SetInsertPoint(transposeCond);
     687        PHINode * transposeIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
     688        transposeIV->addIncoming(iBuilder->getInt32(0), gatherCond);
     689        Value * transposeLoopTest = iBuilder->CreateICmpNE(transposeIV, iBuilder->getInt32(maxCount));
     690        iBuilder->CreateCondBr(transposeLoopTest, transposeBody, exit);
     691
     692        // TRANSPOSE BODY
     693        iBuilder->SetInsertPoint(transposeBody);
     694
     695        Value * value[4];
     696        Value * temporary[4];
     697        for (unsigned i = 0; i < 4; ++i) {
     698            Value * const ptr = iBuilder->CreateGEP(untransposedBuffer, {transposeIV, iBuilder->getInt32(i)});
     699            value[i] = iBuilder->CreateAlignedLoad(ptr, 4);
     700        }
     701        for (unsigned fieldWidth = 16; fieldWidth != 4; fieldWidth /= 2) {
     702            const unsigned fieldCount = iBuilder->getBitBlockWidth() / fieldWidth;
     703            VectorType * const vecType = VectorType::get(IntegerType::get(mMod->getContext(), fieldWidth), fieldCount);
     704            std::vector<Constant *> lowFields(fieldCount);
     705            std::vector<Constant *> highFields(fieldCount);
     706            for (unsigned j = 0; j < fieldCount; ++j) {
     707                lowFields[j] = iBuilder->getInt32(j * 2);
     708                highFields[j] = iBuilder->getInt32(j * 2 + 1);
     709            }
     710            Constant * const lowVector = ConstantVector::get(lowFields);
     711            Constant * const highVector = ConstantVector::get(highFields);
     712            for (unsigned i = 0; i < 4; i += 2) {
     713                value[i] = iBuilder->CreateBitCast(value[i], vecType);
     714                value[i + 1] = iBuilder->CreateBitCast(value[i + 1], vecType);
     715                temporary[i / 2] = iBuilder->CreateShuffleVector(value[i], value[i + 1], lowVector);
     716                temporary[(i / 2) + 2] = iBuilder->CreateShuffleVector(value[i], value[i + 1], highVector);
     717            }
     718            std::swap(value, temporary);
     719        }
     720        Value * offset = iBuilder->CreateShl(transposeIV, ConstantInt::get(transposeIV->getType(), 2));
     721        for (unsigned i = 0; i < 4; ++i) {
     722            Value * index = offset;
     723            if (i) {
     724                index = iBuilder->CreateOr(offset, iBuilder->getInt32(i));
     725            }
     726            Value * ptr = iBuilder->CreateGEP(transposedBuffer, index);
     727            iBuilder->CreateAlignedStore(value[i], ptr, 4);
     728        }
     729        transposeIV->addIncoming(iBuilder->CreateAdd(transposeIV, iBuilder->getInt32(1)), transposeBody);
     730        iBuilder->CreateBr(transposeCond);
     731
     732        // EXIT
     733        iBuilder->SetInsertPoint(exit);
     734        iBuilder->CreateRet(transposedBuffer);
     735
     736        iBuilder->restoreIP(ip);
     737    }
     738
     739    return function;
     740}
     741
    353742
    354743/** ------------------------------------------------------------------------------------------------------------- *
     
    373762    mLeadingKernel = new KernelBuilder("leading", mMod, iBuilder, bufferSize);
    374763    mSortingKernel = new KernelBuilder("sorting", mMod, iBuilder, bufferSize);
    375     mScannerKernel = new KernelBuilder("scanner", mMod, iBuilder, 1);
     764    mGatherKernel = new KernelBuilder("gathering", mMod, iBuilder, 1);
    376765
    377766    generateS2PKernel(mMod, iBuilder, mS2PKernel);
     
    387776    releaseSlabAllocatorMemory();
    388777
    389     generateScannerKernel(mScannerKernel, 1, 1, 64);
     778    generateGatherKernel(mGatherKernel, endpoints, 64);
    390779
    391780}
     
    516905    delete mLeadingKernel;
    517906    delete mSortingKernel;
    518     delete mScannerKernel;
    519 }
    520 
    521 
    522 }
     907    delete mGatherKernel;
     908}
     909
     910
     911}
  • icGREP/icgrep-devel/icgrep/kernels/symboltablepipeline.h

    r4991 r4992  
    3232    pablo::PabloFunction * generateSortingFunction(const pablo::PabloFunction * const leading, const std::vector<unsigned> & endpoints);
    3333
    34     void generateScannerKernel(KernelBuilder * kBuilder, const unsigned minKeyLength, const unsigned maxKeyLength, const unsigned scanWordBitWidth = 64);
    35     Function * generateScanWordRoutine(KernelBuilder * const kBuilder, const unsigned scanWordBitWidth);
     34    void generateGatherKernel(KernelBuilder * kBuilder, const std::vector<unsigned> & endpoints, const unsigned scanWordBitWidth = 64);
     35    Function * generateGatherFunction(Type * const transposedVectorType, const unsigned minCount, const unsigned maxCount);
    3636
    3737    Value * generateGather(Value * const base, Value * const vindex);
     
    4646    KernelBuilder *                     mLeadingKernel;
    4747    KernelBuilder *                     mSortingKernel;
    48     KernelBuilder *                     mScannerKernel;
     48    KernelBuilder *                     mGatherKernel;
    4949
    5050    unsigned                            mLongestLookahead;
Note: See TracChangeset for help on using the changeset viewer.