Changeset 4995


Ignore:
Timestamp:
Mar 30, 2016, 4:42:52 PM (20 months ago)
Author:
nmedfort
Message:

More work on symbol table; unexpected bug with 4KiB-one page tests observed.

Location:
icGREP/icgrep-devel/icgrep
Files:
10 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r4991 r4995  
    110110add_executable(icgrep icgrep.cpp toolchain.cpp grep_engine.cpp object_cache.cpp ${PRECOMPILED_FILES})
    111111add_executable(casefold casefold.cpp kernels/p2s_kernel.cpp kernels/stdout_kernel.cpp kernels/casefold_pipeline.cpp)
    112 add_executable(symtbl symboltable.cpp kernels/symboltablepipeline.cpp toolchain.cpp grep_engine.cpp object_cache.cpp ${PRECOMPILED_FILES})
     112add_executable(symtbl symboltable.cpp kernels/symboltablepipeline.cpp kernels/stdout_kernel.cpp toolchain.cpp grep_engine.cpp object_cache.cpp ${PRECOMPILED_FILES})
    113113
    114114IF(ENABLE_PREGENERATED_UCD_FUNCTIONS)
  • icGREP/icgrep-devel/icgrep/IDISA/idisa_builder.cpp

    r4986 r4995  
    7878        printRegister = function;
    7979    }
     80    assert (value->getType()->isVectorTy());
    8081    CreateCall2(printRegister, geti8StrVal(*mMod, name.c_str(), name), CreateBitCast(value, mBitBlockType));
    8182}
     
    8788        Function * function = Function::Create(FT, Function::InternalLinkage, "PrintInt", mMod);
    8889        auto arg = function->arg_begin();
    89         std::string out = "%-40s = %i\n";
     90        std::string out = "%-40s = %" PRIi64 "\n";
    9091        BasicBlock * entry = BasicBlock::Create(mMod->getContext(), "entry", function);
    9192        IRBuilder<> builder(entry);
     
    103104        printRegister = function;
    104105    }
    105     CreateCall2(printRegister, geti8StrVal(*mMod, name.c_str(), name), CreateBitCast(value, getInt64Ty()));
     106    Value * num = nullptr;
     107    if (value->getType()->isPointerTy()) {
     108        num = CreatePtrToInt(value, getInt64Ty());
     109    } else {
     110        num = CreateZExtOrBitCast(value, getInt64Ty());
     111    }
     112    assert (num->getType()->isIntegerTy());
     113    CreateCall2(printRegister, geti8StrVal(*mMod, name.c_str(), name), num);
    106114}
    107115
  • icGREP/icgrep-devel/icgrep/icgrep-devel.files

    r4991 r4995  
    890890kernels/casefold_pipeline.h
    891891kernels/casefold_pipeline.cpp
     892kernels/symboltablepipeline.cpp
     893kernels/symboltablepipeline.h
     894kernels/kernel.cpp
     895kernels/kernel.h
     896kernels/p2s_kernel.cpp
     897kernels/instance.h
     898kernels/casefold_pipeline.cpp
     899kernels/stdout_kernel.h
     900kernels/casefold_pipeline.h
     901kernels/p2s_kernel.h
     902kernels/stdout_kernel.cpp
     903kernels/pipeline.cpp
     904kernels/scanmatchgen.cpp
     905kernels/s2p_kernel.cpp
     906kernels/pipeline.h
     907kernels/deletion.h
     908kernels/deletion.cpp
     909kernels/streamset.h
     910kernels/s2p_kernel.h
     911kernels/scanmatchgen.h
     912kernels/lane_pipeline.cpp
     913kernels/lane_pipeline.h
     914kernels/lane_s2p_kernel.cpp
     915kernels/lane_s2p_kernel.h
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r4992 r4995  
    7070
    7171Value * KernelBuilder::getInternalState(Value * const instance, disable_implicit_conversion<Value *> index) {
    72     assert (index->getType() == iBuilder->getInt32Ty());
     72    assert (index->getType()->isIntegerTy());
    7373    return iBuilder->CreateGEP(instance, {iBuilder->getInt64(0), iBuilder->getInt32(INTERNAL_STATE), index});
    7474}
     
    137137Value * KernelBuilder::getInputStream(Value * const instance, disable_implicit_conversion<Value *> index, const unsigned streamOffset) {
    138138    assert (instance && index);
    139     Value * inputStream = iBuilder->CreateLoad(iBuilder->CreateGEP(instance,
     139    assert (index->getType()->isIntegerTy());
     140    Value * const inputStreamSet = iBuilder->CreateLoad(iBuilder->CreateGEP(instance,
    140141        {iBuilder->getInt32(0), iBuilder->getInt32(INPUT_STREAM_SET), iBuilder->getInt32(0)}));
    141142    Value * modFunction = iBuilder->CreateLoad(iBuilder->CreateGEP(instance,
     
    145146        offset = iBuilder->CreateAdd(offset, ConstantInt::get(offset->getType(), streamOffset));
    146147    }
    147     assert (index->getType() == iBuilder->getInt32Ty());
    148     return iBuilder->CreateGEP(inputStream, { iBuilder->CreateCall(modFunction, offset), index });
     148    if (LLVM_LIKELY(isa<ConstantInt>(index.get()) || inputStreamSet->getType()->getPointerElementType()->isArrayTy())) {
     149        return iBuilder->CreateGEP(inputStreamSet, { iBuilder->CreateCall(modFunction, offset), index });
     150    } else {
     151        throw std::runtime_error("Cannot access the input stream with a non-constant value unless all input stream types are identical!");
     152    }
    149153}
    150154
     
    205209Value * KernelBuilder::getOutputStream(Value * const instance, disable_implicit_conversion<Value *> index, const unsigned streamOffset) {
    206210    assert (instance && index);
    207     assert (index->getType() == iBuilder->getInt32Ty());
    208     return iBuilder->CreateGEP(instance, {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET), getStreamOffset(instance, streamOffset), index});
     211    assert (index->getType()->isIntegerTy());
     212    if (LLVM_LIKELY(isa<ConstantInt>(index.get()))) {
     213        return iBuilder->CreateGEP(instance, {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET), getStreamOffset(instance, streamOffset), index});
     214    } else {
     215        Value * const outputStreamSet = iBuilder->CreateGEP(instance, {iBuilder->getInt32(0), iBuilder->getInt32(OUTPUT_STREAM_SET)});
     216        if (LLVM_LIKELY(outputStreamSet->getType()->getPointerElementType()->isArrayTy())) {
     217            return iBuilder->CreateGEP(outputStreamSet, {getStreamOffset(instance, streamOffset), index});
     218        }
     219    }
     220    throw std::runtime_error("Cannot access the output stream with a non-constant value unless all output stream types are identical!");
    209221}
    210222
     
    221233
    222234/** ------------------------------------------------------------------------------------------------------------- *
     235 * @brief packDataTypes
     236 ** ------------------------------------------------------------------------------------------------------------- */
     237llvm::Type * KernelBuilder::packDataTypes(const std::vector<llvm::Type *> & types) {
     238    bool canPackIntoArray = !types.empty();
     239    for (Type * type : types) {
     240        if (type != types.front()) { // use canLosslesslyBitcastInto ?
     241            canPackIntoArray = false;
     242            break;
     243        }
     244    }
     245    if (canPackIntoArray) {
     246        return ArrayType::get(types.front(), types.size());
     247    } else {
     248        return StructType::get(mMod->getContext(), types);
     249    }
     250}
     251
     252/** ------------------------------------------------------------------------------------------------------------- *
    223253 * @brief prepareFunction
    224254 ** ------------------------------------------------------------------------------------------------------------- */
     
    226256
    227257    PointerType * modFunctionType = PointerType::get(FunctionType::get(iBuilder->getInt64Ty(), {iBuilder->getInt64Ty()}, false), 0);
    228     mInputStreamType = PointerType::get(StructType::get(mMod->getContext(), mInputStream), 0);
    229     mInputScalarType = PointerType::get(StructType::get(mMod->getContext(), mInputScalar), 0);
    230     mOutputStreamType = StructType::get(mMod->getContext(), mOutputStream);
    231     Type * outputScalarType = StructType::get(mMod->getContext(), mOutputScalar);
    232     Type * internalStateType = StructType::create(mMod->getContext(), mInternalState);
     258    mInputStreamType = PointerType::get(packDataTypes(mInputStream), 0);
     259    mInputScalarType = PointerType::get(packDataTypes(mInputScalar), 0);
     260    mOutputStreamType = packDataTypes(mOutputStream);
     261    Type * outputScalarType = packDataTypes(mOutputScalar);
     262    Type * internalStateType = packDataTypes(mInternalState);
    233263    Type * inputStateType = StructType::create(mMod->getContext(), { mInputStreamType, modFunctionType});
    234 
    235264    Type * outputBufferType = ArrayType::get(mOutputStreamType, mBufferSize);
    236265    mKernelStateType = StructType::create(mMod->getContext(), {internalStateType, inputStateType, outputBufferType, outputScalarType}, mKernelName);
     
    286315    for (unsigned i = 0; i < mInternalState.size(); ++i) {
    287316        Type * const type = mInternalState[i];
    288         if (type->isIntegerTy() || type->isArrayTy() || type->isVectorTy()) {
     317        if (type->isSized()) {
    289318            setInternalState(i, Constant::getNullValue(type));
    290319        } else {
    291320            Value * const ptr = getInternalState(i);
    292321            Value * const size = iBuilder->CreatePtrDiff(iBuilder->CreateGEP(ptr, iBuilder->getInt32(1)), ptr);
     322            iBuilder->CallPrintInt(mKernelName + "_zeroinit_" + std::to_string(i), size);
    293323            iBuilder->CreateMemSet(ptr, iBuilder->getInt8(0), size, 4);
    294324        }
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r4992 r4995  
    4949        inline disable_implicit_conversion(std::nullptr_t) = delete;
    5050        inline disable_implicit_conversion(unsigned) = delete;
    51         operator T() { return _value; }
    52         T operator -> () { return _value; }
     51        operator T() const { return _value; }
     52        T operator-> () const { return _value; }
     53        T get() const { return _value; }
    5354    private:
    5455        T const  _value;
     
    162163
    163164protected:
     165
     166    llvm::Type * packDataTypes(const std::vector<llvm::Type *> & types);
    164167
    165168    llvm::Value * getInputStream(llvm::Value * const instance, const unsigned index, const unsigned streamOffset);
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r4988 r4995  
    1616            TypeBuilder<long(int, char *, long), false>::get(mod->getContext());
    1717            write = cast<Function>(mod->getOrInsertFunction("write", write_type,
    18                                                              AttributeSet().addAttribute(mod->getContext(), 1U, Attribute::NoAlias)));
     18                                                             AttributeSet().addAttribute(mod->getContext(), 2U, Attribute::NoAlias)));
    1919        }
    2020        return write;
  • icGREP/icgrep-devel/icgrep/kernels/symboltablepipeline.cpp

    r4992 r4995  
    1212#include <kernels/s2p_kernel.h>
    1313#include <kernels/instance.h>
     14#include <kernels/stdout_kernel.h>
    1415
    1516#include <pablo/function.h>
     
    5253    cc::CC_Compiler ccCompiler(*function, enc);
    5354    re::RE_Compiler reCompiler(*function, ccCompiler);
    54     RE * cc = makeName(makeCC(makeCC(65, 90), makeCC(97, 122)));
     55    RE * cc = makeName(makeCC(makeCC(makeCC('a', 'z'), makeCC('A', 'Z')), makeCC('0', '9')));
    5556    reCompiler.compileUnicodeNames(cc);
    5657    PabloAST * const matches = reCompiler.compile(cc).stream;
     
    6061    PabloAST * const ends = entry->createAnd(adv, entry->createNot(matches));
    6162
    62     function->setResult(0, entry->createAssign("S", starts));
    63     function->setResult(1, entry->createAssign("E", ends));
     63    function->setResult(0, entry->createAssign("l.S", starts));
     64    function->setResult(1, entry->createAssign("l.E", ends));
    6465
    6566    PabloAST * M = ends;
     
    7576        }
    7677        M = entry->createOr(entry->createAdvance(M, span), M);
    77         function->setResult(i + 2, entry->createAssign("M" + std::to_string(i), M));
     78        function->setResult(i + 2, entry->createAssign("l.M" + std::to_string(i), M));
    7879        ++i;
    7980        step += span;
     
    8788 ** ------------------------------------------------------------------------------------------------------------- */
    8889PabloFunction * SymbolTableBuilder::generateSortingFunction(const PabloFunction * const leading, const std::vector<unsigned> & endpoints) {
    89     PabloFunction * const function = PabloFunction::Create("sorting", leading->getNumOfResults(), leading->getNumOfResults() * 2);
     90    PabloFunction * const function = PabloFunction::Create("sorting", leading->getNumOfResults(), (leading->getNumOfResults() - 1) * 2);
    9091    PabloBlock * entry = function->getEntryBlock();
    91     function->setParameter(0, entry->createVar("S"));
    92     function->setParameter(1, entry->createVar("E"));
     92    function->setParameter(0, entry->createVar("l.S"));
     93    function->setParameter(1, entry->createVar("l.E"));
    9394    for (unsigned i = 2; i < leading->getNumOfResults(); ++i) {
    94         function->setParameter(i, entry->createVar("M" + std::to_string(i - 2)));
     95        function->setParameter(i, entry->createVar("l.M" + std::to_string(i - 2)));
    9596    }
    9697    PabloAST * R = function->getParameter(0);
    9798    PabloAST * const E = entry->createNot(function->getParameter(1));
    98     unsigned i = 1;
     99    unsigned i = 0;
    99100    unsigned lowerbound = 0;
    100101    for (unsigned endpoint : endpoints) {
    101         PabloAST * const M = function->getParameter(i + 1);
     102        PabloAST * const M = function->getParameter(i + 2);
    102103        PabloAST * const L = entry->createLookahead(M, endpoint, "lookahead" + std::to_string(endpoint));
    103104        PabloAST * S = entry->createAnd(L, R);
    104         Assign * Si = entry->createAssign("S_" + std::to_string(i), S);
    105         PabloAST * F = entry->createScanThru(R, E);
    106         Assign * Ei = entry->createAssign("E_" + std::to_string(i), F);
     105        Assign * Si = entry->createAssign("s.S_" + std::to_string(i + 1), S);
     106        PabloAST * F = entry->createScanThru(S, E);
     107        Assign * Ei = entry->createAssign("s.E_" + std::to_string(i + 1), F);
    107108        function->setResult(i * 2, Si);
    108109        function->setResult(i * 2 + 1, Ei);
     
    111112        lowerbound = endpoint;
    112113    }
    113     Assign * Si = entry->createAssign("S_n", R);
     114    Assign * Si = entry->createAssign("s.S_n", R);
    114115    PabloAST * F = entry->createScanThru(R, E);
    115     Assign * Ei = entry->createAssign("E_n", F);
     116    Assign * Ei = entry->createAssign("s.E_n", F);
    116117    function->setResult(i * 2, Si);
    117118    function->setResult(i * 2 + 1, Ei);
     
    130131
    131132/** ------------------------------------------------------------------------------------------------------------- *
    132  * @brief generateGather
    133  ** ------------------------------------------------------------------------------------------------------------- */
    134 inline Value * SymbolTableBuilder::generateGather(Value * const base, Value * const vindex) {
    135 
    136     /*
    137         From Intel:
    138 
    139         extern __m256i _mm256_i32gather_epi32(int const * base, __m256i vindex, const int scale)
    140 
    141         From Clang avx2intrin.h:
    142 
    143         #define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
    144           (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
    145                                                (int const *)(m),  \
    146                                                (__v8si)(__m256i)(i), \
    147                                                (__v8si)_mm256_set1_epi32(-1), (s)); })
    148 
    149         From llvm IntrinsicsX86.td:
    150 
    151         def llvm_ptr_ty        : LLVMPointerType<llvm_i8_ty>;             // i8*
    152 
    153         def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
    154            Intrinsic<[llvm_v8i32_ty],
    155            [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
    156            [IntrReadArgMem]>;
    157 
    158      */
    159 
    160     VectorType * const vecType = VectorType::get(iBuilder->getInt32Ty(), 8);
    161     Function * const vgather = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_avx2_gather_d_d_256);
    162     Constant * const ones = Constant::getAllOnesValue(vecType);
    163     return iBuilder->CreateCall(vgather, {ones, base, iBuilder->CreateBitCast(vindex, vecType), ones, iBuilder->getInt8(1)});
    164 }
    165 
    166 /** ------------------------------------------------------------------------------------------------------------- *
    167133 * @brief generateMaskedGather
    168134 ** ------------------------------------------------------------------------------------------------------------- */
     
    205171}
    206172
    207 ///** ------------------------------------------------------------------------------------------------------------- *
    208 // * @brief generateScanMatch
    209 // ** ------------------------------------------------------------------------------------------------------------- */
    210 //void SymbolTableBuilder::generateHashingKernel(KernelBuilder * kBuilder, const unsigned minKeyLength, const unsigned maxKeyLength, const unsigned scanWordBitWidth) {
    211 
    212 //    const unsigned minKeyBlockCount = (minKeyLength / 4);
    213 //    const unsigned maxKeyBlockCount = ((maxKeyLength + 3) / 4);
    214 
    215 //    Type * const intScanWordTy = iBuilder->getIntNTy(scanWordBitWidth);
    216 //    const unsigned fieldCount = iBuilder->getBitBlockWidth() / scanWordBitWidth;
    217 //    Type * const scanWordVectorType = VectorType::get(intScanWordTy, fieldCount);
    218 //    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
    219 //    const unsigned gatherCount = vectorWidth * 4;
    220 //    Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
    221 
    222 //    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
    223 //    const unsigned startIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex");
    224 //    const unsigned startArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount), "StartArray");
    225 //    const unsigned endIndexIdx = kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex");
    226 //    const unsigned endArrayIdx = kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), gatherCount), "EndArray");
    227 
    228 //    kBuilder->addInputStream(1, "startStream");
    229 //    kBuilder->addInputStream(1, "endStream");
    230 
    231 //    Function * function = kBuilder->prepareFunction();
    232 
    233 //    BasicBlock * const entry = iBuilder->GetInsertBlock();
    234 
    235 //    BasicBlock * startOuterCond = BasicBlock::Create(mMod->getContext(), "startOuterCond", function, 0);
    236 //    BasicBlock * startOuterBody = BasicBlock::Create(mMod->getContext(), "startOuterBody", function, 0);
    237 //    BasicBlock * startInnerCond = BasicBlock::Create(mMod->getContext(), "startInnerCond", function, 0);
    238 //    BasicBlock * startInnerBody = BasicBlock::Create(mMod->getContext(), "startInnerBody", function, 0);
    239 
    240 //    BasicBlock * endOuterCond = BasicBlock::Create(mMod->getContext(), "endOuterCond", function, 0);
    241 //    BasicBlock * endOuterBody = BasicBlock::Create(mMod->getContext(), "endOuterBody", function, 0);
    242 //    BasicBlock * endInnerCond = BasicBlock::Create(mMod->getContext(), "endInnerCond", function, 0);
    243 //    BasicBlock * endInnerBody = BasicBlock::Create(mMod->getContext(), "endInnerBody", function, 0);
    244 
    245 //    BasicBlock * gatherInit = BasicBlock::Create(mMod->getContext(), "gather", function, 0);
    246 
    247 //    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
    248 
    249 //    //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
    250 //    Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
    251 //    Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
    252 //    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
    253 
    254 //    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
    255 //    Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(startIndexIdx), "startIndex");
    256 //    Value * startArray = kBuilder->getInternalState(startArrayIdx);
    257 //    Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(0)), scanWordVectorType, "startStream");
    258 
    259 //    Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(endIndexIdx), "endIndex");
    260 //    Value * endArray = kBuilder->getInternalState(endArrayIdx);
    261 //    Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(1)), scanWordVectorType, "endStream");
    262 
    263 //    iBuilder->CreateBr(startOuterCond);
    264 
    265 //    // START OUTER COND
    266 //    iBuilder->SetInsertPoint(startOuterCond);
    267 //    PHINode * outerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
    268 //    outerStartIndexPhi->addIncoming(startIndex, entry);
    269 //    PHINode * startIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
    270 //    startIV->addIncoming(iBuilder->getInt64(0), entry);
    271 //    Value * startOuterTest = iBuilder->CreateICmpNE(startIV, iBuilder->getInt64(fieldCount));
    272 //    iBuilder->CreateCondBr(startOuterTest, startOuterBody, endOuterCond);
    273 
    274 //    // START OUTER BODY
    275 //    iBuilder->SetInsertPoint(startOuterBody);
    276 //    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
    277 //    startIV->addIncoming(iBuilder->CreateAdd(startIV, iBuilder->getInt64(1)), startInnerCond);
    278 //    iBuilder->CreateBr(startInnerCond);
    279 
    280 //    // START INNER COND
    281 //    iBuilder->SetInsertPoint(startInnerCond);
    282 //    PHINode * innerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 2);
    283 //    innerStartIndexPhi->addIncoming(outerStartIndexPhi, startOuterBody);
    284 //    outerStartIndexPhi->addIncoming(innerStartIndexPhi, startInnerCond);
    285 //    PHINode * startFieldPhi = iBuilder->CreatePHI(intScanWordTy, 2);
    286 //    startFieldPhi->addIncoming(startField, startOuterBody);
    287 //    Value * test = iBuilder->CreateICmpNE(startFieldPhi, ConstantInt::getNullValue(intScanWordTy));
    288 //    iBuilder->CreateCondBr(test, startInnerBody, startOuterCond);
    289 
    290 //    // START INNER BODY
    291 //    iBuilder->SetInsertPoint(startInnerBody);
    292 //    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
    293 //    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
    294 //    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, blockPos), iBuilder->getInt32Ty());
    295 //    iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), innerStartIndexPhi}));
    296 //    innerStartIndexPhi->addIncoming(iBuilder->CreateAdd(innerStartIndexPhi, ConstantInt::get(innerStartIndexPhi->getType(), 1)), startInnerBody);
    297 //    iBuilder->CreateBr(startInnerCond);
    298 
    299 //    // END POINT OUTER COND
    300 //    iBuilder->SetInsertPoint(endOuterCond);
    301 //    PHINode * outerStartIndexPhi2 = iBuilder->CreatePHI(startIndex->getType(), 2);
    302 //    outerStartIndexPhi2->addIncoming(outerStartIndexPhi, startOuterCond);
    303 //    PHINode * endIV = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
    304 //    endIV->addIncoming(iBuilder->getInt64(0), startOuterCond);
    305 //    Value * endOuterTest = iBuilder->CreateICmpNE(endIV, iBuilder->getInt64(fieldCount));
    306 //    iBuilder->CreateCondBr(endOuterTest, endOuterBody, exit);
    307 
    308 //    // END POINT OUTER BODY
    309 //    iBuilder->SetInsertPoint(endOuterBody);
    310 //    Value * endField = iBuilder->CreateExtractElement(endStream, endIV);
    311 //    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
    312 //    iBuilder->CreateBr(endInnerCond);
    313 
    314 //    // END POINT INNER COND
    315 //    iBuilder->SetInsertPoint(endInnerCond);
    316 //    innerStartIndexPhi = iBuilder->CreatePHI(startIndex->getType(), 3);
    317 //    innerStartIndexPhi->addIncoming(outerStartIndexPhi2, endOuterBody);
    318 //    innerStartIndexPhi->addIncoming(innerStartIndexPhi, endInnerBody);
    319 //    outerStartIndexPhi2->addIncoming(innerStartIndexPhi, endInnerCond);
    320 //    PHINode * endIndexPhi = iBuilder->CreatePHI(endIndex->getType(), 3);
    321 //    endIndexPhi->addIncoming(endIndex, endOuterBody);
    322 //    endIndexPhi->addIncoming(ConstantInt::getNullValue(endIndex->getType()), gatherInit);
    323 //    PHINode * endFieldPhi = iBuilder->CreatePHI(intScanWordTy, 3);
    324 //    endFieldPhi->addIncoming(endField, endOuterBody);
    325 //    Value * endInnerTest = iBuilder->CreateICmpNE(endFieldPhi, ConstantInt::getNullValue(intScanWordTy));
    326 //    iBuilder->CreateCondBr(endInnerTest, endInnerBody, endOuterCond);
    327 
    328 //    // END POINT INNER BODY
    329 //    iBuilder->SetInsertPoint(endInnerBody);
    330 //    Value * endPos = generateCountForwardZeroes(iBuilder, endFieldPhi);
    331 //    Value * updatedEndFieldPhi = generateResetLowestBit(iBuilder, endFieldPhi);
    332 //    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
    333 //    endFieldPhi->addIncoming(updatedEndFieldPhi, gatherInit);
    334 //    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, blockPos), iBuilder->getInt32Ty());
    335 //    iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi}));
    336 //    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi, ConstantInt::get(endIndexPhi->getType(), 1));
    337 //    endIndexPhi->addIncoming(updatedEndIndexPhi, endInnerBody);
    338 //    Value * filledEndPosBufferTest = iBuilder->CreateICmpEQ(updatedEndIndexPhi, ConstantInt::get(updatedEndIndexPhi->getType(), gatherCount));
    339 //    iBuilder->CreateCondBr(filledEndPosBufferTest, gatherInit, endInnerCond);
    340 
    341 //    // GATHER INIT
    342 //    iBuilder->SetInsertPoint(gatherInit);
    343 //    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0));
    344 //    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(gatherVectorType, 0));
    345 //    CallGatherFunction(base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), minKeyBlockCount, maxKeyBlockCount);
    346 //    // ... call hashing function ...
    347 //    Value * untouchedArrayPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(startArray, iBuilder->getInt32(vectorWidth)), PointerType::get(gatherVectorType, 0));
    348 //    Value * untouchedCount = iBuilder->CreateSub(innerStartIndexPhi, ConstantInt::get(innerStartIndexPhi->getType(), gatherCount));
    349 //    iBuilder->CreateMemCpy(startArrayPtr, untouchedArrayPtr, untouchedCount, 4);
    350 //    innerStartIndexPhi->addIncoming(untouchedCount, gatherInit);
    351 //    iBuilder->CreateBr(endInnerCond);
    352 
    353 
    354 //    iBuilder->SetInsertPoint(exit);
    355 
    356 
    357 
    358 //    // need to save the start/end index still
    359 //    kBuilder->finalize();
    360 
    361 //    function->dump();
    362 //}
    363 
    364173/** ------------------------------------------------------------------------------------------------------------- *
    365174 * @brief generateGatherKernel
     
    372181    const unsigned vectorWidth = iBuilder->getBitBlockWidth() / 32;
    373182    const unsigned gatherCount = vectorWidth * 4;
    374     Type * const gatherVectorType =  VectorType::get(iBuilder->getInt32Ty(), vectorWidth);
    375183    Type * const transposedVectorType = VectorType::get(iBuilder->getInt8Ty(), iBuilder->getBitBlockWidth() / 8);
    376184
    377185    unsigned minKeyLength = 0;
    378186
     187    Type * startArrayType = ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount);
     188    Type * endArrayType = ArrayType::get(iBuilder->getInt32Ty(), gatherCount);
     189    Type * groupType = StructType::get(iBuilder->getInt32Ty(), startArrayType, iBuilder->getInt32Ty(), endArrayType, nullptr);
     190    const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
     191    const unsigned positionArrayIdx = kBuilder->addInternalState(ArrayType::get(groupType, endpoints.size()), "Positions");
     192
    379193    for (unsigned maxKeyLength : endpoints) {
    380 
    381         kBuilder->addInternalState(iBuilder->getInt32Ty(), "StartIndex" + std::to_string(maxKeyLength));
    382         kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), iBuilder->getBitBlockWidth() + gatherCount), "StartArray" + std::to_string(maxKeyLength));
    383         kBuilder->addInternalState(iBuilder->getInt32Ty(), "EndIndex" + std::to_string(maxKeyLength));
    384         kBuilder->addInternalState(ArrayType::get(iBuilder->getInt32Ty(), gatherCount), "EndArray" + std::to_string(maxKeyLength));
    385 
    386194        kBuilder->addInputStream(1, "startStream" + std::to_string(maxKeyLength));
    387195        kBuilder->addInputStream(1, "endStream" + std::to_string(maxKeyLength));
    388 
    389         kBuilder->addOutputStream(maxKeyLength);
     196        kBuilder->addOutputStream(((maxKeyLength + 3) / 4) * 4);
    390197    }
    391 
    392     const unsigned baseIdx = kBuilder->addInternalState(iBuilder->getInt8PtrTy(), "Base");
    393 
    394     Function * function = kBuilder->prepareFunction();
     198    kBuilder->addInputStream(1, "startStreamN");
     199    kBuilder->addInputStream(1, "endStreamN");
     200
     201    Function * const function = kBuilder->prepareFunction();
    395202
    396203    BasicBlock * const entry = iBuilder->GetInsertBlock();
     
    415222    BasicBlock * exit = BasicBlock::Create(mMod->getContext(), "exit", function, 0);
    416223
    417     //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
    418     Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
    419     Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
    420     blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
    421 
    422     FunctionType * const functionType = FunctionType::get(PointerType::get(transposedVectorType, 0), {iBuilder->getInt8PtrTy(), PointerType::get(gatherVectorType, 0), PointerType::get(gatherVectorType, 0), iBuilder->getInt32Ty(), PointerType::get(transposedVectorType, 0)}, false);
    423     Value * gatherFunctionPtrArray = iBuilder->CreateAlloca(PointerType::get(functionType, 0), iBuilder->getInt32(endpoints.size()));
     224    Type * const int32PtrTy = PointerType::get(iBuilder->getInt32Ty(), 0);
     225    FunctionType * const functionType = FunctionType::get(iBuilder->getVoidTy(), {iBuilder->getInt8PtrTy(), int32PtrTy, int32PtrTy, iBuilder->getInt32Ty(), int32PtrTy}, false);
     226    Value * const gatherFunctionPtrArray = iBuilder->CreateAlloca(PointerType::get(functionType, 0), iBuilder->getInt32(endpoints.size()));
    424227    unsigned i = 0;
    425228    minKeyLength = 0;
     
    431234        minKeyLength = maxKeyLength;
    432235    }
     236
     237    //TODO: this won't work on files > 2^32 bytes yet; needs an intermediate flush then a recalculation of the base pointer.
     238    Value * const base = iBuilder->CreateLoad(kBuilder->getInternalState(baseIdx), "base");
     239    Value * const positionArray = kBuilder->getInternalState(positionArrayIdx);
     240
     241    Value * blockPos = iBuilder->CreateLoad(kBuilder->getBlockNo());
     242    blockPos = iBuilder->CreateMul(blockPos, iBuilder->getInt64(iBuilder->getBitBlockWidth()));
     243
    433244    iBuilder->CreateBr(groupCond);
    434245
     
    443254    iBuilder->SetInsertPoint(groupBody);
    444255    // if two positions cannot be in the same vector element, we could possibly do some work in parallel here.
    445     Value * input0 = iBuilder->CreateMul(groupIV, iBuilder->getInt32(2));
    446     Value * startStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(input0)), scanWordVectorType, "startStream");
    447     Value * input1 = iBuilder->CreateAdd(input0, iBuilder->getInt32(1));
    448     Value * endStream = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(input1)), scanWordVectorType, "endStream");
    449 
    450     Value * internal0 = iBuilder->CreateMul(groupIV, iBuilder->getInt32(4));
    451     Value * startIndex = iBuilder->CreateLoad(kBuilder->getInternalState(internal0), "startIndex");
    452     Value * internal1 = iBuilder->CreateAdd(internal0, iBuilder->getInt32(1));
    453     Value * startArray = kBuilder->getInternalState(internal1);
    454     Value * internal2 = iBuilder->CreateAdd(internal1, iBuilder->getInt32(1));
    455     Value * endIndex = iBuilder->CreateLoad(kBuilder->getInternalState(internal2), "endIndex");
    456     Value * internal3 = iBuilder->CreateAdd(internal2, iBuilder->getInt32(1));
    457     Value * endArray = kBuilder->getInternalState(internal3);
     256
     257    iBuilder->CallPrintInt(" ---- groupIV ---- ", groupIV);
     258
     259    Value * index = iBuilder->CreateMul(groupIV, iBuilder->getInt32(2));
     260    Value * startStreamPtr = kBuilder->getInputStream(index);
     261    Value * startStream = iBuilder->CreateBlockAlignedLoad(startStreamPtr);
     262    iBuilder->CallPrintRegister("startStream", startStream);
     263    startStream = iBuilder->CreateBitCast(startStream, scanWordVectorType, "startStream");
     264
     265    index = iBuilder->CreateAdd(index, iBuilder->getInt32(1));
     266    Value * endStreamPtr = kBuilder->getInputStream(index);
     267    Value * endStream = iBuilder->CreateBlockAlignedLoad(endStreamPtr);
     268    iBuilder->CallPrintRegister("endStream", endStream);
     269    endStream = iBuilder->CreateBitCast(endStream, scanWordVectorType, "endStream");
     270
     271    Value * startIndexPtr = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(0)}, "startIndexPtr");
     272    Value * startIndex = iBuilder->CreateLoad(startIndexPtr, "startIndex");
     273    Value * startArray = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(1)}, "startArray");
     274    Value * endIndexPtr = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(2)}, "endIndexPtr");
     275    Value * endIndex = iBuilder->CreateLoad(endIndexPtr, "endIndex");
     276    Value * endArray = iBuilder->CreateGEP(positionArray, {iBuilder->getInt32(0), groupIV, iBuilder->getInt32(3)}, "endArray");
    458277
    459278    Value * const buffer = kBuilder->getOutputStream(groupIV);
     
    463282    // START OUTER COND
    464283    iBuilder->SetInsertPoint(startOuterCond);
     284    PHINode * startBlockOffset = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
     285    startBlockOffset->addIncoming(blockPos, groupBody);
    465286    PHINode * startIndexPhi1 = iBuilder->CreatePHI(startIndex->getType(), 2);
    466287    startIndexPhi1->addIncoming(startIndex, groupBody);
     
    474295    Value * startField = iBuilder->CreateExtractElement(startStream, startIV);
    475296    startIV->addIncoming(iBuilder->CreateAdd(startIV, iBuilder->getInt64(1)), startInnerCond);
     297    startBlockOffset->addIncoming(iBuilder->CreateAdd(startBlockOffset, iBuilder->getInt64(scanWordBitWidth)), startInnerCond);
    476298    iBuilder->CreateBr(startInnerCond);
    477299
     
    490312    Value * startPos = generateCountForwardZeroes(iBuilder, startFieldPhi);
    491313    startFieldPhi->addIncoming(generateResetLowestBit(iBuilder, startFieldPhi), startInnerBody);
    492     startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, blockPos), iBuilder->getInt32Ty());
    493     iBuilder->CreateStore(startPos, iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), startIndexPhi3}));
     314    startPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(startPos, startBlockOffset), iBuilder->getInt32Ty());
     315    Value * startAddr = iBuilder->CreateGEP(startArray, {iBuilder->getInt32(0), startIndexPhi3});
     316    iBuilder->CallPrintInt("> startIndex ", startIndexPhi3);
     317    iBuilder->CallPrintInt("> startPos ", startPos);
     318    iBuilder->CreateStore(startPos, startAddr);
    494319    startIndexPhi3->addIncoming(iBuilder->CreateAdd(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), 1)), startInnerBody);
    495320    iBuilder->CreateBr(startInnerCond);
     
    497322    // END POINT OUTER COND
    498323    iBuilder->SetInsertPoint(endOuterCond);
     324    PHINode * endBlockOffset = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
     325    endBlockOffset->addIncoming(blockPos, startOuterCond);
    499326    PHINode * endIndexPhi1 = iBuilder->CreatePHI(endIndex->getType(), 2);
    500327    endIndexPhi1->addIncoming(endIndex, startOuterCond);
     
    510337    Value * endField = iBuilder->CreateExtractElement(endStream, endIV);
    511338    endIV->addIncoming(iBuilder->CreateAdd(endIV, iBuilder->getInt64(1)), endInnerCond);
     339    endBlockOffset->addIncoming(iBuilder->CreateAdd(endBlockOffset, iBuilder->getInt64(scanWordBitWidth)), endInnerCond);
    512340    iBuilder->CreateBr(endInnerCond);
    513341
     
    533361    endFieldPhi->addIncoming(updatedEndFieldPhi, endInnerBody);
    534362    endFieldPhi->addIncoming(updatedEndFieldPhi, gather);
    535     endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, blockPos), iBuilder->getInt32Ty());
    536     iBuilder->CreateStore(endPos, iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi2}));
     363    endPos = iBuilder->CreateTruncOrBitCast(iBuilder->CreateOr(endPos, endBlockOffset), iBuilder->getInt32Ty());
     364    Value * endAddr = iBuilder->CreateGEP(endArray, {iBuilder->getInt32(0), endIndexPhi2});
     365    iBuilder->CallPrintInt("> endIndex ", endIndexPhi2);
     366    iBuilder->CallPrintInt("> endPos ", endPos);
     367    iBuilder->CreateStore(endPos, endAddr);
    537368    Value * updatedEndIndexPhi = iBuilder->CreateAdd(endIndexPhi2, ConstantInt::get(endIndexPhi2->getType(), 1));
    538369    endIndexPhi2->addIncoming(updatedEndIndexPhi, endInnerBody);
     
    542373    // GATHER
    543374    iBuilder->SetInsertPoint(gather);
    544     Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(gatherVectorType, 0));
    545     Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(gatherVectorType, 0));
    546     Value * const bufferPtr = iBuilder->CreatePointerCast(buffer, PointerType::get(transposedVectorType, 0));
     375
     376    iBuilder->CallPrintInt(" **** gathering **** ", groupIV);
     377
     378    Value * startArrayPtr = iBuilder->CreatePointerCast(startArray, PointerType::get(iBuilder->getInt32Ty(), 0));
     379    Value * endArrayPtr = iBuilder->CreatePointerCast(endArray, PointerType::get(iBuilder->getInt32Ty(), 0));
     380    Value * const bufferPtr = iBuilder->CreatePointerCast(buffer, PointerType::get(iBuilder->getInt32Ty(), 0));
    547381    Value * gatherFunctionPtr = iBuilder->CreateLoad(iBuilder->CreateGEP(gatherFunctionPtrArray, groupIV));
     382
    548383    iBuilder->CreateCall5(gatherFunctionPtr, base, startArrayPtr, endArrayPtr, iBuilder->getInt32(32), bufferPtr);
    549384
    550385    // ... call hashing function ...
    551     Value * untouchedArrayPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(startArray, iBuilder->getInt32(vectorWidth)), PointerType::get(gatherVectorType, 0));
    552     Value * untouchedCount = iBuilder->CreateSub(startIndexPhi3, ConstantInt::get(startIndexPhi3->getType(), gatherCount));
    553     iBuilder->CreateMemCpy(startArrayPtr, untouchedArrayPtr, untouchedCount, 4);
    554     startIndexPhi3->addIncoming(untouchedCount, gather);
     386    Value * remainingArrayPtr = iBuilder->CreateGEP(startArrayPtr, iBuilder->getInt32(gatherCount));
     387    Value * remainingCount = iBuilder->CreateSub(startIndexPhi3, iBuilder->getInt32(gatherCount));
     388    iBuilder->CreateMemCpy(startArrayPtr, remainingArrayPtr, remainingCount, 4);
     389    startIndexPhi3->addIncoming(remainingCount, gather);
    555390    iBuilder->CreateBr(endInnerCond);
    556391
    557392    // NEXT GROUP
    558393    iBuilder->SetInsertPoint(nextGroup);
    559     kBuilder->setInternalState(internal0, startIndexPhi2);
    560     kBuilder->setInternalState(internal2, endIndexPhi1);
     394    iBuilder->CreateStore(startIndexPhi2, startIndexPtr);
     395    iBuilder->CreateStore(endIndexPhi1, endIndexPtr);
    561396    groupIV->addIncoming(iBuilder->CreateAdd(groupIV, ConstantInt::get(groupIV->getType(), 1)), nextGroup);
    562397    iBuilder->CreateBr(groupCond);
     
    583418        Type * const gatherVectorArrayType = ArrayType::get(gatherVectorType, maxCount);
    584419
    585         FunctionType * const functionType = FunctionType::get(PointerType::get(resultType, 0), {iBuilder->getInt8PtrTy(), PointerType::get(gatherVectorType, 0), PointerType::get(gatherVectorType, 0), iBuilder->getInt32Ty(), PointerType::get(resultType, 0)}, false);
     420        Type * const int32PtrTy = PointerType::get(iBuilder->getInt32Ty(), 0);
     421        FunctionType * const functionType = FunctionType::get(iBuilder->getVoidTy(), {iBuilder->getInt8PtrTy(), int32PtrTy, int32PtrTy, iBuilder->getInt32Ty(), int32PtrTy}, false);
    586422        function = Function::Create(functionType, GlobalValue::ExternalLinkage, functionName, mMod);
    587423        function->setCallingConv(CallingConv::C);
     
    594430        Value * const base = args++;
    595431        base->setName("base");
    596         Value * const startArray = args++;
     432        Value * startArray = args++;
    597433        startArray->setName("startArray");
    598         Value * const endArray = args++;
     434        Value * endArray = args++;
    599435        endArray->setName("endArray");
    600         Value * const count = args++;
    601         count->setName("count");
    602         Value * const transposedBuffer = args++;
    603         transposedBuffer->setName("buffer");
     436        Value * const numOfKeys = args++;
     437        numOfKeys->setName("numOfKeys");
     438        Value * buffer = args++;
     439        buffer->setName("buffer");
    604440
    605441        BasicBlock * entry = BasicBlock::Create(mMod->getContext(), "entry", function, 0);
     
    617453        iBuilder->SetInsertPoint(entry);
    618454        Value * const untransposedBuffer = iBuilder->CreateAlloca(gatherVectorArrayType, iBuilder->getInt32(4), "untransposedBuffer");
     455
     456        iBuilder->CallPrintInt("base", base);
     457        iBuilder->CallPrintInt("startArray", startArray);
     458        iBuilder->CallPrintInt("endArray", endArray);
     459        iBuilder->CallPrintInt("numOfKeys", numOfKeys);
     460        iBuilder->CallPrintInt("buffer", buffer);
     461
    619462        iBuilder->CreateBr(gatherCond);
    620463
     
    622465        iBuilder->SetInsertPoint(gatherCond);
    623466        PHINode * remainingLanes = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
    624         remainingLanes->addIncoming(count, entry);
     467        remainingLanes->addIncoming(numOfKeys, entry);
    625468        PHINode * gatherIV = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
    626469        gatherIV->addIncoming(iBuilder->getInt32(0), entry);
     470        iBuilder->CallPrintInt(" --- gatherIV", gatherIV);
    627471        Value * gatherLoopTest = iBuilder->CreateICmpNE(gatherIV, iBuilder->getInt32(4));
    628472        iBuilder->CreateCondBr(gatherLoopTest, partialGatherCond, transposeCond);
     
    630474        // PARTIAL GATHER COND
    631475        iBuilder->SetInsertPoint(partialGatherCond);
     476        iBuilder->CallPrintInt(" --- remainingLanes", remainingLanes);
    632477        Value * partialGatherLoopTest = iBuilder->CreateICmpSGE(remainingLanes, iBuilder->getInt32(vectorWidth));
    633478        iBuilder->CreateCondBr(partialGatherLoopTest, gatherBody, partialGatherBody);
     
    639484        maskedLanes = iBuilder->CreateMul(maskedLanes, iBuilder->getInt32(32));
    640485        maskedLanes = iBuilder->CreateZExt(maskedLanes, registerType);
    641         maskedLanes = iBuilder->CreateLShr(Constant::getAllOnesValue(registerType), maskedLanes);
     486        maskedLanes = iBuilder->CreateLShr(Constant::getAllOnesValue(registerType), maskedLanes);       
    642487        maskedLanes = iBuilder->CreateBitCast(maskedLanes, gatherVectorType);
    643 
    644488        iBuilder->CreateBr(gatherBody);
    645489
     
    649493        activeLanes->addIncoming(Constant::getAllOnesValue(gatherVectorType), partialGatherCond);
    650494        activeLanes->addIncoming(maskedLanes, partialGatherBody);
    651 
     495        iBuilder->CallPrintRegister(" --- activeLanes", activeLanes);
     496
     497        startArray = iBuilder->CreateBitCast(startArray, PointerType::get(gatherVectorType, 0));
    652498        Value * startPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(startArray, gatherIV), 4);
    653499        for (unsigned blockCount = 0; blockCount < minCount; ++blockCount) {
     500
     501            iBuilder->CallPrintRegister(" --- startPosF" + std::to_string(blockCount), startPos);
    654502            Value * tokenData = generateMaskedGather(base, startPos, activeLanes);
    655503            startPos = iBuilder->CreateAdd(startPos, four);
     504            iBuilder->CallPrintRegister(" --- tokenDataF" + std::to_string(blockCount), tokenData);
    656505            iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
    657506        }
    658507
     508        endArray = iBuilder->CreateBitCast(endArray, PointerType::get(gatherVectorType, 0));
    659509        Value * const endPos = iBuilder->CreateAlignedLoad(iBuilder->CreateGEP(endArray, gatherIV), 4);
    660510        for (unsigned blockCount = minCount; blockCount < maxCount; ++blockCount) {
     511
     512            iBuilder->CallPrintRegister(" --- startPosP" + std::to_string(blockCount), startPos);
     513
    661514            // if we have not fully gathered the data for this key
    662             Value * atLeastOneByte = iBuilder->CreateAnd(iBuilder->CreateSExt(iBuilder->CreateICmpULT(startPos, endPos), startPos->getType()), activeLanes);
     515            Value * atLeastOneByte = iBuilder->CreateSExt(iBuilder->CreateICmpULT(startPos, endPos), startPos->getType());
     516            atLeastOneByte = iBuilder->CreateAnd(atLeastOneByte, activeLanes);
     517            iBuilder->CallPrintRegister(" --- atLeastOneByte" + std::to_string(blockCount), atLeastOneByte);
     518
    663519            // gather it ...
    664520            Value * tokenData = generateMaskedGather(base, startPos, atLeastOneByte);
     521            iBuilder->CallPrintRegister(" --- tokenDataP" + std::to_string(blockCount), tokenData);
    665522            // and compute how much data is remaining.
    666523            Value * remaining = iBuilder->CreateSub(endPos, startPos);
     524
     525            iBuilder->CallPrintRegister(" --- remaining" + std::to_string(blockCount), remaining);
     526
    667527            // if this token only has 1 to 3 bytes remaining ...
    668             Value * lessThanFourBytes = iBuilder->CreateSExt(iBuilder->CreateICmpSLT(remaining, four), remaining->getType());
    669             Value * betweenOneAndThreeBytes = iBuilder->CreateAnd(atLeastOneByte, lessThanFourBytes);
    670             // determine how many bytes (bits?) do *not* belong to the token
    671             remaining = iBuilder->CreateSub(four, iBuilder->CreateAnd(remaining, betweenOneAndThreeBytes));
    672             // remaining = iBuilder->CreateShl(remaining, ConstantInt::get(remaining->getType(), 3));
     528            Value * atLeastFourBytes = iBuilder->CreateSExt(iBuilder->CreateICmpUGE(remaining, four), remaining->getType());
     529
     530            iBuilder->CallPrintRegister(" --- atLeastFourBytes" + std::to_string(blockCount), atLeastFourBytes);
     531
     532
     533            // determine how many bits do *not* belong to the token
     534            remaining = iBuilder->CreateSub(four, remaining);
     535            remaining = iBuilder->CreateShl(remaining, ConstantInt::get(remaining->getType(), 3));
     536
     537            iBuilder->CallPrintRegister(" --- remaining" + std::to_string(blockCount), remaining);
     538
    673539            // then mask them out prior to storing the value
    674540            Value * partialTokenMask = iBuilder->CreateLShr(ConstantInt::getAllOnesValue(remaining->getType()), remaining);
     541            partialTokenMask = iBuilder->CreateOr(partialTokenMask, atLeastFourBytes);
     542
     543            iBuilder->CallPrintRegister(" --- partialTokenMask" + std::to_string(blockCount), partialTokenMask);
     544
    675545            tokenData = iBuilder->CreateAnd(partialTokenMask, tokenData);
    676             iBuilder->CreateAlignedStore(tokenData, iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV}), 4);
     546
     547            iBuilder->CallPrintRegister(" --- tokenDataM" + std::to_string(blockCount), tokenData);
     548
     549            Value * untransposedBufferPtr = iBuilder->CreateGEP(untransposedBuffer, {iBuilder->getInt32(blockCount), gatherIV});
     550
     551            iBuilder->CallPrintInt(" --- untransposedBufferPtr" + std::to_string(blockCount), untransposedBufferPtr);
     552
     553            iBuilder->CreateAlignedStore(tokenData, untransposedBufferPtr, 4);
    677554            if (blockCount < (maxCount - 1)) {
    678555                startPos = iBuilder->CreateAdd(startPos, four);
     
    719596        }
    720597        Value * offset = iBuilder->CreateShl(transposeIV, ConstantInt::get(transposeIV->getType(), 2));
     598        transposeIV->addIncoming(iBuilder->CreateAdd(transposeIV, iBuilder->getInt32(1)), transposeBody);
     599        buffer = iBuilder->CreateBitCast(buffer, PointerType::get(resultType, 0));
    721600        for (unsigned i = 0; i < 4; ++i) {
    722601            Value * index = offset;
    723602            if (i) {
    724                 index = iBuilder->CreateOr(offset, iBuilder->getInt32(i));
     603                index = iBuilder->CreateAdd(offset, iBuilder->getInt32(i));
    725604            }
    726             Value * ptr = iBuilder->CreateGEP(transposedBuffer, index);
     605            Value * ptr = iBuilder->CreateGEP(buffer, index);
    727606            iBuilder->CreateAlignedStore(value[i], ptr, 4);
    728607        }
    729         transposeIV->addIncoming(iBuilder->CreateAdd(transposeIV, iBuilder->getInt32(1)), transposeBody);
    730608        iBuilder->CreateBr(transposeCond);
    731609
    732610        // EXIT
    733611        iBuilder->SetInsertPoint(exit);
    734         iBuilder->CreateRet(transposedBuffer);
     612        iBuilder->CreateRetVoid();
    735613
    736614        iBuilder->restoreIP(ip);
     
    763641    mSortingKernel = new KernelBuilder("sorting", mMod, iBuilder, bufferSize);
    764642    mGatherKernel = new KernelBuilder("gathering", mMod, iBuilder, 1);
     643    mStdOutKernel = new KernelBuilder("stddout", mMod, iBuilder, 1);
    765644
    766645    generateS2PKernel(mMod, iBuilder, mS2PKernel);
     
    777656
    778657    generateGatherKernel(mGatherKernel, endpoints, 64);
    779 
     658    generateStdOutKernel(mMod, iBuilder, mStdOutKernel);
    780659}
    781660
     
    817696    Instance * leadingInstance = mLeadingKernel->instantiate(s2pInstance->getOutputStreamSet());
    818697    Instance * sortingInstance = mSortingKernel->instantiate(leadingInstance->getOutputStreamSet());
     698    Instance * gatheringInstance = mGatherKernel->instantiate(sortingInstance->getOutputStreamSet());
     699    Instance * stdOutInstance = mStdOutKernel->instantiate(gatheringInstance->getOutputStreamSet());
     700
     701    gatheringInstance->setInternalState("Base", iBuilder->CreateBitCast(inputStream, iBuilder->getInt8PtrTy()));
     702
     703    stdOutInstance->setInternalState("RemainingBytes", bufferSize);  // The total number of bytes to be sent to stdout.
    819704
    820705    const unsigned leadingBlocks = (mLongestLookahead + iBuilder->getBitBlockWidth() - 1) / iBuilder->getBitBlockWidth();
     
    858743    Value * remainingBytesCond = iBuilder->CreateICmpULT(remainingBytes2, requiredBytes);
    859744    iBuilder->CreateCondBr(remainingBytesCond, regularExitBlock, regularBodyBlock);
     745
    860746    iBuilder->SetInsertPoint(regularBodyBlock);
    861747    s2pInstance->CreateDoBlockCall();
    862748    leadingInstance->CreateDoBlockCall();
    863749    sortingInstance->CreateDoBlockCall();
     750    gatheringInstance->CreateDoBlockCall();
     751//    stdOutInstance->CreateDoBlockCall();
    864752    remainingBytes2->addIncoming(iBuilder->CreateSub(remainingBytes2, blockSize), regularBodyBlock);
    865753    iBuilder->CreateBr(regularTestBlock);
     
    879767    leadingInstance->clearOutputStreamSet();
    880768    sortingInstance->CreateDoBlockCall();
     769    gatheringInstance->CreateDoBlockCall();
     770//    stdOutInstance->CreateDoBlockCall();
    881771    iBuilder->CreateBr(finalTestBlock);
    882772
     
    892782    leadingInstance->clearOutputStreamSet();
    893783    sortingInstance->CreateDoBlockCall();
     784    gatheringInstance->CreateDoBlockCall();
     785//    stdOutInstance->CreateDoBlockCall();
    894786    remainingFullBlocks->addIncoming(iBuilder->CreateSub(remainingFullBlocks, iBuilder->getInt64(1)), finalBodyBlock);
     787
     788
     789
    895790
    896791    iBuilder->CreateBr(finalTestBlock);
     
    906801    delete mSortingKernel;
    907802    delete mGatherKernel;
    908 }
    909 
    910 
    911 }
     803    delete mStdOutKernel;
     804}
     805
     806
     807}
  • icGREP/icgrep-devel/icgrep/kernels/symboltablepipeline.h

    r4992 r4995  
    3535    Function * generateGatherFunction(Type * const transposedVectorType, const unsigned minCount, const unsigned maxCount);
    3636
    37     Value * generateGather(Value * const base, Value * const vindex);
    3837    Value * generateMaskedGather(Value * const base, Value * const vindex, Value * const mask);
    39 
    40     void generateLLVMParser();
    4138
    4239private:
     
    4744    KernelBuilder *                     mSortingKernel;
    4845    KernelBuilder *                     mGatherKernel;
     46    KernelBuilder *                     mStdOutKernel;
    4947
    5048    unsigned                            mLongestLookahead;
  • icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.cpp

    r4986 r4995  
    130130    for (unsigned j = 0; j < function->getNumOfResults(); ++j) {
    131131        const auto f = mMarkerMap.find(function->getResult(j));
    132         Value * result = nullptr;
    133132        if (LLVM_UNLIKELY(f == mMarkerMap.end())) {
    134             result = iBuilder->allZeroes();
    135         } else {
    136             result = f->second;
    137         }
    138         iBuilder->CreateBlockAlignedStore(result, mKernelBuilder->getOutputStream(j));
     133            throw std::runtime_error("PabloCompiler: result " + std::to_string(j) + " was not assigned a value!");
     134        }
     135        iBuilder->CreateBlockAlignedStore(f->second, mKernelBuilder->getOutputStream(j));
    139136    }
    140137
  • icGREP/icgrep-devel/icgrep/toolchain.cpp

    r4994 r4995  
    7777    builder.setErrorStr(&errMessage);
    7878    builder.setMCPU(sys::getHostCPUName());
     79    TargetOptions opts = InitTargetOptionsFromCodeGenFlags();
     80    #ifndef NDEBUG
     81    opts.JITEmitDebugInfo = 1;
     82    #endif
     83    builder.setTargetOptions(opts);
    7984    CodeGenOpt::Level optLevel = CodeGenOpt::Level::None;
    8085    switch (OptLevel) {
Note: See TracChangeset for help on using the changeset viewer.