Changeset 5905 for icGREP


Ignore:
Timestamp:
Mar 13, 2018, 12:08:22 PM (12 months ago)
Author:
xwa163
Message:
  1. Fix some bugs in match copy kernel
  2. Remove some legacy codes from match copy kernel and sequential kernel
Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_extract_e_m0.cpp

    r5885 r5905  
    66#include <iostream>
    77
    8 //#define APPLY_64PACK_ACCELERATION
     8#define APPLY_64PACK_ACCELERATION
    99// TODO May be we can change it to 256 PACK Acceleration based on SIMD instruction
    1010
     
    687687        ) {
    688688    this->initBufferCursor(iBuilder, {"extender"});
    689     this->configIndexBits(iBuilder, inputIndexMap);
    690 //    this->configOutputBufferToBeClear({{"byteStream", "e1Marker"}});
    691 //    setNoTerminateAttribute(true);
    692689}
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_match_copy_kernel.cpp

    r5895 r5905  
    66#include <kernels/streamset.h>
    77#include <toolchain/toolchain.h>
    8 
    9 #define OUTPUT_BIT_STREAM_NAME "outputStream"
     8#include <llvm/Support/raw_ostream.h>
     9
     10#define OUTPUT_STREAM_NAME "outputStream"
    1011
    1112using namespace llvm;
     
    2021    Value *previousProcessed = iBuilder->getProcessedItemCount("decompressedStream");
    2122
    22 //    BasicBlock *entryBlock = iBuilder->GetInsertBlock();
    2323    Value *inputBasePtr = iBuilder->getInputStreamBlockPtr("decompressedStream", SIZE_ZERO);
    2424
    25     Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO);
     25    Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_STREAM_NAME, SIZE_ZERO);
    2626    Value *itemsToDo = mAvailableItemCount[0];
    27     Value *copySize = iBuilder->CreateUMin(
    28             itemsToDo,
    29             iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH)
    30     );
    31 //    iBuilder->CallPrintInt("itemsToDo", itemsToDo);
    32 //    iBuilder->CallPrintInt("itemsToDo1", mAvailableItemCount[1]);
    33 //    iBuilder->CallPrintInt("itemsToDo2", mAvailableItemCount[2]);
    34 //    iBuilder->CallPrintInt("itemsToDo3", mAvailableItemCount[3]);
    35 //    iBuilder->CallPrintInt("copySize", copySize);
     27    Value *copySize = iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH);
     28    Value* actualCopySize = iBuilder->CreateUMin(itemsToDo, copySize);
    3629
    3730    iBuilder->CreateMemCpy(
     
    3932            inputBasePtr,
    4033            copySize,
    41             1 // Not align guaranteed in final block
    42     );
    43 //    iBuilder->CallPrintInt("outputCpyPtr", outputBasePtr);
    44 //    iBuilder->CallPrintInt("outputBlocks", outputBlocks);
    45     Value *newProcessed = iBuilder->CreateAdd(previousProcessed, copySize);
     34            iBuilder->getBitBlockWidth()
     35    ); //It will be ok to always copy by full block
     36
     37    Value *newProcessed = iBuilder->CreateAdd(previousProcessed, actualCopySize);
    4638    iBuilder->setProcessedItemCount("decompressedStream", newProcessed);
    47     iBuilder->setProducedItemCount(OUTPUT_BIT_STREAM_NAME, newProcessed);
     39    iBuilder->setProducedItemCount(OUTPUT_STREAM_NAME, newProcessed);
    4840
    4941}
     
    6961    Value *lastDepositPosition = iBuilder->CreateAdd(lastM0, SIZE_ONE);
    7062
    71     // TODO maybe we can not use mIsFinal here
    7263    Value *currentMaxBlock = iBuilder->CreateSelect(
    7364            this->mIsFinalBlock,
     
    7869    // Produced Item Count will always be full bitblock except for final block
    7970    Value *previousProducedBlocks = iBuilder->CreateUDiv(
    80             iBuilder->getProducedItemCount(OUTPUT_BIT_STREAM_NAME),
     71            iBuilder->getProducedItemCount(OUTPUT_STREAM_NAME),
    8172            SIZE_BIT_BLOCK_WIDTH
    8273    );
     
    9485    Value *itemsToDo = mAvailableItemCount[0];
    9586
    96     Value *isFinalBlock =
    97             iBuilder->CreateOr(
    98                     iBuilder->CreateICmpULT(itemsToDo, iBuilder->CreateMul(numOfStrides, SIZE_BIT_BLOCK_WIDTH)),
    99                     iBuilder->CreateICmpEQ(itemsToDo, iBuilder->getSize(0))
    100             );
    101 
    102     this->mIsFinalBlock = isFinalBlock;
     87
    10388//    iBuilder->CallPrintInt("isFinalBlock", isFinalBlock);
    104     iBuilder->setTerminationSignal(isFinalBlock);
    105 
    106 
    107 
    108 
    109     Value *previousProducedItemCount = iBuilder->getProducedItemCount(OUTPUT_BIT_STREAM_NAME);
     89
     90//    iBuilder->CallPrintInt("matchCopy:isFinalBlock", isFinalBlock);
     91
     92    Value *previousProducedItemCount = iBuilder->getProducedItemCount(OUTPUT_STREAM_NAME);
    11093
    11194
    11295    // Space Calculation
    11396    Value *outputBufferBlocks = iBuilder->getSize(
    114             this->getAnyStreamSetBuffer(OUTPUT_BIT_STREAM_NAME)->getBufferBlocks());
    115     // TODO need to take previous produced size into account
    116 
    117 
     97            this->getAnyStreamSetBuffer(OUTPUT_STREAM_NAME)->getBufferBlocks());
     98
     99    Value *outputCurrentPtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_STREAM_NAME, SIZE_ZERO); // [8 x <4 x i64>]*
    118100    Value *outputRawBeginPtr = iBuilder->CreatePointerCast(
    119             iBuilder->getRawOutputPointer(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO),
    120             iBuilder->getBitBlockType()->getPointerTo());
    121     Value *outputCurrentPtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO);
     101            iBuilder->getRawOutputPointer(OUTPUT_STREAM_NAME, SIZE_ZERO), outputCurrentPtr->getType());
    122102    Value *producedOffset = iBuilder->CreatePtrDiff(outputCurrentPtr, outputRawBeginPtr);
    123103    Value *remainSpace = iBuilder->CreateSub(outputBufferBlocks, producedOffset);
     
    134114    Value *outputBlocks = iBuilder->CreateUMin(writableBlocks, numOfStrides);
    135115    // outputBlock === min(writableBlocks, numOfStrides, (matchOffsetPosition + matchLength - producedItemCount) / bitBlockWidth )
    136 
    137     outputBlocks = iBuilder->CreateUMin(outputBlocks, this->getMaximumMatchCopyBlock(iBuilder));
    138 
     116//    iBuilder->CallPrintInt("outputBlocks1", outputBlocks);
     117
     118//    outputBlocks = iBuilder->CreateUMin(outputBlocks, this->getMaximumMatchCopyBlock(iBuilder)); //TODO need to handle final block, otherwise it may be deadloop when there is not match copy in final block
     119//    iBuilder->CallPrintInt("outputBlocks2", outputBlocks);
    139120
    140121//    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
    141122
     123
     124    Value *isFinalBlock =
     125            iBuilder->CreateOr(
     126                    iBuilder->CreateICmpULT(itemsToDo, iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH)),
     127                    iBuilder->CreateICmpEQ(itemsToDo, iBuilder->getSize(0))
     128            );
     129
     130    this->mIsFinalBlock = isFinalBlock;
     131    iBuilder->setTerminationSignal(isFinalBlock);
    142132    // Output Copy
    143133    this->generateOutputCopy(iBuilder, outputBlocks);
    144 //    return;
    145 
    146     Value *newProducedItemCount = iBuilder->getProducedItemCount(OUTPUT_BIT_STREAM_NAME);
     134
     135    Value *newProducedItemCount = iBuilder->getProducedItemCount(OUTPUT_STREAM_NAME);
    147136
    148137    BasicBlock *copyEndBlock = iBuilder->CreateBasicBlock("copyEnd");
     
    150139    iBuilder->SetInsertPoint(copyEndBlock);
    151140
    152     // TODO match Copy
     141    // Match Copy
    153142    BasicBlock *exitBlock = iBuilder->CreateBasicBlock("exit_block");
    154143
     
    238227    iBuilder->SetInsertPoint(matchCopyBodyBlock);
    239228    Value* matchCopyFromPos = iBuilder->CreateSub(phiMatchPos, phiMatchOffset);
    240     Value* rawOutputBasePtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO), iBuilder->getInt8PtrTy());
    241 //    iBuilder->CallPrintInt("rawOutputBasePtr", rawOutputBasePtr);
    242 //    iBuilder->CallPrintInt("rawOutputBasePtr1", iBuilder->CreateGEP(
    243 //            rawOutputBasePtr,
    244 //            iBuilder->CreateURem(matchCopyFromPos, iBuilder->CreateMul(outputBufferBlocks, SIZE_BIT_BLOCK_WIDTH))
    245 //    ));
    246     Value* matchCopyFromValue = iBuilder->CreateLoad(
    247             iBuilder->CreateGEP(
    248                     rawOutputBasePtr,
    249                     iBuilder->CreateURem(matchCopyFromPos, iBuilder->CreateMul(outputBufferBlocks, SIZE_BIT_BLOCK_WIDTH))
    250             ));
     229    Value* rawOutputBasePtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(OUTPUT_STREAM_NAME, SIZE_ZERO), iBuilder->getInt8PtrTy());
     230
     231    Value* outputBufferSize = iBuilder->CreateMul(outputBufferBlocks, SIZE_BIT_BLOCK_WIDTH);
     232    Value* matchCopyFromOffset = iBuilder->CreateURem(matchCopyFromPos, outputBufferSize);
     233    Value* matchCopyFromPtr = iBuilder->CreateGEP(rawOutputBasePtr, matchCopyFromOffset);
    251234
    252235    // Output is guranteed to be full bit block except for final block
    253     Value* outputBlockBasePtr = iBuilder->CreatePointerCast(iBuilder->getOutputStreamBlockPtr(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO), iBuilder->getInt8PtrTy());
     236    Value* outputBlockBasePtr = iBuilder->CreatePointerCast(iBuilder->getOutputStreamBlockPtr(OUTPUT_STREAM_NAME, SIZE_ZERO), iBuilder->getInt8PtrTy());
    254237    Value* outputTargetPtr = iBuilder->CreateGEP(outputBlockBasePtr, iBuilder->CreateSub(phiMatchPos, previousProducedItemCount));
    255 //    iBuilder->CallPrintInt("matchCopyFromValue", matchCopyFromValue);
    256 //    iBuilder->CallPrintInt("phiMatchPos", phiMatchPos);
    257 //    iBuilder->CallPrintInt("aa", iBuilder->CreateSub(phiMatchPos, previousProducedItemCount));
    258     iBuilder->CreateStore(matchCopyFromValue, outputTargetPtr);
     238
     239    Value* matchCopyFromRemain = iBuilder->CreateSub(outputBufferSize, matchCopyFromOffset);
     240    // phiMatchOffset
     241    // phiMatchLength
     242    Value* currentCopySize = iBuilder->CreateUMin(matchCopyFromRemain, phiMatchOffset);
     243    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
     244    currentCopySize = iBuilder->CreateUMin(currentCopySize, iBuilder->CreateSub(newProducedItemCount, phiMatchPos));
     245
     246    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
     247
     248//    currentCopySize = SIZE_ONE;
     249    iBuilder->CreateMemCpy(outputTargetPtr, matchCopyFromPtr, currentCopySize, 0);
     250
     251//    iBuilder->CallPrintInt("outputTargetPtr", iBuilder->CreateGEP(iBuilder->CreateLoad(outputTargetPtr), iBuilder->CreateSub(currentCopySize, SIZE_ONE)));
     252//    iBuilder->CallPrintInt("matchCopyFromPtr", iBuilder->CreateGEP(iBuilder->CreateLoad(matchCopyFromPtr), iBuilder->CreateSub(currentCopySize, SIZE_ONE)));
    259253
    260254    phiProcessIndex->addIncoming(phiProcessIndex, iBuilder->GetInsertBlock());
    261255    phiMatchOffset->addIncoming(phiMatchOffset, iBuilder->GetInsertBlock());
    262     phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, SIZE_ONE), iBuilder->GetInsertBlock());
    263     phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, SIZE_ONE), iBuilder->GetInsertBlock());
     256    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, currentCopySize), iBuilder->GetInsertBlock());
     257    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, currentCopySize), iBuilder->GetInsertBlock());
    264258
    265259    iBuilder->CreateBr(matchCopyLoopCon);
     
    276270}
    277271
    278 
    279 void LZ4MatchCopyKernel::generateStoreCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string outputBufferName,
    280                                                      Value *offset, Type *pointerType, Value *value) {
    281     size_t inputSize = this->getOutputBufferSize(iBuilder, outputBufferName);
    282     Value *offsetMask = iBuilder->getSize(inputSize - 1);
    283     Value *maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    284 
    285     Value *outputBufferPtr = iBuilder->getRawOutputPointer(outputBufferName, iBuilder->getSize(0));
    286 
    287     outputBufferPtr = iBuilder->CreatePointerCast(outputBufferPtr, pointerType);
    288     iBuilder->CreateStore(value, iBuilder->CreateGEP(outputBufferPtr, maskedOffset));
    289 }
    290 
    291 Value *LZ4MatchCopyKernel::generateLoadCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName,
    292                                                       Value *offset, Type *pointerType) {
    293     size_t inputSize = this->getOutputBufferSize(iBuilder, inputBufferName);
    294     Value *offsetMask = iBuilder->getSize(inputSize - 1);
    295     Value *maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    296 
    297     Value *inputBufferPtr = iBuilder->getRawOutputPointer(inputBufferName, iBuilder->getSize(0));
    298 
    299     inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
    300     return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
    301 }
    302 
    303 Value *LZ4MatchCopyKernel::generateLoadCircularInput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName,
    304                                                      Value *offset, Type *pointerType) {
    305     size_t inputSize = this->getInputBufferSize(iBuilder, inputBufferName);
    306     Value *offsetMask = iBuilder->getSize(inputSize - 1);
    307     Value *maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    308 
    309     Value *inputBufferPtr = iBuilder->getRawInputPointer(inputBufferName, iBuilder->getSize(0));
    310 
    311     inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
    312     return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
    313 }
    314 
    315 size_t LZ4MatchCopyKernel::getInputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
    316     return this->getInputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    317 }
    318 
    319 size_t LZ4MatchCopyKernel::getOutputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
    320     return this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    321 }
    322 
    323272LZ4MatchCopyKernel::LZ4MatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder)
    324273        : MultiBlockKernel("lz4MatchCopyKernel",
     
    332281                           },
    333282        // Outputs
    334                            {Binding{iBuilder->getStreamSetTy(1, 8), OUTPUT_BIT_STREAM_NAME, BoundedRate(0, 1)}},
     283                           {Binding{iBuilder->getStreamSetTy(1, 8), OUTPUT_STREAM_NAME, BoundedRate(0, 1)}},
    335284        // Arguments
    336285                           {},
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_match_copy_kernel.h

    r5895 r5905  
    1515    protected:
    1616        void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    17 //        void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    1817    private:
    19         llvm::Value* generateLoadCircularInput(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputBufferName, llvm::Value* offset, llvm::Type* pointerType);
    20         llvm::Value* generateLoadCircularOutput(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputBufferName, llvm::Value* offset, llvm::Type* pointerType);
    21         void generateStoreCircularOutput(const std::unique_ptr<KernelBuilder> &iBuilder, std::string outputBufferName, llvm::Value* offset, llvm::Type* pointerType, llvm::Value* value);
    22 
    23         size_t getInputBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName);
    24         size_t getOutputBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName);
    2518        void generateOutputCopy(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* outputBlocks);
    2619
  • icGREP/icgrep-devel/icgrep/kernels/sequential_kernel.cpp

    r5864 r5905  
    88
    99
    10 
    1110using namespace llvm;
    1211using namespace kernel;
     
    1514
    1615#define SequentialSegmentStateKey ("SequentialSegment_State")
    17 #define ModifyInputTempKey ("ModifyInput_Temp")
    18 #define MemCpyUntilZeroCopyOffsetTempKey ("MemCpyUntilZeroCopyOffsetTempKey")
    1916#define CountForwardMaxPosTempKey ("CountForwardMaxPosTempKey")
    20 
    2117
    2218
    2319namespace kernel {
    2420    SequentialKernel::SequentialKernel(
    25             const std::unique_ptr<kernel::KernelBuilder> & iBuilder,
    26             std::string && kernelName,
    27             std::vector<Binding> && stream_inputs,
    28             std::vector<Binding> && stream_outputs,
    29             std::vector<Binding> && scalar_parameters,
    30             std::vector<Binding> && scalar_outputs,
    31             std::vector<Binding> && internal_scalars):
    32             MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     21            const std::unique_ptr<kernel::KernelBuilder> &iBuilder,
     22            std::string &&kernelName,
     23            std::vector<Binding> &&stream_inputs,
     24            std::vector<Binding> &&stream_outputs,
     25            std::vector<Binding> &&scalar_parameters,
     26            std::vector<Binding> &&scalar_outputs,
     27            std::vector<Binding> &&internal_scalars) :
     28            MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs),
     29                             std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    3330        addScalar(iBuilder->getSizeTy(), SequentialSegmentStateKey);
    34         addScalar(iBuilder->getInt1Ty(), ModifyInputTempKey);
    35         addScalar(iBuilder->getSizeTy(), MemCpyUntilZeroCopyOffsetTempKey);
    3631        addScalar(iBuilder->getSizeTy(), CountForwardMaxPosTempKey);
    3732        addScalar(iBuilder->getSizeTy(), "tempClear");
     
    4035
    4136
    42     void SequentialKernel::recordCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder, Value* maxPos) {
     37    void SequentialKernel::recordCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder, Value *maxPos) {
    4338        if (maxPos) {
    4439            iBuilder->setScalarField(CountForwardMaxPosTempKey, maxPos);
    4540        }
    4641    }
    47     Value* SequentialKernel::restoreCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder, Value* currentMaxPos) {
     42
     43    Value *SequentialKernel::restoreCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder,
     44                                                           Value *currentMaxPos) {
    4845        if (currentMaxPos) {
    4946            return iBuilder->getScalarField(CountForwardMaxPosTempKey);
     
    5249    }
    5350
    54     void SequentialKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    55         BasicBlock* entryBlock = iBuilder->GetInsertBlock();
     51    void SequentialKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder,
     52                                                   Value *const numOfStrides) {
     53        BasicBlock *entryBlock = iBuilder->GetInsertBlock();
    5654//        iBuilder->CallPrintInt("entry", iBuilder->getSize(1));
    5755//        iBuilder->CallPrintInt("available", iBuilder->getAvailableItemCount("byteStream"));
    5856
    5957        // AfterEntryBlock will be the entry block of subclass if it is initial state
    60         BasicBlock* afterEntryBlock = iBuilder->CreateBasicBlock("afterEntryBlock");
     58        BasicBlock *afterEntryBlock = iBuilder->CreateBasicBlock("afterEntryBlock");
    6159        this->exitBlock = iBuilder->CreateBasicBlock("exitBlock");
    6260
     
    7068
    7169        iBuilder->SetInsertPoint(entryBlock);
    72         this->generateBuildIndexBits(iBuilder);
    73         this->generateClearBuffer(iBuilder);
    7470
    7571
    7672        // Create Indirect Branch
    77         std::vector<Constant*> blockAddressVector = std::vector<Constant*>();
    78         for (BasicBlock* bb : this->stateBlocks) {
     73        std::vector<Constant *> blockAddressVector = std::vector<Constant *>();
     74        for (BasicBlock *bb : this->stateBlocks) {
    7975            blockAddressVector.push_back(BlockAddress::get(bb));
    8076        }
    81         Constant * labels = ConstantVector::get(blockAddressVector);
    82 
    83         Value * target = iBuilder->CreateExtractElement(labels, iBuilder->getScalarField(SequentialSegmentStateKey));
    84         IndirectBrInst * indirectBr = iBuilder->CreateIndirectBr(target);
    85         for (BasicBlock* bb : this->stateBlocks) {
     77        Constant *labels = ConstantVector::get(blockAddressVector);
     78
     79        Value *target = iBuilder->CreateExtractElement(labels, iBuilder->getScalarField(SequentialSegmentStateKey));
     80        IndirectBrInst *indirectBr = iBuilder->CreateIndirectBr(target);
     81        for (BasicBlock *bb : this->stateBlocks) {
    8682            indirectBr->addDestination(bb);
    8783        }
    8884
    8985        iBuilder->SetInsertPoint(this->exitBlock);
    90     }
    91 
    92     bool SequentialKernel::hasIndexBits(const std::string& streamName) {
    93         return inputStreamIndexMap.find(streamName) != inputStreamIndexMap.end();
    94     }
    95 
    96     void SequentialKernel::configOutputBufferToBeClear(const std::map<string, string>& clearMap) {
    97         this->clearBufferMap = clearMap;
    98     }
    99 
    100     void SequentialKernel::generateClearBuffer(const std::unique_ptr<KernelBuilder> &iBuilder) {
    101         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("clear_buffer_entry");
    102         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("clear_buffer_exit");
    103 
    104         iBuilder->CreateBr(entryBlock);
    105         iBuilder->SetInsertPoint(entryBlock);
    106 
    107         for (auto iter = this->clearBufferMap.begin(); iter != this->clearBufferMap.end(); iter++) {
    108             string inputName = iter->first;
    109             string outputName = iter->second;
    110 
    111             BasicBlock* clearEntry = iBuilder->CreateBasicBlock("clear_" + outputName + "_entry");
    112             BasicBlock* clearCon = iBuilder->CreateBasicBlock("clear_" + outputName + "_con");
    113             BasicBlock* clearBody = iBuilder->CreateBasicBlock("clear_" + outputName + "_body");
    114             BasicBlock* clearExit = iBuilder->CreateBasicBlock("clear_" + outputName + "_exit");
    115 
    116             iBuilder->CreateBr(clearEntry);
    117             iBuilder->SetInsertPoint(clearEntry);
    118 
    119             Value* itemProduced = iBuilder->getScalarField("tempClear");
    120             Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputName), iBuilder->getProcessedItemCount(inputName));
    121             iBuilder->setScalarField("tempClear", itemsTotal);
    122 
    123             size_t outputSize = this->getOutputBufferSize(iBuilder, outputName);
    124             size_t outputPackNum = outputSize / 64;
    125 
    126             Value* startPackIndex = iBuilder->CreateLShr(itemProduced, iBuilder->getSize(std::log2(64)));
    127             Value* endPackIndex = iBuilder->CreateLShr(itemsTotal, iBuilder->getSize(std::log2(64)));
    128 
    129             iBuilder->CreateBr(clearCon);
    130             iBuilder->SetInsertPoint(clearCon);
    131 
    132             PHINode* currentPackIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    133             currentPackIndex->addIncoming(startPackIndex, clearEntry);
    134             iBuilder->CreateCondBr(iBuilder->CreateICmpULT(currentPackIndex, endPackIndex), clearBody, clearExit);
    135 
    136             iBuilder->SetInsertPoint(clearBody);
    137             Value* outputBasePtr = iBuilder->getRawOutputPointer(outputName, iBuilder->getSize(0));
    138             outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
    139             Value* maskedPackIndex = iBuilder->CreateAnd(currentPackIndex, iBuilder->getSize(outputPackNum - 1));
    140             iBuilder->CreateStore(iBuilder->getInt64(0), iBuilder->CreateGEP(outputBasePtr, maskedPackIndex));
    141 
    142             currentPackIndex->addIncoming(iBuilder->CreateAdd(currentPackIndex, iBuilder->getSize(1)), clearBody);
    143             iBuilder->CreateBr(clearCon);
    144 
    145             iBuilder->SetInsertPoint(clearExit);
    146         }
    147         iBuilder->CreateBr(exitBlock);
    148         iBuilder->SetInsertPoint(exitBlock);
    149 
    150     }
    151 
    152     void SequentialKernel::generateBuildIndexBits(const std::unique_ptr<KernelBuilder> &iBuilder) {
    153 //        iBuilder->CallPrintInt("entry", iBuilder->getSize(0));
    154         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("build_index_bits_entry");
    155         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("build_index_bits_exit");
    156 
    157         iBuilder->CreateBr(entryBlock);
    158 
    159         // Entry Block
    160         iBuilder->SetInsertPoint(entryBlock);
    161 
    162         for (auto iter = inputStreamIndexMap.begin(); iter != inputStreamIndexMap.end(); iter++) {
    163             string streamName = iter->first;
    164 //            size_t indexArraySize = iter->second;
    165 
    166             BasicBlock* indexUpdateEntryBlock = iBuilder->CreateBasicBlock(streamName + "_index_update_entry");
    167             iBuilder->CreateBr(indexUpdateEntryBlock);
    168 
    169             iBuilder->SetInsertPoint(indexUpdateEntryBlock);
    170 
    171             Value* previousItemsAvailable = iBuilder->getScalarField(this->generateInputPreviousAvailableName(streamName));
    172             Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(streamName), iBuilder->getProcessedItemCount(streamName));
    173             iBuilder->setScalarField(this->generateInputPreviousAvailableName(streamName), itemsTotal);
    174 
    175             size_t bufferSize = this->getInputBufferSize(iBuilder, streamName);
    176             size_t indexBitsCount = bufferSize / 64;
    177 
    178             Value* indexBitToBeUpdateStart = iBuilder->CreateLShr(previousItemsAvailable, std::log2(64));
    179             Value* indexBitToBeUpdateEnd = iBuilder->CreateLShr(iBuilder->CreateAdd(itemsTotal, iBuilder->getSize(63)), std::log2(64));
    180 
    181 
    182             BasicBlock* updateLoopCon = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_con");
    183             BasicBlock* updateLoopBody = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_body");
    184             BasicBlock* updateLoopFinal = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_final");
    185             BasicBlock* updateLoopExit = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_exit");
    186 
    187             iBuilder->CreateBr(updateLoopCon);
    188 
    189 
    190             // Update Loop Con
    191             iBuilder->SetInsertPoint(updateLoopCon);
    192             PHINode* currentUpdateBitIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    193             currentUpdateBitIndex->addIncoming(indexBitToBeUpdateStart, indexUpdateEntryBlock);
    194 
    195             iBuilder->CreateCondBr(
    196                     iBuilder->CreateICmpULT(currentUpdateBitIndex, indexBitToBeUpdateEnd),
    197                     updateLoopBody,
    198                     updateLoopExit
    199             );
    200 
    201             // Update Loop Body
    202             iBuilder->SetInsertPoint(updateLoopBody);
    203             Value* bitIndex = iBuilder->CreateURem(currentUpdateBitIndex, iBuilder->getSize(indexBitsCount)); // TODO replace with and
    204             Value* arrayIndex = iBuilder->CreateLShr(
    205                     bitIndex,
    206                     iBuilder->getSize(std::log2(64)));
    207             Value* indexIndex = iBuilder->CreateAnd(bitIndex, iBuilder->getSize(63));
    208 
    209             Value* inputStreamPtr = iBuilder->getRawInputPointer(streamName, iBuilder->getSize(0));
    210             inputStreamPtr = iBuilder->CreatePointerCast(inputStreamPtr, iBuilder->getInt64Ty()->getPointerTo());
    211 
    212             Value* targetInputValue = iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, bitIndex));
    213 
    214             // handle bit 0 index
    215             Value* index0OldValue = iBuilder->CreateExtractElement(
    216                     iBuilder->getScalarField(this->generateInputZeroIndexName(streamName)),
    217                     arrayIndex
    218             );
    219 
    220             Value* newBit0Value = iBuilder->CreateNot(
    221                     iBuilder->CreateICmpEQ(
    222                             targetInputValue,
    223                             iBuilder->CreateNot(
    224                                     iBuilder->getInt64(0x0)
    225                             )
    226                     )
    227             );
    228 
    229 
    230 
    231             newBit0Value = iBuilder->CreateZExt(newBit0Value, iBuilder->getInt64Ty());
    232 
    233             Value* index0NewValue = index0OldValue;
    234             index0NewValue = iBuilder->CreateAnd(
    235                     index0NewValue,
    236                     iBuilder->CreateNot(
    237                             iBuilder->CreateShl(
    238                                     iBuilder->getInt64(1),
    239                                     indexIndex
    240                             )
    241                     )
    242             );
    243             index0NewValue = iBuilder->CreateOr(
    244                     index0NewValue,
    245                     iBuilder->CreateShl(
    246                             newBit0Value,
    247                             indexIndex
    248                     )
    249             );
    250             iBuilder->setScalarField(
    251                     this->generateInputZeroIndexName(streamName),
    252                     iBuilder->CreateInsertElement(
    253                             iBuilder->getScalarField(this->generateInputZeroIndexName(streamName)),
    254                             index0NewValue,
    255                             arrayIndex
    256                     )
    257             );
    258 
    259 
    260             // handle bit 1 index
    261 
    262             Value* index1OldValue = iBuilder->CreateExtractElement(
    263                     iBuilder->getScalarField(this->generateInputOneIndexName(streamName)),
    264                     arrayIndex
    265             );
    266 
    267             Value* newBit1Value = iBuilder->CreateNot(iBuilder->CreateICmpEQ(targetInputValue, iBuilder->getInt64(0)));
    268             newBit1Value = iBuilder->CreateZExt(newBit1Value, iBuilder->getInt64Ty());
    269 
    270             Value* index1NewValue = index1OldValue;
    271             index1NewValue = iBuilder->CreateAnd(
    272                     index1NewValue,
    273                     iBuilder->CreateNot(
    274                             iBuilder->CreateShl(
    275                                     iBuilder->getInt64(1),
    276                                     indexIndex
    277                             )
    278                     )
    279             );
    280             index1NewValue = iBuilder->CreateOr(
    281                     index1NewValue,
    282                     iBuilder->CreateShl(
    283                             newBit1Value,
    284                             indexIndex
    285                     )
    286             );
    287 
    288             iBuilder->setScalarField(
    289                     this->generateInputOneIndexName(streamName),
    290                     iBuilder->CreateInsertElement(
    291                             iBuilder->getScalarField(this->generateInputOneIndexName(streamName)),
    292                             index1NewValue,
    293                             arrayIndex
    294                     )
    295             );
    296 
    297             iBuilder->CreateBr(updateLoopFinal);
    298 
    299 
    300             // Update Loop Final
    301             iBuilder->SetInsertPoint(updateLoopFinal);
    302             currentUpdateBitIndex->addIncoming(iBuilder->CreateAdd(currentUpdateBitIndex, iBuilder->getSize(1)), updateLoopFinal);
    303             iBuilder->CreateBr(updateLoopCon);
    304 
    305             //Update Loop Exit
    306             iBuilder->SetInsertPoint(updateLoopExit);
    307 
    308         }
    309 
    310         iBuilder->CreateBr(exitBlock);
    311         iBuilder->SetInsertPoint(exitBlock);
    312 
    31386    }
    31487
    31588    void SequentialKernel::generateDoSequentialSegmentMethod(const std::unique_ptr<KernelBuilder> &iBuilder) {
    31689        // Will be override by subclass
    317     }
    318 
    319     // Initialize
    320 
    321     // Index
    322     void SequentialKernel::configIndexBits(const std::unique_ptr<KernelBuilder> &iBuilder, const std::map<std::string, size_t>& inputIndexMap) {
    323         for (auto iter = inputIndexMap.begin(); iter != inputIndexMap.end(); iter++ ) {
    324             string inputBufferName = iter->first;
    325             size_t indexBitNum = iter->second; // blockSize = size / iBuilder->getStride()
    326             size_t indexArraySize = ((indexBitNum * iBuilder->getStride() / 64 ) + 63) / 64;
    327             inputStreamIndexMap.insert(make_pair(inputBufferName, indexArraySize));
    328 
    329             this->addScalar(VectorType::get(iBuilder->getInt64Ty(), indexArraySize), generateInputZeroIndexName(inputBufferName));
    330             this->addScalar(VectorType::get(iBuilder->getInt64Ty(), indexArraySize), generateInputOneIndexName(inputBufferName));
    331             this->addScalar(iBuilder->getSizeTy(), generateInputPreviousAvailableName(inputBufferName));
    332         }
    333 
    334     }
    335     inline string SequentialKernel::generateInputZeroIndexName(string inputStreamName) {
    336         return "index_" + inputStreamName + "_zero_index";
    337     }
    338     inline string SequentialKernel::generateInputOneIndexName(string inputStreamName) {
    339         return "index_" + inputStreamName + "_one_index";
    340     }
    341 
    342     inline string SequentialKernel::generateInputPreviousAvailableName(std::string inputStreamName) {
    343         return "index_" + inputStreamName + "_previous_item_available";
    34490    }
    34591
     
    34894        return "Cursor_" + cursorName;
    34995    }
    350     void SequentialKernel::initBufferCursor(const std::unique_ptr<KernelBuilder> &iBuilder, std::vector<std::string> cursorNames) {
     96
     97    void SequentialKernel::initBufferCursor(const std::unique_ptr<KernelBuilder> &iBuilder,
     98                                            std::vector<std::string> cursorNames) {
    35199        for (std::string name : cursorNames) {
    352100            addScalar(iBuilder->getSizeTy(), this->generateCursorFullname(name));
     
    354102    }
    355103
    356     Value* SequentialKernel::getCursorValue(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName) {
     104    Value *SequentialKernel::getCursorValue(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName) {
    357105        return iBuilder->getScalarField(this->generateCursorFullname(cursorName));
    358106    }
    359107
    360     void SequentialKernel::setCursorValue(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, Value* value) {
     108    void SequentialKernel::setCursorValue(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName,
     109                                          Value *value) {
    361110        iBuilder->setScalarField(this->generateCursorFullname(cursorName), value);
    362111    }
    363112
    364     void SequentialKernel::advanceCursor(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, llvm::Value* nums) {
     113    void SequentialKernel::advanceCursor(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName,
     114                                         llvm::Value *nums) {
    365115        std::string fullname = this->generateCursorFullname(cursorName);
    366         Value* cursorValue = iBuilder->getScalarField(fullname);
     116        Value *cursorValue = iBuilder->getScalarField(fullname);
    367117        cursorValue = iBuilder->CreateAdd(cursorValue, nums);
    368118        iBuilder->setScalarField(fullname, cursorValue);
    369119    }
    370120
    371     void SequentialKernel::advanceCursorUntilPos(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, llvm::Value* position) {
     121    void SequentialKernel::advanceCursorUntilPos(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName,
     122                                                 llvm::Value *position) {
    372123        std::string fullname = this->generateCursorFullname(cursorName);
    373         Value* cursorValue = iBuilder->getScalarField(fullname);
    374         iBuilder->CreateAssert(iBuilder->CreateICmpSLE(cursorValue, position), cursorName + " Cursor can only move forward");
     124        Value *cursorValue = iBuilder->getScalarField(fullname);
     125        iBuilder->CreateAssert(iBuilder->CreateICmpSLE(cursorValue, position),
     126                               cursorName + " Cursor can only move forward");
    375127        iBuilder->setScalarField(fullname, position);
    376128    }
     
    378130
    379131    // forwardBits, packEnd, exceedAvailable
    380     std::pair<llvm::Value*, std::pair<llvm::Value*, llvm::Value*>> SequentialKernel::genereateCountForwardBitsOnePack(
     132    std::pair<llvm::Value *, std::pair<llvm::Value *, llvm::Value *>>
     133    SequentialKernel::genereateCountForwardBitsOnePack(
    381134            const std::unique_ptr<KernelBuilder> &iBuilder,
    382135            std::string inputStreamBufferName,
    383             llvm::Value* cursorValue,
     136            llvm::Value *cursorValue,
    384137            bool isZero
    385     ){
     138    ) {
    386139        size_t bufferSize = this->getInputBufferSize(iBuilder, inputStreamBufferName);
    387         Value* bufferOffsetMask = iBuilder->getSize(bufferSize - 1);
    388 
    389         Value* actualBufferOffset = iBuilder->CreateAnd(bufferOffsetMask, cursorValue);
    390 
    391         Value* packIndex = iBuilder->CreateLShr(actualBufferOffset, iBuilder->getSize(std::log2(64)));
    392 
    393         Value* countStartBitIndex = iBuilder->CreateAnd(actualBufferOffset, iBuilder->getSize(64 - 1));
    394 
    395         Value* inputStreamPtr = iBuilder->getInputStreamBlockPtr(inputStreamBufferName, iBuilder->getInt32(0));
     140        Value *bufferOffsetMask = iBuilder->getSize(bufferSize - 1);
     141
     142        Value *actualBufferOffset = iBuilder->CreateAnd(bufferOffsetMask, cursorValue);
     143
     144        Value *packIndex = iBuilder->CreateLShr(actualBufferOffset, iBuilder->getSize(std::log2(64)));
     145
     146        Value *countStartBitIndex = iBuilder->CreateAnd(actualBufferOffset, iBuilder->getSize(64 - 1));
     147
     148        Value *inputStreamPtr = iBuilder->getInputStreamBlockPtr(inputStreamBufferName, iBuilder->getInt32(0));
    396149        inputStreamPtr = iBuilder->CreatePointerCast(inputStreamPtr, iBuilder->getInt64Ty()->getPointerTo());
    397         Value* packData = iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, packIndex));
    398 
     150        Value *packData = iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, packIndex));
    399151
    400152
     
    404156            packData = iBuilder->CreateNot(packData);
    405157        }
    406         Value* forwardZeroCount = iBuilder->CreateCountForwardZeroes(packData);
    407 
    408 
    409 
    410         Value* isEndOfPack = iBuilder->CreateICmpUGE(iBuilder->CreateAdd(countStartBitIndex, forwardZeroCount), iBuilder->getSize(64));
     158        Value *forwardZeroCount = iBuilder->CreateCountForwardZeroes(packData);
     159
     160
     161        Value *isEndOfPack = iBuilder->CreateICmpUGE(iBuilder->CreateAdd(countStartBitIndex, forwardZeroCount),
     162                                                    iBuilder->getSize(64));
    411163        forwardZeroCount = iBuilder->CreateSelect(
    412164                isEndOfPack,
     
    415167        );
    416168
    417         Value* newCursorValue = iBuilder->CreateAdd(cursorValue, forwardZeroCount);
    418         Value* itemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName), iBuilder->getProcessedItemCount(inputStreamBufferName));
    419 
    420         Value* isExceedAvailable = iBuilder->CreateICmpUGE(newCursorValue, itemTotal);
     169        Value *newCursorValue = iBuilder->CreateAdd(cursorValue, forwardZeroCount);
     170        Value *itemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName),
     171                                               iBuilder->getProcessedItemCount(inputStreamBufferName));
     172
     173        Value *isExceedAvailable = iBuilder->CreateICmpUGE(newCursorValue, itemTotal);
    421174
    422175        newCursorValue = iBuilder->CreateSelect(isExceedAvailable, itemTotal, newCursorValue);
     
    424177//        Value* isNotFinished = iBuilder->CreateOr(isEndOfPack, isExceedAvailable);
    425178//        Value* isFinished = iBuilder->CreateNot(isNotFinished);
    426         return std::make_pair(iBuilder->CreateSub(newCursorValue, cursorValue), make_pair(isEndOfPack, isExceedAvailable));
     179        return std::make_pair(iBuilder->CreateSub(newCursorValue, cursorValue),
     180                              make_pair(isEndOfPack, isExceedAvailable));
    427181    };
    428182
    429183    // pair<forwardZeros, isFinished>
    430     std::pair<llvm::Value*, llvm::Value*> SequentialKernel::generateCountForwardBits(
     184    std::pair<llvm::Value *, llvm::Value *> SequentialKernel::generateCountForwardBits(
    431185            const std::unique_ptr<KernelBuilder> &iBuilder,
    432186            std::string inputStreamBufferName,
    433             llvm::Value* cursorValue,
     187            llvm::Value *cursorValue,
    434188            bool isZero,
    435             llvm::Value* maxPos
     189            llvm::Value *maxPos
    436190    ) {
    437         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("count_forward_bit_entry");
     191        BasicBlock *entryBlock = iBuilder->CreateBasicBlock("count_forward_bit_entry");
    438192        iBuilder->CreateBr(entryBlock);
    439193        iBuilder->SetInsertPoint(entryBlock);
    440194
    441         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("count_forward_bit_exit");
     195        BasicBlock *exitBlock = iBuilder->CreateBasicBlock("count_forward_bit_exit");
    442196
    443197
    444198        auto onePackResult = genereateCountForwardBitsOnePack(iBuilder, inputStreamBufferName, cursorValue, isZero);
    445199
    446         Value* forwardCount = onePackResult.first;
    447         Value* isEndOfPack = onePackResult.second.first;
    448         Value* isExceedAvailable = onePackResult.second.second;
    449         Value* newCursorValue = iBuilder->CreateAdd(cursorValue, forwardCount);
    450 
    451         if (!hasIndexBits(inputStreamBufferName)) {
    452             Value* isNotFinished = iBuilder->CreateOr(isEndOfPack, isExceedAvailable);
    453             Value* isFinished = iBuilder->CreateNot(isNotFinished);
    454 
    455             if (maxPos) {
    456                 Value* reachMaxPos = iBuilder->CreateICmpUGE(newCursorValue, maxPos);
    457                 isFinished = iBuilder->CreateSelect(
    458                         reachMaxPos,
    459                         iBuilder->getInt1(true),
    460                         isFinished
    461                 );
    462                 newCursorValue = iBuilder->CreateSelect(
    463                         reachMaxPos,
    464                         maxPos,
    465                         newCursorValue
    466                 );
    467 
    468             }
    469 
    470             iBuilder->CreateBr(exitBlock);
    471             iBuilder->SetInsertPoint(exitBlock);
    472 
    473             return std::make_pair(iBuilder->CreateSub(newCursorValue, cursorValue), isFinished);
    474         } else {
    475             BasicBlock* countIndexBitConBlock = iBuilder->CreateBasicBlock("count_forward_bit_count_index_con");
    476             BasicBlock* countIndexBitBodyBlock = iBuilder->CreateBasicBlock("count_forward_bit_count_index_body");
    477             BasicBlock* countFinalPackBlock = iBuilder->CreateBasicBlock("count_forward_bit_count_final_pack");
    478 
    479             BasicBlock* beforeExitBlock = iBuilder->CreateBasicBlock("count_forward_bit_before_exit");
    480             iBuilder->CreateBr(countIndexBitConBlock);
    481 
    482             // beforeExitBlock
    483             iBuilder->SetInsertPoint(beforeExitBlock);
    484             PHINode* finalNewCursorValue = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    485             PHINode* isFinish = iBuilder->CreatePHI(iBuilder->getInt1Ty(), 3);
    486 
    487 
    488             Value* retCursorValue = finalNewCursorValue;
    489             Value* retIsFinish = isFinish;
    490             if (maxPos) {
    491                 Value* exceedMaxPos = iBuilder->CreateICmpUGE(retCursorValue, maxPos);
    492                 retCursorValue = iBuilder->CreateSelect(exceedMaxPos, maxPos, retCursorValue);
    493                 retIsFinish = iBuilder->CreateSelect(exceedMaxPos, iBuilder->getInt1(true), retIsFinish);
    494             }
    495 
    496             iBuilder->CreateBr(exitBlock);
    497 
    498 
    499             // countIndexBitConBlock
    500             iBuilder->SetInsertPoint(countIndexBitConBlock);
    501 
    502             // isEndOfPack && !isExceedAvailable
    503             Value* shouldCountIndexBit = isEndOfPack;
    504 
    505             if (maxPos) {
    506                 Value* reachMaxPos = iBuilder->CreateICmpUGE(newCursorValue, maxPos);
    507                 shouldCountIndexBit = iBuilder->CreateSelect(reachMaxPos, iBuilder->getInt1(false), shouldCountIndexBit);
    508             }
    509 
    510             finalNewCursorValue->addIncoming(newCursorValue, countIndexBitConBlock);
    511             isFinish->addIncoming(iBuilder->CreateNot(shouldCountIndexBit), countIndexBitConBlock);
    512 
    513             iBuilder->CreateCondBr(shouldCountIndexBit, countIndexBitBodyBlock, beforeExitBlock);
    514 
    515             // countIndexBitBodyBlock
    516             iBuilder->SetInsertPoint(countIndexBitBodyBlock);
    517             Value* countBeginBitIndex = iBuilder->CreateLShr(newCursorValue, iBuilder->getSize(std::log2(64)));
    518 
    519 
    520             Value* indexCount = this->generateCountIndexBit(iBuilder, inputStreamBufferName, !isZero, countBeginBitIndex);
    521 
    522             newCursorValue = iBuilder->CreateAdd(
    523                     newCursorValue,
    524                     iBuilder->CreateShl(
    525                             indexCount,
    526                             std::log2(64)
    527                     )
     200        Value *forwardCount = onePackResult.first;
     201        Value *isEndOfPack = onePackResult.second.first;
     202        Value *isExceedAvailable = onePackResult.second.second;
     203        Value *newCursorValue = iBuilder->CreateAdd(cursorValue, forwardCount);
     204
     205        Value *isNotFinished = iBuilder->CreateOr(isEndOfPack, isExceedAvailable);
     206        Value *isFinished = iBuilder->CreateNot(isNotFinished);
     207
     208        if (maxPos) {
     209            Value *reachMaxPos = iBuilder->CreateICmpUGE(newCursorValue, maxPos);
     210            isFinished = iBuilder->CreateSelect(
     211                    reachMaxPos,
     212                    iBuilder->getInt1(true),
     213                    isFinished
    528214            );
    529 
    530             Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName), iBuilder->getProcessedItemCount(inputStreamBufferName));
    531             isExceedAvailable = iBuilder->CreateICmpUGE(newCursorValue, itemsTotal);
    532             newCursorValue =  iBuilder->CreateSelect(
    533                     isExceedAvailable,
    534                     itemsTotal,
     215            newCursorValue = iBuilder->CreateSelect(
     216                    reachMaxPos,
     217                    maxPos,
    535218                    newCursorValue
    536219            );
    537             BasicBlock* countIndexBitBodyExitBlock = iBuilder->GetInsertBlock();
    538 
    539             finalNewCursorValue->addIncoming(newCursorValue, countIndexBitBodyExitBlock);
    540             isFinish->addIncoming(iBuilder->CreateNot(isExceedAvailable), countIndexBitBodyExitBlock);
    541 
    542             iBuilder->CreateCondBr(
    543                     isExceedAvailable,
    544                     beforeExitBlock,
    545                     countFinalPackBlock
    546             );
    547 
    548             // CountFinalPackBlock
    549             iBuilder->SetInsertPoint(countFinalPackBlock);
    550             auto onePackResult = genereateCountForwardBitsOnePack(iBuilder, inputStreamBufferName, newCursorValue, isZero);
    551 
    552             forwardCount = onePackResult.first;
    553             //isEndOfPack = onePackResult.second.first;  // should always be false
    554             //isExceedAvailable = onePackResult.second.second; // should always be false
    555             Value* finalCursorValue = iBuilder->CreateAdd(newCursorValue, forwardCount);
    556 
    557             finalNewCursorValue->addIncoming(finalCursorValue, countFinalPackBlock);
    558             isFinish->addIncoming(iBuilder->getInt1(true), countFinalPackBlock);
    559 
    560             iBuilder->CreateBr(beforeExitBlock);
    561 
    562             // exit block
    563             iBuilder->SetInsertPoint(exitBlock);
    564             return std::make_pair(iBuilder->CreateSub(retCursorValue, cursorValue), retIsFinish);
    565         }
     220
     221        }
     222
     223        iBuilder->CreateBr(exitBlock);
     224        iBuilder->SetInsertPoint(exitBlock);
     225
     226        return std::make_pair(iBuilder->CreateSub(newCursorValue, cursorValue), isFinished);
    566227
    567228    };
    568229
    569     Value* SequentialKernel::generateCountIndexBit(const std::unique_ptr<KernelBuilder> &iBuilder, std::string streamName, bool isZero, llvm::Value* beginBitIndex) {
    570         string indexBitScalarName = isZero? this->generateInputZeroIndexName(streamName) : this->generateInputOneIndexName(streamName);
    571         BasicBlock* countIndexBitEntryBlock = iBuilder->CreateBasicBlock("count_index_bit_entry_block");
    572 
    573         BasicBlock* countIndexBitConBlock = iBuilder->CreateBasicBlock("count_index_bit_con_block");
    574         BasicBlock* countIndexBitBodyBlock = iBuilder->CreateBasicBlock("count_index_bit_body_block");
    575 //        BasicBlock* countIndexBitFinalBlock = iBuilder->CreateBasicBlock("count_index_bit_final_block");
    576         BasicBlock* countIndexBitExitBlock = iBuilder->CreateBasicBlock("count_index_bit_exit_block");
    577 
    578 
    579         iBuilder->CreateBr(countIndexBitEntryBlock);
    580 
    581         // CountIndexBitEntry
    582         iBuilder->SetInsertPoint(countIndexBitEntryBlock);
    583         auto info = this->inputStreamIndexMap.find(streamName);
    584         //TODO
    585 //        assert(( "index bit of " + streamName + " not exists") && (info != this->inputStreamIndexMap.end()));
    586 
    587         Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(streamName), iBuilder->getProcessedItemCount(streamName));
    588         Value* maxIndexBitCount = iBuilder->CreateLShr(
    589                 iBuilder->CreateAdd(itemsTotal, iBuilder->getSize(63)),
    590                 std::log2(64)
    591         );
    592 
    593         iBuilder->CreateBr(countIndexBitConBlock);
    594         //Con Block
    595         iBuilder->SetInsertPoint(countIndexBitConBlock);
    596         PHINode* currentBitIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    597         currentBitIndex->addIncoming(beginBitIndex, countIndexBitEntryBlock);
    598         iBuilder->CreateCondBr(
    599                 iBuilder->CreateICmpULT(
    600                         currentBitIndex,
    601                         maxIndexBitCount
    602                 ),
    603                 countIndexBitBodyBlock,
    604                 countIndexBitExitBlock
    605         );
    606 
    607         // Body Block
    608         iBuilder->SetInsertPoint(countIndexBitBodyBlock);
    609 
    610 
    611         Value* countArrayIndex = iBuilder->CreateLShr(
    612                 iBuilder->CreateAnd(
    613                         currentBitIndex,
    614                         iBuilder->getSize(this->getInputBufferSize(iBuilder, streamName) / 64 - 1)
    615                 ),
    616                 iBuilder->getSize(std::log2(64))
    617         );
    618         Value* countStartBitIndex = iBuilder->CreateAnd(currentBitIndex, iBuilder->getSize(63));
    619 
    620         Value* packData = iBuilder->CreateExtractElement(
    621                 iBuilder->getScalarField(indexBitScalarName),
    622                 countArrayIndex
    623         );
    624 
    625         packData = iBuilder->CreateSelect(
    626                 iBuilder->CreateICmpEQ(countStartBitIndex, iBuilder->getSize(0)),
    627                 packData,
    628                 iBuilder->CreateLShr(packData, countStartBitIndex)
    629         );
    630 
    631         Value* forwardZeroCount = iBuilder->CreateCountForwardZeroes(packData);
    632 
    633         Value* isEndOfPack = iBuilder->CreateICmpUGE(iBuilder->CreateAdd(countStartBitIndex, forwardZeroCount), iBuilder->getSize(64));
    634         forwardZeroCount = iBuilder->CreateSelect(
    635                 isEndOfPack,
    636                 iBuilder->CreateSub(iBuilder->getSize(64), countStartBitIndex),
    637                 forwardZeroCount
    638         );
    639 
    640 
    641         Value* newBitIndex = iBuilder->CreateAdd(currentBitIndex, forwardZeroCount);
    642         currentBitIndex->addIncoming(newBitIndex, countIndexBitBodyBlock);
    643 
    644         iBuilder->CreateCondBr(
    645                 isEndOfPack,
    646                 countIndexBitConBlock,
    647                 countIndexBitExitBlock
    648         );
    649 
    650 
    651         //Exit Block
    652         iBuilder->SetInsertPoint(countIndexBitExitBlock);
    653         PHINode* finalBitIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    654         finalBitIndex->addIncoming(currentBitIndex, countIndexBitConBlock);
    655         finalBitIndex->addIncoming(newBitIndex, countIndexBitBodyBlock);
    656 
    657         return iBuilder->CreateSub(finalBitIndex, beginBitIndex);
    658     }
    659 
    660     std::pair<llvm::Value*, llvm::Value*> SequentialKernel::generateCountForwardOnes(const unique_ptr<KernelBuilder> &iBuilder, string inputStreamBufferName, Value* beginOffset, Value* maxPos) {
     230    std::pair<llvm::Value *, llvm::Value *>
     231    SequentialKernel::generateCountForwardOnes(const unique_ptr<KernelBuilder> &iBuilder, string inputStreamBufferName,
     232                                               Value *beginOffset, Value *maxPos) {
    661233        return this->generateCountForwardBits(iBuilder, inputStreamBufferName, beginOffset, false, maxPos);
    662234    };
    663235
    664     std::pair<llvm::Value*, llvm::Value*> SequentialKernel::generateCountForwardZeros(const unique_ptr<KernelBuilder> &iBuilder, string inputStreamBufferName, Value* beginOffset, Value* maxPos) {
     236    std::pair<llvm::Value *, llvm::Value *>
     237    SequentialKernel::generateCountForwardZeros(const unique_ptr<KernelBuilder> &iBuilder, string inputStreamBufferName,
     238                                                Value *beginOffset, Value *maxPos) {
    665239        return this->generateCountForwardBits(iBuilder, inputStreamBufferName, beginOffset, true, maxPos);
    666240    }
    667241
    668242
    669     BasicBlock* SequentialKernel::advanceCursorUntilNextOne(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName, Value* maxPos) {
    670         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_entry");
     243    BasicBlock *
     244    SequentialKernel::advanceCursorUntilNextOne(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName,
     245                                                std::string inputStreamBufferName, Value *maxPos) {
     246        BasicBlock *entryBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_entry");
    671247
    672248        this->recordCountForwardTempMaxPos(iBuilder, maxPos);
     
    679255        this->waitCursorUntilInputAvailable(iBuilder, cursorName, inputStreamBufferName);
    680256
    681         BasicBlock* countForwareZeroBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_count_block");
    682         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_exit_block");
     257        BasicBlock *countForwareZeroBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_count_block");
     258        BasicBlock *exitBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_exit_block");
    683259
    684260        iBuilder->CreateBr(countForwareZeroBlock);
    685261        iBuilder->SetInsertPoint(countForwareZeroBlock);
    686262
    687         Value* cursorValue = this->getCursorValue(iBuilder, cursorName);
     263        Value *cursorValue = this->getCursorValue(iBuilder, cursorName);
    688264
    689265        maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
     
    692268
    693269        cursorValue = iBuilder->CreateAdd(cursorValue, retValue.first);
    694         Value* isFinished = retValue.second;
     270        Value *isFinished = retValue.second;
    695271
    696272
     
    706282
    707283
    708     BasicBlock* SequentialKernel::advanceCursorUntilNextZero(
    709             const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName, Value* maxPos) {
    710         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_entry");
     284    BasicBlock *SequentialKernel::advanceCursorUntilNextZero(
     285            const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName,
     286            Value *maxPos) {
     287        BasicBlock *entryBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_entry");
    711288
    712289        this->recordCountForwardTempMaxPos(iBuilder, maxPos);
     
    717294        this->waitCursorUntilInputAvailable(iBuilder, cursorName, inputStreamBufferName);
    718295
    719         BasicBlock* countForwareOneBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_count_block");
    720         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_exit_block");
     296        BasicBlock *countForwareOneBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_count_block");
     297        BasicBlock *exitBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_exit_block");
    721298
    722299        iBuilder->CreateBr(countForwareOneBlock);
    723300        iBuilder->SetInsertPoint(countForwareOneBlock);
    724301
    725         Value* cursorValue = this->getCursorValue(iBuilder, cursorName);
     302        Value *cursorValue = this->getCursorValue(iBuilder, cursorName);
    726303
    727304        maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
     
    731308        this->advanceCursor(iBuilder, cursorName, retValue.first);
    732309
    733         Value* isFinished = retValue.second;
     310        Value *isFinished = retValue.second;
    734311
    735312        iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
     
    739316    }
    740317
    741     void SequentialKernel::memcpyCircularBuffer(
    742             const std::unique_ptr<KernelBuilder> &iBuilder,
    743             string sourceBufferName,
    744             llvm::Value* sourceOffset,
    745             string dstBufferName,
    746             llvm::Value* outputOffset,
    747             llvm::Value* distance
    748     ) {
    749 
    750         size_t outputBufferSize = this->getOutputBufferSize(iBuilder, dstBufferName);
    751         Value* outputBufferSizeValue = iBuilder->getSize(outputBufferSize);
    752         Value* outputBufferSizeMask = iBuilder->getSize(outputBufferSize - 1);
    753         Value* maskedOutputOffset = iBuilder->CreateAnd(outputOffset, outputBufferSizeMask);
    754         Value* remainBuffer = iBuilder->CreateSub(outputBufferSizeValue, maskedOutputOffset);
    755         Value* copyLength1 = iBuilder->CreateSelect(iBuilder->CreateICmpUGE(remainBuffer, distance), distance, remainBuffer);
    756         Value* copyLength2 = iBuilder->CreateSub(distance, copyLength1);
    757 
    758 
    759         Value* inputBufferBasePtr = iBuilder->getRawInputPointer(sourceBufferName, iBuilder->getSize(0));
    760         Value* outputBufferBasePtr = iBuilder->getRawOutputPointer(dstBufferName, iBuilder->getSize(0));
    761 
    762         iBuilder->CreateMemCpy(
    763                 iBuilder->CreateGEP(outputBufferBasePtr, maskedOutputOffset),
    764                 iBuilder->CreateGEP(inputBufferBasePtr, sourceOffset),
    765                 copyLength1,
    766                 1); // no alignment guaranteed
    767         // Assumed output buffer is Circular buffer
    768         iBuilder->CreateMemCpy(
    769                 outputBufferBasePtr,
    770                 iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(sourceOffset, copyLength1)),
    771                 copyLength2,
    772                 8
    773         );
    774         iBuilder->setProducedItemCount(dstBufferName, iBuilder->CreateAdd(outputOffset, distance));
    775     }
    776 
    777     BasicBlock* SequentialKernel::memcpy2CursorsUntilNextZero(
    778             const std::unique_ptr<KernelBuilder> &iBuilder,
    779             string sourceBufferName,
    780             string sourceCursorName,
    781             string dstBufferName,
    782             string dstCursorName,
    783             string sourceMarkerName,
    784             Value* maxPos
    785     ) {
    786         BasicBlock* previousEntryBlock = iBuilder->GetInsertBlock();
    787 
    788         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("memcpy_2_cursors_until_next_zero_entry");
    789         this->recordCountForwardTempMaxPos(iBuilder, maxPos);
    790 
    791         iBuilder->CreateBr(entryBlock);
    792         iBuilder->SetInsertPoint(entryBlock);
    793 
    794         this->waitCursorUntilInputAvailable(iBuilder, sourceCursorName, sourceMarkerName);
    795 
    796         BasicBlock* bodyBlock = iBuilder->CreateBasicBlock("memcpy_2_cursors_until_next_zero_body");
    797         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("memcpy_2_cursors_until_next_zero_exit");
    798 
    799         iBuilder->CreateBr(bodyBlock);
    800         iBuilder->SetInsertPoint(bodyBlock);
    801 
    802         // Count Forward Zero in this pack
    803         Value* sourceCursorValue = this->getCursorValue(iBuilder, sourceCursorName);
    804 
    805         maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
    806         auto retValue = this->generateCountForwardOnes(iBuilder, sourceMarkerName, sourceCursorValue, maxPos);
    807         Value* distance = retValue.first;
    808 
    809         // Memcpy from sourceBuffer[sourceCursor : sourceCursor + distance] to dstBuffer[dstCursor : dstCursor + distance]
    810         Value* inputBufferBasePtr = iBuilder->getRawInputPointer(sourceBufferName, iBuilder->getSize(0));
    811         Value* outputBufferBasePtr = iBuilder->getRawOutputPointer(dstBufferName, iBuilder->getSize(0));
    812 
    813         Value* outputOffset = this->getCursorValue(iBuilder, dstCursorName);
    814         size_t outputBufferSize = this->getOutputBufferSize(iBuilder, dstBufferName);
    815         Value* outputBufferSizeValue = iBuilder->getSize(outputBufferSize);
    816         Value* outputBufferSizeMask = iBuilder->getSize(outputBufferSize - 1);
    817         Value* maskedOutputOffset = iBuilder->CreateAnd(outputOffset, outputBufferSizeMask);
    818         Value* remainBuffer = iBuilder->CreateSub(outputBufferSizeValue, maskedOutputOffset);
    819         Value* copyLength1 = iBuilder->CreateSelect(iBuilder->CreateICmpUGE(remainBuffer, distance), distance, remainBuffer);
    820         Value* copyLength2 = iBuilder->CreateSub(distance, copyLength1);
    821 
    822         iBuilder->CreateMemCpy(
    823                 iBuilder->CreateGEP(outputBufferBasePtr, maskedOutputOffset),
    824                 iBuilder->CreateGEP(inputBufferBasePtr, sourceCursorValue),
    825                 copyLength1,
    826                 1); // no alignment guaranteed
    827         // Assumed output buffer is Circular buffer
    828         iBuilder->CreateMemCpy(
    829                 outputBufferBasePtr,
    830                 iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(sourceCursorValue, copyLength1)),
    831                 copyLength2,
    832                 8
    833         );
    834 
    835         // Update cursor value and producedItemCount
    836         this->advanceCursor(iBuilder, sourceCursorName, distance);
    837         this->advanceCursor(iBuilder, dstCursorName, distance);
    838         iBuilder->setProducedItemCount(dstBufferName, this->getCursorValue(iBuilder, dstCursorName));
    839 
    840         // Finish
    841         Value* isFinished = retValue.second;
    842         iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
    843         //TODO should not use index bits for count forward zeros in this case
    844         iBuilder->SetInsertPoint(exitBlock);
    845         return exitBlock;
    846     }
    847 
    848     BasicBlock* SequentialKernel::memcpyOutputDst(
    849             const unique_ptr<KernelBuilder> &iBuilder,
    850             string outputBufferName,
    851             Value* copyOffset,
    852             Value* copyLength
    853 
    854     ) {
    855         Value* distance = copyLength;
    856 
    857         BasicBlock* matchCopyEntryBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_entry");
    858         BasicBlock* matchCopyExitBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_exit");
    859 
    860         Value* outputOffset = iBuilder->getProducedItemCount(outputBufferName);
    861 
    862         iBuilder->CreateBr(matchCopyEntryBlock);
    863 
    864         iBuilder->SetInsertPoint(matchCopyEntryBlock);
    865         this->generateDstMatchCopy(iBuilder, matchCopyEntryBlock, matchCopyExitBlock, outputBufferName, copyOffset, distance, outputOffset);
    866 
    867         iBuilder->SetInsertPoint(matchCopyExitBlock);
    868         // Update Cursor Value and producedItemCount
    869         iBuilder->setProducedItemCount(outputBufferName, iBuilder->CreateAdd(outputOffset, copyLength));
    870 
    871         return matchCopyExitBlock;
    872     }
    873 
    874     llvm::BasicBlock* SequentialKernel::memcpyOutputDstCursorUntilNextZero(
    875             const std::unique_ptr<KernelBuilder> &iBuilder,
    876             std::string outputBufferName,
    877             llvm::Value* copyOffset,
    878             std::string dstCursorName,
    879             std::string dstMarkerName,
    880             llvm::Value* maxPos
    881     ) {
    882         iBuilder->setScalarField(MemCpyUntilZeroCopyOffsetTempKey, copyOffset);
    883         this->recordCountForwardTempMaxPos(iBuilder, maxPos);
    884 
    885         BasicBlock* entryBlock = iBuilder->CreateBasicBlock("memcpy_ooutput_dst_cursor_until_next_zero_entry");
    886         iBuilder->CreateBr(entryBlock);
    887         iBuilder->SetInsertPoint(entryBlock);
    888 
    889         this->waitCursorUntilInputAvailable(iBuilder, dstCursorName, dstMarkerName);
    890 
    891         BasicBlock* bodyBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_body");
    892         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_exit");
    893 
    894         iBuilder->CreateBr(bodyBlock);
    895         iBuilder->SetInsertPoint(bodyBlock);
    896 
    897         // Count Forward Zero in this pack
    898         Value* cursorValue = this->getCursorValue(iBuilder, dstCursorName);
    899         maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
    900         auto retValue = this->generateCountForwardOnes(iBuilder, dstMarkerName, cursorValue, maxPos);
    901         Value* distance = retValue.first;
    902 
    903         // Memcpy from outputBuffer[cursorValue - copyOffset : cursorValue - copyOffset + distance] to outputBuffer[cursorValue : cursorValue + distance]
    904         BasicBlock* matchCopyEntryBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_entry");
    905         BasicBlock* matchCopyExitBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_exit");
    906         Value* outputOffset = this->getCursorValue(iBuilder, dstCursorName);
    907 
    908         iBuilder->CreateBr(matchCopyEntryBlock);
    909 
    910         iBuilder->SetInsertPoint(matchCopyEntryBlock);
    911         copyOffset = iBuilder->getScalarField(MemCpyUntilZeroCopyOffsetTempKey);
    912         this->generateDstMatchCopy(iBuilder, matchCopyEntryBlock, matchCopyExitBlock, outputBufferName, copyOffset, distance, outputOffset);
    913 
    914         iBuilder->SetInsertPoint(matchCopyExitBlock);
    915         // Update Cursor Value and producedItemCount
    916         this->advanceCursor(iBuilder, dstCursorName, distance);
    917         iBuilder->setProducedItemCount(outputBufferName, this->getCursorValue(iBuilder, dstCursorName));
    918 
    919         // Finish
    920         Value* isFinished = retValue.second;
    921         iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
    922 
    923 
    924         iBuilder->SetInsertPoint(exitBlock);
    925 
    926         return exitBlock;
    927     }
    928 
    929     void SequentialKernel::generateDstMatchCopy(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock* entry, BasicBlock* exit, string outputBufferName, Value* matchOffset, Value* matchLength, Value* outputOffset) {
    930         iBuilder->SetInsertPoint(entry);
    931 
    932         Value * outputBufferBasePtr = iBuilder->getRawOutputPointer(outputBufferName, iBuilder->getSize(0));
    933 
    934         Value* bufferSize = iBuilder->getSize(this->getOutputBufferSize(iBuilder, outputBufferName));
    935         Value* bufferSizeMask = iBuilder->CreateSub(bufferSize, iBuilder->getSize(1));
    936 
    937 
    938         Value* matchStart = iBuilder->CreateSub(outputOffset, matchOffset);
    939         Value * baseSrcOffset = iBuilder->CreateAnd(matchStart, bufferSizeMask);
    940         Value * baseDstOffset = iBuilder->CreateAnd(outputOffset, bufferSizeMask);
    941 
    942 
    943         Value * copyStep = iBuilder->CreateSelect(
    944                 iBuilder->CreateICmpULT(matchOffset, iBuilder->getSize(4)),
    945                 iBuilder->getSize(1),
    946                 iBuilder->getSize(4)
    947         );
    948 
    949 
    950         BasicBlock * cpyLoopCond = iBuilder->CreateBasicBlock("matchcopy_loop_cond");
    951         BasicBlock * cpyLoopBody = iBuilder->CreateBasicBlock("matchcopy_loop_body");
    952         BasicBlock * cpyLoopExit = iBuilder->CreateBasicBlock("matchcopy_loop_exit");
    953 
    954 
    955         iBuilder->CreateBr(cpyLoopCond);
    956 
    957         iBuilder->SetInsertPoint(cpyLoopCond);
    958 
    959         PHINode * phiSrcOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    960         PHINode * phiDstOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    961         PHINode * phiIter = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    962         phiSrcOffset->addIncoming(baseSrcOffset, entry);
    963         phiDstOffset->addIncoming(baseDstOffset, entry);
    964         phiIter->addIncoming(iBuilder->getSize(0), entry);
    965 
    966         iBuilder->CreateCondBr(
    967                 iBuilder->CreateICmpUGE(phiIter, matchLength),
    968                 cpyLoopExit,
    969                 cpyLoopBody
    970         );
    971 
    972         iBuilder->SetInsertPoint(cpyLoopBody);
    973         BasicBlock * reachingBufferEnd_then = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_then");
    974         BasicBlock * reachingBufferEnd_else = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_else");
    975 
    976 
    977         Value * distSrcEnd = iBuilder->CreateSub(bufferSize, phiSrcOffset);
    978         Value * distDstEnd = iBuilder->CreateSub(bufferSize, phiDstOffset);
    979         Value * minDist = iBuilder->CreateSelect(iBuilder->CreateICmpULT(distSrcEnd, distDstEnd), distSrcEnd, distDstEnd);
    980         iBuilder->CreateUnlikelyCondBr(
    981                 iBuilder->CreateICmpULE(minDist, iBuilder->getSize(4)),
    982                 reachingBufferEnd_then,
    983                 reachingBufferEnd_else
    984         );
    985 
    986         iBuilder->SetInsertPoint(reachingBufferEnd_then);
    987 
    988         Value * src8 = iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset);
    989         Value * dst8 = iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset);
    990         iBuilder->CreateStore(iBuilder->CreateLoad(src8), dst8);
    991         Value * newSrcOffset = iBuilder->CreateAnd(
    992                 iBuilder->CreateAdd(phiSrcOffset, iBuilder->getSize(1)),
    993                 bufferSizeMask
    994         );
    995         Value * newDstOffset = iBuilder->CreateAnd(
    996                 iBuilder->CreateAdd(phiDstOffset, iBuilder->getSize(1)),
    997                 bufferSizeMask
    998         );
    999         phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_then);
    1000         phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_then);
    1001         phiIter->addIncoming(iBuilder->CreateAdd(phiIter, iBuilder->getSize(1)), reachingBufferEnd_then);
    1002         iBuilder->CreateBr(cpyLoopCond);
    1003 
    1004 
    1005         iBuilder->SetInsertPoint(reachingBufferEnd_else);
    1006         // Copy 4 bytes at a time (regardless of step length).
    1007         Value * src32 = iBuilder->CreatePointerCast(
    1008                 iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset),
    1009                 iBuilder->getInt32Ty()->getPointerTo());
    1010         Value * dst32 = iBuilder->CreatePointerCast(
    1011                 iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset),
    1012                 iBuilder->getInt32Ty()->getPointerTo());
    1013         // Force unaligned load/store of an int32.
    1014         iBuilder->CreateAlignedStore(iBuilder->CreateAlignedLoad(src32, 1), dst32, 1);
    1015         newSrcOffset = iBuilder->CreateAnd(
    1016                 iBuilder->CreateAdd(phiSrcOffset, copyStep),
    1017                 bufferSizeMask
    1018         );
    1019         newDstOffset = iBuilder->CreateAnd(
    1020                 iBuilder->CreateAdd(phiDstOffset, copyStep),
    1021                 bufferSizeMask
    1022         );
    1023         phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_else);
    1024         phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_else);
    1025         phiIter->addIncoming(iBuilder->CreateAdd(phiIter, copyStep), reachingBufferEnd_else);
    1026         iBuilder->CreateBr(cpyLoopCond);
    1027 
    1028         iBuilder->SetInsertPoint(cpyLoopExit);
    1029         outputOffset = iBuilder->CreateAdd(outputOffset, matchLength);
    1030 
    1031         iBuilder->CreateBr(exit);
    1032     }
    1033 
    1034 
    1035     BasicBlock* SequentialKernel::waitCursorUntilInputAvailable(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName) {
     318
     319    BasicBlock *SequentialKernel::waitCursorUntilInputAvailable(const std::unique_ptr<KernelBuilder> &iBuilder,
     320                                                                std::string cursorName,
     321                                                                std::string inputStreamBufferName) {
    1036322//        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
    1037         Value* nextStateValue = iBuilder->getSize(this->stateBlocks.size());
    1038 
    1039         BasicBlock* restoreBlock = iBuilder->CreateBasicBlock("wait_cursor_until_input_available_restore");
    1040         BasicBlock* continueBlock = iBuilder->CreateBasicBlock("wait_cursor_until_input_available_continue");
     323        Value *nextStateValue = iBuilder->getSize(this->stateBlocks.size());
     324
     325        BasicBlock *restoreBlock = iBuilder->CreateBasicBlock("wait_cursor_until_input_available_restore");
     326        BasicBlock *continueBlock = iBuilder->CreateBasicBlock("wait_cursor_until_input_available_continue");
    1041327
    1042328        this->stateBlocks.push_back(restoreBlock);
     
    1046332        iBuilder->SetInsertPoint(restoreBlock);
    1047333
    1048         Value* cursorValue = this->getCursorValue(iBuilder, cursorName);
    1049         Value* itemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName), iBuilder->getProcessedItemCount(inputStreamBufferName));
    1050         Value* isAvailable = iBuilder->CreateICmpULT(cursorValue, itemTotal);
    1051 
    1052         Value* nextState = iBuilder->CreateSelect(isAvailable, iBuilder->getSize(0), nextStateValue);
     334        Value *cursorValue = this->getCursorValue(iBuilder, cursorName);
     335        Value *itemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName),
     336                                               iBuilder->getProcessedItemCount(inputStreamBufferName));
     337        Value *isAvailable = iBuilder->CreateICmpULT(cursorValue, itemTotal);
     338
     339        Value *nextState = iBuilder->CreateSelect(isAvailable, iBuilder->getSize(0), nextStateValue);
    1053340        iBuilder->setScalarField(SequentialSegmentStateKey, nextState);
    1054341
     
    1070357    }
    1071358
    1072     Value* SequentialKernel::offsetToPackBaseOffset(const unique_ptr<KernelBuilder> &iBuilder, Value* offset) {
     359    Value *SequentialKernel::offsetToPackBaseOffset(const unique_ptr<KernelBuilder> &iBuilder, Value *offset) {
    1073360        return iBuilder->CreateShl(
    1074361                this->offsetToPackIndex(iBuilder, offset),
     
    1076363        );
    1077364    }
    1078     Value* SequentialKernel::offsetToPackIndex(const unique_ptr<KernelBuilder> &iBuilder, Value* offset) {
     365
     366    Value *SequentialKernel::offsetToPackIndex(const unique_ptr<KernelBuilder> &iBuilder, Value *offset) {
    1079367        return iBuilder->CreateLShr(offset, iBuilder->getSize(std::log2(64)));
    1080368    }
    1081369
    1082     Value* SequentialKernel::offsetToPackOffset(const unique_ptr<KernelBuilder> &iBuilder, Value* offset) {
     370    Value *SequentialKernel::offsetToPackOffset(const unique_ptr<KernelBuilder> &iBuilder, Value *offset) {
    1083371        return iBuilder->CreateAnd(offset, iBuilder->getSize(64 - 1));
    1084372    }
    1085373
    1086     Value* SequentialKernel::offsetToActualBufferOffset(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset) {
     374    Value *
     375    SequentialKernel::offsetToActualBufferOffset(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName,
     376                                                 Value *offset) {
    1087377        size_t bufferSize = this->getInputBufferSize(iBuilder, inputBufferName);
    1088         Value* bufferOffsetMask = iBuilder->getSize(bufferSize - 1);
     378        Value *bufferOffsetMask = iBuilder->getSize(bufferSize - 1);
    1089379        return iBuilder->CreateAnd(bufferOffsetMask, offset);
    1090380    }
    1091381
    1092     Value* SequentialKernel::generateLoadCircularInputPack(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset) {
    1093         Value* actualBufferOffset = this->offsetToActualBufferOffset(iBuilder, inputBufferName, offset);
    1094         Value* packIndex = this->offsetToPackIndex(iBuilder, actualBufferOffset);
     382    Value *
     383    SequentialKernel::generateLoadCircularInputPack(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName,
     384                                                    Value *offset) {
     385        Value *actualBufferOffset = this->offsetToActualBufferOffset(iBuilder, inputBufferName, offset);
     386        Value *packIndex = this->offsetToPackIndex(iBuilder, actualBufferOffset);
    1095387//        Value* countStartBitIndex = this->offsetToPackOffset(iBuilder, actualBufferOffset);
    1096388
    1097389
    1098         Value* inputStreamPtr = iBuilder->getInputStreamBlockPtr(inputBufferName, iBuilder->getInt32(0));
     390        Value *inputStreamPtr = iBuilder->getInputStreamBlockPtr(inputBufferName, iBuilder->getInt32(0));
    1099391        inputStreamPtr = iBuilder->CreatePointerCast(inputStreamPtr, iBuilder->getInt64Ty()->getPointerTo());
    1100392        return iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, packIndex));
     
    1104396    }
    1105397
    1106     Value* SequentialKernel::generateLoadCircularInput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset, Type* pointerType) {
     398    Value *
     399    SequentialKernel::generateLoadCircularInput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName,
     400                                                Value *offset, Type *pointerType) {
    1107401        size_t inputSize = this->getInputBufferSize(iBuilder, inputBufferName);
    1108         Value* offsetMask = iBuilder->getSize(inputSize - 1);
    1109         Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    1110 
    1111         Value* inputBufferPtr = iBuilder->getRawInputPointer(inputBufferName, iBuilder->getSize(0));
     402        Value *offsetMask = iBuilder->getSize(inputSize - 1);
     403        Value *maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
     404
     405        Value *inputBufferPtr = iBuilder->getRawInputPointer(inputBufferName, iBuilder->getSize(0));
    1112406
    1113407        inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
    1114408        return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
    1115409    }
    1116     Value* SequentialKernel::generateLoadCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset, Type* pointerType) {
    1117         size_t inputSize = this->getOutputBufferSize(iBuilder, inputBufferName);
    1118         Value* offsetMask = iBuilder->getSize(inputSize - 1);
    1119         Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    1120 
    1121         Value* inputBufferPtr = iBuilder->getRawOutputPointer(inputBufferName, iBuilder->getSize(0));
    1122 
    1123         inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
    1124         return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
    1125     }
    1126 
    1127     Value* SequentialKernel::generateLoadSourceInputByte(const std::unique_ptr<KernelBuilder> &iBuilder, string sourceBufferName, Value* offset) {
    1128         Value * blockStartPtr = iBuilder->CreatePointerCast(
     410
     411    Value *SequentialKernel::generateLoadSourceInputByte(const std::unique_ptr<KernelBuilder> &iBuilder,
     412                                                         string sourceBufferName, Value *offset) {
     413        Value *blockStartPtr = iBuilder->CreatePointerCast(
    1129414                iBuilder->getInputStreamBlockPtr(sourceBufferName, iBuilder->getInt32(0)),
    1130415                iBuilder->getInt8PtrTy()
    1131416        );
    1132         Value * ptr = iBuilder->CreateGEP(blockStartPtr, offset);
     417        Value *ptr = iBuilder->CreateGEP(blockStartPtr, offset);
    1133418
    1134419
     
    1137422
    1138423
    1139     void SequentialKernel::generateStoreCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string outputBufferName, Type* pointerType, Value* value) {
    1140         Value* offset = iBuilder->getProducedItemCount(outputBufferName);
     424    void
     425    SequentialKernel::generateStoreCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string outputBufferName,
     426                                                  Type *pointerType, Value *value) {
     427        Value *offset = iBuilder->getProducedItemCount(outputBufferName);
    1141428
    1142429        size_t inputSize = this->getOutputBufferSize(iBuilder, outputBufferName);
    1143         Value* offsetMask = iBuilder->getSize(inputSize - 1);
    1144         Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    1145 
    1146         Value* outputBufferPtr = iBuilder->getRawOutputPointer(outputBufferName, iBuilder->getSize(0));
     430        Value *offsetMask = iBuilder->getSize(inputSize - 1);
     431        Value *maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
     432
     433        Value *outputBufferPtr = iBuilder->getRawOutputPointer(outputBufferName, iBuilder->getSize(0));
    1147434
    1148435        outputBufferPtr = iBuilder->CreatePointerCast(outputBufferPtr, pointerType);
     
    1153440    }
    1154441
    1155     void SequentialKernel::increaseScalarField(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& fieldName, llvm::Value* value) {
    1156         Value* fieldValue = iBuilder->getScalarField(fieldName);
     442    void
     443    SequentialKernel::increaseScalarField(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string &fieldName,
     444                                          llvm::Value *value) {
     445        Value *fieldValue = iBuilder->getScalarField(fieldName);
    1157446        fieldValue = iBuilder->CreateAdd(fieldValue, value);
    1158447        iBuilder->setScalarField(fieldName, fieldValue);
     
    1160449
    1161450
    1162     void SequentialKernel::markCircularOutputBitstreamOnePack(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& bitstreamName, llvm::Value* start, llvm::Value* end, bool isOne) {
    1163         Value* outputBasePtr = iBuilder->getRawOutputPointer(bitstreamName, iBuilder->getSize(0));
     451    void SequentialKernel::markCircularOutputBitstreamOnePack(const std::unique_ptr<KernelBuilder> &iBuilder,
     452                                                              const std::string &bitstreamName, llvm::Value *start,
     453                                                              llvm::Value *end, bool isOne) {
     454        Value *outputBasePtr = iBuilder->getRawOutputPointer(bitstreamName, iBuilder->getSize(0));
    1164455
    1165456        outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
    1166457
    1167458        size_t outputBufferSize = this->getOutputBufferSize(iBuilder, bitstreamName);
    1168         Value* outputMask = iBuilder->getSize(outputBufferSize / 64 - 1);
    1169 
    1170 
    1171         Value* startOffset = iBuilder->CreateLShr(start, iBuilder->getSize(std::log2(64)), "startOffset");
    1172         Value* curOffset = startOffset;
    1173 
    1174 
    1175         Value* outputLowestBitValue = iBuilder->CreateSelect(
     459        Value *outputMask = iBuilder->getSize(outputBufferSize / 64 - 1);
     460
     461
     462        Value *startOffset = iBuilder->CreateLShr(start, iBuilder->getSize(std::log2(64)), "startOffset");
     463        Value *curOffset = startOffset;
     464
     465
     466        Value *outputLowestBitValue = iBuilder->CreateSelect(
    1176467                iBuilder->CreateICmpULE(
    1177468                        iBuilder->CreateShl(curOffset, std::log2(64)),
     
    1182473        );
    1183474
    1184         Value* outputHighestBitValue = iBuilder->CreateShl(
     475        Value *outputHighestBitValue = iBuilder->CreateShl(
    1185476                iBuilder->getSize(1),
    1186477                iBuilder->CreateAnd(end, iBuilder->getSize(64 - 1))
     
    1188479
    1189480
    1190         Value* bitMask = iBuilder->CreateSub(
     481        Value *bitMask = iBuilder->CreateSub(
    1191482                outputHighestBitValue,
    1192483                outputLowestBitValue
     
    1199490
    1200491    // Assume we have enough output buffer
    1201     llvm::BasicBlock* SequentialKernel::markCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& bitstreamName, llvm::Value* start, llvm::Value* end, bool isOne, bool setProduced) {
    1202         BasicBlock* entryBlock = iBuilder->GetInsertBlock();
    1203 
    1204         Value* outputBasePtr = iBuilder->getRawOutputPointer(bitstreamName, iBuilder->getSize(0));
     492    llvm::BasicBlock *SequentialKernel::markCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
     493                                                                    const std::string &bitstreamName,
     494                                                                    llvm::Value *start, llvm::Value *end, bool isOne,
     495                                                                    bool setProduced) {
     496        BasicBlock *entryBlock = iBuilder->GetInsertBlock();
     497
     498        Value *outputBasePtr = iBuilder->getRawOutputPointer(bitstreamName, iBuilder->getSize(0));
    1205499
    1206500        outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
    1207501
    1208502        size_t outputBufferSize = this->getOutputBufferSize(iBuilder, bitstreamName);
    1209         Value* outputMask = iBuilder->getSize(outputBufferSize / 64 - 1);
    1210 
    1211         BasicBlock* conBlock = iBuilder->CreateBasicBlock("mark_bit_one_con");
    1212         BasicBlock* bodyBlock =iBuilder->CreateBasicBlock("mark_bit_one_body");
    1213         BasicBlock* exitBlock =iBuilder->CreateBasicBlock("mark_bit_one_exit");
    1214 
    1215         Value* startOffset = iBuilder->CreateLShr(start, iBuilder->getSize(std::log2(64)), "startOffset");
     503        Value *outputMask = iBuilder->getSize(outputBufferSize / 64 - 1);
     504
     505        BasicBlock *conBlock = iBuilder->CreateBasicBlock("mark_bit_one_con");
     506        BasicBlock *bodyBlock = iBuilder->CreateBasicBlock("mark_bit_one_body");
     507        BasicBlock *exitBlock = iBuilder->CreateBasicBlock("mark_bit_one_exit");
     508
     509        Value *startOffset = iBuilder->CreateLShr(start, iBuilder->getSize(std::log2(64)), "startOffset");
    1216510
    1217511        iBuilder->CreateBr(conBlock);
     
    1221515
    1222516
    1223         PHINode* curOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
     517        PHINode *curOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    1224518        curOffset->addIncoming(startOffset, entryBlock);
    1225519
     
    1232526        // Body
    1233527        iBuilder->SetInsertPoint(bodyBlock);
    1234         Value* maskedOffset = iBuilder->CreateAnd(curOffset, outputMask);
    1235 
    1236         Value* outputLowestBitValue = iBuilder->CreateSelect(
     528        Value *maskedOffset = iBuilder->CreateAnd(curOffset, outputMask);
     529
     530        Value *outputLowestBitValue = iBuilder->CreateSelect(
    1237531                iBuilder->CreateICmpULE(
    1238532                        iBuilder->CreateShl(curOffset, std::log2(64)),
     
    1243537        );
    1244538
    1245         Value* hasNotReachEnd = iBuilder->CreateICmpULE(
     539        Value *hasNotReachEnd = iBuilder->CreateICmpULE(
    1246540                iBuilder->CreateShl(iBuilder->CreateAdd(curOffset, iBuilder->getSize(1)), std::log2(64)),
    1247541                end
    1248542        );
    1249         Value* producedItemsCount = iBuilder->CreateSelect(
     543        Value *producedItemsCount = iBuilder->CreateSelect(
    1250544                hasNotReachEnd,
    1251545                iBuilder->CreateShl(iBuilder->CreateAdd(curOffset, iBuilder->getSize(1)), std::log2(64)),
     
    1253547        );
    1254548
    1255         Value* outputHighestBitValue = iBuilder->CreateSelect(
     549        Value *outputHighestBitValue = iBuilder->CreateSelect(
    1256550                hasNotReachEnd,
    1257551                iBuilder->getSize(0),
     
    1263557
    1264558
    1265         Value* bitMask = iBuilder->CreateSub(
     559        Value *bitMask = iBuilder->CreateSub(
    1266560                outputHighestBitValue,
    1267561                outputLowestBitValue
     
    1272566        }
    1273567
    1274         Value* targetPtr = iBuilder->CreateGEP(outputBasePtr, maskedOffset);
    1275         Value* oldValue = iBuilder->CreateLoad(targetPtr);
    1276         Value* newValue = NULL;
     568        Value *targetPtr = iBuilder->CreateGEP(outputBasePtr, maskedOffset);
     569        Value *oldValue = iBuilder->CreateLoad(targetPtr);
     570        Value *newValue = NULL;
    1277571        if (isOne) {
    1278572            newValue = iBuilder->CreateOr(oldValue, bitMask);
  • icGREP/icgrep-devel/icgrep/kernels/sequential_kernel.h

    r5864 r5905  
    2121    class SequentialKernel : public MultiBlockKernel {
    2222    protected:
    23         std::map<std::string, size_t> inputStreamIndexMap;
    24         std::map<std::string, std::string> clearBufferMap;
    25 
    2623        SequentialKernel(
    2724                const std::unique_ptr<kernel::KernelBuilder> & iBuilder,
     
    3431
    3532        virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    36 //        virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &iBuilder) override;
    37 
    3833
    3934        virtual void generateDoSequentialSegmentMethod(const std::unique_ptr<KernelBuilder> &iBuilder);
    40 
    41         //Index Bit
    42         void configIndexBits(const std::unique_ptr<KernelBuilder> &iBuilder, const std::map<std::string, size_t>& inputStreamMap);
    43 
    44         void configOutputBufferToBeClear(const std::map<std::string, std::string>& clearMap);
    45 
    4635
    4736        //Cursor
     
    5544                                                     std::string cursorName, std::string inputStreamBufferName, llvm::Value* maxPos);
    5645
    57         llvm::BasicBlock* memcpy2CursorsUntilNextZero(
    58                 const std::unique_ptr<KernelBuilder> &iBuilder,
    59                 std::string sourceBufferName,
    60                 std::string sourceCursorName,
    61                 std::string dstBufferName,
    62                 std::string dstCursorName,
    63                 std::string sourceMarkerName,
    64                 llvm::Value* maxPos);
    65         llvm::BasicBlock* memcpyOutputDstCursorUntilNextZero(
    66                 const std::unique_ptr<KernelBuilder> &iBuilder,
    67                 std::string outputBufferName,
    68                 llvm::Value* copyOffset,
    69                 std::string dstCursorName,
    70                 std::string dstMarkerName,
    71                 llvm::Value* maxPos
    72         );
    73 
    74         llvm::BasicBlock* memcpyOutputDst(
    75                 const std::unique_ptr<KernelBuilder> &iBuilder,
    76                 std::string outputBufferName,
    77                 llvm::Value* copyOffset,
    78                 llvm::Value* copyLength
    79 
    80         );
    81 
    82         void memcpyCircularBuffer(
    83                 const std::unique_ptr<KernelBuilder> &iBuilder,
    84                 std::string sourceBufferName,
    85                 llvm::Value* sourceOffset,
    86                 std::string dstBufferName,
    87                 llvm::Value* dstOffset,
    88                 llvm::Value* distance
    89         );
    90 
    9146        // Helper Functions
    9247        void markCircularOutputBitstreamOnePack(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& bitstreamName, llvm::Value* start, llvm::Value* end, bool isOne);
     
    9752        llvm::Value* generateLoadSourceInputByte(const std::unique_ptr<KernelBuilder> &iBuilder, std::string sourceBuffername, llvm::Value* offset);
    9853        void generateStoreCircularOutput(const std::unique_ptr<KernelBuilder> &iBuilder, std::string outputBufferName, llvm::Type* pointerType, llvm::Value* value);
    99         llvm::Value* generateLoadCircularOutput(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputBufferName, llvm::Value* offset, llvm::Type* pointerType);
    10054        void increaseScalarField(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& fieldName, llvm::Value* value);
    10155
    102 //        void generateStoreCircularOutput1(const std::unique_ptr<KernelBuilder> &iBuilder, std::string outputBufferName, llvm::Type* pointerType, llvm::Value* value);
    10356
    10457        llvm::Value* offsetToActualBufferOffset(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputBufferName, llvm::Value* offset);
     
    11467
    11568    private:
    116 
    11769        // forwardBits, packEnd, exceedAvailable
    11870        std::pair<llvm::Value*, std::pair<llvm::Value*, llvm::Value*>> genereateCountForwardBitsOnePack(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputStreamBufferName, llvm::Value* beginOffset, bool isZero = true);
    119         llvm::Value* generateCountIndexBit(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName, bool isZero, llvm::Value* beginBitIndex);
    12071
    12172        // forwardBits, isFinished
     
    12576        std::pair<llvm::Value*, llvm::Value*> generateCountForwardOnes(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputStreamBufferName, llvm::Value* beginOffset, llvm::Value* maxPos = NULL);
    12677
    127         void generateDstMatchCopy(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::BasicBlock* entry, llvm::BasicBlock* exit, std::string outputBufferName, llvm::Value* matchOffset, llvm::Value* matchLength, llvm::Value* outputOffset);
    128 
    129         void generateBuildIndexBits(const std::unique_ptr<KernelBuilder> &iBuilder);
    130         inline bool hasIndexBits(const std::string& streamName);
    131         void generateClearBuffer(const std::unique_ptr<KernelBuilder> &iBuilder);
    132 
    13378        std::vector<llvm::BasicBlock*> stateBlocks;
    13479
    13580        inline std::string generateCursorFullname(std::string cursorName);
    136         inline std::string generateInputZeroIndexName(std::string inputStreamName);
    137         inline std::string generateInputOneIndexName(std::string inputStreamName);
    138         inline std::string generateInputPreviousAvailableName(std::string inputStreamName);
    139 
    140 
    14181
    14282        // CursorValue should not be set directly in user implemented kernel
     
    14484
    14585        llvm::BasicBlock* exitBlock;
    146 
    14786
    14887        inline void recordCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* maxPos);
Note: See TracChangeset for help on using the changeset viewer.