Ignore:
Timestamp:
Feb 28, 2018, 11:07:48 PM (16 months ago)
Author:
xwa163
Message:

Implement lz4_numbers_to_bitstream_kernel in new kernel infrastructure, fix bug of extract and deposit processes of lz4_ext_dep in large data.

Location:
icGREP/icgrep-devel/icgrep/kernels/lz4
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_extract_e_m0.cpp

    r5864 r5885  
    77
    88//#define APPLY_64PACK_ACCELERATION
     9// TODO May be we can change it to 256 PACK Acceleration based on SIMD instruction
     10
    911#define ACCELERATION_LOOP_COUNT (20)
    1012
     
    604606    iBuilder->SetInsertPoint(handleM0ElseBlock);
    605607    this->advanceCursorUntilPos(iBuilder, "extender", iBuilder->getScalarField("offsetPos"));
     608
     609    // Store final M0 pos to make sure the bit stream will be long enough
     610    Value* finalM0OutputPos = iBuilder->getScalarField("m0OutputPos");
     611    this->generateStoreCircularOutput(iBuilder, "m0Start", iBuilder->getInt64Ty()->getPointerTo(), finalM0OutputPos);
     612    this->generateStoreCircularOutput(iBuilder, "m0End", iBuilder->getInt64Ty()->getPointerTo(), finalM0OutputPos);
     613    this->generateStoreCircularOutput(iBuilder, "matchOffset", iBuilder->getInt64Ty()->getPointerTo(), iBuilder->getInt64(0));
     614
    606615    iBuilder->CreateBr(compressedBlockLoopFinal);
    607616
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp

    r5873 r5885  
    4747
    4848        Value * sourceItemsAvail = mAvailableItemCount[1]; //TODO need to be calculated from numOfStrides
    49 //    kb->CallPrintInt("itemsToDo:", itemsToDo);
    50 //        kb->CallPrintInt("sourceItemsAvail:", sourceItemsAvail);
    51 //        kb->getProcessedItemCount("")
    52 //        kb->CallPrintInt("sourceItemsAvail2:", sourceItemsAvail2);
    53 
    5449
    5550        Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
     
    109104//    kb->CallPrintInt("total_count", total_count);
    110105//    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
    111         kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
     106        // Do not check popcount in final block, since there may be some useless pdep marker in the end
     107        kb->CreateCondBr(kb->CreateOr(kb->CreateICmpULE(total_count, sourceItemsRemaining), mIsFinal), processBlock, terminate);
    112108        kb->SetInsertPoint(processBlock);
    113109
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_numbers_to_bitstream_kernel.cpp

    r5864 r5885  
    1 //
    2 // Created by wxy325 on 2017/8/9.
    3 //
    41
    52#include "lz4_numbers_to_bitstream_kernel.h"
     
    107#include <kernels/streamset.h>
    118
    12 #define CURRENT_PROCESS_INDEX_KEY ("currentProcessIndex")
    13 #define CURRENT_PACK_INDEX_KEY ("currentPackIndex")
    14 #define CURRENT_PACK_START_VALUE_KEY ("currentPackStartValue")
    15 #define CURRENT_PACK_END_VALUE_KEY ("currentPackEndValue")
    16 #define CARRY_BIT_KEY ("carryBit")
    17 
    189#define START_NUM_STREAM_NAME ("startNumberStream")
    1910#define END_NUM_STREAM_NAME ("endNumberStream")
    20 #define OUTPUT_BIT_STREAM_NAME ("outputBitStream")
    21 
    22 
    23 #define WORD_WIDTH (64)
    24 #define LOG_2_WORD_WIDTH (std::log2(WORD_WIDTH))
     11#define OUTPUT_BIT_STREAM_NAME ("outputBitStream__")
     12
     13#define PENDING_START_DATA_KEY ("pendingStartData")
     14#define PENDING_END_DATA_KEY ("pendingEndData")
    2515
    2616using namespace llvm;
     
    2919
    3020namespace kernel {
    31     void LZ4NumbersToBitstreamKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    32         //TODO
    33 //    void LZ4NumbersToBitstreamKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &iBuilder) {
    34         BasicBlock* entryBlock = iBuilder->GetInsertBlock();
    35         BasicBlock* dataLoopCon = iBuilder->CreateBasicBlock("data_loop_con");
    36         BasicBlock* dataLoopBody = iBuilder->CreateBasicBlock("data_loop_body");
    37         BasicBlock* dataLoopExit = iBuilder->CreateBasicBlock("data_loop_exit");
    38 
    39         Value* initPackStartValue = iBuilder->getScalarField(CURRENT_PACK_START_VALUE_KEY);
    40         Value* initPackEndValue = iBuilder->getScalarField(CURRENT_PACK_END_VALUE_KEY);
    41 
    42         Value* initPackIndex = iBuilder->getScalarField(CURRENT_PACK_INDEX_KEY);
    43         Value* initProcessIndex = iBuilder->getScalarField(CURRENT_PROCESS_INDEX_KEY);
    44         Value* initCarryBit = iBuilder->getScalarField(CARRY_BIT_KEY);
    45 
    46         //EntryBlock
    47 //        Value* numItemAvailable = iBuilder->CreateAdd(
    48 //                iBuilder->getAvailableItemCount(START_NUM_STREAM_NAME),
    49 //                iBuilder->getProcessedItemCount(START_NUM_STREAM_NAME)
     21
     22    void LZ4NumbersToBitstreamKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder,
     23                                                               llvm::Value *const numOfStrides) {
     24
     25//        iBuilder->CallPrintInt("======Entry", iBuilder->getSize(0));
     26//        iBuilder->CallPrintInt("mIsFinal", mIsFinal);
     27//        iBuilder->CallPrintInt("numOfStrides", numOfStrides);
     28
     29        // Const
     30        Constant *SIZE_ZERO = iBuilder->getSize(0);
     31        Constant *SIZE_ONE = iBuilder->getSize(1);
     32        Constant *INT64_ZERO = iBuilder->getInt64(0);
     33        Constant *INT64_ONE = iBuilder->getInt64(1);
     34        Constant *BIT_BLOCK_ZERO = llvm::ConstantVector::get(
     35                {INT64_ZERO, INT64_ZERO, INT64_ZERO, INT64_ZERO}); // TODO Assumed bit block type is always <4 * i64>
     36        unsigned int BIT_BLOCK_WIDTH = iBuilder->getBitBlockWidth();
     37        Constant *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(BIT_BLOCK_WIDTH);
     38
     39
     40        size_t outputBufferSize = this->getAnyBufferSize(iBuilder, OUTPUT_BIT_STREAM_NAME) / iBuilder->getStride();
     41        Value *outputRawBeginPtr = iBuilder->CreatePointerCast(
     42                iBuilder->getRawOutputPointer(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO),
     43                iBuilder->getBitBlockType()->getPointerTo());
     44        Value *outputCurrentPtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO);
     45//        outputRawBeginPtr->getType()->print(outs());
     46//        outputCurrentPtr->getType()->print(outs());
     47
     48        Value *offset = iBuilder->CreatePtrDiff(outputCurrentPtr, outputRawBeginPtr);
     49        Value *remainSpace = iBuilder->CreateSub(iBuilder->getSize(outputBufferSize), offset);
     50//        iBuilder->CallPrintInt("remainSpace",
     51//                               remainSpace); //TODO workaround here, kernel infrastructure should provide the information about how much data we can produced
     52
     53
     54        BasicBlock *entryBlock = iBuilder->GetInsertBlock();
     55
     56
     57        Value *itemsToDo = mAvailableItemCount[0];
     58//        iBuilder->CallPrintInt("itemsToDo", itemsToDo);
     59        Value *isFinalBlock = iBuilder->CreateICmpEQ(itemsToDo, iBuilder->getSize(0));
     60        iBuilder->setTerminationSignal(isFinalBlock);
     61
     62        Value *itemProcessed = iBuilder->getProcessedItemCount(START_NUM_STREAM_NAME);
     63        Value *oldProducedItemCount = iBuilder->getProducedItemCount(OUTPUT_BIT_STREAM_NAME);
     64        Value *oldProducedOutputBlockIndex = iBuilder->CreateUDiv(oldProducedItemCount,
     65                                                                  SIZE_BIT_BLOCK_WIDTH); // always produce full block except for final block
     66
     67
     68//        Value *initCurrentItemIndex = iBuilder->CreateSelect(
     69//                isFinalBlock,
     70//                SIZE_ZERO,
     71//                iBuilder->CreateURem(itemProcessed, SIZE_BIT_BLOCK_WIDTH)
    5072//        );
    5173
    52         Value* numItemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(START_NUM_STREAM_NAME), iBuilder->getProcessedItemCount(START_NUM_STREAM_NAME));
    53 //        iBuilder->CallPrintInt("numItemAvailable", numItemAvailable);
    54 
    55 
    56         //dataLoopCon
    57         iBuilder->CreateBr(dataLoopCon);
    58         iBuilder->SetInsertPoint(dataLoopCon);
    59         PHINode* currentPackIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 5);
    60         PHINode* currentPackStartValue = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 5);
    61         PHINode* currentPackEndValue = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 5);
    62         PHINode* currentProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 5);
    63         PHINode* isHandleStart = iBuilder->CreatePHI(iBuilder->getInt1Ty(), 5);
    64         PHINode* carryBit = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 5);
    65 
    66         currentPackIndex->addIncoming(initPackIndex, entryBlock);
    67         currentPackStartValue->addIncoming(initPackStartValue, entryBlock);
    68         currentPackEndValue->addIncoming(initPackEndValue, entryBlock);
    69         currentProcessIndex->addIncoming(initProcessIndex, entryBlock);
    70         isHandleStart->addIncoming(iBuilder->getInt1(true), entryBlock);
    71         carryBit->addIncoming(initCarryBit, entryBlock);
    72 
    73         Value* notReachMaxNum = iBuilder->CreateICmpULT(currentProcessIndex, numItemTotal);
    74         iBuilder->CreateCondBr(notReachMaxNum, dataLoopBody, dataLoopExit);
    75 
    76         //dataLoopBody
    77         iBuilder->SetInsertPoint(dataLoopBody);
    78 
    79         BasicBlock* dataStartLoopBlock = iBuilder->CreateBasicBlock("data_start_loop_block");
    80         BasicBlock* dataEndLoopBlock = iBuilder->CreateBasicBlock("data_end_loop_block");
    81 
    82 
    83         iBuilder->CreateCondBr(isHandleStart, dataStartLoopBlock, dataEndLoopBlock);
    84 
    85 
    86 ///////////Handle Index Start
    87         //dataStartIndexBlock
    88         iBuilder->SetInsertPoint(dataStartLoopBlock);
    89         Value* bitOneStartIndex = this->generateLoadCircularInput(iBuilder, START_NUM_STREAM_NAME, currentProcessIndex, iBuilder->getInt64Ty()->getPointerTo());
    90         Value* bitOneStartTargetPackIndex = iBuilder->CreateLShr(bitOneStartIndex, iBuilder->getSize(LOG_2_WORD_WIDTH));
    91 
    92         BasicBlock* markBitOneStartBlock = iBuilder->CreateBasicBlock("mark_bit_one_start_block");
    93         BasicBlock* increasePackIndexStartBlock = iBuilder->CreateBasicBlock("increase_pack_index_start_block");
    94 
    95         iBuilder->CreateCondBr(iBuilder->CreateICmpULT(currentPackIndex, bitOneStartTargetPackIndex), increasePackIndexStartBlock, markBitOneStartBlock);
    96 
    97         //IncreasePackIndexStartBlock
    98         iBuilder->SetInsertPoint(increasePackIndexStartBlock);
    99         Value* currentPackOutputPtr = this->getPackOutputPtr(iBuilder, currentPackIndex);
    100 
    101         iBuilder->CreateStore(iBuilder->CreateSub(
    102                 currentPackEndValue,
    103                 iBuilder->CreateAdd(currentPackStartValue, carryBit)
    104         ), currentPackOutputPtr);
    105 
    106 
    107         Value* newCarryBit = iBuilder->CreateSelect(
    108                 iBuilder->CreateICmpUGT(iBuilder->CreateAdd(currentPackStartValue, carryBit), currentPackEndValue),
    109                 iBuilder->getInt64(1),
    110                 iBuilder->getInt64(0)
    111         );
    112 
    113         iBuilder->setProducedItemCount(OUTPUT_BIT_STREAM_NAME, iBuilder->CreateShl(currentPackIndex, LOG_2_WORD_WIDTH));
    114 
    115         currentPackStartValue->addIncoming(iBuilder->getInt64(0), increasePackIndexStartBlock);
    116         currentPackEndValue->addIncoming(iBuilder->getInt64(0), increasePackIndexStartBlock);
    117         currentPackIndex->addIncoming(iBuilder->CreateAdd(currentPackIndex, iBuilder->getSize(1)), increasePackIndexStartBlock);
    118         currentProcessIndex->addIncoming(currentProcessIndex, increasePackIndexStartBlock);
    119         carryBit->addIncoming(newCarryBit, increasePackIndexStartBlock);
    120 
    121 
    122         isHandleStart->addIncoming(iBuilder->getInt1(true), increasePackIndexStartBlock);
    123         iBuilder->CreateBr(dataLoopCon);
    124 
    125         //markBitOneStartBlock
    126         iBuilder->SetInsertPoint(markBitOneStartBlock);
    127         Value* maskedBitOneIndex = iBuilder->CreateAnd(bitOneStartIndex, iBuilder->getSize(WORD_WIDTH - 1));
    128         Value* newPackValue = iBuilder->CreateOr(
    129                 currentPackStartValue,
    130                 iBuilder->CreateShl(iBuilder->getInt64(1), maskedBitOneIndex)
    131         );
    132         currentPackStartValue->addIncoming(newPackValue, markBitOneStartBlock);
    133         currentPackEndValue->addIncoming(currentPackEndValue, markBitOneStartBlock);
    134         currentPackIndex->addIncoming(currentPackIndex, markBitOneStartBlock);
    135         currentProcessIndex->addIncoming(currentProcessIndex, markBitOneStartBlock);
    136         isHandleStart->addIncoming(iBuilder->getInt1(false), markBitOneStartBlock);
    137         carryBit->addIncoming(carryBit, markBitOneStartBlock);
    138         iBuilder->CreateBr(dataLoopCon);
    139 
    140 
    141 ///////////Handle Index End
    142         iBuilder->SetInsertPoint(dataEndLoopBlock);
    143         Value* bitOneEndIndex = this->generateLoadCircularInput(iBuilder, END_NUM_STREAM_NAME, currentProcessIndex, iBuilder->getInt64Ty()->getPointerTo());
    144         Value* bitOneEndTargetPackIndex = iBuilder->CreateLShr(bitOneEndIndex, iBuilder->getSize(LOG_2_WORD_WIDTH));
    145 
    146 
    147         BasicBlock* markBitOneEndBlock = iBuilder->CreateBasicBlock("mark_bit_one_end_block");
    148         BasicBlock* increasePackIndexEndBlock = iBuilder->CreateBasicBlock("increase_pack_index_end_block");
    149 
    150         iBuilder->CreateCondBr(iBuilder->CreateICmpULT(currentPackIndex, bitOneEndTargetPackIndex), increasePackIndexEndBlock, markBitOneEndBlock);
    151 
    152         //IncreasePackIndexEndBlock
    153         iBuilder->SetInsertPoint(increasePackIndexEndBlock);
    154         currentPackOutputPtr = this->getPackOutputPtr(iBuilder, currentPackIndex);
    155         iBuilder->CreateStore(iBuilder->CreateSub(
    156                 currentPackEndValue,
    157                 iBuilder->CreateAdd(currentPackStartValue, carryBit)
    158         ), currentPackOutputPtr);
    159 
    160 
    161 
    162         newCarryBit = iBuilder->CreateSelect(
    163                 iBuilder->CreateICmpUGT(iBuilder->CreateAdd(currentPackStartValue, carryBit), currentPackEndValue),
    164                 iBuilder->getInt64(1),
    165                 iBuilder->getInt64(0)
    166         );
    167         iBuilder->setProducedItemCount(OUTPUT_BIT_STREAM_NAME, iBuilder->CreateShl(currentPackIndex, LOG_2_WORD_WIDTH));
    168 
    169         currentPackStartValue->addIncoming(iBuilder->getInt64(0), increasePackIndexEndBlock);
    170         currentPackEndValue->addIncoming(iBuilder->getInt64(0), increasePackIndexEndBlock);
    171         currentPackIndex->addIncoming(iBuilder->CreateAdd(currentPackIndex, iBuilder->getSize(1)), increasePackIndexEndBlock);
    172         currentProcessIndex->addIncoming(currentProcessIndex, increasePackIndexEndBlock);
    173         isHandleStart->addIncoming(iBuilder->getInt1(false), increasePackIndexEndBlock);
    174         carryBit->addIncoming(newCarryBit, increasePackIndexEndBlock);
    175         iBuilder->CreateBr(dataLoopCon);
    176 
    177         //markBitOneEndBlock
    178         iBuilder->SetInsertPoint(markBitOneEndBlock);
    179         Value* maskedBitOneEndIndex = iBuilder->CreateAnd(bitOneEndIndex, iBuilder->getSize(WORD_WIDTH - 1));
    180         Value* newPackEndValue = iBuilder->CreateOr(
    181                 currentPackEndValue,
    182                 iBuilder->CreateShl(iBuilder->getInt64(1), maskedBitOneEndIndex)
    183         );
    184         currentPackStartValue->addIncoming(currentPackStartValue, markBitOneEndBlock);
    185         currentPackEndValue->addIncoming(newPackEndValue, markBitOneEndBlock);
    186         currentPackIndex->addIncoming(currentPackIndex, markBitOneEndBlock);
    187         currentProcessIndex->addIncoming(iBuilder->CreateAdd(currentProcessIndex, iBuilder->getSize(1)), markBitOneEndBlock);
    188         isHandleStart->addIncoming(iBuilder->getInt1(true), markBitOneEndBlock);
    189         carryBit->addIncoming(carryBit, markBitOneEndBlock);
    190         iBuilder->CreateBr(dataLoopCon);
    191 
    192         //dataLoopExit
    193         iBuilder->SetInsertPoint(dataLoopExit);
    194         iBuilder->setScalarField(CURRENT_PROCESS_INDEX_KEY, currentProcessIndex);
    195         iBuilder->setScalarField(CURRENT_PACK_INDEX_KEY, currentPackIndex);
    196         iBuilder->setScalarField(CURRENT_PACK_START_VALUE_KEY, currentPackStartValue);
    197         iBuilder->setScalarField(CURRENT_PACK_END_VALUE_KEY, currentPackEndValue);
    198         iBuilder->setScalarField(CARRY_BIT_KEY, carryBit);
    199 
    200 
    201         iBuilder->CreateStore(iBuilder->CreateSub(
    202                 currentPackEndValue,
    203                 iBuilder->CreateAdd(currentPackStartValue, carryBit)
    204         ), this->getPackOutputPtr(iBuilder, currentPackIndex));
    205         Value* lastBitOneIndex = this->generateLoadCircularInput(iBuilder, START_NUM_STREAM_NAME, iBuilder->CreateSub(currentProcessIndex, iBuilder->getSize(1)), iBuilder->getInt64Ty()->getPointerTo());
    206         iBuilder->setProducedItemCount(OUTPUT_BIT_STREAM_NAME, lastBitOneIndex);
    207 
    208         BasicBlock* exitBlock = iBuilder->CreateBasicBlock("exit_block");
    209 //        iBuilder->setProcessedItemCount(START_NUM_STREAM_NAME, numItemAvailable);
    210         iBuilder->CreateBr(exitBlock);
    211 
    212         iBuilder->SetInsertPoint(exitBlock);
    213     }
    214 
    215     Value* LZ4NumbersToBitstreamKernel::generateLoadCircularInput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset, Type* pointerType) {
    216         size_t inputSize = this->getInputBufferSize(iBuilder, inputBufferName);
    217         Value* offsetMask = iBuilder->getSize(inputSize - 1);
    218         Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
    219 
    220         Value* inputBufferPtr = iBuilder->getRawInputPointer(inputBufferName, iBuilder->getSize(0));
    221 
    222         inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
    223         return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
    224     }
    225 
    226     size_t LZ4NumbersToBitstreamKernel::getInputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
    227         return this->getInputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    228     }
    229 
    230     size_t LZ4NumbersToBitstreamKernel::getOutputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
    231         return this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    232     }
    233 
    234     inline Value* LZ4NumbersToBitstreamKernel::getPackOutputPtr(const std::unique_ptr<KernelBuilder> & iBuilder, Value* packIndex) {
    235         Value* outputBasePtr = iBuilder->getRawOutputPointer(OUTPUT_BIT_STREAM_NAME, iBuilder->getSize(0));
    236         outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
    237 
    238         size_t outputBufferSize = this->getOutputBufferSize(iBuilder, OUTPUT_BIT_STREAM_NAME);
    239         size_t outputPackSize = outputBufferSize / 64;
    240 
    241         Value* maskedPackIndex = iBuilder->CreateAnd(packIndex, iBuilder->getSize(outputPackSize - 1));
    242         return iBuilder->CreateGEP(outputBasePtr, maskedPackIndex);
    243     }
    244 
    245     LZ4NumbersToBitstreamKernel::LZ4NumbersToBitstreamKernel(std::string kernelName, const std::unique_ptr<kernel::KernelBuilder> &iBuilder)
    246     : MultiBlockKernel(string(kernelName),
    247     // Inputs
    248     {
    249         Binding{iBuilder->getStreamSetTy(1, 64), START_NUM_STREAM_NAME},
    250         Binding{iBuilder->getStreamSetTy(1, 64), END_NUM_STREAM_NAME}
    251     },
    252     //Outputs
    253     {
    254         Binding{iBuilder->getStreamSetTy(1, 1), OUTPUT_BIT_STREAM_NAME, UnknownRate()}
    255     },
    256     //Arguments
    257     {
    258         //TODO may need total length
    259     },
    260     {},
    261     //Internal states:
    262     {
    263         Binding{iBuilder->getSizeTy(), CURRENT_PROCESS_INDEX_KEY},
    264         Binding{iBuilder->getSizeTy(), CURRENT_PACK_INDEX_KEY},
    265         Binding{iBuilder->getInt64Ty(), CURRENT_PACK_START_VALUE_KEY},
    266         Binding{iBuilder->getInt64Ty(), CURRENT_PACK_END_VALUE_KEY},
    267         Binding{iBuilder->getInt64Ty(), CARRY_BIT_KEY}
    268 
    269     }) {
     74        Value *initCurrentItemIndex = iBuilder->CreateURem(itemProcessed, SIZE_BIT_BLOCK_WIDTH);
     75
     76        Value *initOutputIndex = SIZE_ZERO;
     77
     78//        Value *availableOutputBlocks = iBuilder->CreateSelect(mIsFinal, iBuilder->getSize(32), numOfStrides); //TODO workaround here
     79//        Value *availableOutputBlocks = numOfStrides;
     80//        Value *availableOutputBlocks = remainSpace;
     81        Value *availableOutputBlocks = iBuilder->CreateUMin(remainSpace, numOfStrides);
     82
     83        // TODO handle input pointer
     84        Value *inputStartBasePtr = iBuilder->getInputStreamBlockPtr(START_NUM_STREAM_NAME, SIZE_ZERO);
     85        inputStartBasePtr = iBuilder->CreatePointerCast(inputStartBasePtr, iBuilder->getInt64Ty()->getPointerTo());
     86        Value *inputEndBasePtr = iBuilder->getInputStreamBlockPtr(END_NUM_STREAM_NAME, SIZE_ZERO);
     87        inputEndBasePtr = iBuilder->CreatePointerCast(inputEndBasePtr, iBuilder->getInt64Ty()->getPointerTo());
     88        Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_BIT_STREAM_NAME, SIZE_ZERO);
     89        Value *initCarryBit = iBuilder->getScalarField("carryBit");
     90
     91//        iBuilder->CallPrintInt("itemProcessed", itemProcessed);
     92//        iBuilder->CallPrintInt("inputStartBasePtr", inputStartBasePtr);
     93
     94        Value *initCurrentBlockStartData = iBuilder->getScalarField(PENDING_START_DATA_KEY);
     95        Value *initCurrentBlockEndData = iBuilder->getScalarField(PENDING_END_DATA_KEY);
     96
     97
     98        BasicBlock *multiBlockLoopConBlock = iBuilder->CreateBasicBlock("multiBlockLoopConBlock");
     99        BasicBlock *multiBlockLoopBodyBlock = iBuilder->CreateBasicBlock("multiBlockLoopBodyBlock");
     100        BasicBlock *multiBlockLoopExitBlock = iBuilder->CreateBasicBlock("multiBlockLoopExitBlock");
     101
     102        iBuilder->CreateBr(multiBlockLoopConBlock);
     103
     104        // multiBlockLoopConBlock
     105        iBuilder->SetInsertPoint(multiBlockLoopConBlock);
     106        PHINode *phiCurrentItemIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
     107        phiCurrentItemIndex->addIncoming(initCurrentItemIndex, entryBlock);
     108
     109        PHINode *phiCurrentOutputIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
     110        phiCurrentOutputIndex->addIncoming(initOutputIndex, entryBlock);
     111
     112        PHINode *phiCurrentBlockStartData = iBuilder->CreatePHI(iBuilder->getBitBlockType(), 2);
     113        phiCurrentBlockStartData->addIncoming(initCurrentBlockStartData, entryBlock);
     114
     115        PHINode *phiCurrentBlockEndData = iBuilder->CreatePHI(iBuilder->getBitBlockType(), 2);
     116        phiCurrentBlockEndData->addIncoming(initCurrentBlockEndData, entryBlock);
     117
     118        PHINode *phiCarryBit = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
     119        phiCarryBit->addIncoming(initCarryBit, entryBlock);
     120
     121
     122        // TODO It is possible that in final block, not all items have been processed, while the output buffer is not enough. This situation need to be verified later
     123        // phiCurrentItemIndex < itemsToDo && currentOutputIndex < availableOutputBlocks
     124//        iBuilder->CallPrintInt("phiCurrentItemIndex", phiCurrentItemIndex);
     125//        iBuilder->CallPrintInt("aaa", iBuilder->CreateAdd(itemsToDo, initCurrentItemIndex));
     126        iBuilder->CreateCondBr(
     127                iBuilder->CreateAnd(
     128                        iBuilder->CreateICmpULT(phiCurrentItemIndex, iBuilder->CreateAdd(itemsToDo,
     129                                                                                         initCurrentItemIndex)), //TODO should not be itemsToDo here, may be itemsToDo + initCurrentItemIndex
     130                        iBuilder->CreateICmpULT(phiCurrentOutputIndex, availableOutputBlocks)
     131                ),
     132                multiBlockLoopBodyBlock,
     133                multiBlockLoopExitBlock
     134        );
     135
     136        // multiBlockLoopBodyBlock
     137        iBuilder->SetInsertPoint(multiBlockLoopBodyBlock);
     138
     139        Value *currentOutputGlobalIndex = iBuilder->CreateAdd(phiCurrentOutputIndex, oldProducedOutputBlockIndex);
     140
     141        // StartBits
     142        Value *currentStartPos = iBuilder->CreateLoad(iBuilder->CreateGEP(inputStartBasePtr, phiCurrentItemIndex));
     143        Value *currentStartGlobalBlockIndex = iBuilder->CreateUDiv(currentStartPos, SIZE_BIT_BLOCK_WIDTH);
     144//        Value *currentStartLocalBlockIndex = iBuilder->CreateSub(currentStartGlobalBlockIndex,
     145//                                                                 oldProducedOutputBlockIndex);
     146//        iBuilder->CallPrintInt("currentStartLocalBlockIndex", currentStartLocalBlockIndex); //TODO overflow here
     147
     148
     149        Value *currentStartLocalBlockOffset = iBuilder->CreateURem(currentStartPos,
     150                                                                   SIZE_BIT_BLOCK_WIDTH); // 0 ~ BIT_BLOCK_WIDTH
     151
     152        Value *newBlockStartData = this->setIntVectorBitOne(iBuilder, phiCurrentBlockStartData,
     153                                                            currentStartLocalBlockOffset,
     154                                                            iBuilder->CreateICmpEQ(currentStartGlobalBlockIndex,
     155                                                                                   currentOutputGlobalIndex));
     156//        iBuilder->CallPrintRegister("phiCurrentBlockStartData", phiCurrentBlockStartData);
     157//        iBuilder->CallPrintRegister("newBlockStartData", newBlockStartData);
     158//        iBuilder->CallPrintInt("currentStartPos", currentStartPos);
     159//        iBuilder->CallPrintInt("----", SIZE_ZERO);
     160
     161
     162        // EndBits
     163        Value *currentEndPos = iBuilder->CreateLoad(iBuilder->CreateGEP(inputEndBasePtr, phiCurrentItemIndex));
     164        Value *currentEndGlobalBlockIndex = iBuilder->CreateUDiv(currentEndPos, SIZE_BIT_BLOCK_WIDTH);
     165//        Value *currentEndLocalBlockIndex = iBuilder->CreateSub(currentEndGlobalBlockIndex, oldProducedOutputBlockIndex);
     166
     167        Value *currentEndLocalBlockOffset = iBuilder->CreateURem(currentEndPos,
     168                                                                 SIZE_BIT_BLOCK_WIDTH); // 0 ~ BIT_BLOCK_WIDTH
     169
     170
     171        Value *newBlockEndData = this->setIntVectorBitOne(iBuilder, phiCurrentBlockEndData, currentEndLocalBlockOffset,
     172                                                          iBuilder->CreateICmpEQ(currentEndGlobalBlockIndex,
     173                                                                                 currentOutputGlobalIndex));
     174//            iBuilder->CallPrintInt("%%%currentEndPos", currentEndPos);
     175//            iBuilder->CallPrintRegister("%%%newBlockEndData", newBlockEndData);
     176//        iBuilder->CallPrintInt("currentEndPos", currentEndPos);
     177
     178        Value *enterNewOutputBlock = iBuilder->CreateOr(
     179                iBuilder->CreateICmpUGT(currentStartGlobalBlockIndex, currentOutputGlobalIndex),
     180                iBuilder->CreateICmpUGT(currentEndGlobalBlockIndex, currentOutputGlobalIndex)
     181        );
     182
     183
     184        Value *carryBitIntVec = iBuilder->CreateInsertElement(BIT_BLOCK_ZERO, phiCarryBit, (uint64_t) 0);
     185        Value *newBlockStartWithCarry = iBuilder->simd_add(BIT_BLOCK_WIDTH, newBlockStartData, carryBitIntVec);
     186
     187
     188        // Avoid branch mis-prediction by always storing output block
     189        Value *outputData = iBuilder->simd_sub(BIT_BLOCK_WIDTH, newBlockEndData, newBlockStartWithCarry);
     190//        iBuilder->CallPrintInt("----store", iBuilder->getSize(0));
     191//        iBuilder->CallPrintInt("carry", phiCarryBit);
     192//        iBuilder->CallPrintRegister("newBlockEndData", newBlockEndData);
     193//        iBuilder->CallPrintRegister("newBlockStartWithCarry", newBlockStartWithCarry);
     194//        iBuilder->CallPrintInt("----outputPtr", iBuilder->CreateGEP(outputBasePtr, phiCurrentOutputIndex));
     195//        iBuilder->CallPrintRegister("outputData", outputData);
     196        iBuilder->CreateBlockAlignedStore(outputData, iBuilder->CreateGEP(outputBasePtr, phiCurrentOutputIndex));
     197
     198        // Handle PHINodes
     199
     200        // When currentStartLocalBlockIndex < phiCurrentOutputIndex && currentEndLocalBlockIndex < phiCurrentOutputIndex
     201        // this round of loop will do nothing, and currentItemIndex += 1
     202        phiCurrentItemIndex->addIncoming(
     203                iBuilder->CreateSelect(
     204                        enterNewOutputBlock,
     205                        phiCurrentItemIndex,
     206                        iBuilder->CreateAdd(phiCurrentItemIndex, SIZE_ONE)
     207                ),
     208                iBuilder->GetInsertBlock()
     209        );
     210
     211        phiCurrentOutputIndex->addIncoming(
     212                iBuilder->CreateSelect(
     213                        enterNewOutputBlock,
     214                        iBuilder->CreateAdd(phiCurrentOutputIndex, SIZE_ONE),
     215                        phiCurrentOutputIndex
     216                ),
     217                iBuilder->GetInsertBlock()
     218        );
     219
     220        phiCurrentBlockStartData->addIncoming(
     221                iBuilder->CreateSelect(
     222                        enterNewOutputBlock,
     223                        BIT_BLOCK_ZERO,
     224                        newBlockStartData
     225                ),
     226                iBuilder->GetInsertBlock()
     227        );
     228
     229        phiCurrentBlockEndData->addIncoming(
     230                iBuilder->CreateSelect(
     231                        enterNewOutputBlock,
     232                        BIT_BLOCK_ZERO,
     233                        newBlockEndData
     234                ),
     235                iBuilder->GetInsertBlock()
     236        );
     237
     238        Value *newCarryBit = iBuilder->CreateSelect(this->intVecGT(iBuilder, newBlockStartWithCarry, newBlockEndData),
     239                                                    INT64_ONE, INT64_ZERO);
     240
     241//        iBuilder->CallPrintInt("newCarryBit", newCarryBit );
     242
     243        phiCarryBit->addIncoming(
     244                iBuilder->CreateSelect(
     245                        enterNewOutputBlock,
     246                        newCarryBit,
     247                        phiCarryBit
     248                ),
     249                iBuilder->GetInsertBlock()
     250        );
     251
     252
     253        iBuilder->CreateBr(multiBlockLoopConBlock);
     254
     255        // multiBlockLoopExitBlock
     256        iBuilder->SetInsertPoint(multiBlockLoopExitBlock);
     257
     258        iBuilder->setScalarField(PENDING_START_DATA_KEY, phiCurrentBlockStartData);
     259        iBuilder->setScalarField(PENDING_END_DATA_KEY, phiCurrentBlockEndData);
     260        iBuilder->setScalarField("carryBit", phiCarryBit);
     261
     262        carryBitIntVec = iBuilder->CreateInsertElement(BIT_BLOCK_ZERO, phiCarryBit, (uint64_t) 0);
     263        Value *finalOutputData = iBuilder->simd_sub(
     264                BIT_BLOCK_WIDTH,
     265                phiCurrentBlockEndData,
     266                iBuilder->simd_add(BIT_BLOCK_WIDTH, phiCurrentBlockStartData, carryBitIntVec)
     267        );
     268//        iBuilder->CallPrintRegister("%%%phiCurrentBlockEndData", phiCurrentBlockEndData);
     269//            iBuilder->CallPrintInt("----outputPtrFinal", iBuilder->CreateGEP(outputBasePtr, phiCurrentOutputIndex));
     270
     271        BasicBlock *storeFinalBlock = iBuilder->CreateBasicBlock("storeFinalBlock");
     272        BasicBlock *storeFinalBlockEnd = iBuilder->CreateBasicBlock("storeFinalBlockEnd");
     273
     274        iBuilder->CreateUnlikelyCondBr(isFinalBlock, storeFinalBlock, storeFinalBlockEnd);
     275        iBuilder->SetInsertPoint(storeFinalBlock);
     276
     277//        iBuilder->CallPrintRegister("finalOutputData", finalOutputData);
     278        iBuilder->CreateBlockAlignedStore(finalOutputData, iBuilder->CreateGEP(outputBasePtr,
     279                                                                   phiCurrentOutputIndex)); //Possible overflow here if this store always happen
     280        iBuilder->CreateBr(storeFinalBlockEnd);
     281        iBuilder->SetInsertPoint(storeFinalBlockEnd);
     282
     283        // Processed Item Count and Produced Item Count
     284        Value *newProcessedItemCount = iBuilder->CreateAdd(iBuilder->getProcessedItemCount(START_NUM_STREAM_NAME),
     285                                                           iBuilder->CreateSub(phiCurrentItemIndex,
     286                                                                               initCurrentItemIndex));
     287
     288
     289        iBuilder->setProcessedItemCount(START_NUM_STREAM_NAME, newProcessedItemCount);
     290        iBuilder->setProcessedItemCount(END_NUM_STREAM_NAME, newProcessedItemCount);
     291
     292        Value *lastEndPos = iBuilder->CreateLoad(
     293                iBuilder->CreateGEP(inputEndBasePtr, iBuilder->CreateSub(phiCurrentItemIndex, SIZE_ONE)));
     294//        iBuilder->CallPrintInt("lastEndPos", lastEndPos);
     295
     296        iBuilder->setProducedItemCount(OUTPUT_BIT_STREAM_NAME,
     297                                       iBuilder->CreateSelect(
     298                                               isFinalBlock,
     299                                               lastEndPos,
     300                                               iBuilder->CreateAdd(
     301                                                       iBuilder->CreateMul(phiCurrentOutputIndex, SIZE_BIT_BLOCK_WIDTH),
     302                                                       iBuilder->getProducedItemCount(OUTPUT_BIT_STREAM_NAME)
     303                                               )
     304                                       )
     305        );
     306//        iBuilder->CallPrintInt("isFinalBlock", isFinalBlock);
     307//        iBuilder->CallPrintInt("producedItemCount", iBuilder->getProducedItemCount(OUTPUT_BIT_STREAM_NAME));
     308    }
     309
     310    size_t LZ4NumbersToBitstreamKernel::getAnyBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder,
     311                                                          std::string bufferName) {
     312        return this->getAnyStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
     313    }
     314
     315    /*
     316     * iBuilder: kernel builder
     317     * intVec: BitBlockType, <4 * i64>
     318     * pos: size_t, 0 - 256, position of bit 1
     319     * isSet: i1, when isSet == true, bit 1 will be set, otherwise this function do nothing
     320     * */
     321    Value *LZ4NumbersToBitstreamKernel::setIntVectorBitOne(const std::unique_ptr<KernelBuilder> &iBuilder,
     322                                                            llvm::Value *intVec, llvm::Value *pos, llvm::Value *isSet) {
     323        Value *SIZE_64 = iBuilder->getSize(64); //TODO assume bit block type will always be <4 * i64>
     324        Value *blockIndex = iBuilder->CreateUDiv(pos, SIZE_64);
     325        Value *blockOffset = iBuilder->CreateURem(pos, SIZE_64);
     326
     327        Value *oldValue = iBuilder->CreateExtractElement(intVec, blockIndex);
     328        // Use select to avoid branch misprediction
     329        Value *bitOneValue = iBuilder->CreateShl(
     330                iBuilder->CreateSelect(isSet, iBuilder->getInt64(1), iBuilder->getInt64(0)),
     331                blockOffset
     332        );
     333        Value *newValue = iBuilder->CreateOr(oldValue, bitOneValue);
     334        return iBuilder->CreateInsertElement(intVec, newValue, blockIndex);
     335    }
     336
     337    Value *LZ4NumbersToBitstreamKernel::intVecGT(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value *intVec1,
     338                                                  llvm::Value *intVec2) {
     339        unsigned int BIT_BLOCK_WIDTH = iBuilder->getBitBlockWidth();
     340        Value *gt = iBuilder->simd_ugt(BIT_BLOCK_WIDTH, intVec1, intVec2);
     341        return iBuilder->CreateNot(iBuilder->CreateICmpEQ(iBuilder->CreateExtractElement(gt, (uint64_t) 0),
     342                                                          iBuilder->getIntN(BIT_BLOCK_WIDTH, 0)));
     343    }
     344
     345
     346    LZ4NumbersToBitstreamKernel::LZ4NumbersToBitstreamKernel(std::string kernelName,
     347                                                               const std::unique_ptr<kernel::KernelBuilder> &iBuilder)
     348            : MultiBlockKernel(string(kernelName),
     349            // Inputs
     350                               {
     351                                       Binding{iBuilder->getStreamSetTy(1, 64), START_NUM_STREAM_NAME,
     352                                               BoundedRate(0, 1), AlwaysConsume()},
     353                                       Binding{iBuilder->getStreamSetTy(1, 64), END_NUM_STREAM_NAME, BoundedRate(0, 1),
     354                                               AlwaysConsume()}
     355                               },
     356            //Outputs
     357                               {
     358//                                       Binding{iBuilder->getStreamSetTy(1, 1), OUTPUT_BIT_STREAM_NAME,
     359//                                           UnknownRate()}
     360                                       Binding{iBuilder->getStreamSetTy(1, 1), OUTPUT_BIT_STREAM_NAME,
     361                                                   BoundedRate(0, 1)}   //TODO BoundedRate is a workaround, it should be UnknownRate in the future
     362                               },
     363            //Arguments
     364                               {
     365                               },
     366                               {},
     367            //Internal states:
     368                               {
     369                    Binding(iBuilder->getBitBlockType(), PENDING_START_DATA_KEY),
     370                    Binding(iBuilder->getBitBlockType(), PENDING_END_DATA_KEY),
     371                    Binding(iBuilder->getIntNTy(64), "carryBit"),
     372            }) {
     373//        addAttribute(CanTerminateEarly());
    270374//        setNoTerminateAttribute(true);
    271     }
    272 
     375        addAttribute(MustExplicitlyTerminate());
     376    }
    273377}
    274 
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_numbers_to_bitstream_kernel.h

    r5864 r5885  
    1 //
    2 // Created by wxy325 on 2017/8/9.
    3 //
    41
    5 #ifndef ICGREP_LZ4_NUMBERS_TO_BITSTREAM_KERNEL_H
    6 #define ICGREP_LZ4_NUMBERS_TO_BITSTREAM_KERNEL_H
    7 #include <string>
    8 
     2#ifndef ICGREP_LZ4_NUMBERS_TO_BITSTREAM_KERNEL2_H
     3#define ICGREP_LZ4_NUMBERS_TO_BITSTREAM_KERNEL2_H
    94#include "kernels/kernel.h"
    105
     
    1712
    1813namespace IDISA { class IDISA_Builder; }
     14
    1915namespace kernel {
    20 
    21     class LZ4NumbersToBitstreamKernel final : public MultiBlockKernel {
     16class LZ4NumbersToBitstreamKernel final : public MultiBlockKernel {
    2217    public:
    2318        LZ4NumbersToBitstreamKernel(std::string kernelName, const std::unique_ptr<kernel::KernelBuilder> &iBuilder);
    2419    protected:
    2520        void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    26 //        void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &iBuilder) override;
    2721    private:
    28         inline llvm::Value* generateLoadCircularInput(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputBufferName, llvm::Value* offset, llvm::Type* pointerType);
    29         inline size_t getInputBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName);
    30         inline size_t getOutputBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName);
    31         inline llvm::Value* getPackOutputPtr(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* packIndex);
     22        inline size_t getAnyBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName);
     23        llvm::Value* setIntVectorBitOne(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* intVec, llvm::Value* pos, llvm::Value* isSet);
     24        inline llvm::Value* intVecGT(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* intVec1, llvm::Value* intVec2);
    3225    };
    3326}
    3427
    3528
    36 
    37 #endif //ICGREP_LZ4_NUMBERS_TO_BITSTREAM_KERNEL_H
     29#endif //ICGREP_LZ4_NUMBERS_TO_BITSTREAM_KERNEL2_H
Note: See TracChangeset for help on using the changeset viewer.