Ignore:
Timestamp:
Apr 19, 2018, 4:06:04 PM (16 months ago)
Author:
xwa163
Message:

Improve performance of swizzled_match_copy_kernel by adjusting loop structure

Location:
icGREP/icgrep-devel/icgrep/kernels/lz4
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp

    r5974 r5981  
    5959}
    6060
    61 Value* LZ4SwizzledMatchCopyKernel::loadNextMatchOffset(const unique_ptr<KernelBuilder> &iBuilder) {
     61pair<Value*, Value*> LZ4SwizzledMatchCopyKernel::loadNextMatchOffset(const unique_ptr<KernelBuilder> &iBuilder) {
    6262    Value* initCurrentPos = iBuilder->CreateAdd(iBuilder->getScalarField("currentOffsetMarkerPos"), iBuilder->getSize(1));
    6363    Value* newPosition = this->advanceUntilNextBit(iBuilder, "MatchOffsetMarker", initCurrentPos, true);
    6464
    6565    // Load Match Offset from newPosition
    66     iBuilder->setScalarField("currentOffsetMarkerPos", newPosition);
    67     iBuilder->setProcessedItemCount("MatchOffsetMarker", newPosition);
    68 
    6966    Value* matchOffsetPtr = iBuilder->getRawInputPointer("byteStream", newPosition);
    7067    // For now, it is safe to cast matchOffset pointer into i16 since the input byte stream is always linear available
     
    7269    Value* matchOffset = iBuilder->CreateZExt(iBuilder->CreateLoad(matchOffsetPtr), iBuilder->getSizeTy());
    7370
    74     return matchOffset;
     71    return std::make_pair(matchOffset, newPosition);
    7572}
    7673
     
    7976    Value* m0Start = this->advanceUntilNextBit(iBuilder, "M0Marker", initCurrentPos, true);
    8077    Value* m0End = this->advanceUntilNextBit(iBuilder, "M0Marker", m0Start, false);
    81     iBuilder->setScalarField("currentM0MarkerPos", m0End);
    8278    return std::make_pair(m0Start, m0End);
    8379};
    8480
    8581
    86 
    87 
    8882void LZ4SwizzledMatchCopyKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     83    ConstantInt * const SIZE_4_MEGS = iBuilder->getSize(4 * 1024 * 1024);
     84
     85    BasicBlock * const entryBlock = iBuilder->GetInsertBlock();
     86
     87    Value * const available = iBuilder->getAvailableItemCount("sourceStreamSet0");
     88    Value * const processed = iBuilder->getProcessedItemCount("sourceStreamSet0");
     89
     90    Value * const itemsToDo = iBuilder->CreateUMin(iBuilder->CreateSub(available, processed), SIZE_4_MEGS);
     91    iBuilder->setTerminationSignal(iBuilder->CreateICmpULT(itemsToDo, SIZE_4_MEGS));
     92
     93
     94    // Output Copy
     95    generateOutputCopy(iBuilder);
     96
     97    Value * const toProcessItemCount = iBuilder->CreateAdd(processed, itemsToDo);
     98
     99    // Match Copy
     100    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("M0CountMarker");
     101    Value *totalM0StartItemsCount = iBuilder->getAvailableItemCount("M0CountMarker");
     102
     103    BasicBlock * const matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
     104    BasicBlock * const processExitBlock = iBuilder->CreateBasicBlock("exit_block");
     105
     106    BasicBlock * const loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
     107    BasicBlock * const matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
     108    BasicBlock * const matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
     109
     110
     111    iBuilder->CreateBr(matchCopyLoopCon);
     112
     113    iBuilder->SetInsertPoint(matchCopyLoopCon);
     114    PHINode * const phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
     115    phiProcessIndex->addIncoming(initM0StartProcessIndex, entryBlock);
     116
     117    Value * const hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
     118
     119    iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, processExitBlock);
     120
     121    iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
     122
     123    auto ret = this->loadNextM0StartEnd(iBuilder);
     124    Value *newM0Start = ret.first;
     125    Value *newM0End = ret.second;
     126
     127    auto matchOffsetRet = this->loadNextMatchOffset(iBuilder);
     128    Value *newMatchOffset = matchOffsetRet.first;
     129    Value* newMatchOffsetPos = matchOffsetRet.second;
     130
     131    Value * const newMatchLength = iBuilder->CreateAdd(iBuilder->CreateSub(newM0End, newM0Start), iBuilder->getInt64(1));
     132
     133    iBuilder->CreateBr(matchCopyConBlock);
     134    iBuilder->SetInsertPoint(matchCopyConBlock);
     135
     136    Value * const hasNotReachEnd = iBuilder->CreateICmpULT(newM0Start, toProcessItemCount);
     137    iBuilder->CreateLikelyCondBr(hasNotReachEnd, matchCopyBodyBlock, processExitBlock);
     138
     139    iBuilder->SetInsertPoint(matchCopyBodyBlock);
     140
     141    iBuilder->setScalarField("currentOffsetMarkerPos", newMatchOffsetPos);
     142    iBuilder->setProcessedItemCount("MatchOffsetMarker", newMatchOffsetPos);
     143    iBuilder->setScalarField("currentM0MarkerPos", newM0End);
     144    iBuilder->setProcessedItemCount("M0Marker", newM0End);
     145
     146
     147    BasicBlock* copyLoopCon = iBuilder->CreateBasicBlock("copyLoopCon");
     148    BasicBlock* copyLoopBody = iBuilder->CreateBasicBlock("copyLoopBody");
     149    iBuilder->CreateBr(copyLoopCon);
     150    iBuilder->SetInsertPoint(copyLoopCon);
     151    PHINode* phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
     152    PHINode* phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
     153
     154    phiMatchLength->addIncoming(newMatchLength, matchCopyBodyBlock);
     155    phiMatchPos->addIncoming(newM0Start, matchCopyBodyBlock);
     156
     157    phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, iBuilder->getSize(1)), iBuilder->GetInsertBlock());
     158
     159
     160    iBuilder->CreateLikelyCondBr(iBuilder->CreateICmpNE(phiMatchLength, iBuilder->getSize(0)), copyLoopBody, matchCopyLoopCon);
     161
     162    iBuilder->SetInsertPoint(copyLoopBody);
     163    Value* copySize = this->doMatchCopy(iBuilder, phiMatchPos, newMatchOffset, phiMatchLength);
     164    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, copySize), iBuilder->GetInsertBlock());
     165    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, copySize), iBuilder->GetInsertBlock());
     166    iBuilder->CreateBr(copyLoopCon);
     167
     168    iBuilder->SetInsertPoint(processExitBlock);
     169    iBuilder->setProcessedItemCount("M0CountMarker", phiProcessIndex);
     170    iBuilder->setProcessedItemCount("M0Marker", toProcessItemCount);
     171    iBuilder->setProcessedItemCount("sourceStreamSet0", toProcessItemCount);
     172    iBuilder->setScalarField("currentM0MarkerPos", toProcessItemCount);
     173
     174}
     175
     176llvm::Value* LZ4SwizzledMatchCopyKernel::doMatchCopy(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* phiMatchPos, llvm::Value* phiMatchOffset, llvm::Value* phiMatchLength) {
    89177    ConstantInt * const SIZE_ZERO = iBuilder->getSize(0);
    90178    ConstantInt * const SIZE_ONE = iBuilder->getSize(1);
    91179    ConstantInt * const SIZE_PDEP_WIDTH = iBuilder->getSize(mPDEPWidth);
    92     ConstantInt * const SIZE_4_MEGS = iBuilder->getSize(4 * 1024 * 1024);
    93180    ConstantInt * const SIZE_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
    94181
    95     BasicBlock * const entryBlock = iBuilder->GetInsertBlock();
    96 
    97     Value * const available = iBuilder->getAvailableItemCount("sourceStreamSet0");
    98     Value * const processed = iBuilder->getProcessedItemCount("sourceStreamSet0");
    99 
    100     Value * const itemsToDo = iBuilder->CreateUMin(iBuilder->CreateSub(available, processed), SIZE_4_MEGS);
    101     iBuilder->setTerminationSignal(iBuilder->CreateICmpULT(itemsToDo, SIZE_4_MEGS));
    102 
    103     Value * previousProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
    104 
    105     // Output Copy
    106     generateOutputCopy(iBuilder);
    107 
    108     Value * const toProcessItemCount = iBuilder->CreateAdd(processed, itemsToDo);
    109 
    110     // Match Copy
    111     Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("M0CountMarker");
    112     Value *totalM0StartItemsCount = iBuilder->getAvailableItemCount("M0CountMarker");
    113 
    114     Value * const initMatchOffset = iBuilder->getScalarField("pendingMatchOffset");
    115     Value * const initMatchLength = iBuilder->getScalarField("pendingMatchLength");
    116     Value * const initMatchPos = iBuilder->getScalarField("pendingMatchPos");
    117 
    118     BasicBlock * const matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
    119     iBuilder->CreateBr(matchCopyLoopCon);
    120 
    121     iBuilder->SetInsertPoint(matchCopyLoopCon);
    122     PHINode * const phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    123     phiProcessIndex->addIncoming(initM0StartProcessIndex, entryBlock);
    124     PHINode * const phiMatchOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    125     phiMatchOffset->addIncoming(initMatchOffset, entryBlock);
    126     PHINode * const phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    127     phiMatchLength->addIncoming(initMatchLength, entryBlock);
    128     PHINode * const phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
    129     phiMatchPos->addIncoming(initMatchPos, entryBlock);
    130 
    131     BasicBlock * const loadNextMatchInfoConBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoConBlock");
    132     BasicBlock * const loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
    133 
    134     BasicBlock * const matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
    135     BasicBlock * const matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
    136 
    137     iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(phiMatchLength, SIZE_ZERO), loadNextMatchInfoConBlock, matchCopyConBlock);
    138 
    139     iBuilder->SetInsertPoint(loadNextMatchInfoConBlock);
    140     Value * const hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
    141     BasicBlock * const processExitBlock = iBuilder->CreateBasicBlock("exit_block");
    142     iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, processExitBlock);
    143 
    144     iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
    145 
    146     auto ret = this->loadNextM0StartEnd(iBuilder);
    147     Value *newM0Start = ret.first;
    148     Value *newM0End = ret.second;
    149     iBuilder->setProcessedItemCount("M0Marker", newM0End);
    150     Value *newMatchOffset = this->loadNextMatchOffset(iBuilder);
    151 
    152 
    153 
    154     Value * const newMatchLength = iBuilder->CreateAdd(iBuilder->CreateSub(newM0End, newM0Start), iBuilder->getInt64(1));
    155 
    156     phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, SIZE_ONE), iBuilder->GetInsertBlock());
    157 
    158     phiMatchPos->addIncoming(newM0Start, iBuilder->GetInsertBlock());
    159     phiMatchOffset->addIncoming(newMatchOffset, iBuilder->GetInsertBlock());
    160     phiMatchLength->addIncoming(newMatchLength, iBuilder->GetInsertBlock());
    161 
    162     iBuilder->CreateBr(matchCopyLoopCon);
    163 
    164     iBuilder->SetInsertPoint(matchCopyConBlock);
    165 
    166     Value * const hasNotReachEnd = iBuilder->CreateICmpULT(phiMatchPos, toProcessItemCount);
    167     iBuilder->CreateCondBr(hasNotReachEnd, matchCopyBodyBlock, processExitBlock);
    168 
    169     iBuilder->SetInsertPoint(matchCopyBodyBlock);
    170 
    171     Value * const matchCopyTargetPos = iBuilder->CreateSub(phiMatchPos, previousProducedItemCount);
    172     Value * const matchCopyTargetBlockIndex = iBuilder->CreateUDiv(matchCopyTargetPos, SIZE_BLOCK_WIDTH);
    173     Value * const matchCopyTargetStreamIndex = iBuilder->CreateUDiv(iBuilder->CreateURem(matchCopyTargetPos, SIZE_BLOCK_WIDTH), SIZE_PDEP_WIDTH); // should SIZE_PDEP_WIDTH be SIZE_STREAM_COUNT?
     182    ConstantInt * const outputBufferBlocks = iBuilder->getSize(this->getAnyStreamSetBuffer("outputStreamSet0")->getBufferBlocks());
     183
     184    Value* matchPosLocalBlockIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(phiMatchPos, SIZE_BLOCK_WIDTH), outputBufferBlocks);
     185    Value * const matchCopyTargetStreamIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(phiMatchPos, SIZE_PDEP_WIDTH), iBuilder->getSize(mStreamCount));
    174186    Value * const matchCopyTargetBlockOffset = iBuilder->CreateURem(phiMatchPos, SIZE_PDEP_WIDTH);
    175187
    176     Value * const matchCopyFromPos = iBuilder->CreateSub(matchCopyTargetPos, phiMatchOffset);
    177     Value * const matchCopyFromBlockIndex = iBuilder->CreateUDiv(matchCopyFromPos, SIZE_BLOCK_WIDTH);
    178     Value * const matchCopyFromStreamIndex = iBuilder->CreateUDiv(iBuilder->CreateURem(matchCopyFromPos, SIZE_BLOCK_WIDTH), SIZE_PDEP_WIDTH);
     188    Value * const matchCopyFromPos = iBuilder->CreateSub(phiMatchPos, phiMatchOffset);
     189
     190    Value* matchCopyFromLocalBlockIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(matchCopyFromPos, SIZE_BLOCK_WIDTH), outputBufferBlocks);
     191    Value * const matchCopyFromStreamIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(matchCopyFromPos, SIZE_PDEP_WIDTH), iBuilder->getSize(mStreamCount));
    179192    Value * const matchCopyFromBlockOffset = iBuilder->CreateURem(matchCopyFromPos, SIZE_PDEP_WIDTH);
    180193
     
    182195    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchOffset);
    183196    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
    184     currentCopySize = iBuilder->CreateUMin(currentCopySize, iBuilder->CreateSub(toProcessItemCount, phiMatchPos));
    185197    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
    186198
     
    194206
    195207    for (unsigned i = 0; i < mStreamSize; i++) {
    196         Value * const matchCopyFromBlockPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), matchCopyFromStreamIndex, matchCopyFromBlockIndex);
     208        Value* basePtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer("outputStreamSet" + std::to_string(i), SIZE_ZERO), iBuilder->getBitBlockType()->getPointerTo());
     209
     210        Value * const matchCopyFromBlockPtr = iBuilder->CreateGEP(basePtr, iBuilder->CreateAdd(iBuilder->CreateMul(matchCopyFromLocalBlockIndex, iBuilder->getSize(mStreamCount)), matchCopyFromStreamIndex));
    197211        Value * const fromBlockValue = iBuilder->CreateBlockAlignedLoad(matchCopyFromBlockPtr);
    198212
    199         Value * const outputTargetBlockPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), matchCopyTargetStreamIndex, matchCopyTargetBlockIndex);
     213        Value * const outputTargetBlockPtr = iBuilder->CreateGEP(basePtr, iBuilder->CreateAdd(iBuilder->CreateMul(matchPosLocalBlockIndex, iBuilder->getSize(mStreamCount)), matchCopyTargetStreamIndex));
    200214        Value * const targetOriginalValue = iBuilder->CreateBlockAlignedLoad(outputTargetBlockPtr);
    201215
     
    207221        iBuilder->CreateStore(finalValue, outputTargetBlockPtr);
    208222    }
    209 
    210     phiProcessIndex->addIncoming(phiProcessIndex, matchCopyBodyBlock);
    211     phiMatchOffset->addIncoming(phiMatchOffset, matchCopyBodyBlock);
    212     phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, currentCopySize), matchCopyBodyBlock);
    213     phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, currentCopySize), matchCopyBodyBlock);
    214 
    215     iBuilder->CreateBr(matchCopyLoopCon);
    216 
    217     iBuilder->SetInsertPoint(processExitBlock);
    218     iBuilder->setScalarField("pendingMatchOffset", phiMatchOffset);
    219     iBuilder->setScalarField("pendingMatchLength", phiMatchLength);
    220     iBuilder->setScalarField("pendingMatchPos", phiMatchPos);
    221     iBuilder->setProcessedItemCount("M0CountMarker", phiProcessIndex);
    222     iBuilder->setProcessedItemCount("sourceStreamSet0", toProcessItemCount);
     223    return currentCopySize;
    223224}
    224225
     
    233234}
    234235
    235 Value* LZ4SwizzledMatchCopyKernel::loadOffset(const std::unique_ptr<KernelBuilder> & iBuilder, const std::string & bufferName, Value* offset) {
    236     return iBuilder->CreateLoad(iBuilder->getRawInputPointer(bufferName, offset));
    237 }
    238236
    239237LZ4SwizzledMatchCopyKernel::LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, unsigned streamCount/*=4*/, unsigned streamSize/*=2*/, unsigned swizzleFactor/*=4*/, unsigned PDEP_width/*64*/)
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.h

    r5974 r5981  
    1616        LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width = 64);
    1717    protected:
    18 
    1918        void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) override;
    20 
    2119        void generateOutputCopy(const std::unique_ptr<KernelBuilder> & iBuilder);
    22 
    23         llvm::Value * loadOffset(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string & bufferName, llvm::Value* offset);
    2420
    2521    private:
     
    2925        const unsigned mStreamSize;
    3026        const unsigned mStreamCount;
    31         llvm::Value* loadNextMatchOffset(const std::unique_ptr<KernelBuilder> &iBuilder);
     27        std::pair<llvm::Value*, llvm::Value*> loadNextMatchOffset(const std::unique_ptr<KernelBuilder> &iBuilder);
    3228        std::pair<llvm::Value*, llvm::Value*> loadNextM0StartEnd(const std::unique_ptr<KernelBuilder> &iBuilder);
    3329        llvm::Value *advanceUntilNextBit(const std::unique_ptr<KernelBuilder> &iBuilder, std::string inputName,
    3430                                          llvm::Value *startPos, bool isNextOne);
     31
     32        llvm::Value* doMatchCopy(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* matchPos, llvm::Value* matchOffset, llvm::Value* matchLength);
    3533
    3634    };
Note: See TracChangeset for help on using the changeset viewer.