Changeset 5961 for icGREP/icgrep-devel


Ignore:
Timestamp:
Apr 11, 2018, 12:33:46 AM (16 months ago)
Author:
xwa163
Message:

Improve performance of lz4_index_builder by using memset to produce output

Location:
icGREP/icgrep-devel/icgrep/kernels/lz4
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_index_builder.cpp

    r5959 r5961  
    149149                iBuilder->getSize(1));
    150150
    151         // TODO Clear Output Buffer at the beginning instead of marking 0
    152         this->markCircularOutputBitstream(iBuilder, "deletionMarker", iBuilder->getProducedItemCount("deletionMarker"), iBuilder->CreateAdd(phiCursorPosAfterLiteral, iBuilder->getSize(1)), true);
    153 //        this->markCircularOutputBitstream(iBuilder, "deletionMarker", iBuilder->CreateAdd(phiCursorPosAfterLiteral, iBuilder->getSize(1)), offsetPos, false);
     151        this->setCircularOutputBitstream(iBuilder, "deletionMarker", iBuilder->getProducedItemCount("deletionMarker"), iBuilder->CreateAdd(phiCursorPosAfterLiteral, iBuilder->getSize(1)));
     152
    154153        iBuilder->setProducedItemCount("deletionMarker", offsetPos);
    155154        this->increaseScalarField(iBuilder, "m0OutputPos", literalLength); //TODO m0OutputPos may be removed from scalar fields
     
    234233        this->generateStoreNumberOutput(iBuilder, "matchOffset", matchOffset);
    235234        this->increaseScalarField(iBuilder, "m0OutputPos", matchLength);
    236         this->markCircularOutputBitstream(iBuilder, "M0Marker", outputPos, outputEndPos, true, false);
     235        this->setCircularOutputBitstream(iBuilder, "M0Marker", outputPos, outputEndPos);
    237236
    238237        return iBuilder->CreateAdd(phiCursorPosAfterMatch, INT64_ONE);
     
    241240    void LZ4IndexBuilderKernel::generateProcessCompressedBlock(const std::unique_ptr<KernelBuilder> &iBuilder, Value* blockStart, Value* blockEnd) {
    242241        // Constant
    243         this->markCircularOutputBitstream(iBuilder, "deletionMarker", blockStart, blockEnd, false, false);
     242
     243        this->clearCircularOutputBitstream(iBuilder, "deletionMarker", blockStart, blockEnd);
    244244
    245245        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
     
    388388    }
    389389
    390     size_t LZ4IndexBuilderKernel::getOutputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
    391         return this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    392     }
    393 
    394     // Assume we have enough output buffer
    395     llvm::BasicBlock *LZ4IndexBuilderKernel::markCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
    396                                                                          const std::string &bitstreamName,
    397                                                                          llvm::Value *start, llvm::Value *end, bool isOne,
    398                                                                          bool setProduced) {
    399         const unsigned int bitBlockWidth = iBuilder->getBitBlockWidth();
    400         Value* SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(bitBlockWidth);
    401         Value* SIZE_ONE = iBuilder->getSize(1);
    402         Type * const INT_BIT_BLOCK_TY = iBuilder->getIntNTy(bitBlockWidth);
    403         Type * const BIT_BLOCK_TY = iBuilder->getBitBlockType();
    404         Constant* INT_BIT_BLOCK_ONE = ConstantInt::get(INT_BIT_BLOCK_TY, 1);
    405         Constant* INT_BIT_BLOCK_ZERO = ConstantInt::get(INT_BIT_BLOCK_TY, 0);
    406 
    407         BasicBlock *entryBlock = iBuilder->GetInsertBlock();
    408         BasicBlock *conBlock = iBuilder->CreateBasicBlock("mark_bit_one_con");
    409         BasicBlock *bodyBlock = iBuilder->CreateBasicBlock("mark_bit_one_body");
    410         BasicBlock *exitBlock = iBuilder->CreateBasicBlock("mark_bit_one_exit");
    411 
    412         Value * startBlockLocalIndex = iBuilder->CreateUDiv(start, SIZE_BIT_BLOCK_WIDTH);
    413 
    414         iBuilder->CreateBr(conBlock);
    415 
    416         // Con
    417         iBuilder->SetInsertPoint(conBlock);
    418 
    419         PHINode *curBlockLocalIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    420         curBlockLocalIndex->addIncoming(startBlockLocalIndex, entryBlock);
    421 
    422 
    423         iBuilder->CreateCondBr(
    424                 iBuilder->CreateICmpULT(iBuilder->CreateMul(curBlockLocalIndex, SIZE_BIT_BLOCK_WIDTH), end),
    425                 bodyBlock,
    426                 exitBlock
    427         );
    428 
    429         // Body
    430         iBuilder->SetInsertPoint(bodyBlock);
    431 
    432         Value * const currentPosition = iBuilder->CreateMul(curBlockLocalIndex, SIZE_BIT_BLOCK_WIDTH);
    433         Value * lowestBitPosition = iBuilder->CreateURem(start, SIZE_BIT_BLOCK_WIDTH);
    434         lowestBitPosition = iBuilder->CreateZExt(lowestBitPosition, INT_BIT_BLOCK_TY);
    435         Value * outputLowestBitValue = iBuilder->CreateShl(INT_BIT_BLOCK_ONE, lowestBitPosition);
    436         Value * const hasNotReachedStart = iBuilder->CreateICmpULE(currentPosition, start);
    437         outputLowestBitValue = iBuilder->CreateSelect(hasNotReachedStart, outputLowestBitValue, INT_BIT_BLOCK_ONE);
    438 
    439         Value * const nextPosition = iBuilder->CreateMul(iBuilder->CreateAdd(curBlockLocalIndex, SIZE_ONE), SIZE_BIT_BLOCK_WIDTH);
    440         Value * const hasNotReachEnd = iBuilder->CreateICmpULE(nextPosition, end);
    441         Value * producedItemsCount = iBuilder->CreateSelect(hasNotReachEnd, nextPosition, end);
    442         Value * highestBitPosition = iBuilder->CreateURem(end, SIZE_BIT_BLOCK_WIDTH);
    443         highestBitPosition = iBuilder->CreateZExt(highestBitPosition, INT_BIT_BLOCK_TY);
    444         Value * outputHighestBitValue = iBuilder->CreateShl(INT_BIT_BLOCK_ONE, highestBitPosition);
    445         outputHighestBitValue = iBuilder->CreateSelect(hasNotReachEnd, INT_BIT_BLOCK_ZERO, outputHighestBitValue);
    446         Value * bitMask = iBuilder->CreateSub(outputHighestBitValue, outputLowestBitValue);
    447         bitMask = iBuilder->CreateBitCast(bitMask, BIT_BLOCK_TY);
    448 
    449         Value * targetPtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, currentPosition), iBuilder->getBitBlockType()->getPointerTo());
    450         Value * oldValue = iBuilder->CreateBlockAlignedLoad(targetPtr);
    451         Value * newValue = nullptr;
    452         if (isOne) {
    453             newValue = iBuilder->CreateOr(oldValue, bitMask);
    454         } else {
    455             newValue = iBuilder->CreateAnd(oldValue, iBuilder->CreateNot(bitMask));
    456         }
    457         iBuilder->CreateStore(newValue, targetPtr);
    458 
    459         if (setProduced) {
    460             iBuilder->setProducedItemCount(bitstreamName, producedItemsCount);
    461         }
    462 
    463         curBlockLocalIndex->addIncoming(iBuilder->CreateAdd(curBlockLocalIndex, SIZE_ONE), bodyBlock);
    464         iBuilder->CreateBr(conBlock);
    465 
    466         // Exit
    467         iBuilder->SetInsertPoint(exitBlock);
    468         return exitBlock;
    469     }
    470 
    471 
    472390    void LZ4IndexBuilderKernel::generateStoreNumberOutput(const unique_ptr<KernelBuilder> &iBuilder,
    473391                                                          const string & outputBufferName,
     
    480398    }
    481399
     400
     401    void LZ4IndexBuilderKernel::clearCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
     402                                                             const std::string &bitstreamName,
     403                                                             llvm::Value *start, llvm::Value *end) {
     404        //TODO currently we assume that start/end pos is not in the same byte
     405        Value* SIZE_0 = iBuilder->getSize(0);
     406        Value* SIZE_8 = iBuilder->getSize(8);
     407        Value* INT8_0 = iBuilder->getInt8(0);
     408        Type* INT8_PTR_TY = iBuilder->getInt8PtrTy();
     409
     410        Value* outputBufferBytes = iBuilder->CreateUDiv(iBuilder->getSize(this->getAnyStreamSetBuffer(bitstreamName)->getBufferBlocks() * iBuilder->getBitBlockWidth()), SIZE_8);
     411        Value* rawOutputPtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, SIZE_0), INT8_PTR_TY);
     412
     413        Value* startRemain = iBuilder->CreateURem(start, SIZE_8);
     414        Value* startBytePos = iBuilder->CreateUDiv(start, SIZE_8);
     415        Value* endRemain = iBuilder->CreateURem(end, SIZE_8);
     416        Value* endBytePos = iBuilder->CreateUDiv(end, SIZE_8);
     417
     418        BasicBlock* startByteCpyBlock = iBuilder->CreateBasicBlock("startByteCpyBlock");
     419        BasicBlock* endByteCpyConBlock = iBuilder->CreateBasicBlock("endByteCpyConBlock");
     420        BasicBlock* endByteCpyBlock = iBuilder->CreateBasicBlock("endByteCpyBlock");
     421        BasicBlock* memsetBlock = iBuilder->CreateBasicBlock("memsetBlock");
     422
     423        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(startRemain, SIZE_0), startByteCpyBlock, endByteCpyConBlock);
     424
     425        // Clear highest {startShiftAmount} bits
     426        iBuilder->SetInsertPoint(startByteCpyBlock);
     427        Value* startPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(startBytePos, outputBufferBytes));
     428        Value* startValue = iBuilder->CreateLoad(startPtr);
     429
     430        Value* startShiftAmount = iBuilder->CreateSub(SIZE_8, startRemain);
     431        startValue = iBuilder->CreateLShr(iBuilder->CreateShl(startValue, startShiftAmount), startShiftAmount);
     432
     433        iBuilder->CreateStore(startValue, startPtr);
     434        iBuilder->CreateBr(endByteCpyConBlock);
     435
     436        iBuilder->SetInsertPoint(endByteCpyConBlock);
     437        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(endBytePos, SIZE_0), endByteCpyBlock, memsetBlock);
     438
     439        // Clear lowest {endRemain} bits
     440        iBuilder->SetInsertPoint(endByteCpyBlock);
     441        Value* endPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(endBytePos, outputBufferBytes));
     442        Value* endValue = iBuilder->CreateLoad(endPtr);
     443        endValue = iBuilder->CreateShl(iBuilder->CreateLShr(endValue, endRemain), endRemain);
     444        iBuilder->CreateStore(endValue, endPtr);
     445        iBuilder->CreateBr(memsetBlock);
     446
     447        iBuilder->SetInsertPoint(memsetBlock);
     448        Value* memsetStartByte = iBuilder->CreateUDivCeil(start, SIZE_8);
     449        Value* memsetEndByte = endBytePos;
     450
     451        Value* memsetSize = iBuilder->CreateSub(memsetEndByte, memsetStartByte);
     452
     453        memsetSize = iBuilder->CreateUMin(memsetSize, outputBufferBytes);
     454        // We always assume that  (memsetEndByte - memsetStartByte) < outputBufferBytes
     455
     456        Value* memsetStartByteRem = iBuilder->CreateURem(memsetStartByte, outputBufferBytes);
     457
     458        Value* memsetSize1 = iBuilder->CreateUMin(iBuilder->CreateSub(outputBufferBytes, memsetStartByteRem), memsetSize);
     459        Value* memsetSize2 = iBuilder->CreateSub(memsetSize, memsetSize1);
     460
     461        iBuilder->CreateMemSet(iBuilder->CreateGEP(rawOutputPtr, memsetStartByteRem), INT8_0, memsetSize1, true);
     462        iBuilder->CreateMemSet(rawOutputPtr, INT8_0, memsetSize2, true);
     463    }
     464
     465    void LZ4IndexBuilderKernel::setCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
     466                                                             const std::string &bitstreamName,
     467                                                             llvm::Value *start, llvm::Value *end) {
     468        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("exitBlock");
     469
     470        Value* SIZE_0 = iBuilder->getSize(0);
     471        Value* SIZE_8 = iBuilder->getSize(8);
     472        Value* INT8_0 = iBuilder->getInt8(0);
     473        Value* INT8_1 = iBuilder->getInt8(1);
     474        Type* INT8_PTR_TY = iBuilder->getInt8PtrTy();
     475
     476        Value* outputBufferBytes = iBuilder->CreateUDiv(iBuilder->getSize(this->getAnyStreamSetBuffer(bitstreamName)->getBufferBlocks() * iBuilder->getBitBlockWidth()), SIZE_8);
     477        Value* rawOutputPtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, SIZE_0), INT8_PTR_TY);
     478
     479        Value* startRemain = iBuilder->CreateURem(start, SIZE_8);
     480        Value* startBytePos = iBuilder->CreateUDiv(start, SIZE_8);
     481        Value* endRemain = iBuilder->CreateURem(end, SIZE_8);
     482        Value* endBytePos = iBuilder->CreateUDiv(end, SIZE_8);
     483        Value* startShiftAmount = iBuilder->CreateSub(SIZE_8, startRemain);
     484
     485        BasicBlock* shortSetBlock = iBuilder->CreateBasicBlock("shortSetBlock");
     486        BasicBlock* longSetBlock = iBuilder->CreateBasicBlock("longSetBlock");
     487
     488//        iBuilder->CreateBr(startByteCpyBlock);
     489        iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(startBytePos, endBytePos), shortSetBlock, longSetBlock);
     490
     491        // When startPos and endPos are in the same byte
     492        iBuilder->SetInsertPoint(shortSetBlock);
     493        Value* targetPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(startBytePos, outputBufferBytes));
     494        Value* targetValue = iBuilder->CreateLoad(targetPtr);
     495        targetValue = iBuilder->CreateOr(iBuilder->CreateSub(
     496                iBuilder->CreateShl(INT8_1, endRemain),
     497                iBuilder->CreateShl(INT8_1, startRemain)
     498        ), targetValue);
     499//        targetValue = iBuilder->CreateNot(iBuilder->CreateLShr(iBuilder->CreateShl(iBuilder->CreateNot(targetValue), startShiftAmount), startShiftAmount));
     500//        targetValue = iBuilder->CreateShl(iBuilder->CreateLShr(targetValue, endRemain), endRemain);
     501        iBuilder->CreateStore(targetValue, targetPtr);
     502        iBuilder->CreateBr(exitBlock);
     503
     504        iBuilder->SetInsertPoint(longSetBlock);
     505
     506        BasicBlock* startByteCpyBlock = iBuilder->CreateBasicBlock("startByteCpyBlock");
     507        BasicBlock* endByteCpyConBlock = iBuilder->CreateBasicBlock("endByteCpyConBlock");
     508        BasicBlock* endByteCpyBlock = iBuilder->CreateBasicBlock("endByteCpyBlock");
     509        BasicBlock* memsetBlock = iBuilder->CreateBasicBlock("memsetBlock");
     510
     511        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(startRemain, SIZE_0), startByteCpyBlock, endByteCpyConBlock);
     512        // Clear highest {startShiftAmount} bits
     513        iBuilder->SetInsertPoint(startByteCpyBlock);
     514        Value* startPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(startBytePos, outputBufferBytes));
     515        Value* startValue = iBuilder->CreateLoad(startPtr);
     516        startValue = iBuilder->CreateNot(iBuilder->CreateLShr(iBuilder->CreateShl(iBuilder->CreateNot(startValue), startShiftAmount), startShiftAmount));
     517
     518        iBuilder->CreateStore(startValue, startPtr);
     519        iBuilder->CreateBr(endByteCpyConBlock);
     520
     521        iBuilder->SetInsertPoint(endByteCpyConBlock);
     522        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(endBytePos, SIZE_0), endByteCpyBlock, memsetBlock);
     523
     524        // Clear lowest {endRemain} bits
     525        iBuilder->SetInsertPoint(endByteCpyBlock);
     526        Value* endPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(endBytePos, outputBufferBytes));
     527        Value* endValue = iBuilder->CreateLoad(endPtr);
     528        endValue = iBuilder->CreateNot(iBuilder->CreateShl(iBuilder->CreateLShr(iBuilder->CreateNot(endValue), endRemain), endRemain));
     529        iBuilder->CreateStore(endValue, endPtr);
     530        iBuilder->CreateBr(memsetBlock);
     531
     532        iBuilder->SetInsertPoint(memsetBlock);
     533        Value* memsetStartByte = iBuilder->CreateUDivCeil(start, SIZE_8);
     534        Value* memsetEndByte = endBytePos;
     535
     536        Value* memsetSize = iBuilder->CreateSub(memsetEndByte, memsetStartByte);
     537        // TODO bug here when start end in the same byte
     538//        iBuilder->CallPrintInt("memsetEndByte", memsetEndByte);
     539//        iBuilder->CallPrintInt("memsetStartByte", memsetStartByte);
     540//        iBuilder->CallPrintInt("memsetSize1_1", memsetSize);
     541
     542
     543        memsetSize = iBuilder->CreateUMin(memsetSize, outputBufferBytes);
     544
     545        // We always assume that  (memsetEndByte - memsetStartByte) < outputBufferBytes
     546
     547        Value* memsetStartByteRem = iBuilder->CreateURem(memsetStartByte, outputBufferBytes);
     548
     549        Value* memsetSize1 = iBuilder->CreateUMin(iBuilder->CreateSub(outputBufferBytes, memsetStartByteRem), memsetSize);
     550        Value* memsetSize2 = iBuilder->CreateSub(memsetSize, memsetSize1);
     551//        iBuilder->CallPrintInt("memset1Ptr", iBuilder->CreateGEP(rawOutputPtr, memsetStartByteRem));
     552//        iBuilder->CallPrintInt("memsetSize1", memsetSize1);
     553
     554//        iBuilder->CallPrintInt("memset2Ptr", rawOutputPtr);
     555//        iBuilder->CallPrintInt("memsetSize2", memsetSize2);
     556        iBuilder->CreateMemSet(iBuilder->CreateGEP(rawOutputPtr, memsetStartByteRem), iBuilder->getInt8(0xff), memsetSize1, true);
     557        iBuilder->CreateMemSet(rawOutputPtr, iBuilder->getInt8(0xff), memsetSize2, true);
     558        iBuilder->CreateBr(exitBlock);
     559
     560        iBuilder->SetInsertPoint(exitBlock);
     561    }
     562
    482563}
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_index_builder.h

    r5958 r5961  
    5252                     llvm::Value *blockEnd);
    5353
    54 
    55         size_t getOutputBufferSize(const std::unique_ptr<KernelBuilder> &iBuilder, std::string bufferName);
    56 
    57         llvm::BasicBlock *markCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
    58                                                       const std::string &bitstreamName,
    59                                                       llvm::Value *start, llvm::Value *end, bool isOne,
    60                                                       bool setProduced = true);
    61 
    6254        void generateStoreNumberOutput(const std::unique_ptr<KernelBuilder> &iBuilder,
    6355                                       const std::string &outputBufferName,
    6456                                       llvm::Value *value);
    6557
     58        void clearCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
     59                                                                 const std::string &bitstreamName,
     60                                                                 llvm::Value *start, llvm::Value *end);
     61
     62        void setCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
     63                                                               const std::string &bitstreamName,
     64                                                               llvm::Value *start, llvm::Value *end);
    6665    };
    6766}
Note: See TracChangeset for help on using the changeset viewer.