Changeset 6061


Ignore:
Timestamp:
May 31, 2018, 1:35:52 AM (5 months ago)
Author:
xwa163
Message:

Improve the output performance of LZ4ByteStreamAIO kernel by using load/store instruction directly instead of memcpy. The performance of LZ4_Grep pipeline with LZ4BytestreamAio kernel now is slightly better than the standard LZ4 implementation in large file.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_bytestream_aio.cpp

    r6059 r6061  
    528528    void LZ4ByteStreamAioKernel::handleLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    529529                                         llvm::Value *literalLength) {
     530        unsigned fw = 64;
     531        Type* INT_FW_PTR = b->getIntNTy(fw)->getPointerTo();
     532
    530533        Value* inputBytePtr = b->getRawInputPointer("byteStream", literalStart);
     534        Value* inputPtr = b->CreatePointerCast(inputBytePtr, INT_FW_PTR);
    531535
    532536        Value* outputPos = b->getScalarField("outputPos");
    533537        Value* outputBufferSize = b->getCapacity("outputStream");
    534538        Value* outputPtr = b->getRawOutputPointer("outputStream", b->CreateURem(outputPos, outputBufferSize));
     539        outputPtr = b->CreatePointerCast(outputPtr, INT_FW_PTR);
    535540
    536541        // We can always assume that we have enough output buffer based on our output buffer allocation strategy (except in extract only case)
    537         b->CreateMemCpy(outputPtr, inputBytePtr, literalLength, 1);
    538 
     542
     543        BasicBlock* entryBlock = b->GetInsertBlock();
     544        BasicBlock* literalCopyCon = b->CreateBasicBlock("literalCopyCon");
     545        BasicBlock* literalCopyBody = b->CreateBasicBlock("literalCopyBody");
     546        BasicBlock* literalCopyExit = b->CreateBasicBlock("literalCopyExit");
     547
     548        b->CreateBr(literalCopyCon);
     549
     550        // ---- literalCopyCon
     551        b->SetInsertPoint(literalCopyCon);
     552        PHINode* phiOutputPtr = b->CreatePHI(outputPtr->getType(), 2);
     553        phiOutputPtr->addIncoming(outputPtr, entryBlock);
     554        PHINode* phiInputPtr = b->CreatePHI(inputPtr->getType(), 2);
     555        phiInputPtr->addIncoming(inputPtr, entryBlock);
     556        PHINode* phiCopiedLength = b->CreatePHI(literalLength->getType(), 2);
     557        phiCopiedLength->addIncoming(b->getSize(0), entryBlock);
     558        b->CreateCondBr(b->CreateICmpULT(phiCopiedLength, literalLength), literalCopyBody, literalCopyExit);
     559
     560        // ---- literalCopyBody
     561        b->SetInsertPoint(literalCopyBody);
     562        // Always copy fw bits to improve performance
     563        // TODO sometime it will crash because of overflow copy in the end of the buffer, need to add 4 bytes of
     564        //      extra buffer in order to make sure it does not crash.
     565        b->CreateStore(b->CreateLoad(phiInputPtr), phiOutputPtr);
     566
     567        phiInputPtr->addIncoming(b->CreateGEP(phiInputPtr, b->getSize(1)), b->GetInsertBlock());
     568        phiOutputPtr->addIncoming(b->CreateGEP(phiOutputPtr, b->getSize(1)), b->GetInsertBlock());
     569        phiCopiedLength->addIncoming(b->CreateAdd(phiCopiedLength, b->getSize(fw / 8)), b->GetInsertBlock());
     570        b->CreateBr(literalCopyCon);
     571
     572        // ---- literalCopyExit
     573        b->SetInsertPoint(literalCopyExit);
    539574        b->setScalarField("outputPos", b->CreateAdd(outputPos, literalLength));
    540575    }
     
    542577    void LZ4ByteStreamAioKernel::handleMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
    543578                                       llvm::Value *matchLength) {
     579        unsigned fw = 64;
     580        Type* INT_FW_PTR = b->getIntNTy(fw)->getPointerTo();
     581
    544582        BasicBlock* entryBlock = b->GetInsertBlock();
    545583
    546584        Value* outputPos = b->getScalarField("outputPos");
     585        Value* outputBufferSize = b->getCapacity("outputStream");
     586
     587        Value* copyToPtr = b->getRawOutputPointer("outputStream", b->CreateURem(outputPos, outputBufferSize));
     588        Value* copyFromPtr = b->getRawOutputPointer("outputStream", b->CreateURem(b->CreateSub(outputPos, matchOffset), outputBufferSize));
     589
     590        BasicBlock* matchCopyCon = b->CreateBasicBlock("matchCopyCon");
    547591        BasicBlock* matchCopyBody = b->CreateBasicBlock("matchCopyBody");
    548592        BasicBlock* matchCopyExit = b->CreateBasicBlock("matchCopyExit");
    549593
    550         b->CreateBr(matchCopyBody);
     594        b->CreateBr(matchCopyCon);
     595
     596        // ---- matchCopyCon
     597        b->SetInsertPoint(matchCopyCon);
     598        PHINode* phiFromPtr = b->CreatePHI(b->getInt8PtrTy(), 2);
     599        phiFromPtr->addIncoming(copyFromPtr, entryBlock);
     600        PHINode* phiToPtr = b->CreatePHI(b->getInt8PtrTy(), 2);
     601        phiToPtr->addIncoming(copyToPtr, entryBlock);
     602        PHINode* phiCopiedSize = b->CreatePHI(b->getSizeTy(), 2);
     603        phiCopiedSize->addIncoming(b->getSize(0), entryBlock);
     604
     605        b->CreateCondBr(b->CreateICmpULT(phiCopiedSize, matchLength), matchCopyBody, matchCopyExit);
    551606
    552607        // ---- matchCopyBody
    553608        b->SetInsertPoint(matchCopyBody);
    554         PHINode* phiRemainingMatchLength = b->CreatePHI(b->getSizeTy(), 2);
    555         phiRemainingMatchLength->addIncoming(matchLength, entryBlock);
    556         PHINode* phiCurrentCopyToPos = b->CreatePHI(b->getSizeTy(), 2);
    557         phiCurrentCopyToPos->addIncoming(outputPos, entryBlock);
    558 
    559         Value* currentCopyLength = b->CreateUMin(phiRemainingMatchLength, matchOffset);
    560 //        b->CallPrintInt("currentCopyLength", currentCopyLength);
    561         Value* copyFromPos = b->CreateSub(phiCurrentCopyToPos, matchOffset);
    562 
    563         Value* outputBufferSize = b->getCapacity("outputStream");
    564         b->CreateMemCpy(
    565                 b->getRawOutputPointer("outputStream", b->CreateURem(phiCurrentCopyToPos, outputBufferSize)),
    566                 b->getRawOutputPointer("outputStream", b->CreateURem(copyFromPos, outputBufferSize)),
    567                 currentCopyLength,
    568                 1
    569         );
    570 
    571         Value* newMatchLength = b->CreateSub(phiRemainingMatchLength, currentCopyLength);
    572         Value* newOutputPos = b->CreateAdd(phiCurrentCopyToPos, currentCopyLength);
    573 
    574         phiRemainingMatchLength->addIncoming(newMatchLength, b->GetInsertBlock());
    575         phiCurrentCopyToPos->addIncoming(newOutputPos, b->GetInsertBlock());
    576 
    577         b->CreateCondBr(b->CreateICmpEQ(newMatchLength, b->getSize(0)), matchCopyExit, matchCopyBody);
    578 
     609        b->CreateStore(
     610                b->CreateLoad(b->CreatePointerCast(phiFromPtr, INT_FW_PTR)),
     611        b->CreatePointerCast(phiToPtr, INT_FW_PTR)
     612        );
     613
     614        Value* copySize = b->CreateUMin(matchOffset, b->getSize(fw / 8));
     615        phiFromPtr->addIncoming(b->CreateGEP(phiFromPtr, copySize), b->GetInsertBlock());
     616        phiToPtr->addIncoming(b->CreateGEP(phiToPtr, copySize), b->GetInsertBlock());
     617        phiCopiedSize->addIncoming(b->CreateAdd(phiCopiedSize, copySize), b->GetInsertBlock());
     618        b->CreateBr(matchCopyCon);
     619
     620        // ---- matchCopyExit
    579621        b->SetInsertPoint(matchCopyExit);
    580622        b->setScalarField("outputPos", b->CreateAdd(outputPos, matchLength));
Note: See TracChangeset for help on using the changeset viewer.