Changeset 6065


Ignore:
Timestamp:
Jun 6, 2018, 1:30:11 PM (2 weeks ago)
Author:
xwa163
Message:
  1. Fix some typo in LZ4 Grep Extract and Deposit pipeline
  2. Small fix for LZ4ParallelByteStreamAIOKernel
Location:
icGREP/icgrep-devel/icgrep
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_parallel_bytestream_aio.cpp

    r6064 r6065  
    7171                        b->CreatePointerCast(b->getRawInputPointer("extender", b->getSize(0)), b->getInt8PtrTy()),
    7272                        b->CreateTrunc(maskBlockIndexVec, VectorType::get(b->getInt32Ty(), 4)),
    73                         Constant::getAllOnesValue(b->getBitBlockType()), //TODO test the mask
     73                        Constant::getAllOnesValue(b->getBitBlockType()),
    7474                        b->getInt8(8)
    7575                }
     
    8282                        b->CreatePointerCast(b->getRawInputPointer("extender", b->getSize(0)), b->getInt8PtrTy()),
    8383                        b->CreateTrunc(maskBlockIndexVec, VectorType::get(b->getInt32Ty(), 4)),
    84                         Constant::getAllOnesValue(b->getBitBlockType()), //TODO test the mask
     84                        Constant::getAllOnesValue(b->getBitBlockType()),
    8585                        b->getInt8(8)
    8686                }
     
    114114            const std::unique_ptr<KernelBuilder> &b, Value *beginTokenPosVec, Value *lz4BlockEndVec, Value* initOutputPosVec
    115115    ) {
    116         Function *gatherFunc = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_avx2_gather_d_q_256); // TODO find ret <4 * i32> version
     116        Function *gatherFunc = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_avx2_gather_d_q_256); // Maybe it will be better to use <4 * i32> version
    117117
    118118        // Constant
     
    139139
    140140
    141         Value* bytePtrBase = b->CreatePointerCast(b->getRawInputPointer("byteStream", b->getSize(0)), b->getInt8PtrTy());
     141        Value* byteRawInputPtr = b->CreatePointerCast(b->getRawInputPointer("byteStream", b->getSize(0)), b->getInt8PtrTy());
     142
    142143        Value* firstTokenPos = b->CreateExtractElement(beginTokenPosVec, (uint64_t)0);
    143 
    144         bytePtrBase = b->CreateGEP(bytePtrBase, firstTokenPos);
    145 
    146 
    147         // TODO use <4 * i32> gather instead of <4 * i64>, since actually we only need <4 * i8>
    148         Value* tokenValuesVec =  b->CreateCall(
    149                 gatherFunc,
    150                 {
    151                         UndefValue::get(b->getBitBlockType()),
    152                         bytePtrBase,
    153                         b->CreateTrunc(b->CreateSub(beginTokenPosVec, b->simd_fill(SIMD_WIDTH, firstTokenPos)), VectorType::get(b->getInt32Ty(), 4)),
    154                         b->CreateAnd(Constant::getAllOnesValue(b->getBitBlockType()), notFinishMask),
    155                         b->getInt8(1)
    156                 }
    157         );
    158         tokenValuesVec = b->CreateAnd(tokenValuesVec, notFinishMask);
    159         tokenValuesVec = b->CreateAnd(tokenValuesVec, BIT_BLOCK_FF);
     144        Value* bytePtrBase = b->CreateGEP(byteRawInputPtr, firstTokenPos);
     145
     146
     147
     148        Value* tokenValuesVec = this->simdFetchByteData(b, byteRawInputPtr, beginTokenPosVec, notFinishMask);
    160149
    161150        Value* shouldExtendLiteralVec = b->CreateICmpEQ(b->CreateAnd(BIT_BLOCK_F0, tokenValuesVec), BIT_BLOCK_F0);
     
    190179//        b->CallPrintInt("a", b->getSize(0));
    191180        // TODO maybe we can load i64 once and then consume 8 times
    192         // TODO use <4 * i32> gather instead of <4 * i64>, since actually we only need <4 * i8>
    193         Value* currentLiteralLengthVec =  b->CreateCall(
    194                 gatherFunc,
    195                 {
    196                         UndefValue::get(b->getBitBlockType()),
    197                         bytePtrBase,
    198                         b->CreateTrunc(b->CreateSub(phiCurrentExtendLiteralPosVec, b->simd_fill(SIMD_WIDTH, firstTokenPos)), VectorType::get(b->getInt32Ty(), 4)),
    199                         shouldExtendLiteralGatherMask,
    200                         b->getInt8(1)
    201                 }
    202         );
    203 
    204         currentLiteralLengthVec = b->CreateAnd(currentLiteralLengthVec, shouldExtendLiteralGatherMask);// TODO remove this line?
    205         currentLiteralLengthVec = b->CreateAnd(currentLiteralLengthVec, BIT_BLOCK_FF);
     181        Value* currentLiteralLengthVec = this->simdFetchByteData(b, byteRawInputPtr, phiCurrentExtendLiteralPosVec, shouldExtendLiteralGatherMask);
    206182
    207183        Value* newExtendLiteralLengthVec = b->CreateAdd(phiExtendLiteralLengthVec, currentLiteralLengthVec);
     
    232208        phiExtendLiteralEndPos->addIncoming(phiCurrentExtendLiteralPosVec, extendLiteralCond);
    233209
    234         Value* literalLengthVec = b->CreateAdd(literalExtendValueVec, b->CreateLShr(tokenValuesVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))));
     210
     211        Value* literalLengthVec = b->simd_add(SIMD_WIDTH, literalExtendValueVec, b->simd_srlv(SIMD_WIDTH, tokenValuesVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))));
     212//        Value* literalLengthVec = b->CreateAdd(literalExtendValueVec, b->CreateLShr(tokenValuesVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))));
    235213
    236214        Value* literalStartPosVec = b->CreateAdd(phiExtendLiteralEndPos, BIT_BLOCK_1);
     
    278256        shouldExtendMatchGatherMask = b->CreateAnd(shouldExtendMatchGatherMask, notFinishMask);
    279257        // TODO maybe we can load i64 once and then consume 8 times
    280         // TODO use <4 * i32> gather instead of <4 * i64>, since actually we only need <4 * i8>
    281         Value* currentMatchLengthVec =  b->CreateCall(
    282                 gatherFunc,
    283                 {
    284                         UndefValue::get(b->getBitBlockType()),
    285                         bytePtrBase,
    286                         b->CreateTrunc(b->CreateSub(phiCurrentExtendMatchPosVec, b->simd_fill(SIMD_WIDTH, firstTokenPos)), VectorType::get(b->getInt32Ty(), 4)),
    287                         shouldExtendMatchGatherMask,
    288                         b->getInt8(1)
    289                 }
    290         );
    291 
    292         currentMatchLengthVec = b->CreateAnd(currentMatchLengthVec, shouldExtendMatchGatherMask);// TODO remove this line?
    293         currentMatchLengthVec = b->CreateAnd(currentMatchLengthVec, BIT_BLOCK_FF);
     258
     259        Value* currentMatchLengthVec = this->simdFetchByteData(b, byteRawInputPtr, phiCurrentExtendMatchPosVec, shouldExtendMatchGatherMask);
    294260
    295261        Value* newExtendMatchLengthVec = b->CreateAdd(phiExtendMatchLengthVec, currentMatchLengthVec);
     
    344310        matchOffsetVec = b->CreateAnd(matchOffsetVec, BIT_BLOCK_FFFF);
    345311
     312//        Value* matchOffsetVec = this->simdFetchByteData(b, byteRawInputPtr, matchOffsetBeginPosVec, b->CreateAnd(hasMatchPartMask, notFinishMask));
     313
     314
     315
    346316        this->handleSimdMatchCopy(b, matchOffsetVec, matchLength, outputPosAfterLiteralCpy);
    347317
     
    469439        // ---- notSimdProcessBlock
    470440        b->SetInsertPoint(notSimdProcessBlock);
    471         // TODO Use loop to process the remaining block in sequential approach (the number of the remaining block should be less than (b->getBitBlockWidth() / SIMD_WIDTH))
     441        // Use loop to process the remaining block in sequential approach (the number of the remaining block should be less than (b->getBitBlockWidth() / SIMD_WIDTH))
    472442        this->generateSequentialDecompression(b, blockDataIndex, totalNumber);
    473443        b->CreateBr(exitBlock);
     
    586556    std::pair<llvm::Value *, llvm::Value *> LZ4ParallelByteStreamAioKernel::processBlockBoundary(const std::unique_ptr<KernelBuilder> &b, llvm::Value *beginTokenPos,
    587557                                                              llvm::Value *lz4BlockEnd, llvm::Value* initOutputPos) {
    588         // TODO handle initOutputPos;
    589 // Constant
     558        // Constant
    590559        ConstantInt* SIZE_0 = b->getSize(0);
    591560        ConstantInt* SIZE_1 = b->getSize(1);
     
    775744    void LZ4ParallelByteStreamAioKernel::handleSimdLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* literalStartVec, llvm::Value* literalLengthVec, llvm::Value* outputPosVec) {
    776745        Value* l = b->CreateExtractElement(literalLengthVec, (uint64_t)0);
    777         Value* shouldPrint = b->CreateICmpNE(l, b->getSize(0));
    778 //        b->CallPrintIntCond("literalStart", b->CreateExtractElement(literalStartVec, (uint64_t)0), shouldPrint);
    779 //        b->CallPrintIntCond("literalLength", l, shouldPrint);
     746
     747        Value* outputCapacity = b->getCapacity("outputStream");
     748        Value* outputPosRemVec = b->simd_and(outputPosVec, b->simd_fill(SIMD_WIDTH, b->simd_not(b->CreateNeg(outputCapacity))));
    780749        // TODO use memcpy first
    781750
    782751        BasicBlock* entryBlock = b->GetInsertBlock();
    783         Value* outputCapacity = b->getCapacity("outputStream");
     752
    784753
    785754        Value* inputBasePtr = b->CreatePointerCast(b->getRawInputPointer("byteStream", b->getSize(0)), b->getInt8PtrTy());
     
    789758            Value* literalStart = b->CreateExtractElement(literalStartVec, i);
    790759            Value* literalLength = b->CreateExtractElement(literalLengthVec, i);
    791             Value* outputPos = b->CreateExtractElement(outputPosVec, i);
    792             Value* outputPosRem = b->CreateURem(outputPos, outputCapacity);
     760            Value* outputPosRem = b->CreateExtractElement(outputPosRemVec, i);;
    793761            b->CreateMemCpy(
    794762                    b->CreateGEP(outputBasePtr, outputPosRem),
     
    894862    }
    895863
     864    llvm::Value* LZ4ParallelByteStreamAioKernel::simdFetchByteData(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* mask) {
     865        return this->simdFetchByteDataByGather(b, basePtr, offsetVec, mask);
     866//        return this->simdFetchByteDataByLoop(b, basePtr, offsetVec, mask);
     867    }
     868
     869    llvm::Value* LZ4ParallelByteStreamAioKernel::simdFetchByteDataByGather(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* mask) {
     870        Value* BIT_BLOCK_FF = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xff));
     871        Function *gatherFunc = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_avx2_gather_d_q_256);
     872        Function *gatherFunc2 = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_avx2_gather_d_d);
     873
     874        Value* firstOffset = b->CreateExtractElement(offsetVec, (uint64_t)0);
     875
     876        Type* i32BitBlockTy = VectorType::get(b->getInt32Ty(), 4);
     877
     878//        Value* tokenValuesVec =  b->CreateCall(
     879//                gatherFunc,
     880//                {
     881//                        UndefValue::get(b->getBitBlockType()),
     882//                        b->CreateGEP(basePtr, firstOffset),
     883//                        b->CreateTrunc(b->CreateSub(offsetVec, b->simd_fill(SIMD_WIDTH, firstOffset)), VectorType::get(b->getInt32Ty(), 4)),
     884//                        mask,
     885//                        b->getInt8(1)
     886//                }
     887//        );
     888
     889        ////
     890        Value* tokenValuesVec =  b->CreateCall(
     891                gatherFunc2,
     892                {
     893                        UndefValue::get(i32BitBlockTy),
     894                        b->CreateGEP(basePtr, firstOffset),
     895                        b->CreateTrunc(b->CreateSub(offsetVec, b->simd_fill(SIMD_WIDTH, firstOffset)), VectorType::get(b->getInt32Ty(), 4)),
     896                        b->CreateTrunc(mask, i32BitBlockTy),
     897                        b->getInt8(1)
     898                }
     899        );
     900        tokenValuesVec = b->CreateZExt(tokenValuesVec, b->getBitBlockType());
     901        /////
     902       
     903        tokenValuesVec = b->CreateAnd(tokenValuesVec, mask);
     904        tokenValuesVec = b->CreateAnd(tokenValuesVec, BIT_BLOCK_FF);
     905        return tokenValuesVec;
     906    }
     907
     908    llvm::Value* LZ4ParallelByteStreamAioKernel::simdFetchByteDataByLoop(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* maskVec) {
     909        Value* retVec = ConstantVector::getNullValue(b->getBitBlockType());
     910
     911        for (uint64_t i = 0; i < 4; i++){ //TODO 4 here is a hardcode for AVX2, it may need to be changed to (BitBlockWidth / 64)
     912            Value* mask = b->CreateExtractElement(maskVec, i);
     913            Value* shouldLoad = b->CreateICmpNE(mask, b->getInt64(0));
     914            Value* loadPtr = b->CreateSelect(shouldLoad, b->CreateGEP(basePtr, b->CreateExtractElement(offsetVec, i)), basePtr);
     915            Value* loadValue = b->CreateZExt(b->CreateLoad(loadPtr), b->getInt64Ty());
     916
     917            Value* finalValue = b->CreateSelect(shouldLoad, loadValue, b->getInt64(0));
     918            retVec = b->CreateInsertElement(retVec, finalValue, i);
     919        }
     920
     921        return retVec;
     922    }
    896923
    897924}
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_parallel_bytestream_aio.h

    r6064 r6065  
    6363
    6464        void generateSequentialDecompression(const std::unique_ptr<KernelBuilder> &b, llvm::Value* startBlockDataIndex, llvm::Value* endBlockDataIndex);
     65
     66
     67        llvm::Value* simdFetchByteData(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* mask);
     68        llvm::Value* simdFetchByteDataByGather(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* mask);
     69        llvm::Value* simdFetchByteDataByLoop(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* mask);
     70
     71
    6572    };
    6673
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.cpp

    r6064 r6065  
    746746    this->generateExtractAndDepositMarkers(iBuilder);
    747747
    748     auto swizzle = this->generateSwizzleExtractData(iBuilder);
    749 
    750     StreamSetBuffer * depositedSwizzle0 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
    751     StreamSetBuffer * depositedSwizzle1 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
    752 
    753     Kernel * multiplePdepK = mPxDriver.addKernelInstance<SwizzledMultiplePDEPkernel>(iBuilder, 4, 2);
    754     mPxDriver.makeKernelCall(multiplePdepK, {mDepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
    755 
    756 
    757     // split PDEP into 2 kernel will be a little slower in single thread environment
     748
     749    StreamSetBuffer * LineBreakStream;
     750    StreamSetBuffer * Matches;
     751    std::vector<re::RE*> res = {regex};
     752    if (mEnableMultiplexing) {
     753        std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res);
     754    } else {
     755        auto swizzle = this->generateSwizzleExtractData(iBuilder);
     756
     757        StreamSetBuffer * depositedSwizzle0 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
     758        StreamSetBuffer * depositedSwizzle1 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
     759
     760        Kernel * multiplePdepK = mPxDriver.addKernelInstance<SwizzledMultiplePDEPkernel>(iBuilder, 4, 2);
     761        mPxDriver.makeKernelCall(multiplePdepK, {mDepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
     762
     763
     764        // split PDEP into 2 kernel will be a little slower in single thread environment
    758765/*
    759766    Kernel * pdep1 = mPxDriver.addKernelInstance<PDEPkernel>(iBuilder, 4);
     
    764771*/
    765772
    766     StreamSetBuffer * matchCopiedSwizzle0 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
    767     StreamSetBuffer * matchCopiedSwizzle1 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
    768 
    769     Kernel * swizzledMatchCopyK = mPxDriver.addKernelInstance<LZ4SwizzledMatchCopyKernel>(iBuilder, 4, 2, 4);
    770     mPxDriver.makeKernelCall(swizzledMatchCopyK, {mMatchOffsetMarker, mM0Marker, mCompressedByteStream, depositedSwizzle0, depositedSwizzle1}, {matchCopiedSwizzle0, matchCopiedSwizzle1});
    771 
    772     // Produce unswizzled bit streams
    773     StreamSetBuffer * matchCopiedbits = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
    774     Kernel * unSwizzleK = mPxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
    775     mPxDriver.makeKernelCall(unSwizzleK, {matchCopiedSwizzle0, matchCopiedSwizzle1}, {matchCopiedbits});
    776 
    777     StreamSetBuffer * LineBreakStream;
    778     StreamSetBuffer * Matches;
    779     std::vector<re::RE*> res = {regex};
    780 //    if (mEnableMultiplexing) {
    781 //        std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res, matchCopiedbits);
    782 //    } else {
     773        StreamSetBuffer * matchCopiedSwizzle0 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
     774        StreamSetBuffer * matchCopiedSwizzle1 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
     775
     776        Kernel * swizzledMatchCopyK = mPxDriver.addKernelInstance<LZ4SwizzledMatchCopyKernel>(iBuilder, 4, 2, 4);
     777        mPxDriver.makeKernelCall(swizzledMatchCopyK, {mMatchOffsetMarker, mM0Marker, mCompressedByteStream, depositedSwizzle0, depositedSwizzle1}, {matchCopiedSwizzle0, matchCopiedSwizzle1});
     778
     779        // Produce unswizzled bit streams
     780        StreamSetBuffer * matchCopiedbits = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
     781        Kernel * unSwizzleK = mPxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
     782        mPxDriver.makeKernelCall(unSwizzleK, {matchCopiedSwizzle0, matchCopiedSwizzle1}, {matchCopiedbits});
     783
     784
    783785        std::tie(LineBreakStream, Matches) = grepPipeline(res, matchCopiedbits);
    784 //    };
     786    };
    785787
    786788    kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder);
  • icGREP/icgrep-devel/icgrep/lz4_grep.cpp

    r6064 r6065  
    9898        }
    9999
    100         auto main = g.getMainFunc();
    101         main(fileBuffer, lz4Frame.getBlocksStart(), lz4Frame.getBlocksStart() + lz4Frame.getBlocksLength(), lz4Frame.hasBlockChecksum());
     100        auto main = g.getCountOnlyGrepMainFunction();
     101        uint64_t countResult = main(fileBuffer, lz4Frame.getBlocksStart(), lz4Frame.getBlocksStart() + lz4Frame.getBlocksLength(), lz4Frame.hasBlockChecksum());
     102        llvm::outs() << countResult << "\n";
    102103    } else {
    103104        g.generateScanMatchGrepPipeline(re_ast);
Note: See TracChangeset for help on using the changeset viewer.