Changeset 6081


Ignore:
Timestamp:
Jun 12, 2018, 4:09:27 AM (9 days ago)
Author:
xwa163
Message:
  1. Add command line parameter -enable-gather and -enable-scatter in lz4 parallel grep
  2. fix some bugs for AVX512 machine
Location:
icGREP/icgrep-devel/icgrep
Files:
9 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_block_decoder.cpp

    r6077 r6081  
    4646}
    4747
    48 void LZ4BlockDecoderNewKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     48void LZ4BlockDecoderNewKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) {
    4949
    50     Constant* INT64_0 = iBuilder->getInt64(0);
     50    Constant* INT64_0 = b->getInt64(0);
    5151
    52     BasicBlock * entryBlock = iBuilder->GetInsertBlock();
     52    BasicBlock * entryBlock = b->GetInsertBlock();
    5353
    5454    // Skip Header
    55     Value* hasSkipHeader = iBuilder->getScalarField("hasSkipHeader");
    56     iBuilder->setScalarField("hasSkipHeader", iBuilder->getTrue());
    57     Value* skipLength = iBuilder->CreateSelect(hasSkipHeader, iBuilder->getSize(0), iBuilder->getScalarField("headerSize"));
    58     Value* previousOffset = iBuilder->getScalarField("previousOffset");
    59     previousOffset = iBuilder->CreateAdd(skipLength, previousOffset);
    60     Value* initBlockStart = iBuilder->getScalarField("pendingBlockStart");
    61     Value* initBlockEnd = iBuilder->getScalarField("pendingBlockEnd");
    62     Value* initIsCompressed = iBuilder->getScalarField("pendingIsCompressed");
    63     Value * availableItemCount = iBuilder->getAvailableItemCount("byteStream");
    64     BasicBlock * processCon = iBuilder->CreateBasicBlock("process_con");
    65     iBuilder->CreateBr(processCon);
     55    Value* hasSkipHeader = b->getScalarField("hasSkipHeader");
     56    b->setScalarField("hasSkipHeader", b->getTrue());
     57    Value* skipLength = b->CreateSelect(hasSkipHeader, b->getSize(0), b->getScalarField("headerSize"));
     58    Value* previousOffset = b->getScalarField("previousOffset");
     59    previousOffset = b->CreateAdd(skipLength, previousOffset);
     60    Value* initBlockStart = b->getScalarField("pendingBlockStart");
     61    Value* initBlockEnd = b->getScalarField("pendingBlockEnd");
     62    Value* initIsCompressed = b->getScalarField("pendingIsCompressed");
     63    Value * availableItemCount = b->getAvailableItemCount("byteStream");
     64    BasicBlock * processCon = b->CreateBasicBlock("process_con");
     65    b->CreateBr(processCon);
    6666
    67     iBuilder->SetInsertPoint(processCon);
     67    b->SetInsertPoint(processCon);
    6868
    69     PHINode* phiIsCompressed = iBuilder->CreatePHI(initIsCompressed->getType(), 3);
    70     PHINode* phiBlockStart = iBuilder->CreatePHI(initBlockStart->getType(), 3);
    71     PHINode* phiBlockEnd = iBuilder->CreatePHI(initBlockEnd->getType(), 3);
    72     PHINode* sOffset = iBuilder->CreatePHI(previousOffset->getType(), 3);
     69    PHINode* phiIsCompressed = b->CreatePHI(initIsCompressed->getType(), 3);
     70    PHINode* phiBlockStart = b->CreatePHI(initBlockStart->getType(), 3);
     71    PHINode* phiBlockEnd = b->CreatePHI(initBlockEnd->getType(), 3);
     72    PHINode* sOffset = b->CreatePHI(previousOffset->getType(), 3);
    7373
    7474    phiIsCompressed->addIncoming(initIsCompressed, entryBlock);
     
    7878
    7979    // Store Output
    80     BasicBlock* storeOutputBlock = iBuilder->CreateBasicBlock("storeOutputBlock");
    81     BasicBlock * block_decoder_con = iBuilder->CreateBasicBlock("block_decoder_con_block");
     80    BasicBlock* storeOutputBlock = b->CreateBasicBlock("storeOutputBlock");
     81    BasicBlock * block_decoder_con = b->CreateBasicBlock("block_decoder_con_block");
    8282
    83     iBuilder->CreateUnlikelyCondBr(
    84             iBuilder->CreateAnd(
    85                     iBuilder->CreateICmpULE(phiBlockEnd, availableItemCount),
    86                     iBuilder->CreateNot(iBuilder->CreateICmpEQ(phiBlockEnd, INT64_0))
     83    b->CreateUnlikelyCondBr(
     84            b->CreateAnd(
     85                    b->CreateICmpULE(phiBlockEnd, availableItemCount),
     86                    b->CreateNot(b->CreateICmpEQ(phiBlockEnd, INT64_0))
    8787            ),
    8888            storeOutputBlock,
     
    9090    );
    9191
    92     iBuilder->SetInsertPoint(storeOutputBlock);
     92    b->SetInsertPoint(storeOutputBlock);
    9393
    94     appendOutput(iBuilder, phiIsCompressed, phiBlockStart, phiBlockEnd);
     94    appendOutput(b, phiIsCompressed, phiBlockStart, phiBlockEnd);
    9595
    9696
    97     phiIsCompressed->addIncoming(iBuilder->getInt8(0), storeOutputBlock);
     97    phiIsCompressed->addIncoming(b->getInt8(0), storeOutputBlock);
    9898    phiBlockStart->addIncoming(INT64_0, storeOutputBlock);
    9999    phiBlockEnd->addIncoming(INT64_0, storeOutputBlock);
    100100    sOffset->addIncoming(sOffset, storeOutputBlock);
    101101
    102     iBuilder->CreateBr(processCon);
     102    b->CreateBr(processCon);
    103103
    104104
    105105    // block decoder entry
    106     iBuilder->SetInsertPoint(block_decoder_con);
     106    b->SetInsertPoint(block_decoder_con);
    107107
    108     BasicBlock * block_decoder_body = iBuilder->CreateBasicBlock("block_decoder_body_block");
    109     BasicBlock * block_decoder_exit = iBuilder->CreateBasicBlock("block_decoder_exit_block");
     108    BasicBlock * block_decoder_body = b->CreateBasicBlock("block_decoder_body_block");
     109    BasicBlock * block_decoder_exit = b->CreateBasicBlock("block_decoder_exit_block");
    110110
    111     Value * reachFinalBlock = iBuilder->getScalarField("reachFinalBlock");
    112     iBuilder->CreateCondBr(
    113         iBuilder->CreateAnd(
    114             iBuilder->CreateICmpULT(sOffset, availableItemCount),
    115             iBuilder->CreateNot(reachFinalBlock)
     111    Value * reachFinalBlock = b->getScalarField("reachFinalBlock");
     112    b->CreateCondBr(
     113        b->CreateAnd(
     114            b->CreateICmpULT(sOffset, availableItemCount),
     115            b->CreateNot(reachFinalBlock)
    116116        ),
    117117        block_decoder_body,
     
    119119
    120120    //block_decoder_body
    121     iBuilder->SetInsertPoint(block_decoder_body);
    122     Value* currentBlockSize = iBuilder->getSize(0);
     121    b->SetInsertPoint(block_decoder_body);
     122    Value* currentBlockSize = b->getSize(0);
    123123    for (size_t i = 0; i < 4; i++) {
    124         Value * offset = iBuilder->CreateAdd(sOffset, iBuilder->getSize(i));
    125         Value * rawOffset = iBuilder->CreateZExt(generateLoadInput(iBuilder, offset), iBuilder->getSizeTy());
    126         currentBlockSize = iBuilder->CreateOr(currentBlockSize, iBuilder->CreateShl(rawOffset, iBuilder->getSize(8 * i)));
     124        Value * offset = b->CreateAdd(sOffset, b->getSize(i));
     125        Value * rawOffset = b->CreateZExt(generateLoadInput(b, offset), b->getSizeTy());
     126        currentBlockSize = b->CreateOr(currentBlockSize, b->CreateShl(rawOffset, b->getSize(8 * i)));
    127127    }
    128128
    129     Value * realBlockSize = iBuilder->CreateAnd(currentBlockSize, 0x7fffffff);
     129    Value * realBlockSize = b->CreateAnd(currentBlockSize, 0x7fffffff);
    130130
    131     Value * isCompressed = iBuilder->CreateNot(currentBlockSize);
    132     isCompressed = iBuilder->CreateLShr(isCompressed, 31);
    133     isCompressed = iBuilder->CreateTrunc(isCompressed, iBuilder->getInt1Ty());
     131    Value * isCompressed = b->CreateNot(currentBlockSize);
     132    isCompressed = b->CreateLShr(isCompressed, 31);
     133    isCompressed = b->CreateTrunc(isCompressed, b->getInt1Ty());
    134134
    135     Value * isFinalBlock = iBuilder->CreateICmpEQ(realBlockSize, iBuilder->getSize(0));
    136     iBuilder->setScalarField("reachFinalBlock", isFinalBlock);
     135    Value * isFinalBlock = b->CreateICmpEQ(realBlockSize, b->getSize(0));
     136    b->setScalarField("reachFinalBlock", isFinalBlock);
    137137
    138     Value * blockStart = iBuilder->CreateAdd(sOffset, iBuilder->getSize(4));
    139     Value * blockEnd = iBuilder->CreateAdd(blockStart, realBlockSize);
     138    Value * blockStart = b->CreateAdd(sOffset, b->getSize(4));
     139    Value * blockEnd = b->CreateAdd(blockStart, realBlockSize);
    140140
    141141    Value * newOffset = sOffset;
    142     newOffset = iBuilder->CreateAdd(newOffset, iBuilder->getSize(4)); // Block Size
    143     newOffset = iBuilder->CreateAdd(newOffset, realBlockSize); // Block Content
    144     Value * const blockChecksumOffset = iBuilder->CreateSelect(iBuilder->getScalarField("hasBlockChecksum"), iBuilder->getSize(4), iBuilder->getSize(0));
    145     newOffset = iBuilder->CreateAdd(newOffset, blockChecksumOffset);
     142    newOffset = b->CreateAdd(newOffset, b->getSize(4)); // Block Size
     143    newOffset = b->CreateAdd(newOffset, realBlockSize); // Block Content
     144    Value * const blockChecksumOffset = b->CreateSelect(b->getScalarField("hasBlockChecksum"), b->getSize(4), b->getSize(0));
     145    newOffset = b->CreateAdd(newOffset, blockChecksumOffset);
    146146
    147147    sOffset->addIncoming(newOffset, block_decoder_body);
    148     phiIsCompressed->addIncoming(isCompressed, block_decoder_body);
     148    phiIsCompressed->addIncoming(b->CreateZExt(isCompressed, b->getInt8Ty()), block_decoder_body);
    149149    phiBlockStart->addIncoming(blockStart, block_decoder_body);
    150150    phiBlockEnd->addIncoming(blockEnd, block_decoder_body);
    151     iBuilder->CreateBr(processCon);
     151    b->CreateBr(processCon);
    152152
    153153    // block_decoder_exit_block
    154     iBuilder->SetInsertPoint(block_decoder_exit);
    155     iBuilder->setScalarField("pendingIsCompressed", phiIsCompressed);
    156     iBuilder->setScalarField("pendingBlockStart", phiBlockStart);
    157     iBuilder->setScalarField("pendingBlockEnd", phiBlockEnd);
    158     iBuilder->setScalarField("previousOffset", sOffset);
    159     iBuilder->setProcessedItemCount("byteStream", availableItemCount);
    160     iBuilder->setTerminationSignal(mIsFinal);
     154    b->SetInsertPoint(block_decoder_exit);
     155    b->setScalarField("pendingIsCompressed", phiIsCompressed);
     156    b->setScalarField("pendingBlockStart", phiBlockStart);
     157    b->setScalarField("pendingBlockEnd", phiBlockEnd);
     158    b->setScalarField("previousOffset", sOffset);
     159    b->setProcessedItemCount("byteStream", availableItemCount);
     160    b->setTerminationSignal(mIsFinal);
    161161}
    162162
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_block_decoder.h

    r6026 r6081  
    2424    LZ4BlockDecoderNewKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, std::string&& kernelName = "LZ4BlockDecoderKernel");
    2525protected:
    26     void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
     26    void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) override;
    2727private:
    2828    llvm::Value *generateLoadInput(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value *offset);
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_parallel_bytestream_aio.cpp

    r6080 r6081  
    1919namespace kernel{
    2020
    21     LZ4ParallelByteStreamAioKernel::LZ4ParallelByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, size_t outputBlockSize)
     21    LZ4ParallelByteStreamAioKernel::LZ4ParallelByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, bool enableGather, bool enableScatter, size_t outputBlockSize)
    2222            :SegmentOrientedKernel("LZ4ParallelByteStreamAioKernel",
    2323            // Inputs
     
    4545                                           Binding{b->getIntNTy(SIMD_WIDTH), "outputPos"},
    4646
    47                                    }), mOutputBlockSize(outputBlockSize) {
     47                                   }), mEnableGather(enableGather), mEnableScatter(enableScatter), mOutputBlockSize(outputBlockSize) {
    4848        this->setStride(4 * 1024 * 1024 * 4);
    4949        addAttribute(MustExplicitlyTerminate());
     
    5454        /*
    5555        // TODO incomplete
    56 
    5756        // Constant
    5857        Function *gatherFunc = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_avx2_gather_d_q_256); // TODO find ret <4 * i32> version
     
    102101        PHINode* phiTokenMarkersVec = b->CreatePHI(initTokeMarkersVec->getType(), 2);
    103102        phiTokenMarkersVec->addIncoming(initTokeMarkersVec, entryBlock);
    104 
    105 
    106 
    107103*/
    108104        return beginTokenPosVec;    //TODO
     
    115111    ) {
    116112        // Constant
    117         Value* BIT_BLOCK_0 = ConstantVector::getNullValue(b->getBitBlockType());
    118         Value* BIT_BLOCK_1 = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0x1));
    119         Value* BIT_BLOCK_F0 = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xf0));
    120         Value* BIT_BLOCK_0F = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0x0f));
    121         Value* BIT_BLOCK_FF = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xff));
    122         Value* BIT_BLOCK_FFFF = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xffff));
    123         Type* INT_BIT_BLOCK_TY = b->getIntNTy(b->getBitBlockWidth());
    124         Constant* INT_BIT_BLOCK_TY_0 = b->getIntN(b->getBitBlockWidth(), 0);
     113        Value* const BIT_BLOCK_0 = ConstantVector::getNullValue(b->getBitBlockType());
     114        Value* const BIT_BLOCK_1 = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0x1));
     115        Value* const BIT_BLOCK_F0 = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xf0));
     116        Value* const BIT_BLOCK_0F = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0x0f));
     117        Value* const BIT_BLOCK_FF = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xff));
     118        Value* const BIT_BLOCK_FFFF = b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 0xffff));
     119        Constant* const INT_BIT_BLOCK_TY_0 = b->getIntN(b->getBitBlockWidth(), 0);
     120
     121        // Type
     122        Type* const BIT_BLOCK_TY = b->getBitBlockType();
     123        Type* const INT_BIT_BLOCK_TY = b->getIntNTy(b->getBitBlockWidth());
    125124
    126125        // ---- EntryBlock
     
    128127        BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock");
    129128
    130         Value* notFinishBlocksVec = b->CreateICmpULT(beginTokenPosVec, lz4BlockEndVec);
    131         Value* notFinishBitBlock = b->CreateZExt(notFinishBlocksVec, b->getBitBlockType());
    132         Value* notFinishMask = b->CreateNeg(notFinishBitBlock);
    133 
     129        Value* notFinishMask = b->simd_ult(SIMD_WIDTH, beginTokenPosVec, lz4BlockEndVec);
    134130
    135131        Value* byteRawInputPtr = b->CreatePointerCast(b->getRawInputPointer("byteStream", b->getSize(0)), b->getInt8PtrTy());
    136132
    137         Value* firstTokenPos = b->CreateExtractElement(beginTokenPosVec, (uint64_t)0);
    138         Value* bytePtrBase = b->CreateGEP(byteRawInputPtr, firstTokenPos);
    139 
    140 
    141 
    142133        Value* tokenValuesVec = this->simdFetchByteData(b, byteRawInputPtr, beginTokenPosVec, notFinishMask);
    143134
    144         Value* shouldExtendLiteralVec = b->CreateICmpEQ(b->CreateAnd(BIT_BLOCK_F0, tokenValuesVec), BIT_BLOCK_F0);
    145         Value* shouldExtendLiteralBitBlockVec = b->CreateZExt(shouldExtendLiteralVec, b->getBitBlockType());
     135        Value* shouldExtendLiteralBitBlockVec = b->CreateZExt(b->CreateICmpEQ(b->CreateAnd(BIT_BLOCK_F0, tokenValuesVec), BIT_BLOCK_F0), b->getBitBlockType());
    146136        Value* shouldExtendLiteral = b->CreateICmpNE(b->CreateBitCast(shouldExtendLiteralBitBlockVec, INT_BIT_BLOCK_TY), INT_BIT_BLOCK_TY_0);
    147137
    148         Value* shouldExtendMatchVec = b->CreateICmpEQ(b->CreateAnd(BIT_BLOCK_0F, tokenValuesVec), BIT_BLOCK_0F);
    149         Value* shouldExtendMatchBitBlockVec = b->CreateZExt(shouldExtendMatchVec, b->getBitBlockType());
     138        Value* shouldExtendMatchBitBlockVec = b->CreateZExt(b->CreateICmpEQ(b->CreateAnd(BIT_BLOCK_0F, tokenValuesVec), BIT_BLOCK_0F), b->getBitBlockType());
    150139        Value* shouldExtendMatch = b->CreateICmpNE(b->CreateBitCast(shouldExtendMatchBitBlockVec, INT_BIT_BLOCK_TY), INT_BIT_BLOCK_TY_0);
    151140
    152         Value* initExtendLiteralPos = b->CreateAdd(beginTokenPosVec, shouldExtendLiteralBitBlockVec);
    153 
     141        Value* initExtendLiteralPos = b->simd_add(SIMD_WIDTH, beginTokenPosVec, shouldExtendLiteralBitBlockVec);
    154142
    155143        BasicBlock* extendLiteralCond = b->CreateBasicBlock("extendLiteralCond");
     
    157145
    158146        b->CreateCondBr(shouldExtendLiteral, extendLiteralCond, extendLiteralEnd);
    159 
    160147
    161148        // ---- extendLiteralCond
     
    164151        phiCurrentExtendLiteralPosVec->addIncoming(initExtendLiteralPos, entryBlock);
    165152
    166         PHINode* phiExtendLiteralLengthVec = b->CreatePHI(b->getBitBlockType(), 2);
     153        PHINode* phiExtendLiteralLengthVec = b->CreatePHI(BIT_BLOCK_TY, 2);
    167154        phiExtendLiteralLengthVec->addIncoming(BIT_BLOCK_0, entryBlock);
    168155
     
    170157        phiShouldExtendLiteralBitBlockVec->addIncoming(shouldExtendLiteralBitBlockVec, entryBlock);
    171158        Value* shouldExtendLiteralGatherMask = b->CreateNeg(phiShouldExtendLiteralBitBlockVec);
    172         shouldExtendLiteralGatherMask = b->CreateAnd(shouldExtendLiteralGatherMask, notFinishMask);
    173 //        b->CallPrintInt("a", b->getSize(0));
     159        shouldExtendLiteralGatherMask = b->simd_and(shouldExtendLiteralGatherMask, notFinishMask);
     160
    174161        // TODO maybe we can load i64 once and then consume 8 times
    175162        Value* currentLiteralLengthVec = this->simdFetchByteData(b, byteRawInputPtr, phiCurrentExtendLiteralPosVec, shouldExtendLiteralGatherMask);
    176163
    177         Value* newExtendLiteralLengthVec = b->CreateAdd(phiExtendLiteralLengthVec, currentLiteralLengthVec);
    178 
    179         Value* shouldContinueExtendVec = b->CreateICmpEQ(currentLiteralLengthVec, BIT_BLOCK_FF);
    180         Value* shouldContinueExtendVecBitBlock = b->CreateZExt(shouldContinueExtendVec, b->getBitBlockType());
    181 
     164        Value* newExtendLiteralLengthVec = b->simd_add(SIMD_WIDTH, phiExtendLiteralLengthVec, currentLiteralLengthVec);
     165        Value* shouldContinueExtendVecBitBlock = b->CreateZExt(b->CreateICmpEQ(currentLiteralLengthVec, BIT_BLOCK_FF), BIT_BLOCK_TY);
    182166        Value* newExtendLiteralPosVec = b->CreateAdd(phiCurrentExtendLiteralPosVec, b->CreateAnd(shouldExtendLiteralBitBlockVec, shouldContinueExtendVecBitBlock));
    183 
    184167
    185168        phiCurrentExtendLiteralPosVec->addIncoming(newExtendLiteralPosVec, b->GetInsertBlock());
    186169        phiExtendLiteralLengthVec->addIncoming(newExtendLiteralLengthVec, b->GetInsertBlock());
    187 
    188 
    189170        phiShouldExtendLiteralBitBlockVec->addIncoming(shouldContinueExtendVecBitBlock, b->GetInsertBlock());
     171
    190172        Value* shouldContinueExtendLiteral = b->CreateICmpNE(b->CreateBitCast(shouldContinueExtendVecBitBlock, INT_BIT_BLOCK_TY), INT_BIT_BLOCK_TY_0);
    191173
     
    204186
    205187        Value* literalLengthVec = b->simd_add(SIMD_WIDTH, literalExtendValueVec, b->simd_srlv(SIMD_WIDTH, tokenValuesVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))));
    206 //        Value* literalLengthVec = b->CreateAdd(literalExtendValueVec, b->CreateLShr(tokenValuesVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))));
    207 
    208         Value* literalStartPosVec = b->CreateAdd(phiExtendLiteralEndPos, BIT_BLOCK_1);
    209         Value* literalEndPosVec = b->CreateAdd(literalStartPosVec, literalLengthVec);
    210 
     188        Value* literalStartPosVec = b->simd_add(SIMD_WIDTH, phiExtendLiteralEndPos, BIT_BLOCK_1);
     189        Value* literalEndPosVec = b->simd_add(SIMD_WIDTH, literalStartPosVec, literalLengthVec);
    211190
    212191        this->handleSimdLiteralCopy(b, literalStartPosVec, literalLengthVec, initOutputPosVec);
    213         Value* outputPosAfterLiteralCpy = b->CreateAdd(initOutputPosVec, literalLengthVec);
     192        Value* outputPosAfterLiteralCpy = b->simd_add(SIMD_WIDTH, initOutputPosVec, literalLengthVec);
    214193
    215194
    216195        Value* matchOffsetBeginPosVec = literalEndPosVec;
    217196
    218         Value* matchOffsetNextPosVec = b->CreateAdd(matchOffsetBeginPosVec, BIT_BLOCK_1);
     197        Value* matchOffsetNextPosVec = b->simd_add(SIMD_WIDTH, matchOffsetBeginPosVec, BIT_BLOCK_1);
    219198
    220199
     
    223202        BasicBlock* extendMatchExit = b->CreateBasicBlock("extendMatchExit");
    224203
    225 
    226204        BasicBlock* extendLiteralEndFinal = b->GetInsertBlock();
    227205
    228         Value* hasMatchPartVec = b->CreateICmpULT(matchOffsetBeginPosVec, lz4BlockEndVec);
    229         Value* hasMatchPartBitBlock = b->CreateZExt(hasMatchPartVec, b->getBitBlockType());
    230         Value* hasMatchPartMask = b->CreateNeg(hasMatchPartBitBlock);
    231 
    232         b->CreateLikelyCondBr(b->CreateICmpNE(b->CreateBitCast(hasMatchPartBitBlock, INT_BIT_BLOCK_TY), INT_BIT_BLOCK_TY_0), hasMatchPartBlock, exitBlock);
     206        Value* hasMatchPartMask = b->simd_ult(SIMD_WIDTH, matchOffsetBeginPosVec, lz4BlockEndVec);
     207        b->CreateLikelyCondBr(b->CreateICmpNE(b->CreateBitCast(hasMatchPartMask, INT_BIT_BLOCK_TY), INT_BIT_BLOCK_TY_0), hasMatchPartBlock, exitBlock);
    233208
    234209        // ---- hasMatchPartBlock
    235210        b->SetInsertPoint(hasMatchPartBlock);
    236         Value* initExtendMatchPosVec = b->CreateAdd(matchOffsetNextPosVec, shouldExtendMatchBitBlockVec);
    237 //        b->CallPrintRegister("initExtendMatchPosVec", initExtendMatchPosVec);
     211        Value* initExtendMatchPosVec = b->simd_add(SIMD_WIDTH, matchOffsetNextPosVec, shouldExtendMatchBitBlockVec);
    238212        b->CreateCondBr(shouldExtendMatch, extendMatchCon, extendMatchExit);
    239213
     
    248222        phiShouldExtendMatchBitBlockVec->addIncoming(shouldExtendMatchBitBlockVec, hasMatchPartBlock);
    249223        Value* shouldExtendMatchGatherMask = b->CreateNeg(phiShouldExtendMatchBitBlockVec);
    250         shouldExtendMatchGatherMask = b->CreateAnd(shouldExtendMatchGatherMask, notFinishMask);
     224        shouldExtendMatchGatherMask = b->simd_and(shouldExtendMatchGatherMask, notFinishMask);
    251225        // TODO maybe we can load i64 once and then consume 8 times
    252226
    253227        Value* currentMatchLengthVec = this->simdFetchByteData(b, byteRawInputPtr, phiCurrentExtendMatchPosVec, shouldExtendMatchGatherMask);
    254228
    255         Value* newExtendMatchLengthVec = b->CreateAdd(phiExtendMatchLengthVec, currentMatchLengthVec);
    256 
    257 
    258         Value* shouldContinueExtendMatchVec = b->CreateICmpEQ(currentMatchLengthVec, BIT_BLOCK_FF);
    259         Value* shouldContinueExtendMatchVecBitBlock = b->CreateZExt(shouldContinueExtendMatchVec, b->getBitBlockType());
    260 
    261         Value* newExtendMatchPosVec = b->CreateAdd(phiCurrentExtendMatchPosVec, b->CreateAnd(shouldExtendMatchBitBlockVec, shouldContinueExtendMatchVecBitBlock));
    262 
     229        Value* newExtendMatchLengthVec = b->simd_add(SIMD_WIDTH, phiExtendMatchLengthVec, currentMatchLengthVec);
     230
     231        Value* shouldContinueExtendMatchVecBitBlock = b->CreateZExt(b->CreateICmpEQ(currentMatchLengthVec, BIT_BLOCK_FF), b->getBitBlockType());
     232        Value* newExtendMatchPosVec = b->simd_add(SIMD_WIDTH, phiCurrentExtendMatchPosVec, b->simd_and(shouldExtendMatchBitBlockVec, shouldContinueExtendMatchVecBitBlock));
    263233
    264234        phiCurrentExtendMatchPosVec->addIncoming(newExtendMatchPosVec, b->GetInsertBlock());
    265235        phiExtendMatchLengthVec->addIncoming(newExtendMatchLengthVec, b->GetInsertBlock());
    266 
    267236
    268237        phiShouldExtendMatchBitBlockVec->addIncoming(shouldContinueExtendMatchVecBitBlock, b->GetInsertBlock());
     
    284253
    285254        // matchLength = (size_t)token & 0xf + 4 + matchExtendValue
    286         Value* matchLength = b->CreateAdd(
    287                 b->CreateAdd(matchExtendValueVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))),
    288                 b->CreateAnd(tokenValuesVec, BIT_BLOCK_0F)
    289         );
    290         matchLength = b->CreateAnd(matchLength, hasMatchPartMask);
    291 
     255        Value* matchLength = b->simd_add(
     256                SIMD_WIDTH,
     257                b->simd_add(SIMD_WIDTH, matchExtendValueVec, b->simd_fill(SIMD_WIDTH, b->getIntN(SIMD_WIDTH, 4))),
     258                b->simd_and(tokenValuesVec, BIT_BLOCK_0F)
     259        );
     260        matchLength = b->simd_and(matchLength, hasMatchPartMask);
    292261
    293262        Value* matchOffsetVec = this->simdFetchData(
    294263                b,
    295                 bytePtrBase,
    296                 b->CreateSub(matchOffsetBeginPosVec, b->simd_fill(SIMD_WIDTH, firstTokenPos)),
     264                byteRawInputPtr,
     265                matchOffsetBeginPosVec,
    297266                hasMatchPartMask
    298267        );
    299         matchOffsetVec = b->CreateAnd(matchOffsetVec, BIT_BLOCK_FFFF);
     268        matchOffsetVec = b->simd_and(matchOffsetVec, BIT_BLOCK_FFFF);
    300269
    301270        this->handleSimdMatchCopy(b, matchOffsetVec, matchLength, outputPosAfterLiteralCpy);
    302271
    303         Value* outputPosAfterMatchCpy = b->CreateAdd(outputPosAfterLiteralCpy, matchLength);
     272        Value* outputPosAfterMatchCpy = b->simd_add(SIMD_WIDTH, outputPosAfterLiteralCpy, matchLength);
    304273
    305274        BasicBlock* extendMatchExitFinal = b->GetInsertBlock();
     
    317286        phiNewOutputPos->addIncoming(outputPosAfterLiteralCpy, extendLiteralEndFinal);
    318287        phiNewOutputPos->addIncoming(outputPosAfterMatchCpy, extendMatchExitFinal);
    319 //        b->CallPrintRegister("phiBeforeTokenPos", phiBeforeTokenPos);
    320         Value* nextTokenPos = b->CreateAdd(phiBeforeTokenPos, BIT_BLOCK_1);
    321 //        b->CallPrintRegister("nextTokenPos", nextTokenPos);
     288        Value* nextTokenPos = b->simd_add(SIMD_WIDTH, phiBeforeTokenPos, BIT_BLOCK_1);
    322289        return std::make_pair(nextTokenPos, phiNewOutputPos);
    323290    }
     
    335302        Value* outputPos = b->getProducedItemCount("outputStream");
    336303        Value* initOutputPosVec = b->simd_fill(SIMD_WIDTH, outputPos);
    337         std::vector<Constant*> initOutputPos;
     304        std::vector<Constant*> initOutputOffset;
    338305        for (unsigned i = 0; i < b->getBitBlockWidth() / SIMD_WIDTH; i++) {
    339             initOutputPos.push_back(b->getIntN(SIMD_WIDTH, i * 4 * 1024 * 1024));
     306            initOutputOffset.push_back(b->getIntN(SIMD_WIDTH, i * 4 * 1024 * 1024));
    340307        }
    341308
    342         initOutputPosVec = b->CreateAdd(initOutputPosVec, ConstantVector::get(initOutputPos));
     309        initOutputPosVec = b->simd_add(SIMD_WIDTH, initOutputPosVec, ConstantVector::get(initOutputOffset));
    343310
    344311        // TODO handle uncompression blocks
     
    364331        b->SetInsertPoint(processBody);
    365332//        Value* newCursorVec = this->generateSimdAcceleration(b, phiCursorVec, blockEndVec);
    366         auto ret = this->simdProcessBlockBoundary(b, phiCursorVec, blockEndVec, phiOutputPosVec);;
    367         Value* newCursorVec = ret.first;
    368         Value* newOutputPosVec = ret.second;
    369 //        b->CallPrintInt("newOutputPosVec", b->CreateExtractElement(newOutputPosVec, (uint64_t)0));
     333
     334        Value *newCursorVec = nullptr, *newOutputPosVec = nullptr;
     335        std::tie(newCursorVec, newOutputPosVec) = this->simdProcessBlockBoundary(b, phiCursorVec, blockEndVec, phiOutputPosVec);
    370336
    371337        phiCursorVec->addIncoming(newCursorVec, b->GetInsertBlock());
     
    380346        Value* lastBlockEnd = b->CreateExtractElement(blockEndVec, lastVecIndex);
    381347        b->setProcessedItemCount("byteStream", lastBlockEnd);
    382 
    383348        Value* lastOutputPos = b->CreateExtractElement(phiOutputPosVec, lastVecIndex);
    384349        b->setProducedItemCount("outputStream", lastOutputPos);
    385 //        b->CallPrintRegister("phiOutputPosVec", phiOutputPosVec);
    386 
    387350    }
    388351
     
    669632    }
    670633
    671     void LZ4ParallelByteStreamAioKernel::generateSimdMatchCopyByMemcpy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetVec, llvm::Value* matchLengthVec, llvm::Value* outputPosVec) {
    672         Value* outputCapacity = b->getCapacity("outputStream");
    673         Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream", b->getSize(0)), b->getInt8PtrTy());
    674 
    675         for (uint64_t i = 0; i < b->getBitBlockWidth() / SIMD_WIDTH; i++) {
    676             BasicBlock* matchCopyConBlock = b->CreateBasicBlock("matchCopyConBlock" + std::to_string(i));
    677             BasicBlock* matchCopyBodyBlock = b->CreateBasicBlock("matchCopyBodyBlock" + std::to_string(i));
    678             BasicBlock* matchCopyExitBlock = b->CreateBasicBlock("matchCopyExitBlock" + std::to_string(i));
    679 
    680             BasicBlock* beforeConBlock = b->GetInsertBlock();
    681 
    682             Value* matchOffset = b->CreateExtractElement(matchOffsetVec, i);
    683             Value* initMatchLength = b->CreateExtractElement(matchLengthVec, i);
    684             Value* initOutputPos = b->CreateExtractElement(outputPosVec, i);
    685             b->CreateBr(matchCopyConBlock);
    686 
    687             // ---- matchCopyConBlock
    688             b->SetInsertPoint(matchCopyConBlock);
    689             PHINode* phiMatchLength = b->CreatePHI(initMatchLength->getType(), 2);
    690             phiMatchLength->addIncoming(initMatchLength, beforeConBlock);
    691             PHINode* phiOutputPos = b->CreatePHI(initOutputPos->getType(), 2);
    692             phiOutputPos->addIncoming(initOutputPos, beforeConBlock);
    693 
    694             b->CreateCondBr(b->CreateICmpUGT(phiMatchLength, b->getSize(0)), matchCopyBodyBlock, matchCopyExitBlock);
    695 
    696             // ---- matchCopyBodyBlock
    697             b->SetInsertPoint(matchCopyBodyBlock);
    698             Value* copySize = b->CreateUMin(phiMatchLength, matchOffset);
    699             Value* copyFromPos = b->CreateSub(phiOutputPos, matchOffset);
    700 
    701             b->CreateMemCpy(
    702                     b->CreateGEP(outputBasePtr, b->CreateURem(phiOutputPos, outputCapacity)),
    703                     b->CreateGEP(outputBasePtr, b->CreateURem(copyFromPos, outputCapacity)),
    704                     copySize,
    705                     1
    706             );
    707 
    708             phiMatchLength->addIncoming(b->CreateSub(phiMatchLength, copySize), b->GetInsertBlock());
    709             phiOutputPos->addIncoming(b->CreateAdd(phiOutputPos, copySize), b->GetInsertBlock());
    710 
    711             b->CreateBr(matchCopyConBlock);
    712 
    713             // ---- matchCopyExitBlock
    714             b->SetInsertPoint(matchCopyExitBlock);
    715         }
    716     }
    717 
    718634    void LZ4ParallelByteStreamAioKernel::generateSimdSequentialMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetVec, llvm::Value* matchLengthVec, llvm::Value* outputPosVec) {
    719635
     
    773689
    774690    void LZ4ParallelByteStreamAioKernel::handleSimdMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetVec, llvm::Value* matchLengthVec, llvm::Value* outputPosVec) {
    775 //        this->generateSimdMatchCopyByMemcpy(b, matchOffsetVec, matchLengthVec, outputPosVec);
    776691        this->generateSimdSequentialMatchCopy(b, matchOffsetVec, matchLengthVec, outputPosVec);
    777692    }
    778693
    779694    void LZ4ParallelByteStreamAioKernel::handleSimdLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* literalStartVec, llvm::Value* literalLengthVec, llvm::Value* outputPosVec) {
    780         this->generateSimdLiteralCopyByScatter(b, literalStartVec, literalLengthVec, outputPosVec);
    781 //        this->generateSimdSequentialLiteralCopy(b, literalStartVec, literalLengthVec, outputPosVec);
    782 //        this->generateSequentialLiteralCopyWithSimdCalculation(b, literalStartVec, literalLengthVec, outputPosVec);
    783 //        this->generateLiteralCopyByMemcpy(b, literalStartVec, literalLengthVec, outputPosVec);
    784     }
    785 
    786     void LZ4ParallelByteStreamAioKernel::generateSimdLiteralCopyByMemcpy(const std::unique_ptr<KernelBuilder> &b,
    787                                                                          llvm::Value *literalStartVec,
    788                                                                          llvm::Value *literalLengthVec,
    789                                                                          llvm::Value *outputPosVec) {
    790         // This function will be slower than other literal copy related function, it is only for performance testing.
    791         Value* outputCapacity = b->getCapacity("outputStream");
    792         Value* outputPosRemVec = b->simd_and(outputPosVec, b->simd_fill(SIMD_WIDTH, b->simd_not(b->CreateNeg(outputCapacity))));
    793 
    794         Value* inputBasePtr = b->CreatePointerCast(b->getRawInputPointer("byteStream", b->getSize(0)), b->getInt8PtrTy());
    795         Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream", b->getSize(0)), b->getInt8PtrTy());
    796 
    797         for (uint64_t i = 0; i < b->getBitBlockWidth() / SIMD_WIDTH; i++) {
    798             Value* literalStart = b->CreateExtractElement(literalStartVec, i);
    799             Value* literalLength = b->CreateExtractElement(literalLengthVec, i);
    800             Value* outputPosRem = b->CreateExtractElement(outputPosRemVec, i);;
    801             b->CreateMemCpy(
    802                     b->CreateGEP(outputBasePtr, outputPosRem),
    803                     b->CreateGEP(inputBasePtr, literalStart),
    804                     literalLength,
    805                     1
    806             );
     695        if (AVX512BW_available() && mEnableScatter) {
     696            this->generateSimdLiteralCopyByScatter(b, literalStartVec, literalLengthVec, outputPosVec);
     697        } else {
     698            this->generateSimdSequentialLiteralCopy(b, literalStartVec, literalLengthVec, outputPosVec);
    807699        }
    808700    }
     
    989881        BasicBlock* i64LiteralCopyBlock = b->CreateBasicBlock("i64LiteralCopyBlock");
    990882        BasicBlock* i8LiteralCopyBlock = b->CreateBasicBlock("i8LiteralCopyBlock");
    991 
    992 
    993 
    994 
    995883
    996884        llvm::Value* initCopiedLength = ConstantVector::getNullValue(literalLengthVec->getType());
     
    11761064    llvm::Value* LZ4ParallelByteStreamAioKernel::simdFetchData(const std::unique_ptr<KernelBuilder> &b, llvm::Value* basePtr, llvm::Value* offsetVec, llvm::Value* mask) {
    11771065//        return this->simdFetchDataByLoop(b, basePtr, offsetVec, mask);
    1178         if (AVX2_available()) {
     1066        if (AVX2_available() && mEnableGather) {
    11791067            return this->simdFetchI32DataByGather(b, basePtr, offsetVec, mask);
    11801068        } else {
     
    12261114        if (AVX512BW_available()) {
    12271115            // AVX512 gather use i8 mask
    1228             //declare <8 x double> @llvm.x86.avx512.gather.dpd.512(<8 x double>, i8*, <8 x i32>, i8, i32) #1
     1116            //declare <8 x double> @llvm.x86.avx512.gather.dpq.512(<8 x i64>, i8*, <8 x i32>, i8, i32) #1
    12291117            Function *gatherFunc512 = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_avx512_gather_dpd_512);
    12301118            return b->CreateCall(
     
    12641152            Value* mask = b->CreateExtractElement(maskVec, i);
    12651153            Value* shouldLoad = b->CreateICmpNE(mask, b->getInt64(0));
    1266             Value* loadPtr = b->CreateSelect(shouldLoad, b->CreateGEP(basePtr, b->CreateExtractElement(offsetVec, i)), basePtr);
     1154//            Value* loadPtr = b->CreateSelect(shouldLoad, b->CreateGEP(basePtr, b->CreateExtractElement(offsetVec, i)), basePtr);
     1155            Value* loadPtr = b->CreateGEP(basePtr, b->CreateExtractElement(offsetVec, i));
    12671156            Value* loadValue = b->CreateZExt(b->CreateLoad(b->CreatePointerCast(loadPtr, b->getInt64Ty()->getPointerTo())), b->getInt64Ty());
    12681157            Value* finalValue = b->CreateSelect(shouldLoad, loadValue, b->getInt64(0));
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_parallel_bytestream_aio.h

    r6080 r6081  
    2525    public:
    2626        // By default, output block size in LZ4 is 4MB
    27         LZ4ParallelByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, size_t outputBlockSize = 4 * 1024 * 1024 );
     27        LZ4ParallelByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, bool enableGather = true, bool enableScatter = true, size_t outputBlockSize = 4 * 1024 * 1024 );
    2828
    2929    protected:
     
    6161        void generateSimdLiteralCopyByScatter(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStartVec,
    6262                                              llvm::Value *literalLengthVec, llvm::Value *outputPosVec);
    63         void generateSimdLiteralCopyByMemcpy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStartVec,
    64                                              llvm::Value *literalLengthVec, llvm::Value *outputPosVec);
    6563
    6664        void generateOverwritingMemcpy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *inputBasePtr,
     
    7270
    7371        void handleSimdMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetVec, llvm::Value* matchLengthVec, llvm::Value* outputPosVec);
    74         void generateSimdMatchCopyByMemcpy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetVec, llvm::Value* matchLengthVec, llvm::Value* outputPosVec);
    7572        void generateSimdSequentialMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetVec, llvm::Value* matchLengthVec, llvm::Value* outputPosVec);
    7673
     
    9592
    9693        size_t mOutputBlockSize;
     94        bool mEnableGather;
     95        bool mEnableScatter;
    9796    };
    9897
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp

    r6070 r6081  
    465465}
    466466
    467 parabix::StreamSetBuffer * LZ4Generator::generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
     467parabix::StreamSetBuffer * LZ4Generator::generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, bool enableGather, bool enableScatter) {
    468468    //// Decode Block Information
    469469    StreamSetBuffer * const BlockData_IsCompressed = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getInputBufferBlocks(iBuilder), 1);
     
    484484    StreamSetBuffer * const decompressionByteStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(iBuilder), 1);
    485485
    486     Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ParallelByteStreamAioKernel>(iBuilder);
     486    Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ParallelByteStreamAioKernel>(iBuilder, enableGather, enableScatter);
    487487    lz4AioK->setInitialArguments({mFileSize});
    488488    mPxDriver.makeKernelCall(
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.h

    r6066 r6081  
    4141    virtual void generateLoadByteStreamAndBitStream(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    4242    virtual void generateExtractAndDepositMarkers(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    43     virtual parabix::StreamSetBuffer * generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     43    virtual parabix::StreamSetBuffer * generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, bool enableGather, bool enableScatter);
    4444    virtual parabix::StreamSetBuffer * generateAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    4545    virtual parabix::StreamSetBuffer * generateSwizzledAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.cpp

    r6080 r6081  
    606606}
    607607
    608 void LZ4GrepGenerator::generateParallelAioPipeline(re::RE* regex) {
     608void LZ4GrepGenerator::generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter) {
    609609    auto & iBuilder = mPxDriver.getBuilder();
    610610    this->generateCountOnlyMainFunc(iBuilder);
    611611
    612612    this->generateLoadByteStream(iBuilder);
    613     parabix::StreamSetBuffer * decompressedByteStream = this->generateParallelAIODecompression(iBuilder);
     613    parabix::StreamSetBuffer * decompressedByteStream = this->generateParallelAIODecompression(iBuilder, enableGather, enableScatter);
    614614
    615615
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.h

    r6064 r6081  
    3939
    4040    void generateAioPipeline(re::RE* regex);
    41     void generateParallelAioPipeline(re::RE* regex);
     41    void generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter);
    4242
    4343    ScanMatchGrepMainFunctionType getScanMatchGrepMainFunction();
  • icGREP/icgrep-devel/icgrep/lz4_grep.cpp

    r6065 r6081  
    5252static cl::opt<bool> parallelDecompression("parallel-decompression", cl::desc("Use parallel Approach for LZ4 Decompression"), cl::init(false), cl::cat(lz4GrepDebugFlags));
    5353static cl::opt<bool> swizzledDecompression("swizzled-decompression", cl::desc("Use swizzle approach for decompression"), cl::init(false), cl::cat(lz4GrepDebugFlags));
    54 static cl::opt<bool> enableGather("enable-gather", cl::desc("Enable gather intrinsics for bitstream PDEP"), cl::init(false), cl::cat(lz4GrepDebugFlags));
     54static cl::opt<bool> enableGather("enable-gather", cl::desc("Enable gather intrinsics"), cl::init(false), cl::cat(lz4GrepDebugFlags));
     55static cl::opt<bool> enableScatter("enable-scatter", cl::desc("Enable scatter intrinsics"), cl::init(false), cl::cat(lz4GrepDebugFlags));
    5556
    5657
     
    7980    if (aio) {
    8081        if (parallelDecompression) {
    81             g.generateParallelAioPipeline(re_ast);
     82            g.generateParallelAioPipeline(re_ast, enableGather, enableScatter);
    8283        } else if (enableMultiplexing) {
    8384            g.generateMultiplexingSwizzledAioPipeline2(re_ast);
Note: See TracChangeset for help on using the changeset viewer.