Changeset 6132


Ignore:
Timestamp:
Jul 23, 2018, 4:56:33 AM (3 months ago)
Author:
xwa163
Message:
  1. More experiment on lz4 grep
  2. Improve performance of lzparabix grep
Location:
icGREP/icgrep-devel/icgrep
Files:
2 added
20 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r6123 r6132  
    103103add_library(UCDlib UCD/CaseFolding.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp)
    104104add_library(GrepEngine  ${GREP_CORE_SRC} grep/grep_engine.cpp kernels/cc_kernel.cpp kernels/cc_scan_kernel.cpp kernels/charclasses.cpp kernels/streams_merge.cpp kernels/until_n.cpp kernels/UCD_property_kernel.cpp kernels/grapheme_kernel.cpp)
    105 add_library(LZ4_Lib lz4FrameDecoder.cpp kernels/cc_kernel.cpp kernels/lz4/lz4_deposit_uncompressed.cpp kernels/lz4/lz4_generate_deposit_stream.cpp kernels/pdep_kernel.cpp lz4/LZ4Generator.cpp kernels/lz4/lz4_block_decoder.cpp kernels/lz4/lz4_index_builder.cpp kernels/lz4/lz4_swizzled_match_copy_kernel.cpp kernels/bitstream_pdep_kernel.cpp kernels/bitstream_gather_pdep_kernel.cpp kernels/swizzled_multiple_pdep_kernel.cpp kernels/lz4/lz4_bitstream_not_kernel.cpp kernels/lz4/lz4_bitstream_match_copy_kernel.cpp kernels/fake_stream_generating_kernel.cpp kernels/lz4/aio/lz4_bytestream_aio.cpp kernels/lz4/aio/lz4_swizzled_aio.cpp kernels/lz4/aio/lz4_parallel_bytestream_aio.cpp kernels/lz4/aio/lz4_sequential_aio_base.cpp kernels/lz4/aio/lz4_sequential_aio_base.h kernels/lz4/aio/lz4_bitstream_aio.cpp kernels/lz4/aio/lz4_bitstream_aio.h)
     105add_library(LZ4_Lib lz4FrameDecoder.cpp kernels/cc_kernel.cpp kernels/lz4/lz4_deposit_uncompressed.cpp kernels/lz4/lz4_generate_deposit_stream.cpp kernels/pdep_kernel.cpp lz4/LZ4Generator.cpp kernels/lz4/lz4_block_decoder.cpp kernels/lz4/lz4_index_builder.cpp kernels/lz4/lz4_swizzled_match_copy_kernel.cpp kernels/bitstream_pdep_kernel.cpp kernels/bitstream_gather_pdep_kernel.cpp kernels/swizzled_multiple_pdep_kernel.cpp kernels/lz4/lz4_bitstream_not_kernel.cpp kernels/lz4/lz4_bitstream_match_copy_kernel.cpp kernels/fake_stream_generating_kernel.cpp kernels/lz4/aio/lz4_bytestream_aio.cpp kernels/lz4/aio/lz4_swizzled_aio.cpp kernels/lz4/aio/lz4_parallel_bytestream_aio.cpp kernels/lz4/aio/lz4_sequential_aio_base.cpp kernels/lz4/aio/lz4_sequential_aio_base.h kernels/lz4/aio/lz4_bitstream_aio.cpp kernels/lz4/aio/lz4_bitstream_aio.h kernels/lz4/aio/lz4_i4_bytestream_aio.cpp kernels/lz4/aio/lz4_i4_bytestream_aio.h)
    106106add_library(LZParabix_Lib lzparabix/LZParabixGenerator.cpp kernels/lzparabix/decoder/LZParabixBlockDecoder.cpp kernels/lzparabix/decoder/LZParabixBlockDecoder.h kernels/lzparabix/decoder/LZParabixAioKernel.cpp kernels/lzparabix/decoder/LZParabixAioKernel.h lzparabix/LZParabixGrepGenerator.cpp lzparabix/LZParabixGrepGenerator.h kernels/fake_stream_generating_kernel.cpp kernels/lzparabix/encoder/LZParabixCompressionKernel.cpp kernels/lzparabix/encoder/LZParabixCompressionKernel.h kernels/lzparabix/decoder/LZParabixLiteralDecoderKernel.cpp kernels/lzparabix/decoder/LZParabixLiteralDecoderKernel.h)
    107107
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bitstream_aio.cpp

    r6118 r6132  
    1717                                                 std::vector<unsigned> numsOfBitStreams,
    1818                                                 unsigned blockSize)
    19     : LZ4SequentialAioBaseKernel(b, "LZ4ByteStreamAioKernel", blockSize),
     19    : LZ4SequentialAioBaseKernel(b, "LZ4BitStreamAioKernel", blockSize),
    2020      mNumsOfBitStreams(numsOfBitStreams)
    2121    {
     
    4040
    4141    void LZ4BitStreamAioKernel::doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    42                                               llvm::Value *literalLength) {
     42                                              llvm::Value *literalLength, llvm::Value* blockStart) {
    4343        // Constant
    4444        ConstantInt* INT_64_0 = b->getInt64(0);
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bitstream_aio.h

    r6118 r6132  
    1212    protected:
    1313        virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    14                                    llvm::Value *literalLength) override;
     14                                   llvm::Value *literalLength, llvm::Value* blockStart) override;
    1515        virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
    1616                                 llvm::Value *matchLength) override;
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bytestream_aio.cpp

    r6111 r6132  
    1515
    1616namespace kernel{
     17    std::string LZ4ByteStreamAioKernel::getCopyByteStreamName() {
     18        return mCopyOtherByteStream ? "targetByteStream" : "byteStream";
     19    }
    1720
    18     LZ4ByteStreamAioKernel::LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, unsigned blockSize)
    19             : LZ4SequentialAioBaseKernel(b, "LZ4ByteStreamAioKernel", blockSize) {
     21    LZ4ByteStreamAioKernel::LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, bool copyOtherByteStream, unsigned blockSize)
     22            : LZ4SequentialAioBaseKernel(b, "LZ4ByteStreamAioKernel", blockSize),
     23              mCopyOtherByteStream(copyOtherByteStream) {
    2024        mStreamSetOutputs.push_back(Binding{b->getStreamSetTy(1, 8), "outputStream", BoundedRate(0, 1)});
     25        this->addScalar(b->getInt8PtrTy(), "temporaryInputPtr");
     26        if (copyOtherByteStream) {
     27            mStreamSetInputs.push_back(Binding{b->getStreamSetTy(1, 8), "targetByteStream", RateEqualTo("byteStream")});
     28        }
    2129    }
    2230
    2331    void LZ4ByteStreamAioKernel::doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    24                                                llvm::Value *literalLength) {
     32                                               llvm::Value *literalLength, llvm::Value* blockStart) {
    2533        unsigned fw = 64;
    2634        Type* INT_FW_PTR = b->getIntNTy(fw)->getPointerTo();
    2735
    28         Value* inputBytePtr = b->getRawInputPointer("byteStream", literalStart);
     36        Value* inputBytePtr = b->getScalarField("temporaryInputPtr");
     37        inputBytePtr = b->CreateGEP(inputBytePtr, b->CreateSub(literalStart, blockStart));
     38
    2939        Value* inputPtr = b->CreatePointerCast(inputBytePtr, INT_FW_PTR);
    3040
     
    3343        Value* outputPtr = b->getRawOutputPointer("outputStream", b->CreateURem(outputPos, outputBufferSize));
    3444        outputPtr = b->CreatePointerCast(outputPtr, INT_FW_PTR);
    35 
    36         // We can always assume that we have enough output buffer based on our output buffer allocation strategy (except in extract only case)
    3745
    3846        BasicBlock* entryBlock = b->GetInsertBlock();
     
    120128    }
    121129
     130    void LZ4ByteStreamAioKernel::initializationMethod(const std::unique_ptr<KernelBuilder> &b) {
     131        b->setScalarField("temporaryInputPtr", b->CreateMalloc(b->getSize(mBlockSize)));
     132    }
     133
     134    void LZ4ByteStreamAioKernel::prepareProcessBlock(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockStart, llvm::Value* blockEnd) {
     135        Value* rawInputPtr = b->CreatePointerCast(b->getRawInputPointer(this->getCopyByteStreamName(), b->getSize(0)), b->getInt8PtrTy());
     136        Value* inputCapacity = b->getCapacity(this->getCopyByteStreamName());
     137
     138        Value* blockStartRem = b->CreateURem(blockStart, inputCapacity);
     139        Value* remSize = b->CreateSub(inputCapacity, blockStartRem);
     140
     141        Value* blockSize = b->CreateSub(blockEnd, blockStart);
     142
     143        Value* copySize1 = b->CreateUMin(remSize, blockSize);
     144        Value* copySize2 = b->CreateSub(blockSize, copySize1);
     145
     146        Value* temporayInputPtr = b->getScalarField("temporaryInputPtr");
     147
     148        b->CreateMemCpy(temporayInputPtr, b->CreateGEP(rawInputPtr, blockStartRem), copySize1, 1);
     149        b->CreateMemCpy(b->CreateGEP(temporayInputPtr, copySize1), rawInputPtr, copySize2, 1);
     150    }
     151
     152    void LZ4ByteStreamAioKernel::beforeTermination(const std::unique_ptr<KernelBuilder> &b) {
     153        b->CreateFree(b->getScalarField("temporaryInputPtr"));
     154//        b->CallPrintInt("beforeTermination", b->getSize(0));
     155    }
     156
    122157}
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bytestream_aio.h

    r6111 r6132  
    99    class LZ4ByteStreamAioKernel : public LZ4SequentialAioBaseKernel {
    1010    public:
    11         LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, unsigned blockSize = 4 * 1024 * 1024);
     11        LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, bool copyOtherByteStream = false, unsigned blockSize = 4 * 1024 * 1024);
     12
    1213
    1314    protected:
    1415        virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    15                                    llvm::Value *literalLength) override;
     16                                   llvm::Value *literalLength, llvm::Value* blockStart) override;
    1617        virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
    1718                                 llvm::Value *matchLength) override;
    1819        virtual void setProducedOutputItemCount(const std::unique_ptr<KernelBuilder> &b, llvm::Value* produced) override;
     20
     21        virtual void initializationMethod(const std::unique_ptr<KernelBuilder> &b) override;
     22        virtual void prepareProcessBlock(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockStart, llvm::Value* blockEnd) override;
     23        virtual void beforeTermination(const std::unique_ptr<KernelBuilder> &b) override;
     24
     25    private:
     26        inline std::string getCopyByteStreamName();
     27        bool mCopyOtherByteStream;
     28
    1929    };
    2030
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_sequential_aio_base.cpp

    r6118 r6132  
    2727                    Binding{b->getStreamSetTy(1, 64), "blockStart", RateEqualTo("isCompressed"), AlwaysConsume()},
    2828                    Binding{b->getStreamSetTy(1, 64), "blockEnd", RateEqualTo("isCompressed"), AlwaysConsume()}
    29 
    3029            },
    3130            //Outputs
     
    4342                                           Binding{b->getInt64Ty(), "outputPos"},
    4443
    45 
    46                                    }){
     44                                           Binding{b->getInt1Ty(), "hasCallInitialization"}
     45
     46
     47                                   }),
     48             mBlockSize(blockSize) {
    4749        this->setStride(blockSize);
    4850        addAttribute(MustExplicitlyTerminate());
     
    5153    // ---- Kernel Methods
    5254    void LZ4SequentialAioBaseKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &b) {
     55        Value* hasCallInitialization = b->getScalarField("hasCallInitialization");
     56
     57        BasicBlock* initializationBlock = b->CreateBasicBlock("initializationBlock");
     58        BasicBlock* entryBlock = b->CreateBasicBlock("entryBlock");
    5359        BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock");
     60
     61        b->CreateLikelyCondBr(hasCallInitialization, entryBlock, initializationBlock);
     62
     63        // ---- initializationBlock
     64        b->SetInsertPoint(initializationBlock);
     65        b->setScalarField("hasCallInitialization", b->getInt1(true));
     66        this->initializationMethod(b);
     67        b->CreateBr(entryBlock);
     68
     69        // ---- entryBlock
     70        b->SetInsertPoint(entryBlock);
    5471        BasicBlock* blockEndConBlock = b->CreateBasicBlock("blockEndConBlock");
    5572
     
    7289
    7390        b->SetInsertPoint(processBlock);
    74 
    7591        //TODO handle uncompressed block
     92        this->prepareProcessBlock(b, blockStart, blockEnd);
     93
    7694        this->processCompressedLz4Block(b, blockStart, blockEnd);
     95
    7796        this->storePendingOutput(b);
    7897
     
    87106        b->CreateBr(exitBlock);
    88107
     108        // ---- exitBlock
    89109        b->SetInsertPoint(exitBlock);
     110
     111        BasicBlock* beforeTerminationBlock = b->CreateBasicBlock("beforeTerminationBlock");
     112        BasicBlock* terminationBlock = b->CreateBasicBlock("terminationBlock");
     113
     114        b->CreateUnlikelyCondBr(b->getTerminationSignal(), beforeTerminationBlock, terminationBlock);
     115
     116        // ---- beforeTerminationBlock
     117        b->SetInsertPoint(beforeTerminationBlock);
     118        this->beforeTermination(b);
     119        b->CreateBr(terminationBlock);
     120
     121        // ---- terminationBlock
     122        b->SetInsertPoint(terminationBlock);
    90123    }
    91124
     
    116149        b->SetInsertPoint(processBody);
    117150        /*
    118         auto accelerationRet = this->doAcceleration(b, phiCursorValue, lz4BlockEnd);
     151        auto accelerationRet = this->doAcceleration(b, phiCursorValue, lz4BlockStart, lz4BlockEnd);
    119152        Value* tokenMarkers = accelerationRet.first.first;
    120153
     
    125158        nextTokenGlobalPos = this->processLz4Sequence(b, nextTokenGlobalPos, lz4BlockEnd);
    126159        */
    127         Value* nextTokenGlobalPos = this->processLz4Sequence(b, phiCursorValue, lz4BlockEnd);
     160        Value* nextTokenGlobalPos = this->processLz4Sequence(b, phiCursorValue, lz4BlockStart, lz4BlockEnd);
    128161        phiCursorValue->addIncoming(nextTokenGlobalPos, b->GetInsertBlock());
    129162        b->CreateBr(processCon);
     
    133166
    134167    std::pair<std::pair<llvm::Value *, llvm::Value *>, llvm::Value *>
    135     LZ4SequentialAioBaseKernel::doAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value *beginTokenPos,
    136                                      llvm::Value *blockEnd) {
     168    LZ4SequentialAioBaseKernel::doAcceleration(
     169            const std::unique_ptr<KernelBuilder> &b,
     170            llvm::Value *beginTokenPos,
     171            llvm::Value *blockStart,
     172            llvm::Value *blockEnd) {
    137173        BasicBlock* entryBlock = b->GetInsertBlock();
    138174
     
    229265        // TODO all of the literal data here will always be in the same 64-bit literal block, it may be better if we provide
    230266        //      this information to the literal copy method, especially when we are working with swizzled form
    231         this->doAccelerationLiteralCopy(b, literalStartGlobalPos, literalLength);
     267        this->doAccelerationLiteralCopy(b, literalStartGlobalPos, literalLength, blockStart);
    232268        this->doAccelerationMatchCopy(b, matchOffset, matchLength);
    233269
     
    246282    }
    247283
    248     llvm::Value *LZ4SequentialAioBaseKernel::processLz4Sequence(const std::unique_ptr<KernelBuilder> &b,
    249                                                       llvm::Value *beginTokenPos,
    250                                                       llvm::Value *lz4BlockEnd) {
     284    llvm::Value *LZ4SequentialAioBaseKernel::processLz4Sequence(
     285            const std::unique_ptr<KernelBuilder> &b,
     286            llvm::Value *beginTokenPos,
     287            llvm::Value *lz4BlockStart,
     288            llvm::Value *lz4BlockEnd) {
    251289        // Constant
    252290        ConstantInt* SIZE_0 = b->getSize(0);
     
    307345
    308346        // This literal copy will always cross 64 bits literal boundary
    309         this->doLiteralCopy(b, literalStartPos, literalLength);
     347        this->doLiteralCopy(b, literalStartPos, literalLength, lz4BlockStart);
    310348        BasicBlock* extendLiteralEndFinal = b->GetInsertBlock();
    311349
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_sequential_aio_base.h

    r6118 r6132  
    2828    // ---- Constant
    2929    const static unsigned int ACCELERATION_WIDTH = 64;
     30    const unsigned mBlockSize;
    3031
    3132    // ---- Kernel Methods
     
    3637                                   llvm::Value *lz4BlockEnd);
    3738
    38     std::pair<std::pair<llvm::Value *, llvm::Value *>, llvm::Value *>
    39     doAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value *beginTokenPos,
    40                    llvm::Value *blockEnd);
     39    std::pair<std::pair<llvm::Value *, llvm::Value *>, llvm::Value *> doAcceleration(
     40            const std::unique_ptr<KernelBuilder> &b,
     41            llvm::Value *beginTokenPos,
     42            llvm::Value *blockStart,
     43            llvm::Value *blockEnd);
    4144
    4245
    43     virtual llvm::Value *processLz4Sequence(const std::unique_ptr<KernelBuilder> &b,
    44                                     llvm::Value *beginTokenPos, llvm::Value *lz4BlockEnd);
     46    virtual llvm::Value *processLz4Sequence(
     47            const std::unique_ptr<KernelBuilder> &b,
     48            llvm::Value *beginTokenPos,
     49            llvm::Value *lz4BlockStart,
     50            llvm::Value *lz4BlockEnd
     51    );
    4552
    4653    std::pair<llvm::Value*, llvm::Value*> parseMatchInfo(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetBeginPos, llvm::Value* tokenValue);
     
    8693    // ---- Methods To Be Override
    8794
     95    virtual void initializationMethod(const std::unique_ptr<KernelBuilder> &b){};
     96    virtual void prepareProcessBlock(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockStart, llvm::Value* blockEnd){};
     97    virtual void beforeTermination(const std::unique_ptr<KernelBuilder> &b){};
     98
    8899
    89100    virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    90                                llvm::Value *literalLength) = 0;
     101                               llvm::Value *literalLength, llvm::Value* blockStart) = 0;
    91102
    92103    virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
     
    98109    virtual void prepareAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value* beginTokenPos) {};
    99110    virtual void doAccelerationLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    100                                            llvm::Value *literalLength) {this->doLiteralCopy(b, literalStart, literalLength);}
     111                                           llvm::Value *literalLength, llvm::Value* blockStart) {this->doLiteralCopy(b, literalStart, literalLength, blockStart);}
    101112    virtual void doAccelerationMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
    102113                                         llvm::Value *matchLength) {this->doMatchCopy(b, matchOffset, matchLength);}
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_swizzled_aio.cpp

    r6111 r6132  
    5757
    5858    void LZ4SwizzledAioKernel::doAccelerationLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    59                                            llvm::Value *literalLength) {
     59                                           llvm::Value *literalLength, llvm::Value* blockStart) {
    6060//        this->handleAccelerationLiteralCopy(b, literalStart, literalLength, inputValuesVector);
    6161
     
    512512
    513513    void LZ4SwizzledAioKernel::doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    514                                              llvm::Value *literalLength) {
     514                                             llvm::Value *literalLength, llvm::Value* blockStart) {
    515515        Value* SIZE_64  = b->getSize(64);
    516516        Value* SIZE_0 = b->getSize(0);
  • icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_swizzled_aio.h

    r6111 r6132  
    5555
    5656        virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    57                                    llvm::Value *literalLength);
     57                                   llvm::Value *literalLength, llvm::Value* blockStart) override;
    5858        virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
    5959                                 llvm::Value *matchLength);
     
    6363        virtual void prepareAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value* beginTokenPos) override;
    6464        virtual void doAccelerationLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart,
    65                                                              llvm::Value *literalLength) override;
     65                                                             llvm::Value *literalLength, llvm::Value* blockStart) override;
    6666        virtual void doAccelerationMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset,
    6767                                                           llvm::Value *matchLength) override;
  • icGREP/icgrep-devel/icgrep/kernels/lzparabix/decoder/LZParabixAioKernel.cpp

    r6131 r6132  
    4343                                   {
    4444                                           Binding{b->getSizeTy(), "blockDataIndex"},
    45                                            Binding{b->getInt64Ty(), "outputPos"},
    46 
     45                                           Binding{b->getInt64Ty(), "outputPos"}
    4746
    4847                                   }), mNumsOfBitStreams(numsOfBitStreams) {
     
    6463
    6564
     65    void LZParabixAioKernel::initScalarOutputPtr(const std::unique_ptr<KernelBuilder> &b) {
     66//        b->CallPrintInt("------------------", b->getSize(0));
     67        for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
     68            Value* ptr = b->CreatePointerCast(b->getOutputStreamBlockPtr("outputStream" + std::to_string(i), b->getSize(0)), b->getInt64Ty()->getPointerTo());
     69            b->setScalarField("currentOutputPtr_" + std::to_string(i), ptr);
     70
     71            for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
     72                b->CreateStore(b->getInt64(0), b->CreateGEP(ptr, b->getInt32(j * 4)));
     73            }
     74        }
     75    }
     76
    6677    void LZParabixAioKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &b) {
     78        this->initScalarOutputPtr(b);
     79
    6780        BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock");
    6881        BasicBlock* blockEndConBlock = b->CreateBasicBlock("blockEndConBlock");
     
    168181
    169182        b->SetInsertPoint(exitBlock);
    170         this->storePendingOutput(b);
    171183        b->setProcessedItemCount("inputBitStream0", b->CreateAdd(literalStartPos, totalLiteralLength));
    172184    }
    173 
    174185
    175186    std::pair<llvm::Value *, llvm::Value *>
     
    339350    }
    340351
    341 
    342352    // ---- Output
    343353    void LZParabixAioKernel::initPendingOutputScalar(const std::unique_ptr<KernelBuilder> &b) {
    344         this->initPendingOutputScalar_BitStream(b);
    345 //        this->initPendingOutputScalar_Swizzled(b);
     354        for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
     355            for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
     356                this->addScalar(b->getInt64Ty(), "pendingOutput" + std::to_string(i) + "_" + std::to_string(j));
     357            }
     358        }
     359        for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
     360            this->addScalar(b->getInt64Ty()->getPointerTo(), "currentOutputPtr_" + std::to_string(i));
     361        }
    346362    }
    347363
    348364    void LZParabixAioKernel::appendBitStreamOutput(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength) {
    349         this->appendBitStreamOutput_BitStream(b, extractedValues, valueLength);
    350 //        this->appendBitStreamOutput_Swizzled(b, extractedValues, valueLength);
    351     }
    352 
    353     void LZParabixAioKernel::storePendingOutput(const std::unique_ptr<KernelBuilder> &b) {
    354         BasicBlock* storePendingOutputBlock = b->CreateBasicBlock("storePendingOutputBlock");
    355         BasicBlock* storePendingOutputExitBlock = b->CreateBasicBlock("storePendingOutputExitBlock");
    356 
    357         Value* oldOutputPos = b->getScalarField("outputPos");
    358         b->CreateCondBr(
    359                 b->CreateICmpNE(b->CreateURem(oldOutputPos, b->getSize(64)), b->getSize(0)),
    360                 storePendingOutputBlock,
    361                 storePendingOutputExitBlock
    362         );
    363 
    364         b->SetInsertPoint(storePendingOutputBlock);
    365         this->storePendingOutput_BitStream(b);
    366 //        this->storePendingOutput_Swizzled(b);
    367         b->CreateBr(storePendingOutputExitBlock);
    368 
    369         b->SetInsertPoint(storePendingOutputExitBlock);
    370     }
    371 
    372 
    373     // ---- Output BitStream
    374     void LZParabixAioKernel::initPendingOutputScalar_BitStream(const std::unique_ptr<KernelBuilder> &b) {
    375         for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
    376             for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
    377                 this->addScalar(b->getInt64Ty(), "pendingOutput" + std::to_string(i) + "_" + std::to_string(j));
    378             }
    379         }
    380     }
    381 
    382     void LZParabixAioKernel::appendBitStreamOutput_BitStream(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength) {
    383365        BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock");
    384366
     
    389371
    390372        unsigned iStreamIndex = 0;
    391         for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
    392             for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
    393                 Value* newValue = b->CreateOr(b->getScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j)), b->CreateShl(extractedValues[iStreamIndex], oldOutputPosRem64));
    394                 newOutputVec.push_back(newValue);
     373
     374        for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
     375            Value* outputPtr = b->getScalarField("currentOutputPtr_" + std::to_string(i));
     376            for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
     377                Value* ptr = b->CreateGEP(outputPtr, b->getSize(j * 4));
     378                Value* newValue = b->CreateOr(b->CreateLoad(ptr), b->CreateShl(extractedValues[iStreamIndex], oldOutputPosRem64));
     379                b->CreateStore(newValue, ptr);
    395380                ++iStreamIndex;
    396381            }
    397382        }
    398383
    399         BasicBlock* noStoreOutputBlock = b->CreateBasicBlock("noStoreOutputBlock");
    400384        BasicBlock* storeOutputBlock =b->CreateBasicBlock("storeOutputBlock");
    401 
    402         b->CreateCondBr(b->CreateICmpULT(b->CreateAdd(oldOutputPosRem64, valueLength), b->getSize(64)), noStoreOutputBlock, storeOutputBlock);
    403 
    404         // ---- noStoreOutputBlock
    405         b->SetInsertPoint(noStoreOutputBlock);
    406 
    407         iStreamIndex = 0;
    408         for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
    409             for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
    410                 b->setScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j), newOutputVec[iStreamIndex]);
    411                 ++iStreamIndex;
    412             }
    413         }
    414 
    415         b->CreateBr(exitBlock);
     385        b->CreateCondBr(b->CreateICmpULT(b->CreateAdd(oldOutputPosRem64, valueLength), b->getSize(64)), exitBlock, storeOutputBlock);
    416386
    417387        // ---- storeOutputBlock
    418388        b->SetInsertPoint(storeOutputBlock);
    419389
    420         Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0"));
    421         Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    422         Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    423 
    424         iStreamIndex = 0;
    425         for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
    426             Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream" + std::to_string(i), b->getSize(0)), b->getBitBlockType()->getPointerTo());
    427             Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(mNumsOfBitStreams[i])));
    428             outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo());
    429 
    430             Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64));
    431 
    432             for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
    433                 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize(j * (b->getBitBlockWidth() / 64))));
    434                 b->CreateStore(newOutputVec[iStreamIndex], targetPtr);
    435                 ++iStreamIndex;
    436             }
    437         }
    438 
    439390        Value* shiftAmount = b->CreateSub(b->getSize(0x40), oldOutputPosRem64);
    440391        Value* fullyShift = b->CreateICmpEQ(shiftAmount, b->getSize(0x40));
    441392
     393        Value* exceedBlock = b->CreateICmpUGE(b->CreateAdd(b->CreateURem(oldOutputPos, b->getSize(b->getBitBlockWidth())), valueLength), b->getSize(b->getBitBlockWidth()));
    442394        iStreamIndex = 0;
    443395        for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
    444             for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
    445                 b->setScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j), b->CreateSelect(fullyShift, b->getInt64(0), b->CreateLShr(extractedValues[iStreamIndex], shiftAmount)));
     396            Value* oldOutputPtr = b->getScalarField("currentOutputPtr_" + std::to_string(i));
     397            Value* distance = b->CreateSelect(exceedBlock, b->getSize(1 + (mNumsOfBitStreams[i] - 1) * b->getBitBlockWidth() / 64), b->getSize(1));
     398            Value* newOutputPtr = b->CreateGEP(oldOutputPtr, distance);
     399            b->setScalarField("currentOutputPtr_" + std::to_string(i), newOutputPtr);
     400            for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
     401                Value* newValue = b->CreateSelect(fullyShift, b->getInt64(0), b->CreateLShr(extractedValues[iStreamIndex], shiftAmount));
     402                Value* ptr = b->CreateGEP(newOutputPtr, b->getSize(j * 4));
     403                b->CreateStore(newValue, ptr);
    446404                ++iStreamIndex;
    447405            }
     
    452410        b->SetInsertPoint(exitBlock);
    453411        b->setScalarField("outputPos", b->CreateAdd(oldOutputPos, valueLength));
    454     }
    455 
    456     void LZParabixAioKernel::storePendingOutput_BitStream(const std::unique_ptr<KernelBuilder> &b) {
    457         Value* oldOutputPos = b->getScalarField("outputPos");
    458         Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0"));
    459         Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    460         Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    461         Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64));
    462 
    463         unsigned iStreamIndex = 0;
    464         for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {
    465             Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream" + std::to_string(i), b->getSize(0)), b->getBitBlockType()->getPointerTo());
    466             Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(mNumsOfBitStreams[i])));
    467             outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo());
    468             for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {
    469                 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize(j * (b->getBitBlockWidth() / 64))));
    470                 b->CreateStore(b->getScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j)), targetPtr);
    471                 ++iStreamIndex;
    472             }
    473         }
    474     }
    475 
    476     // ---- Output Swizzled
    477     void LZParabixAioKernel::initPendingOutputScalar_Swizzled(const std::unique_ptr<KernelBuilder> &b) {
    478         for (unsigned i = 0; i < (mNumsOfBitStreams[0] + 3) / 4; i++) {
    479             this->addScalar(b->getBitBlockType(), "pendingOutput" + std::to_string(0) + "_" + std::to_string(i));
    480         }
    481     }
    482     void LZParabixAioKernel::appendBitStreamOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength) {
    483 
    484         std::vector<llvm::Value*> extractedValuesVec;
    485         for (unsigned i = 0; i < 2; i++) {
    486             Value* vec = ConstantVector::getNullValue(b->getBitBlockType());
    487             for (unsigned j = 0; j < 4; j++) {
    488                 vec = b->CreateInsertElement(vec, extractedValues[i * 4 + j], j);
    489             }
    490             extractedValuesVec.push_back(vec);
    491         }
    492 
    493         BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock");
    494 
    495         Value* oldOutputPos = b->getScalarField("outputPos");
    496         Value* oldOutputPosRem64 = b->CreateURem(oldOutputPos, b->getSize(64));
    497 
    498         std::vector<llvm::Value*> newOutputVec;
    499         for (unsigned i = 0; i < 2; i++) {
    500             Value* newValue = b->CreateOr(b->getScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i)), b->CreateShl(extractedValuesVec[i], b->simd_fill(64, oldOutputPosRem64)));
    501             newOutputVec.push_back(newValue);
    502         }
    503 
    504 
    505         BasicBlock* noStoreOutputBlock = b->CreateBasicBlock("noStoreOutputBlock");
    506         BasicBlock* storeOutputBlock =b->CreateBasicBlock("storeOutputBlock");
    507 
    508         b->CreateCondBr(b->CreateICmpULT(b->CreateAdd(oldOutputPosRem64, valueLength), b->getSize(64)), noStoreOutputBlock, storeOutputBlock);
    509 
    510         // ---- noStoreOutputBlock
    511         b->SetInsertPoint(noStoreOutputBlock);
    512         for (unsigned i = 0; i < 2; i++) {
    513             b->setScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i), newOutputVec[i]);
    514         }
    515         b->CreateBr(exitBlock);
    516 
    517         // ---- storeOutputBlock
    518         b->SetInsertPoint(storeOutputBlock);
    519 
    520         Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0"));
    521         Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    522         Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    523 
    524         Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream0", b->getSize(0)), b->getBitBlockType()->getPointerTo());
    525         Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(8)));
    526         outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo());
    527 
    528         Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64));
    529 
    530         for (unsigned i = 0; i < 2; i++) {
    531             for (unsigned j = 0; j < 4; j++) {
    532                 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize((i * 4 + j) * 4)));
    533                 b->CreateStore(b->CreateExtractElement(newOutputVec[i], j), targetPtr);
    534             }
    535 
    536         }
    537 
    538         Value* shiftAmount = b->CreateSub(b->getSize(0x40), oldOutputPosRem64);
    539         Value* fullyShift = b->CreateICmpEQ(shiftAmount, b->getSize(0x40));
    540 
    541         for (unsigned i = 0; i < 2; i++) {
    542 
    543             b->setScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i), b->CreateSelect(fullyShift, ConstantVector::getNullValue(b->getBitBlockType()), b->CreateLShr(extractedValuesVec[i], b->simd_fill(64, shiftAmount))));
    544         }
    545 
    546         b->CreateBr(exitBlock);
    547 
    548         b->SetInsertPoint(exitBlock);
    549         b->setScalarField("outputPos", b->CreateAdd(oldOutputPos, valueLength));
    550 
    551     }
    552 
    553     void LZParabixAioKernel::storePendingOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b) {
    554         Value* oldOutputPos = b->getScalarField("outputPos");
    555         Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0"));
    556         Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    557         Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));
    558 
    559         Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64));
    560 
    561         Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream0", b->getSize(0)), b->getBitBlockType()->getPointerTo());
    562         Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(8)));
    563         outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo());
    564 
    565         vector<Value*> pendingOutputVec;
    566         for (unsigned i = 0; i < 2; i++) {
    567             pendingOutputVec.push_back(b->getScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i)));
    568         }
    569 
    570         for (unsigned i = 0; i < 2; i++) {
    571             for (unsigned j = 0; j < 2; j++) {
    572                 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize((i * 4 + j) * 4)));
    573                 b->CreateStore(b->CreateExtractElement(pendingOutputVec[i], j), targetPtr);
    574             }
    575         }
    576     }
     412
     413    }
     414
    577415}
  • icGREP/icgrep-devel/icgrep/kernels/lzparabix/decoder/LZParabixAioKernel.h

    r6123 r6132  
    4141        std::vector<unsigned> mNumsOfBitStreams;
    4242
    43 
    4443        // ---- Output
    4544        void initPendingOutputScalar(const std::unique_ptr<KernelBuilder> &b);
    4645        void appendBitStreamOutput(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength);
    47         void storePendingOutput(const std::unique_ptr<KernelBuilder> &b);
    4846
    49 
    50         void initPendingOutputScalar_BitStream(const std::unique_ptr<KernelBuilder> &b);
    51         void appendBitStreamOutput_BitStream(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength);
    52         void storePendingOutput_BitStream(const std::unique_ptr<KernelBuilder> &b);
    53 
    54 
    55         void initPendingOutputScalar_Swizzled(const std::unique_ptr<KernelBuilder> &b);
    56         void appendBitStreamOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength);
    57         void storePendingOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b);
     47        void initScalarOutputPtr(const std::unique_ptr<KernelBuilder> &b);
    5848    };
    5949
  • icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp

    r6093 r6132  
    4545    }
    4646}
     47
     48
     49    P2S4StreamByPDEP::P2S4StreamByPDEP(const std::unique_ptr<kernel::KernelBuilder> & b)
     50            : BlockOrientedKernel("P2S4StreamByPDEP",
     51                                  {Binding{b->getStreamSetTy(4, 1), "basisBits"}},
     52                                  {Binding{b->getStreamSetTy(1, 4), "byteStream"}},
     53                                  {}, {}, {})
     54    {
     55    }
     56
     57
     58    void P2S4StreamByPDEP::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     59        Function * PDEPFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_64);
     60        uint64_t pdepBaseMask = 0x1111111111111111;
     61
     62        Value* inputBlocks[4];
     63
     64        for (unsigned i = 0; i < 4; i++) {
     65            inputBlocks[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i));
     66        }
     67        Value* outputBasePtr = b->CreatePointerCast(b->getOutputStreamBlockPtr("byteStream", b->getSize(0)), b->getInt64Ty()->getPointerTo());
     68
     69        for (unsigned i = 0; i < b->getBitBlockWidth() / 64; i++) {
     70            Value* currentInput[4];
     71            for (unsigned iIndex = 0; iIndex < 4; iIndex++) {
     72                currentInput[iIndex] = b->CreateExtractElement(inputBlocks[iIndex], i);
     73            }
     74
     75            for (unsigned j = 0; j < 4; j++) {
     76                unsigned outputIndex = i * 4 + j;
     77                Value* retI64 = b->getInt64(0);
     78                for (unsigned k = 0; k < 4; k++) {
     79                    Value* newBits = b->CreateCall(
     80                            PDEPFunc,{
     81                                    b->CreateLShr(currentInput[k], b->getInt64(j * 16)),
     82                                    b->getInt64(pdepBaseMask << k)
     83                            }
     84                    );
     85                    retI64 = b->CreateOr(retI64, newBits);
     86                }
     87                b->CreateStore(retI64, b->CreateGEP(outputBasePtr, b->getInt32(outputIndex)));
     88            }
     89        }
     90
     91//        for (unsigned i = 0; i < 4; i++) {
     92//            b->CallPrintRegister("input" + std::to_string(i), inputBlocks[i]);
     93//        }
     94//
     95//        Value* outputBaseBlockPtr = b->CreatePointerCast(b->getOutputStreamBlockPtr("byteStream", b->getSize(0)), b->getBitBlockType()->getPointerTo());
     96//        for (unsigned i = 0; i < 4; i++) {
     97//            b->CallPrintRegister("output" + std::to_string(i), b->CreateLoad(b->CreateGEP(outputBaseBlockPtr, b->getInt32(i))));
     98//        }
     99
     100    }
     101
    47102               
    48103void P2SKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    49104    Value * p_bitblock[8];
    50105    for (unsigned i = 0; i < 8; i++) {
    51         p_bitblock[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i));
     106        if (i < mNumOfStreams) {
     107            p_bitblock[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i));
     108        } else {
     109            p_bitblock[i] = ConstantVector::getNullValue(b->getBitBlockType());
     110        }
     111
    52112    }
    53113    Value * s_bytepack[8];
     
    171231}
    172232
    173 P2SKernel::P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering numbering)
    174     : BlockOrientedKernel("p2s" + cc::numberingSuffix(numbering),
    175               {Binding{b->getStreamSetTy(8, 1), "basisBits"}},
     233
     234
     235
     236P2SKernel::P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering numbering, std::string prefix, unsigned numOfStreams)
     237    : BlockOrientedKernel(prefix + "p2s" + cc::numberingSuffix(numbering),
     238              {Binding{b->getStreamSetTy(numOfStreams, 1), "basisBits"}},
    176239              {Binding{b->getStreamSetTy(1, 8), "byteStream"}},
    177240              {}, {}, {}),
    178     mBasisSetNumbering(numbering) {
     241    mBasisSetNumbering(numbering),
     242      mNumOfStreams(numOfStreams) {
    179243}
    180244
  • icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.h

    r6089 r6132  
    1313namespace kernel {
    1414
     15    class P2S4StreamByPDEP final : public BlockOrientedKernel{
     16    public:
     17        P2S4StreamByPDEP(const std::unique_ptr<kernel::KernelBuilder> & b);
     18    private:
     19        void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
     20    };
     21
    1522class P2SKernel final : public BlockOrientedKernel {
    1623public:
    17     P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian);
     24    P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian, std::string prefix = "", unsigned numOfStreams = 8);
    1825    bool isCachable() const override { return true; }
    1926    bool hasSignature() const override { return false; }
     
    2128    cc::BitNumbering mBasisSetNumbering;
    2229    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
     30    unsigned mNumOfStreams;
    2331};
    2432
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp

    r6112 r6132  
    2424        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
    2525        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
    26         t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
     26
     27        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1); // TODO 4䞪bit streams时这里的16改䞺8?
    2728        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
     29
    2830    } else {
    2931        t0 = iBuilder->hsimd_packh(16, s0, s1);
    3032        t1 = iBuilder->hsimd_packl(16, s0, s1);
    3133    }
     34    if (shift == 1) {
     35//        iBuilder->CallPrintRegister("t0", t0);
     36//        iBuilder->CallPrintRegister("t1", t1);
     37    }
     38
    3239    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
    3340    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
     
    3542
    3643void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) {
     44    {
     45        //input[0 - 3]
     46        Value* bit3311[2];
     47        Value* bit2200[2];
     48        for (unsigned i = 0; i < 2; i++) {
     49            s2p_step(iBuilder, input[2 * i], input[2 * i + 1], iBuilder->simd_himask(2), 1, bit3311[i], bit2200[i]);
     50        }
     51
     52        Value* out[4];
     53        s2p_step(iBuilder, bit3311[0], bit3311[1],
     54                 iBuilder->simd_himask(4), 2, out[3], out[1]);
     55
     56        s2p_step(iBuilder, bit2200[0], bit2200[1],
     57                 iBuilder->simd_himask(4), 2, out[2], out[0]);
     58        for (unsigned i = 0; i < 4; i++) {
     59//            iBuilder->CallPrintRegister("input" + std::to_string(i), input[i]);
     60        }
     61        for (unsigned i = 0; i < 4; i++) {
     62//            iBuilder->CallPrintRegister("out" + std::to_string(i), out[i]);
     63        }
     64    }
     65
     66
    3767    // Little-endian bit number is used for variables.
    3868    Value * bit66442200[4];
    3969    Value * bit77553311[4];
     70//    iBuilder->CallPrintRegister("himask2", iBuilder->simd_himask(2));
     71//    iBuilder->CallPrintRegister("himask4", iBuilder->simd_himask(4));
     72//    iBuilder->CallPrintRegister("himask8", iBuilder->simd_himask(8));
    4073
    4174    for (unsigned i = 0; i < 4; i++) {
    4275        Value * s0 = input[2 * i];
    4376        Value * s1 = input[2 * i + 1];
     77//        iBuilder->CallPrintRegister("s0_" + std::to_string(2 * i), s0);
     78//        iBuilder->CallPrintRegister("s1_" + std::to_string(2 * i + 1), s1);
    4479        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit77553311[i], bit66442200[i]);
     80//        iBuilder->CallPrintRegister("bit77553311", bit77553311[i]);
     81//        iBuilder->CallPrintRegister("bit66442200", bit66442200[i]);
    4582    }
    4683    Value * bit44440000[2];
     
    65102        s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
    66103        s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
     104    }
     105
     106    for (unsigned i = 0; i < 8; i++) {
     107//        iBuilder->CallPrintRegister("input" + std::to_string(i), input[i]);
     108    }
     109    for (unsigned i = 0; i < 8; i++) {
     110//        iBuilder->CallPrintRegister("output" + std::to_string(i), output[i]);
    67111    }
    68112}
     
    110154}
    111155#endif
    112    
     156
     157
     158    S2P4StreamByPEXTKernel::S2P4StreamByPEXTKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
     159            :BlockOrientedKernel("s2p4StreamByPEXT",
     160                                 {
     161                                         Binding{b->getStreamSetTy(1, 4), "byteStream", FixedRate(), Principal()}
     162                                 },
     163                                 {
     164                                         Binding{b->getStreamSetTy(4, 1), "basisBits"}
     165                                 }, {}, {}, {}) {
     166
     167    }
     168
     169    void S2P4StreamByPEXTKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     170        Function* PEXT_func = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
     171        uint64_t pextBaseMask = 0x1111111111111111;
     172
     173        Value* inputBasePtr = b->CreatePointerCast(b->getInputStreamBlockPtr("byteStream", b->getSize(0)), b->getInt64Ty()->getPointerTo());
     174
     175        Value* outputBlocks[4];
     176        for (unsigned i = 0; i < 4; i++) {
     177            outputBlocks[i] = ConstantVector::getNullValue(b->getBitBlockType());
     178        }
     179
     180        for (unsigned i = 0; i < b->getBitBlockWidth() / 64; i++) {
     181            Value* currentOutput[4];
     182            for (unsigned iIndex = 0; iIndex < 4; iIndex++) {
     183                currentOutput[iIndex] = b->getInt64(0);
     184            }
     185
     186            for (unsigned j = 0; j < 4; j++) {
     187                unsigned inputIndex = i * 4 + j;
     188
     189                Value* currentInput = b->CreateLoad(b->CreateGEP(inputBasePtr, b->getInt32(inputIndex)));
     190                for (unsigned k = 0; k < 4; k++) {
     191
     192                    Value* newBits = b->CreateCall(
     193                            PEXT_func,{
     194                                    currentInput,
     195                                    b->getInt64(pextBaseMask << k)
     196                            }
     197                    );
     198
     199                    currentOutput[k] = b->CreateOr(currentOutput[k], b->CreateShl(newBits, 16 * j));
     200                }
     201            }
     202
     203            for (unsigned iIndex = 0; iIndex < 4; iIndex++) {
     204                outputBlocks[iIndex] = b->CreateInsertElement(outputBlocks[iIndex], currentOutput[iIndex], i);
     205            }
     206        }
     207
     208        for (unsigned i = 0; i < 4; i++) {
     209            b->storeOutputStreamBlock("basisBits", b->getInt32(i), outputBlocks[i]);
     210//            b->CallPrintRegister("outputBlocks" + std::to_string(i), outputBlocks[i]);
     211        }
     212    }
     213
    113214void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
    114215    BasicBlock * entry = kb->GetInsertBlock();
     
    135236    Value * basisbits[8];
    136237    s2p(kb, bytepack, basisbits, mBasisSetNumbering);
    137     for (unsigned i = 0; i < 8; ++i) {
     238    for (unsigned i = 0; i < mNumOfStreams; ++i) {
    138239        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]);
    139240    }
     
    145246}
    146247
    147 S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering, bool aligned, std::string prefix)
     248S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering, bool aligned, std::string prefix, unsigned numOfStreams)
    148249    : MultiBlockKernel(aligned ? prefix + "s2p" + cc::numberingSuffix(numbering): prefix + "s2p_unaligned" + cc::numberingSuffix(numbering),
    149250    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
    150     {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
     251    {Binding{b->getStreamSetTy(numOfStreams, 1), "basisBits"}}, {}, {}, {}),
    151252  mBasisSetNumbering(numbering),
    152   mAligned(aligned) {
     253  mAligned(aligned),
     254  mNumOfStreams(numOfStreams)
     255{
    153256    if (!aligned) {
    154257        mStreamSetInputs[0].addAttribute(Misaligned());
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.h

    r6112 r6132  
    1616namespace kernel {
    1717
     18    class S2P4StreamByPEXTKernel final : public BlockOrientedKernel{
     19    public:
     20        S2P4StreamByPEXTKernel(const std::unique_ptr<kernel::KernelBuilder> & b);
     21    protected:
     22        void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) override;
     23    };
     24
    1825class S2PKernel final : public MultiBlockKernel {
    1926public:
    20     S2PKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian, bool aligned = true, std::string prefix = "");
     27    S2PKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian, bool aligned = true, std::string prefix = "", unsigned numOfStreams = 8);
    2128    bool isCachable() const override { return true; }
    2229    bool hasSignature() const override { return false; }
     
    2633    cc::BitNumbering mBasisSetNumbering;
    2734    bool mAligned;
     35    unsigned mNumOfStreams;
    2836};
    2937
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp

    r6119 r6132  
    2727#include <kernels/lz4/aio/lz4_swizzled_aio.h>
    2828#include <kernels/lz4/aio/lz4_bitstream_aio.h>
     29#include <kernels/lz4/aio/lz4_i4_bytestream_aio.h>
    2930#include <kernels/bitstream_pdep_kernel.h>
    3031#include <kernels/lz4/lz4_bitstream_not_kernel.h>
     
    426427
    427428
     429
     430    if (compressedBitStreams[0]->getNumOfStreams() == 4) {
     431        StreamSetBuffer* twistedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 4), this->getInputBufferBlocks(iBuilder));
     432        kernel::Kernel* twistK = mGrepDriver->addKernelInstance<kernel::P2S4StreamByPDEP>(iBuilder);
     433        mGrepDriver->makeKernelCall(twistK, {compressedBitStreams[0]}, {twistedCharClasses});
     434
     435
     436        StreamSetBuffer* uncompressedTwistedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 4), this->getInputBufferBlocks(iBuilder));
     437        Kernel* lz4I4AioK = mPxDriver.addKernelInstance<LZ4I4ByteStreamAioKernel>(iBuilder);
     438        lz4I4AioK->setInitialArguments({mFileSize});
     439        mGrepDriver->makeKernelCall(lz4I4AioK, {
     440                mCompressedByteStream,
     441
     442                // Block Data
     443                BlockData_IsCompressed,
     444                BlockData_BlockStart,
     445                BlockData_BlockEnd,
     446
     447                twistedCharClasses
     448        }, {
     449                                            uncompressedTwistedCharClasses
     450                                    });
     451
     452        StreamSetBuffer* untwistedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(iBuilder));
     453        kernel::Kernel* untwistK = mGrepDriver->addKernelInstance<kernel::S2P4StreamByPEXTKernel>(iBuilder);
     454        mGrepDriver->makeKernelCall(untwistK, {uncompressedTwistedCharClasses}, {untwistedCharClasses});
     455        return {untwistedCharClasses};
     456    }
     457
     458
     459
     460
    428461    std::vector<StreamSetBuffer *> inputStreams = {
    429462            mCompressedByteStream,
     
    554587
    555588StreamSetBuffer * LZ4Generator::generateAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
    556     //// Decode Block Information
    557     StreamSetBuffer * const BlockData_IsCompressed = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getInputBufferBlocks(iBuilder), 1);
    558     StreamSetBuffer * const BlockData_BlockStart = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1);
    559     StreamSetBuffer * const BlockData_BlockEnd = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1);
    560 
    561 
    562     //// Generate Helper Markers Extenders
    563 //    StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1);
    564 //    mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder));
    565 //    Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
    566 //    mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
    567 
    568 
    569     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    570     blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    571     mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
    572 
     589    LZ4BlockInfo blockInfo = this->getBlockInfo(iBuilder);
    573590
    574591    StreamSetBuffer * const decompressionByteStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(iBuilder), 1);
    575 
    576592    Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ByteStreamAioKernel>(iBuilder);
    577593    lz4AioK->setInitialArguments({mFileSize});
     
    580596            {
    581597                    mCompressedByteStream,
    582 //                    Extenders,
    583598
    584599                    // Block Data
    585                     BlockData_IsCompressed,
    586                     BlockData_BlockStart,
    587                     BlockData_BlockEnd
     600                    blockInfo.isCompress,
     601                    blockInfo.blockStart,
     602                    blockInfo.blockEnd
    588603            }, {
    589604                    decompressionByteStream
     
    694709}
    695710
    696 
     711LZ4BlockInfo LZ4Generator::getBlockInfo(const std::unique_ptr<kernel::KernelBuilder> & b) {
     712    LZ4BlockInfo blockInfo;
     713    blockInfo.isCompress = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 8), this->getInputBufferBlocks(b), 1);
     714    blockInfo.blockStart = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 64), this->getInputBufferBlocks(b), 1);
     715    blockInfo.blockEnd = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 64), this->getInputBufferBlocks(b), 1);
     716
     717    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(b);
     718    blockDecoderK->setInitialArguments({b->CreateTrunc(mHasBlockChecksum, b->getInt1Ty()), mHeaderSize, mFileSize});
     719    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {blockInfo.isCompress, blockInfo.blockStart, blockInfo.blockEnd});
     720
     721    return blockInfo;
     722}
    697723
    698724
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.h

    r6119 r6132  
    1818
    1919typedef void (*MainFunctionType)(char * byte_data, size_t headerSize, size_t filesize, bool hasBlockChecksum);
     20
     21struct LZ4BlockInfo {
     22    parabix::StreamSetBuffer* blockStart;
     23    parabix::StreamSetBuffer* blockEnd;
     24    parabix::StreamSetBuffer* isCompress;
     25};
    2026
    2127class LZ4Generator {
     
    8389
    8490    unsigned mLz4BlockSize;
     91
     92    LZ4BlockInfo getBlockInfo(const std::unique_ptr<kernel::KernelBuilder> & b);
    8593};
    8694
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.cpp

    r6124 r6132  
    2222#include <kernels/lz4/lz4_bitstream_match_copy_kernel.h>
    2323#include <kernels/lz4/lz4_bitstream_not_kernel.h>
     24#include <kernels/lz4/aio/lz4_i4_bytestream_aio.h>
    2425#include <kernels/fake_stream_generating_kernel.h>
    2526#include <kernels/bitstream_pdep_kernel.h>
     
    2930#include <re/collect_ccs.h>
    3031#include <re/replaceCC.h>
     32
     33#include <re/casing.h>
     34#include <re/exclude_CC.h>
     35#include <re/to_utf8.h>
     36#include <re/re_analysis.h>
     37#include <re/re_name_resolve.h>
     38#include <re/re_name_gather.h>
     39#include <re/re_multiplex.h>
     40#include <re/re_utility.h>
    3141
    3242#include <UCD/resolve_properties.h>
     
    5464#include <kernels/lz4/aio/lz4_swizzled_aio.h>
    5565#include <kernels/lz4/aio/lz4_bitstream_aio.h>
    56 
     66#include <re/re_seq.h>
     67#include <kernels/lz4/aio/lz4_bytestream_aio.h>
    5768
    5869namespace re { class CC; }
     
    108119}
    109120
    110 
     121parabix::StreamSetBuffer * LZ4GrepGenerator::convertCompressedBitsStreamWithByteStreamAioApproach(
     122        parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix) {
     123    auto mGrepDriver = &mPxDriver;
     124    auto & b = mGrepDriver->getBuilder();
     125
     126    LZ4BlockInfo blockInfo = this->getBlockInfo(b);
     127
     128    StreamSetBuffer * const mtxByteStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(b));
     129    Kernel * p2sK = mPxDriver.addKernelInstance<P2SKernel>(b, cc::BitNumbering::BigEndian, prefix, numberOfStream);
     130    mPxDriver.makeKernelCall(p2sK, {compressedBitStream}, {mtxByteStream});
     131
     132    StreamSetBuffer * const decompressionMtxByteStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(b), 1);
     133    Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ByteStreamAioKernel>(b, true);
     134    lz4AioK->setInitialArguments({mFileSize});
     135    mPxDriver.makeKernelCall(
     136            lz4AioK,
     137            {
     138                    mCompressedByteStream,
     139                    // Block Data
     140                    blockInfo.isCompress,
     141                    blockInfo.blockStart,
     142                    blockInfo.blockEnd,
     143                    mtxByteStream
     144            }, {
     145                    decompressionMtxByteStream
     146            });
     147
     148    StreamSetBuffer * const decompressedMtxBitStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(8), this->getDecompressedBufferBlocks(b));
     149
     150    Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(b, cc::BitNumbering::BigEndian, true, prefix, numberOfStream);
     151    mPxDriver.makeKernelCall(s2pk, {decompressionMtxByteStream}, {decompressedMtxBitStream});
     152
     153    return decompressedMtxBitStream;
     154}
    111155
    112156StreamSetBuffer * LZ4GrepGenerator::convertCompressedBitsStreamWithSwizzledAioApproach(
    113157        parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix) {
    114158    auto mGrepDriver = &mPxDriver;
    115     auto & iBuilder = mGrepDriver->getBuilder();
    116 
    117     //// Decode Block Information
    118     StreamSetBuffer * const BlockData_IsCompressed = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getInputBufferBlocks(iBuilder), 1);
    119     StreamSetBuffer * const BlockData_BlockStart = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1);
    120     StreamSetBuffer * const BlockData_BlockEnd = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1);
    121 
    122     //// Generate Helper Markers Extenders, FX, XF
    123 //    StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1);
    124 //    mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder));
    125 //    Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
    126 //    mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
    127 
    128 
    129     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    130     blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    131     mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
    132 
     159    auto & b = mGrepDriver->getBuilder();
     160
     161    LZ4BlockInfo blockInfo = this->getBlockInfo(b);
    133162
    134163    // Produce unswizzled bit streams
    135     StreamSetBuffer * u16Swizzle0 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(iBuilder), 1);
    136     Kernel * unSwizzleK = mPxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 4, 1, 1, 64, "source");
     164    StreamSetBuffer * u16Swizzle0 = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(4), this->getInputBufferBlocks(b), 1);
     165    Kernel * unSwizzleK = mPxDriver.addKernelInstance<SwizzleGenerator>(b, 4, 1, 1, 64, "source");
    137166    mPxDriver.makeKernelCall(unSwizzleK, {compressedBitStream}, {u16Swizzle0});
    138167
    139     StreamSetBuffer * decompressedSwizzled0 = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(iBuilder), 1);
    140 
    141 
    142     Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4SwizzledAioKernel>(iBuilder, 4, 1, 4);
     168    StreamSetBuffer * decompressedSwizzled0 = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(4), this->getInputBufferBlocks(b), 1);
     169
     170
     171    Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4SwizzledAioKernel>(b, 4, 1, 4);
    143172    lz4AioK->setInitialArguments({mFileSize});
    144173    mPxDriver.makeKernelCall(
     
    149178
    150179                    // Block Data
    151                     BlockData_IsCompressed,
    152                     BlockData_BlockStart,
    153                     BlockData_BlockEnd,
     180                    blockInfo.isCompress,
     181                    blockInfo.blockStart,
     182                    blockInfo.blockEnd,
    154183
    155184                    u16Swizzle0,
     
    160189
    161190
    162     StreamSetBuffer * const decompressionBitStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), this->getDecompressedBufferBlocks(iBuilder));
    163     Kernel * unSwizzleK2 = mPxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 4, 1, 1, 64, "dst");
     191    StreamSetBuffer * const decompressionBitStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(8, 1), this->getDecompressedBufferBlocks(b));
     192    Kernel * unSwizzleK2 = mPxDriver.addKernelInstance<SwizzleGenerator>(b, 4, 1, 1, 64, "dst");
    164193    mPxDriver.makeKernelCall(unSwizzleK2, {decompressedSwizzled0}, {decompressionBitStream});
    165194
     
    317346
    318347};
    319 std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZ4GrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio, bool useSwizzled) {
     348std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZ4GrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio, bool useSwizzled, bool useByteStream) {
    320349
    321350    this->initREs(REs);
     
    338367    std::set<re::Name *> UnicodeProperties;
    339368
    340     const auto UnicodeSets = re::collectCCs(mREs[0], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
     369    re::CC* linefeedCC = re::makeCC(0x0A);
     370
     371    re::Seq* seq = re::makeSeq();
     372    seq->push_back(mREs[0]);
     373    seq->push_back(std::move(linefeedCC));
     374
     375
     376    const auto UnicodeSets = re::collectCCs(seq, &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
    341377    StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    342378
     
    350386    mGrepDriver->makeKernelCall(ccK, {mCompressedBasisBits}, {CharClasses});
    351387
    352     StreamSetBuffer * CompressedLineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    353     kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);
    354     mPxDriver.makeKernelCall(linefeedK, {mCompressedBasisBits}, {CompressedLineFeedStream});
    355 
    356 
    357     StreamSetBuffer * LineBreakStream = nullptr;
    358388    StreamSetBuffer * decompressedCharClasses = nullptr;
    359389    if (useSwizzled) {
    360         StreamSetBuffer * combinedStream = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses + 1), baseBufferSize);
    361         kernel::Kernel* streamCombineKernel = mPxDriver.addKernelInstance<StreamsCombineKernel>(idb, std::vector<unsigned>({1, (unsigned)numOfCharacterClasses}));
    362         mPxDriver.makeKernelCall(streamCombineKernel, {CompressedLineFeedStream, CharClasses}, {combinedStream});
    363         StreamSetBuffer * decompressedCombinedStream = nullptr;
    364 
    365390        if (useAio) {
    366             decompressedCombinedStream = this->convertCompressedBitsStreamWithSwizzledAioApproach(combinedStream, 1 + numOfCharacterClasses, "combined");
     391            decompressedCharClasses = this->convertCompressedBitsStreamWithSwizzledAioApproach(CharClasses, numOfCharacterClasses, "combined");
    367392        } else {
    368             decompressedCombinedStream = this->convertCompressedBitsStream(combinedStream, 1 + numOfCharacterClasses, "combined");
     393            decompressedCharClasses = this->convertCompressedBitsStream(CharClasses, numOfCharacterClasses, "combined");
    369394        }
    370 
    371         LineBreakStream = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), baseBufferSize);
    372         decompressedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
    373         kernel::Kernel* streamSplitKernel = mPxDriver.addKernelInstance<StreamsSplitKernel>(idb, std::vector<unsigned>({1, (unsigned)numOfCharacterClasses}));
    374         mPxDriver.makeKernelCall(streamSplitKernel, {decompressedCombinedStream}, {LineBreakStream, decompressedCharClasses});
     395    } else if (useByteStream){
     396        decompressedCharClasses = this->convertCompressedBitsStreamWithByteStreamAioApproach(CharClasses, numOfCharacterClasses, "combined");
    375397    } else {
    376         auto ret = this->convertCompressedBitsStreamWithBitStreamAioApproach({CharClasses, CompressedLineFeedStream}, "combined");
     398        auto ret = this->convertCompressedBitsStreamWithBitStreamAioApproach({CharClasses}, "combined");
    377399        decompressedCharClasses = ret[0];
    378         LineBreakStream = ret[1];
    379400    }
    380401
     
    383404    mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits});
    384405
    385     kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian);
     406    StreamSetBuffer * LineBreakStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), this->getInputBufferBlocks(idb));
     407    kernel::Kernel * lineFeedGrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, transformCCs(mpx.get(), linefeedCC), externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian, true);
     408    mGrepDriver->makeKernelCall(lineFeedGrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {LineBreakStream});
     409
     410
     411    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian, true);
    386412    mGrepDriver->makeKernelCall(icgrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {MatchResults});
    387413    MatchResultsBufs[0] = MatchResults;
     
    437463        std::vector<StreamSetBuffer *> icgrepInputSets = {decompressedBasisBits};
    438464
    439         std::set<re::Name *> UnicodeProperties;
    440 
    441         StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    442         kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian);
    443         mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    444         MatchResultsBufs[i] = MatchResults;
     465        if (mEnableMultiplexing) {
     466            const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
     467            StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     468
     469            mpx = make_unique<cc::MultiplexedAlphabet>("mpx", UnicodeSets);
     470            mREs[i] = transformCCs(mpx.get(), mREs[i]);
     471            std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
     472            auto numOfCharacterClasses = mpx_basis.size();
     473            StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
     474            kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), false, cc::BitNumbering::BigEndian);
     475            mGrepDriver->makeKernelCall(ccK, {decompressedBasisBits}, {CharClasses});
     476
     477            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian, true);
     478            icgrepInputSets.push_back(CharClasses);
     479            mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
     480            MatchResultsBufs[i] = MatchResults;
     481        } else {
     482            std::set<re::Name *> UnicodeProperties;
     483
     484            StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     485            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian);
     486            mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
     487            MatchResultsBufs[i] = MatchResults;
     488        }
    445489    }
    446490
     
    559603    mPxDriver.finalizeObject();
    560604}
     605
     606void LZ4GrepGenerator::generateByteStreamMultiplexingAioPipeline(re::RE* regex) {
     607    auto & iBuilder = mPxDriver.getBuilder();
     608    this->generateCountOnlyMainFunc(iBuilder);
     609
     610    this->generateLoadByteStreamAndBitStream(iBuilder);
     611    StreamSetBuffer * LineBreakStream;
     612    StreamSetBuffer * Matches;
     613    std::vector<re::RE*> res = {regex};
     614    std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res, true, false, true);
     615
     616    kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder);
     617    mPxDriver.makeKernelCall(matchCountK, {Matches}, {});
     618    mPxDriver.generatePipelineIR();
     619
     620    iBuilder->setKernel(matchCountK);
     621    Value * matchedLineCount = iBuilder->getAccumulator("countResult");
     622    matchedLineCount = iBuilder->CreateZExt(matchedLineCount, iBuilder->getInt64Ty());
     623
     624    mPxDriver.deallocateBuffers();
     625
     626    iBuilder->CreateRet(matchedLineCount);
     627
     628    mPxDriver.finalizeObject();
     629}
     630
    561631
    562632void LZ4GrepGenerator::generateMultiplexingBitStreamAioPipeline(re::RE* regex) {
     
    696766    mPxDriver.finalizeObject();
    697767}
     768
     769
    698770
    699771void LZ4GrepGenerator::generateAioPipeline(re::RE *regex) {
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.h

    r6119 r6132  
    2828
    2929    void generateMultiplexingCompressedBitStream(std::vector<re::RE *> &REs);
    30     std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio = false, bool useSwizzled = true);
     30    std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio = false, bool useSwizzled = true, bool useByteStream = false);
    3131
    3232
     
    3838    void generateBitStreamAioPipeline(re::RE* regex);
    3939
     40    void generateByteStreamMultiplexingAioPipeline(re::RE* regex);
    4041    void generateAioPipeline(re::RE* regex);
    4142    void generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter, int minParallelLevel);
     
    7677    parabix::StreamSetBuffer * convertCompressedBitsStreamWithSwizzledAioApproach(
    7778            parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix);
     79    parabix::StreamSetBuffer * convertCompressedBitsStreamWithByteStreamAioApproach(
     80            parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix);
    7881
    7982
  • icGREP/icgrep-devel/icgrep/lz4_grep.cpp

    r6119 r6132  
    9595                g.generateBitStreamAioPipeline(re_ast);
    9696            }
    97 
    9897        } else {
    99             g.generateAioPipeline(re_ast);
     98            if (enableMultiplexing) {
     99                g.generateByteStreamMultiplexingAioPipeline(re_ast);
     100            } else {
     101                g.generateAioPipeline(re_ast);
     102            }
    100103        }
    101104
Note: See TracChangeset for help on using the changeset viewer.