Ignore:
Timestamp:
Jun 24, 2018, 1:24:36 AM (12 months ago)
Author:
xwa163
Message:
  1. Cleanup LZ4 AIO related kernels
  2. Improve LZ4ParallelByteStreamAIOKernel
  3. Implement simd_cttz
Location:
icGREP/icgrep-devel/icgrep/lz4
Files:
2 deleted
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp

    r6089 r6111  
    2323#include <kernels/lz4/lz4_block_decoder.h>
    2424#include <kernels/lz4/lz4_index_builder.h>
    25 #include <kernels/lz4/lz4_index_builder_new.h>
    26 #include <kernels/lz4/lz4_bytestream_aio.h>
    27 #include <kernels/lz4/lz4_parallel_bytestream_aio.h>
    28 #include <kernels/lz4/lz4_swizzled_aio.h>
     25#include <kernels/lz4/aio/lz4_bytestream_aio.h>
     26#include <kernels/lz4/aio/lz4_parallel_bytestream_aio.h>
     27#include <kernels/lz4/aio/lz4_swizzled_aio.h>
    2928#include <kernels/bitstream_pdep_kernel.h>
    3029#include <kernels/lz4/lz4_bitstream_not_kernel.h>
     
    3635using namespace kernel;
    3736
    38 LZ4Generator::LZ4Generator():mPxDriver("lz4d") {
     37LZ4Generator::LZ4Generator():mPxDriver("lz4d"), mLz4BlockSize(4 * 1024 * 1024) {
    3938    mCompressionMarker = NULL;
    4039}
     
    6564
    6665
    67     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderNewKernel>(iBuilder);
     66    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    6867    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    6968    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
     
    7978    mDepositMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getDecompressedBufferBlocks(iBuilder));
    8079
    81     Kernel* Lz4IndexBuilderK = mPxDriver.addKernelInstance<LZ4IndexBuilderNewKernel>(iBuilder);
     80    Kernel* Lz4IndexBuilderK = mPxDriver.addKernelInstance<LZ4IndexBuilderKernel>(iBuilder);
    8281    Lz4IndexBuilderK->setInitialArguments({mFileSize});
    8382    mPxDriver.makeKernelCall(
     
    402401    sourceK->setInitialArguments({mInputStream, mFileSize});
    403402    mPxDriver.makeKernelCall(sourceK, {}, {mCompressedByteStream});
    404     Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(iBuilder, cc::BitNumbering::BigEndian);
     403    Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(iBuilder, cc::BitNumbering::LittleEndian);
    405404    mPxDriver.makeKernelCall(s2pk, {mCompressedByteStream}, {mCompressedBasisBits});
    406405}
     
    413412
    414413    //// Generate Helper Markers Extenders, FX, XF
    415     StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1);
    416     mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder));
    417     Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
    418     mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
    419 
    420 
    421     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderNewKernel>(iBuilder);
     414//    StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1);
     415//    mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder));
     416//    Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
     417//    mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
     418
     419
     420    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    422421    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    423422    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
     
    442441            {
    443442                    mCompressedByteStream,
    444                     Extenders,
     443
     444//                    Extenders,
    445445
    446446                    // Block Data
     
    465465}
    466466
    467 parabix::StreamSetBuffer * LZ4Generator::generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, bool enableGather, bool enableScatter) {
     467parabix::StreamSetBuffer * LZ4Generator::generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, bool enableGather, bool enableScatter, int minParallelLevel) {
    468468    //// Decode Block Information
    469469    StreamSetBuffer * const BlockData_IsCompressed = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getInputBufferBlocks(iBuilder), 1);
     
    477477//    mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
    478478
    479     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderNewKernel>(iBuilder);
     479    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    480480    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    481481    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
     
    484484    StreamSetBuffer * const decompressionByteStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(iBuilder), 1);
    485485
    486     Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ParallelByteStreamAioKernel>(iBuilder, enableGather, enableScatter);
     486    Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ParallelByteStreamAioKernel>(iBuilder, mLz4BlockSize, enableGather, enableScatter, minParallelLevel);
    487487    lz4AioK->setInitialArguments({mFileSize});
    488488    mPxDriver.makeKernelCall(
     
    490490            {
    491491                    mCompressedByteStream,
     492
     493//                    Extenders,
    492494
    493495                    // Block Data
     
    511513
    512514    //// Generate Helper Markers Extenders
    513     /*
    514     StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(), 1);
    515     mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
    516     Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
    517     mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
    518     */
    519 
    520 
    521     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderNewKernel>(iBuilder);
     515//    StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1);
     516//    mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder));
     517//    Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
     518//    mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders});
     519
     520
     521    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    522522    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    523523    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
     
    558558
    559559
    560     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderNewKernel>(iBuilder);
     560    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    561561    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    562562    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
     
    636636
    637637int LZ4Generator::get4MbBufferBlocks() {
    638     return 4 * 1024 * 1024 / codegen::BlockSize;
     638    return mLz4BlockSize / codegen::BlockSize;
    639639}
    640640
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.h

    r6081 r6111  
    4141    virtual void generateLoadByteStreamAndBitStream(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    4242    virtual void generateExtractAndDepositMarkers(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    43     virtual parabix::StreamSetBuffer * generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, bool enableGather, bool enableScatter);
     43    virtual parabix::StreamSetBuffer * generateParallelAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, bool enableGather, bool enableScatter, int minParallelLevel);
    4444    virtual parabix::StreamSetBuffer * generateAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    4545    virtual parabix::StreamSetBuffer * generateSwizzledAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     
    7676    // M0CountMarker will not contain anything, it will only be used to pass producedItemCount and manage processedItemCount between different kernel
    7777    parabix::StreamSetBuffer * mM0Marker;
     78
     79    unsigned mLz4BlockSize;
    7880};
    7981
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.cpp

    r6089 r6111  
    66#include <llvm/Support/PrettyStackTrace.h>
    77
    8 #include <cc/alphabet.h>
    98#include <cc/cc_compiler.h>
    109
     
    2322#include <kernels/lz4/lz4_bitstream_match_copy_kernel.h>
    2423#include <kernels/lz4/lz4_bitstream_not_kernel.h>
    25 #include <kernels/lz4/lz4_fake_stream_generating_kernel.h>
     24#include <kernels/fake_stream_generating_kernel.h>
    2625#include <kernels/bitstream_pdep_kernel.h>
    2726#include <kernels/bitstream_gather_pdep_kernel.h>
     
    5352#include <llvm/Support/Debug.h>
    5453#include <kernels/lz4/lz4_block_decoder.h>
    55 #include <kernels/lz4/lz4_swizzled_aio.h>
     54#include <kernels/lz4/aio/lz4_swizzled_aio.h>
    5655
    5756
     
    124123
    125124
    126     Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderNewKernel>(iBuilder);
     125    Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
    127126    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize});
    128127    mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
     
    281280
    282281    StreamSetBuffer * fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb));
    283     Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<LZ4FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
     282    Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
    284283    mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits});
    285284
     
    372371
    373372    StreamSetBuffer * fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb));
    374     Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<LZ4FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
     373    Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
    375374    mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits});
    376375
     
    523522}
    524523
    525 void LZ4GrepGenerator::generateMultiplexingSwizzledAioPipeline(re::RE* regex) {
    526     auto & iBuilder = mPxDriver.getBuilder();
    527     this->generateMainFunc(iBuilder);
    528 
    529     // GeneratePipeline
    530     this->generateLoadByteStreamAndBitStream(iBuilder);
    531 
    532     std::vector<re::RE*> res = {regex};
    533     this->generateMultiplexingCompressedBitStream(res);
    534 
    535     mPxDriver.generatePipelineIR();
    536     mPxDriver.deallocateBuffers();
    537 
    538     iBuilder->CreateRetVoid();
    539 
    540     mPxDriver.finalizeObject();
    541 }
    542 
    543 void LZ4GrepGenerator::generateMultiplexingSwizzledAioPipeline2(re::RE* regex) {
     524void LZ4GrepGenerator::generateMultiplexingSwizzledAioPipeline(re::RE *regex) {
    544525    auto & iBuilder = mPxDriver.getBuilder();
    545526    this->generateCountOnlyMainFunc(iBuilder);
     
    607588}
    608589
    609 void LZ4GrepGenerator::generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter) {
     590void LZ4GrepGenerator::generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter, int minParallelLevel) {
    610591    auto & iBuilder = mPxDriver.getBuilder();
    611592    this->generateCountOnlyMainFunc(iBuilder);
    612593
    613594    this->generateLoadByteStream(iBuilder);
    614     parabix::StreamSetBuffer * decompressedByteStream = this->generateParallelAIODecompression(iBuilder, enableGather, enableScatter);
     595    parabix::StreamSetBuffer * decompressedByteStream = this->generateParallelAIODecompression(iBuilder, enableGather, enableScatter, minParallelLevel);
    615596
    616597
    617598    StreamSetBuffer * const decompressionBitStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), this->getDecompressedBufferBlocks(iBuilder));
    618     Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(iBuilder, cc::BitNumbering::BigEndian, /*aligned = */ true, "a");
    619 //    Kernel * s2pk = mPxDriver.addKernelInstance<S2PByPextKernel>(iBuilder, cc::BitNumbering::BigEndian, "a");
     599    Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(iBuilder, cc::BitNumbering::LittleEndian, /*aligned = */ true, "a");
     600//    Kernel * s2pk = mPxDriver.addKernelInstance<S2PByPextKernel>(iBuilder, cc::BitNumbering::LittleEndian, "a");
    620601    mPxDriver.makeKernelCall(s2pk, {decompressedByteStream}, {decompressionBitStream});
    621602
     
    651632
    652633    // GeneratePipeline
     634    this->generateLoadByteStream(iBuilder);
    653635//    this->generateLoadByteStreamAndBitStream(iBuilder);
    654     this->generateLoadByteStream(iBuilder);
     636
    655637    parabix::StreamSetBuffer * decompressedByteStream = this->generateAIODecompression(iBuilder);
    656638
    657639
    658640    StreamSetBuffer * const decompressionBitStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), this->getDecompressedBufferBlocks(iBuilder));
    659     Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(iBuilder, cc::BitNumbering::BigEndian, /*aligned = */ true, "a");
     641    Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(iBuilder, cc::BitNumbering::LittleEndian, /*aligned = */ true, "a");
    660642//    Kernel * s2pk = mPxDriver.addKernelInstance<S2PByPextKernel>(iBuilder, "a");
    661643    mPxDriver.makeKernelCall(s2pk, {decompressedByteStream}, {decompressionBitStream});
  • icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.h

    r6081 r6111  
    3333    void invokeScanMatchGrep(char* fileBuffer, size_t blockStart, size_t blockEnd, bool hasBlockChecksum);
    3434
    35     void generateMultiplexingSwizzledAioPipeline(re::RE* regex);
    36     void generateMultiplexingSwizzledAioPipeline2(re::RE* regex);
     35    void generateMultiplexingSwizzledAioPipeline(re::RE *regex);
    3736
    3837    void generateSwizzledAioPipeline(re::RE* regex);
    3938
    4039    void generateAioPipeline(re::RE* regex);
    41     void generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter);
     40    void generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter, int minParallelLevel);
    4241
    4342    ScanMatchGrepMainFunctionType getScanMatchGrepMainFunction();
Note: See TracChangeset for help on using the changeset viewer.