Changeset 6132
- Timestamp:
- Jul 23, 2018, 4:56:33 AM (7 months ago)
- Location:
- icGREP/icgrep-devel/icgrep
- Files:
-
- 2 added
- 20 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/icgrep/CMakeLists.txt
r6123 r6132 103 103 add_library(UCDlib UCD/CaseFolding.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp) 104 104 add_library(GrepEngine ${GREP_CORE_SRC} grep/grep_engine.cpp kernels/cc_kernel.cpp kernels/cc_scan_kernel.cpp kernels/charclasses.cpp kernels/streams_merge.cpp kernels/until_n.cpp kernels/UCD_property_kernel.cpp kernels/grapheme_kernel.cpp) 105 add_library(LZ4_Lib lz4FrameDecoder.cpp kernels/cc_kernel.cpp kernels/lz4/lz4_deposit_uncompressed.cpp kernels/lz4/lz4_generate_deposit_stream.cpp kernels/pdep_kernel.cpp lz4/LZ4Generator.cpp kernels/lz4/lz4_block_decoder.cpp kernels/lz4/lz4_index_builder.cpp kernels/lz4/lz4_swizzled_match_copy_kernel.cpp kernels/bitstream_pdep_kernel.cpp kernels/bitstream_gather_pdep_kernel.cpp kernels/swizzled_multiple_pdep_kernel.cpp kernels/lz4/lz4_bitstream_not_kernel.cpp kernels/lz4/lz4_bitstream_match_copy_kernel.cpp kernels/fake_stream_generating_kernel.cpp kernels/lz4/aio/lz4_bytestream_aio.cpp kernels/lz4/aio/lz4_swizzled_aio.cpp kernels/lz4/aio/lz4_parallel_bytestream_aio.cpp kernels/lz4/aio/lz4_sequential_aio_base.cpp kernels/lz4/aio/lz4_sequential_aio_base.h kernels/lz4/aio/lz4_bitstream_aio.cpp kernels/lz4/aio/lz4_bitstream_aio.h )105 add_library(LZ4_Lib lz4FrameDecoder.cpp kernels/cc_kernel.cpp kernels/lz4/lz4_deposit_uncompressed.cpp kernels/lz4/lz4_generate_deposit_stream.cpp kernels/pdep_kernel.cpp lz4/LZ4Generator.cpp kernels/lz4/lz4_block_decoder.cpp kernels/lz4/lz4_index_builder.cpp kernels/lz4/lz4_swizzled_match_copy_kernel.cpp kernels/bitstream_pdep_kernel.cpp kernels/bitstream_gather_pdep_kernel.cpp kernels/swizzled_multiple_pdep_kernel.cpp kernels/lz4/lz4_bitstream_not_kernel.cpp kernels/lz4/lz4_bitstream_match_copy_kernel.cpp kernels/fake_stream_generating_kernel.cpp kernels/lz4/aio/lz4_bytestream_aio.cpp kernels/lz4/aio/lz4_swizzled_aio.cpp kernels/lz4/aio/lz4_parallel_bytestream_aio.cpp kernels/lz4/aio/lz4_sequential_aio_base.cpp kernels/lz4/aio/lz4_sequential_aio_base.h kernels/lz4/aio/lz4_bitstream_aio.cpp kernels/lz4/aio/lz4_bitstream_aio.h kernels/lz4/aio/lz4_i4_bytestream_aio.cpp kernels/lz4/aio/lz4_i4_bytestream_aio.h) 106 106 add_library(LZParabix_Lib lzparabix/LZParabixGenerator.cpp kernels/lzparabix/decoder/LZParabixBlockDecoder.cpp kernels/lzparabix/decoder/LZParabixBlockDecoder.h kernels/lzparabix/decoder/LZParabixAioKernel.cpp kernels/lzparabix/decoder/LZParabixAioKernel.h lzparabix/LZParabixGrepGenerator.cpp lzparabix/LZParabixGrepGenerator.h kernels/fake_stream_generating_kernel.cpp kernels/lzparabix/encoder/LZParabixCompressionKernel.cpp kernels/lzparabix/encoder/LZParabixCompressionKernel.h kernels/lzparabix/decoder/LZParabixLiteralDecoderKernel.cpp kernels/lzparabix/decoder/LZParabixLiteralDecoderKernel.h) 107 107 -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bitstream_aio.cpp
r6118 r6132 17 17 std::vector<unsigned> numsOfBitStreams, 18 18 unsigned blockSize) 19 : LZ4SequentialAioBaseKernel(b, "LZ4B yteStreamAioKernel", blockSize),19 : LZ4SequentialAioBaseKernel(b, "LZ4BitStreamAioKernel", blockSize), 20 20 mNumsOfBitStreams(numsOfBitStreams) 21 21 { … … 40 40 41 41 void LZ4BitStreamAioKernel::doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 42 llvm::Value *literalLength ) {42 llvm::Value *literalLength, llvm::Value* blockStart) { 43 43 // Constant 44 44 ConstantInt* INT_64_0 = b->getInt64(0); -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bitstream_aio.h
r6118 r6132 12 12 protected: 13 13 virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 14 llvm::Value *literalLength ) override;14 llvm::Value *literalLength, llvm::Value* blockStart) override; 15 15 virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset, 16 16 llvm::Value *matchLength) override; -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bytestream_aio.cpp
r6111 r6132 15 15 16 16 namespace kernel{ 17 std::string LZ4ByteStreamAioKernel::getCopyByteStreamName() { 18 return mCopyOtherByteStream ? "targetByteStream" : "byteStream"; 19 } 17 20 18 LZ4ByteStreamAioKernel::LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, unsigned blockSize) 19 : LZ4SequentialAioBaseKernel(b, "LZ4ByteStreamAioKernel", blockSize) { 21 LZ4ByteStreamAioKernel::LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, bool copyOtherByteStream, unsigned blockSize) 22 : LZ4SequentialAioBaseKernel(b, "LZ4ByteStreamAioKernel", blockSize), 23 mCopyOtherByteStream(copyOtherByteStream) { 20 24 mStreamSetOutputs.push_back(Binding{b->getStreamSetTy(1, 8), "outputStream", BoundedRate(0, 1)}); 25 this->addScalar(b->getInt8PtrTy(), "temporaryInputPtr"); 26 if (copyOtherByteStream) { 27 mStreamSetInputs.push_back(Binding{b->getStreamSetTy(1, 8), "targetByteStream", RateEqualTo("byteStream")}); 28 } 21 29 } 22 30 23 31 void LZ4ByteStreamAioKernel::doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 24 llvm::Value *literalLength ) {32 llvm::Value *literalLength, llvm::Value* blockStart) { 25 33 unsigned fw = 64; 26 34 Type* INT_FW_PTR = b->getIntNTy(fw)->getPointerTo(); 27 35 28 Value* inputBytePtr = b->getRawInputPointer("byteStream", literalStart); 36 Value* inputBytePtr = b->getScalarField("temporaryInputPtr"); 37 inputBytePtr = b->CreateGEP(inputBytePtr, b->CreateSub(literalStart, blockStart)); 38 29 39 Value* inputPtr = b->CreatePointerCast(inputBytePtr, INT_FW_PTR); 30 40 … … 33 43 Value* outputPtr = b->getRawOutputPointer("outputStream", b->CreateURem(outputPos, outputBufferSize)); 34 44 outputPtr = b->CreatePointerCast(outputPtr, INT_FW_PTR); 35 36 // We can always assume that we have enough output buffer based on our output buffer allocation strategy (except in extract only case)37 45 38 46 BasicBlock* entryBlock = b->GetInsertBlock(); … … 120 128 } 121 129 130 void LZ4ByteStreamAioKernel::initializationMethod(const std::unique_ptr<KernelBuilder> &b) { 131 b->setScalarField("temporaryInputPtr", b->CreateMalloc(b->getSize(mBlockSize))); 132 } 133 134 void LZ4ByteStreamAioKernel::prepareProcessBlock(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockStart, llvm::Value* blockEnd) { 135 Value* rawInputPtr = b->CreatePointerCast(b->getRawInputPointer(this->getCopyByteStreamName(), b->getSize(0)), b->getInt8PtrTy()); 136 Value* inputCapacity = b->getCapacity(this->getCopyByteStreamName()); 137 138 Value* blockStartRem = b->CreateURem(blockStart, inputCapacity); 139 Value* remSize = b->CreateSub(inputCapacity, blockStartRem); 140 141 Value* blockSize = b->CreateSub(blockEnd, blockStart); 142 143 Value* copySize1 = b->CreateUMin(remSize, blockSize); 144 Value* copySize2 = b->CreateSub(blockSize, copySize1); 145 146 Value* temporayInputPtr = b->getScalarField("temporaryInputPtr"); 147 148 b->CreateMemCpy(temporayInputPtr, b->CreateGEP(rawInputPtr, blockStartRem), copySize1, 1); 149 b->CreateMemCpy(b->CreateGEP(temporayInputPtr, copySize1), rawInputPtr, copySize2, 1); 150 } 151 152 void LZ4ByteStreamAioKernel::beforeTermination(const std::unique_ptr<KernelBuilder> &b) { 153 b->CreateFree(b->getScalarField("temporaryInputPtr")); 154 // b->CallPrintInt("beforeTermination", b->getSize(0)); 155 } 156 122 157 } -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_bytestream_aio.h
r6111 r6132 9 9 class LZ4ByteStreamAioKernel : public LZ4SequentialAioBaseKernel { 10 10 public: 11 LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, unsigned blockSize = 4 * 1024 * 1024); 11 LZ4ByteStreamAioKernel(const std::unique_ptr<kernel::KernelBuilder> &b, bool copyOtherByteStream = false, unsigned blockSize = 4 * 1024 * 1024); 12 12 13 13 14 protected: 14 15 virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 15 llvm::Value *literalLength ) override;16 llvm::Value *literalLength, llvm::Value* blockStart) override; 16 17 virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset, 17 18 llvm::Value *matchLength) override; 18 19 virtual void setProducedOutputItemCount(const std::unique_ptr<KernelBuilder> &b, llvm::Value* produced) override; 20 21 virtual void initializationMethod(const std::unique_ptr<KernelBuilder> &b) override; 22 virtual void prepareProcessBlock(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockStart, llvm::Value* blockEnd) override; 23 virtual void beforeTermination(const std::unique_ptr<KernelBuilder> &b) override; 24 25 private: 26 inline std::string getCopyByteStreamName(); 27 bool mCopyOtherByteStream; 28 19 29 }; 20 30 -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_sequential_aio_base.cpp
r6118 r6132 27 27 Binding{b->getStreamSetTy(1, 64), "blockStart", RateEqualTo("isCompressed"), AlwaysConsume()}, 28 28 Binding{b->getStreamSetTy(1, 64), "blockEnd", RateEqualTo("isCompressed"), AlwaysConsume()} 29 30 29 }, 31 30 //Outputs … … 43 42 Binding{b->getInt64Ty(), "outputPos"}, 44 43 45 46 }){ 44 Binding{b->getInt1Ty(), "hasCallInitialization"} 45 46 47 }), 48 mBlockSize(blockSize) { 47 49 this->setStride(blockSize); 48 50 addAttribute(MustExplicitlyTerminate()); … … 51 53 // ---- Kernel Methods 52 54 void LZ4SequentialAioBaseKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &b) { 55 Value* hasCallInitialization = b->getScalarField("hasCallInitialization"); 56 57 BasicBlock* initializationBlock = b->CreateBasicBlock("initializationBlock"); 58 BasicBlock* entryBlock = b->CreateBasicBlock("entryBlock"); 53 59 BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock"); 60 61 b->CreateLikelyCondBr(hasCallInitialization, entryBlock, initializationBlock); 62 63 // ---- initializationBlock 64 b->SetInsertPoint(initializationBlock); 65 b->setScalarField("hasCallInitialization", b->getInt1(true)); 66 this->initializationMethod(b); 67 b->CreateBr(entryBlock); 68 69 // ---- entryBlock 70 b->SetInsertPoint(entryBlock); 54 71 BasicBlock* blockEndConBlock = b->CreateBasicBlock("blockEndConBlock"); 55 72 … … 72 89 73 90 b->SetInsertPoint(processBlock); 74 75 91 //TODO handle uncompressed block 92 this->prepareProcessBlock(b, blockStart, blockEnd); 93 76 94 this->processCompressedLz4Block(b, blockStart, blockEnd); 95 77 96 this->storePendingOutput(b); 78 97 … … 87 106 b->CreateBr(exitBlock); 88 107 108 // ---- exitBlock 89 109 b->SetInsertPoint(exitBlock); 110 111 BasicBlock* beforeTerminationBlock = b->CreateBasicBlock("beforeTerminationBlock"); 112 BasicBlock* terminationBlock = b->CreateBasicBlock("terminationBlock"); 113 114 b->CreateUnlikelyCondBr(b->getTerminationSignal(), beforeTerminationBlock, terminationBlock); 115 116 // ---- beforeTerminationBlock 117 b->SetInsertPoint(beforeTerminationBlock); 118 this->beforeTermination(b); 119 b->CreateBr(terminationBlock); 120 121 // ---- terminationBlock 122 b->SetInsertPoint(terminationBlock); 90 123 } 91 124 … … 116 149 b->SetInsertPoint(processBody); 117 150 /* 118 auto accelerationRet = this->doAcceleration(b, phiCursorValue, lz4Block End);151 auto accelerationRet = this->doAcceleration(b, phiCursorValue, lz4BlockStart, lz4BlockEnd); 119 152 Value* tokenMarkers = accelerationRet.first.first; 120 153 … … 125 158 nextTokenGlobalPos = this->processLz4Sequence(b, nextTokenGlobalPos, lz4BlockEnd); 126 159 */ 127 Value* nextTokenGlobalPos = this->processLz4Sequence(b, phiCursorValue, lz4Block End);160 Value* nextTokenGlobalPos = this->processLz4Sequence(b, phiCursorValue, lz4BlockStart, lz4BlockEnd); 128 161 phiCursorValue->addIncoming(nextTokenGlobalPos, b->GetInsertBlock()); 129 162 b->CreateBr(processCon); … … 133 166 134 167 std::pair<std::pair<llvm::Value *, llvm::Value *>, llvm::Value *> 135 LZ4SequentialAioBaseKernel::doAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value *beginTokenPos, 136 llvm::Value *blockEnd) { 168 LZ4SequentialAioBaseKernel::doAcceleration( 169 const std::unique_ptr<KernelBuilder> &b, 170 llvm::Value *beginTokenPos, 171 llvm::Value *blockStart, 172 llvm::Value *blockEnd) { 137 173 BasicBlock* entryBlock = b->GetInsertBlock(); 138 174 … … 229 265 // TODO all of the literal data here will always be in the same 64-bit literal block, it may be better if we provide 230 266 // this information to the literal copy method, especially when we are working with swizzled form 231 this->doAccelerationLiteralCopy(b, literalStartGlobalPos, literalLength );267 this->doAccelerationLiteralCopy(b, literalStartGlobalPos, literalLength, blockStart); 232 268 this->doAccelerationMatchCopy(b, matchOffset, matchLength); 233 269 … … 246 282 } 247 283 248 llvm::Value *LZ4SequentialAioBaseKernel::processLz4Sequence(const std::unique_ptr<KernelBuilder> &b, 249 llvm::Value *beginTokenPos, 250 llvm::Value *lz4BlockEnd) { 284 llvm::Value *LZ4SequentialAioBaseKernel::processLz4Sequence( 285 const std::unique_ptr<KernelBuilder> &b, 286 llvm::Value *beginTokenPos, 287 llvm::Value *lz4BlockStart, 288 llvm::Value *lz4BlockEnd) { 251 289 // Constant 252 290 ConstantInt* SIZE_0 = b->getSize(0); … … 307 345 308 346 // This literal copy will always cross 64 bits literal boundary 309 this->doLiteralCopy(b, literalStartPos, literalLength );347 this->doLiteralCopy(b, literalStartPos, literalLength, lz4BlockStart); 310 348 BasicBlock* extendLiteralEndFinal = b->GetInsertBlock(); 311 349 -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_sequential_aio_base.h
r6118 r6132 28 28 // ---- Constant 29 29 const static unsigned int ACCELERATION_WIDTH = 64; 30 const unsigned mBlockSize; 30 31 31 32 // ---- Kernel Methods … … 36 37 llvm::Value *lz4BlockEnd); 37 38 38 std::pair<std::pair<llvm::Value *, llvm::Value *>, llvm::Value *> 39 doAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value *beginTokenPos, 40 llvm::Value *blockEnd); 39 std::pair<std::pair<llvm::Value *, llvm::Value *>, llvm::Value *> doAcceleration( 40 const std::unique_ptr<KernelBuilder> &b, 41 llvm::Value *beginTokenPos, 42 llvm::Value *blockStart, 43 llvm::Value *blockEnd); 41 44 42 45 43 virtual llvm::Value *processLz4Sequence(const std::unique_ptr<KernelBuilder> &b, 44 llvm::Value *beginTokenPos, llvm::Value *lz4BlockEnd); 46 virtual llvm::Value *processLz4Sequence( 47 const std::unique_ptr<KernelBuilder> &b, 48 llvm::Value *beginTokenPos, 49 llvm::Value *lz4BlockStart, 50 llvm::Value *lz4BlockEnd 51 ); 45 52 46 53 std::pair<llvm::Value*, llvm::Value*> parseMatchInfo(const std::unique_ptr<KernelBuilder> &b, llvm::Value* matchOffsetBeginPos, llvm::Value* tokenValue); … … 86 93 // ---- Methods To Be Override 87 94 95 virtual void initializationMethod(const std::unique_ptr<KernelBuilder> &b){}; 96 virtual void prepareProcessBlock(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockStart, llvm::Value* blockEnd){}; 97 virtual void beforeTermination(const std::unique_ptr<KernelBuilder> &b){}; 98 88 99 89 100 virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 90 llvm::Value *literalLength ) = 0;101 llvm::Value *literalLength, llvm::Value* blockStart) = 0; 91 102 92 103 virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset, … … 98 109 virtual void prepareAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value* beginTokenPos) {}; 99 110 virtual void doAccelerationLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 100 llvm::Value *literalLength ) {this->doLiteralCopy(b, literalStart, literalLength);}111 llvm::Value *literalLength, llvm::Value* blockStart) {this->doLiteralCopy(b, literalStart, literalLength, blockStart);} 101 112 virtual void doAccelerationMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset, 102 113 llvm::Value *matchLength) {this->doMatchCopy(b, matchOffset, matchLength);} -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_swizzled_aio.cpp
r6111 r6132 57 57 58 58 void LZ4SwizzledAioKernel::doAccelerationLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 59 llvm::Value *literalLength ) {59 llvm::Value *literalLength, llvm::Value* blockStart) { 60 60 // this->handleAccelerationLiteralCopy(b, literalStart, literalLength, inputValuesVector); 61 61 … … 512 512 513 513 void LZ4SwizzledAioKernel::doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 514 llvm::Value *literalLength ) {514 llvm::Value *literalLength, llvm::Value* blockStart) { 515 515 Value* SIZE_64 = b->getSize(64); 516 516 Value* SIZE_0 = b->getSize(0); -
icGREP/icgrep-devel/icgrep/kernels/lz4/aio/lz4_swizzled_aio.h
r6111 r6132 55 55 56 56 virtual void doLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 57 llvm::Value *literalLength );57 llvm::Value *literalLength, llvm::Value* blockStart) override; 58 58 virtual void doMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset, 59 59 llvm::Value *matchLength); … … 63 63 virtual void prepareAcceleration(const std::unique_ptr<KernelBuilder> &b, llvm::Value* beginTokenPos) override; 64 64 virtual void doAccelerationLiteralCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *literalStart, 65 llvm::Value *literalLength ) override;65 llvm::Value *literalLength, llvm::Value* blockStart) override; 66 66 virtual void doAccelerationMatchCopy(const std::unique_ptr<KernelBuilder> &b, llvm::Value *matchOffset, 67 67 llvm::Value *matchLength) override; -
icGREP/icgrep-devel/icgrep/kernels/lzparabix/decoder/LZParabixAioKernel.cpp
r6131 r6132 43 43 { 44 44 Binding{b->getSizeTy(), "blockDataIndex"}, 45 Binding{b->getInt64Ty(), "outputPos"}, 46 45 Binding{b->getInt64Ty(), "outputPos"} 47 46 48 47 }), mNumsOfBitStreams(numsOfBitStreams) { … … 64 63 65 64 65 void LZParabixAioKernel::initScalarOutputPtr(const std::unique_ptr<KernelBuilder> &b) { 66 // b->CallPrintInt("------------------", b->getSize(0)); 67 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 68 Value* ptr = b->CreatePointerCast(b->getOutputStreamBlockPtr("outputStream" + std::to_string(i), b->getSize(0)), b->getInt64Ty()->getPointerTo()); 69 b->setScalarField("currentOutputPtr_" + std::to_string(i), ptr); 70 71 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 72 b->CreateStore(b->getInt64(0), b->CreateGEP(ptr, b->getInt32(j * 4))); 73 } 74 } 75 } 76 66 77 void LZParabixAioKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &b) { 78 this->initScalarOutputPtr(b); 79 67 80 BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock"); 68 81 BasicBlock* blockEndConBlock = b->CreateBasicBlock("blockEndConBlock"); … … 168 181 169 182 b->SetInsertPoint(exitBlock); 170 this->storePendingOutput(b);171 183 b->setProcessedItemCount("inputBitStream0", b->CreateAdd(literalStartPos, totalLiteralLength)); 172 184 } 173 174 185 175 186 std::pair<llvm::Value *, llvm::Value *> … … 339 350 } 340 351 341 342 352 // ---- Output 343 353 void LZParabixAioKernel::initPendingOutputScalar(const std::unique_ptr<KernelBuilder> &b) { 344 this->initPendingOutputScalar_BitStream(b); 345 // this->initPendingOutputScalar_Swizzled(b); 354 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 355 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 356 this->addScalar(b->getInt64Ty(), "pendingOutput" + std::to_string(i) + "_" + std::to_string(j)); 357 } 358 } 359 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 360 this->addScalar(b->getInt64Ty()->getPointerTo(), "currentOutputPtr_" + std::to_string(i)); 361 } 346 362 } 347 363 348 364 void LZParabixAioKernel::appendBitStreamOutput(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength) { 349 this->appendBitStreamOutput_BitStream(b, extractedValues, valueLength);350 // this->appendBitStreamOutput_Swizzled(b, extractedValues, valueLength);351 }352 353 void LZParabixAioKernel::storePendingOutput(const std::unique_ptr<KernelBuilder> &b) {354 BasicBlock* storePendingOutputBlock = b->CreateBasicBlock("storePendingOutputBlock");355 BasicBlock* storePendingOutputExitBlock = b->CreateBasicBlock("storePendingOutputExitBlock");356 357 Value* oldOutputPos = b->getScalarField("outputPos");358 b->CreateCondBr(359 b->CreateICmpNE(b->CreateURem(oldOutputPos, b->getSize(64)), b->getSize(0)),360 storePendingOutputBlock,361 storePendingOutputExitBlock362 );363 364 b->SetInsertPoint(storePendingOutputBlock);365 this->storePendingOutput_BitStream(b);366 // this->storePendingOutput_Swizzled(b);367 b->CreateBr(storePendingOutputExitBlock);368 369 b->SetInsertPoint(storePendingOutputExitBlock);370 }371 372 373 // ---- Output BitStream374 void LZParabixAioKernel::initPendingOutputScalar_BitStream(const std::unique_ptr<KernelBuilder> &b) {375 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {376 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {377 this->addScalar(b->getInt64Ty(), "pendingOutput" + std::to_string(i) + "_" + std::to_string(j));378 }379 }380 }381 382 void LZParabixAioKernel::appendBitStreamOutput_BitStream(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength) {383 365 BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock"); 384 366 … … 389 371 390 372 unsigned iStreamIndex = 0; 391 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 392 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 393 Value* newValue = b->CreateOr(b->getScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j)), b->CreateShl(extractedValues[iStreamIndex], oldOutputPosRem64)); 394 newOutputVec.push_back(newValue); 373 374 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 375 Value* outputPtr = b->getScalarField("currentOutputPtr_" + std::to_string(i)); 376 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 377 Value* ptr = b->CreateGEP(outputPtr, b->getSize(j * 4)); 378 Value* newValue = b->CreateOr(b->CreateLoad(ptr), b->CreateShl(extractedValues[iStreamIndex], oldOutputPosRem64)); 379 b->CreateStore(newValue, ptr); 395 380 ++iStreamIndex; 396 381 } 397 382 } 398 383 399 BasicBlock* noStoreOutputBlock = b->CreateBasicBlock("noStoreOutputBlock");400 384 BasicBlock* storeOutputBlock =b->CreateBasicBlock("storeOutputBlock"); 401 402 b->CreateCondBr(b->CreateICmpULT(b->CreateAdd(oldOutputPosRem64, valueLength), b->getSize(64)), noStoreOutputBlock, storeOutputBlock); 403 404 // ---- noStoreOutputBlock 405 b->SetInsertPoint(noStoreOutputBlock); 406 407 iStreamIndex = 0; 408 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 409 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 410 b->setScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j), newOutputVec[iStreamIndex]); 411 ++iStreamIndex; 412 } 413 } 414 415 b->CreateBr(exitBlock); 385 b->CreateCondBr(b->CreateICmpULT(b->CreateAdd(oldOutputPosRem64, valueLength), b->getSize(64)), exitBlock, storeOutputBlock); 416 386 417 387 // ---- storeOutputBlock 418 388 b->SetInsertPoint(storeOutputBlock); 419 389 420 Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0"));421 Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));422 Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth()));423 424 iStreamIndex = 0;425 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) {426 Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream" + std::to_string(i), b->getSize(0)), b->getBitBlockType()->getPointerTo());427 Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(mNumsOfBitStreams[i])));428 outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo());429 430 Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64));431 432 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) {433 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize(j * (b->getBitBlockWidth() / 64))));434 b->CreateStore(newOutputVec[iStreamIndex], targetPtr);435 ++iStreamIndex;436 }437 }438 439 390 Value* shiftAmount = b->CreateSub(b->getSize(0x40), oldOutputPosRem64); 440 391 Value* fullyShift = b->CreateICmpEQ(shiftAmount, b->getSize(0x40)); 441 392 393 Value* exceedBlock = b->CreateICmpUGE(b->CreateAdd(b->CreateURem(oldOutputPos, b->getSize(b->getBitBlockWidth())), valueLength), b->getSize(b->getBitBlockWidth())); 442 394 iStreamIndex = 0; 443 395 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 444 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 445 b->setScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j), b->CreateSelect(fullyShift, b->getInt64(0), b->CreateLShr(extractedValues[iStreamIndex], shiftAmount))); 396 Value* oldOutputPtr = b->getScalarField("currentOutputPtr_" + std::to_string(i)); 397 Value* distance = b->CreateSelect(exceedBlock, b->getSize(1 + (mNumsOfBitStreams[i] - 1) * b->getBitBlockWidth() / 64), b->getSize(1)); 398 Value* newOutputPtr = b->CreateGEP(oldOutputPtr, distance); 399 b->setScalarField("currentOutputPtr_" + std::to_string(i), newOutputPtr); 400 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 401 Value* newValue = b->CreateSelect(fullyShift, b->getInt64(0), b->CreateLShr(extractedValues[iStreamIndex], shiftAmount)); 402 Value* ptr = b->CreateGEP(newOutputPtr, b->getSize(j * 4)); 403 b->CreateStore(newValue, ptr); 446 404 ++iStreamIndex; 447 405 } … … 452 410 b->SetInsertPoint(exitBlock); 453 411 b->setScalarField("outputPos", b->CreateAdd(oldOutputPos, valueLength)); 454 } 455 456 void LZParabixAioKernel::storePendingOutput_BitStream(const std::unique_ptr<KernelBuilder> &b) { 457 Value* oldOutputPos = b->getScalarField("outputPos"); 458 Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0")); 459 Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth())); 460 Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth())); 461 Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64)); 462 463 unsigned iStreamIndex = 0; 464 for (unsigned i = 0; i < mNumsOfBitStreams.size(); i++) { 465 Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream" + std::to_string(i), b->getSize(0)), b->getBitBlockType()->getPointerTo()); 466 Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(mNumsOfBitStreams[i]))); 467 outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo()); 468 for (unsigned j = 0; j < mNumsOfBitStreams[i]; j++) { 469 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize(j * (b->getBitBlockWidth() / 64)))); 470 b->CreateStore(b->getScalarField("pendingOutput" + std::to_string(i) + "_" + std::to_string(j)), targetPtr); 471 ++iStreamIndex; 472 } 473 } 474 } 475 476 // ---- Output Swizzled 477 void LZParabixAioKernel::initPendingOutputScalar_Swizzled(const std::unique_ptr<KernelBuilder> &b) { 478 for (unsigned i = 0; i < (mNumsOfBitStreams[0] + 3) / 4; i++) { 479 this->addScalar(b->getBitBlockType(), "pendingOutput" + std::to_string(0) + "_" + std::to_string(i)); 480 } 481 } 482 void LZParabixAioKernel::appendBitStreamOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength) { 483 484 std::vector<llvm::Value*> extractedValuesVec; 485 for (unsigned i = 0; i < 2; i++) { 486 Value* vec = ConstantVector::getNullValue(b->getBitBlockType()); 487 for (unsigned j = 0; j < 4; j++) { 488 vec = b->CreateInsertElement(vec, extractedValues[i * 4 + j], j); 489 } 490 extractedValuesVec.push_back(vec); 491 } 492 493 BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock"); 494 495 Value* oldOutputPos = b->getScalarField("outputPos"); 496 Value* oldOutputPosRem64 = b->CreateURem(oldOutputPos, b->getSize(64)); 497 498 std::vector<llvm::Value*> newOutputVec; 499 for (unsigned i = 0; i < 2; i++) { 500 Value* newValue = b->CreateOr(b->getScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i)), b->CreateShl(extractedValuesVec[i], b->simd_fill(64, oldOutputPosRem64))); 501 newOutputVec.push_back(newValue); 502 } 503 504 505 BasicBlock* noStoreOutputBlock = b->CreateBasicBlock("noStoreOutputBlock"); 506 BasicBlock* storeOutputBlock =b->CreateBasicBlock("storeOutputBlock"); 507 508 b->CreateCondBr(b->CreateICmpULT(b->CreateAdd(oldOutputPosRem64, valueLength), b->getSize(64)), noStoreOutputBlock, storeOutputBlock); 509 510 // ---- noStoreOutputBlock 511 b->SetInsertPoint(noStoreOutputBlock); 512 for (unsigned i = 0; i < 2; i++) { 513 b->setScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i), newOutputVec[i]); 514 } 515 b->CreateBr(exitBlock); 516 517 // ---- storeOutputBlock 518 b->SetInsertPoint(storeOutputBlock); 519 520 Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0")); 521 Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth())); 522 Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth())); 523 524 Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream0", b->getSize(0)), b->getBitBlockType()->getPointerTo()); 525 Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(8))); 526 outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo()); 527 528 Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64)); 529 530 for (unsigned i = 0; i < 2; i++) { 531 for (unsigned j = 0; j < 4; j++) { 532 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize((i * 4 + j) * 4))); 533 b->CreateStore(b->CreateExtractElement(newOutputVec[i], j), targetPtr); 534 } 535 536 } 537 538 Value* shiftAmount = b->CreateSub(b->getSize(0x40), oldOutputPosRem64); 539 Value* fullyShift = b->CreateICmpEQ(shiftAmount, b->getSize(0x40)); 540 541 for (unsigned i = 0; i < 2; i++) { 542 543 b->setScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i), b->CreateSelect(fullyShift, ConstantVector::getNullValue(b->getBitBlockType()), b->CreateLShr(extractedValuesVec[i], b->simd_fill(64, shiftAmount)))); 544 } 545 546 b->CreateBr(exitBlock); 547 548 b->SetInsertPoint(exitBlock); 549 b->setScalarField("outputPos", b->CreateAdd(oldOutputPos, valueLength)); 550 551 } 552 553 void LZParabixAioKernel::storePendingOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b) { 554 Value* oldOutputPos = b->getScalarField("outputPos"); 555 Value* oldOutputPosRem = b->CreateURem(oldOutputPos, b->getCapacity("outputStream0")); 556 Value* oldOutputPosBitBlockIndex = b->CreateUDiv(oldOutputPosRem, b->getSize(b->getBitBlockWidth())); 557 Value* oldOutputPosBitBlockRem = b->CreateURem(oldOutputPosRem, b->getSize(b->getBitBlockWidth())); 558 559 Value* oldOutputPosI64Index = b->CreateUDiv(oldOutputPosBitBlockRem, b->getSize(64)); 560 561 Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("outputStream0", b->getSize(0)), b->getBitBlockType()->getPointerTo()); 562 Value* outputBitBlockBasePtr = b->CreateGEP(outputBasePtr, b->CreateMul(oldOutputPosBitBlockIndex, b->getSize(8))); 563 outputBitBlockBasePtr = b->CreatePointerCast(outputBitBlockBasePtr, b->getInt64Ty()->getPointerTo()); 564 565 vector<Value*> pendingOutputVec; 566 for (unsigned i = 0; i < 2; i++) { 567 pendingOutputVec.push_back(b->getScalarField("pendingOutput" + std::to_string(0) + "_" + std::to_string(i))); 568 } 569 570 for (unsigned i = 0; i < 2; i++) { 571 for (unsigned j = 0; j < 2; j++) { 572 Value* targetPtr = b->CreateGEP(outputBitBlockBasePtr, b->CreateAdd(oldOutputPosI64Index, b->getSize((i * 4 + j) * 4))); 573 b->CreateStore(b->CreateExtractElement(pendingOutputVec[i], j), targetPtr); 574 } 575 } 576 } 412 413 } 414 577 415 } -
icGREP/icgrep-devel/icgrep/kernels/lzparabix/decoder/LZParabixAioKernel.h
r6123 r6132 41 41 std::vector<unsigned> mNumsOfBitStreams; 42 42 43 44 43 // ---- Output 45 44 void initPendingOutputScalar(const std::unique_ptr<KernelBuilder> &b); 46 45 void appendBitStreamOutput(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength); 47 void storePendingOutput(const std::unique_ptr<KernelBuilder> &b);48 46 49 50 void initPendingOutputScalar_BitStream(const std::unique_ptr<KernelBuilder> &b); 51 void appendBitStreamOutput_BitStream(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength); 52 void storePendingOutput_BitStream(const std::unique_ptr<KernelBuilder> &b); 53 54 55 void initPendingOutputScalar_Swizzled(const std::unique_ptr<KernelBuilder> &b); 56 void appendBitStreamOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b, std::vector<llvm::Value*>& extractedValues, llvm::Value* valueLength); 57 void storePendingOutput_Swizzled(const std::unique_ptr<KernelBuilder> &b); 47 void initScalarOutputPtr(const std::unique_ptr<KernelBuilder> &b); 58 48 }; 59 49 -
icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp
r6093 r6132 45 45 } 46 46 } 47 48 49 P2S4StreamByPDEP::P2S4StreamByPDEP(const std::unique_ptr<kernel::KernelBuilder> & b) 50 : BlockOrientedKernel("P2S4StreamByPDEP", 51 {Binding{b->getStreamSetTy(4, 1), "basisBits"}}, 52 {Binding{b->getStreamSetTy(1, 4), "byteStream"}}, 53 {}, {}, {}) 54 { 55 } 56 57 58 void P2S4StreamByPDEP::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) { 59 Function * PDEPFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_64); 60 uint64_t pdepBaseMask = 0x1111111111111111; 61 62 Value* inputBlocks[4]; 63 64 for (unsigned i = 0; i < 4; i++) { 65 inputBlocks[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i)); 66 } 67 Value* outputBasePtr = b->CreatePointerCast(b->getOutputStreamBlockPtr("byteStream", b->getSize(0)), b->getInt64Ty()->getPointerTo()); 68 69 for (unsigned i = 0; i < b->getBitBlockWidth() / 64; i++) { 70 Value* currentInput[4]; 71 for (unsigned iIndex = 0; iIndex < 4; iIndex++) { 72 currentInput[iIndex] = b->CreateExtractElement(inputBlocks[iIndex], i); 73 } 74 75 for (unsigned j = 0; j < 4; j++) { 76 unsigned outputIndex = i * 4 + j; 77 Value* retI64 = b->getInt64(0); 78 for (unsigned k = 0; k < 4; k++) { 79 Value* newBits = b->CreateCall( 80 PDEPFunc,{ 81 b->CreateLShr(currentInput[k], b->getInt64(j * 16)), 82 b->getInt64(pdepBaseMask << k) 83 } 84 ); 85 retI64 = b->CreateOr(retI64, newBits); 86 } 87 b->CreateStore(retI64, b->CreateGEP(outputBasePtr, b->getInt32(outputIndex))); 88 } 89 } 90 91 // for (unsigned i = 0; i < 4; i++) { 92 // b->CallPrintRegister("input" + std::to_string(i), inputBlocks[i]); 93 // } 94 // 95 // Value* outputBaseBlockPtr = b->CreatePointerCast(b->getOutputStreamBlockPtr("byteStream", b->getSize(0)), b->getBitBlockType()->getPointerTo()); 96 // for (unsigned i = 0; i < 4; i++) { 97 // b->CallPrintRegister("output" + std::to_string(i), b->CreateLoad(b->CreateGEP(outputBaseBlockPtr, b->getInt32(i)))); 98 // } 99 100 } 101 47 102 48 103 void P2SKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) { 49 104 Value * p_bitblock[8]; 50 105 for (unsigned i = 0; i < 8; i++) { 51 p_bitblock[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i)); 106 if (i < mNumOfStreams) { 107 p_bitblock[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i)); 108 } else { 109 p_bitblock[i] = ConstantVector::getNullValue(b->getBitBlockType()); 110 } 111 52 112 } 53 113 Value * s_bytepack[8]; … … 171 231 } 172 232 173 P2SKernel::P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering numbering) 174 : BlockOrientedKernel("p2s" + cc::numberingSuffix(numbering), 175 {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, 233 234 235 236 P2SKernel::P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering numbering, std::string prefix, unsigned numOfStreams) 237 : BlockOrientedKernel(prefix + "p2s" + cc::numberingSuffix(numbering), 238 {Binding{b->getStreamSetTy(numOfStreams, 1), "basisBits"}}, 176 239 {Binding{b->getStreamSetTy(1, 8), "byteStream"}}, 177 240 {}, {}, {}), 178 mBasisSetNumbering(numbering) { 241 mBasisSetNumbering(numbering), 242 mNumOfStreams(numOfStreams) { 179 243 } 180 244 -
icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.h
r6089 r6132 13 13 namespace kernel { 14 14 15 class P2S4StreamByPDEP final : public BlockOrientedKernel{ 16 public: 17 P2S4StreamByPDEP(const std::unique_ptr<kernel::KernelBuilder> & b); 18 private: 19 void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override; 20 }; 21 15 22 class P2SKernel final : public BlockOrientedKernel { 16 23 public: 17 P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian );24 P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian, std::string prefix = "", unsigned numOfStreams = 8); 18 25 bool isCachable() const override { return true; } 19 26 bool hasSignature() const override { return false; } … … 21 28 cc::BitNumbering mBasisSetNumbering; 22 29 void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override; 30 unsigned mNumOfStreams; 23 31 }; 24 32 -
icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp
r6112 r6132 24 24 Value * x0 = iBuilder->esimd_mergel(128, s0, s1); 25 25 Value * x1 = iBuilder->esimd_mergeh(128, s0, s1); 26 t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1); 26 27 t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1); // TODO 4䞪bit streamsæ¶è¿éç16æ¹äžº8? 27 28 t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1); 29 28 30 } else { 29 31 t0 = iBuilder->hsimd_packh(16, s0, s1); 30 32 t1 = iBuilder->hsimd_packl(16, s0, s1); 31 33 } 34 if (shift == 1) { 35 // iBuilder->CallPrintRegister("t0", t0); 36 // iBuilder->CallPrintRegister("t1", t1); 37 } 38 32 39 p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift)); 33 40 p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1); … … 35 42 36 43 void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[], cc::BitNumbering basisNumbering) { 44 { 45 //input[0 - 3] 46 Value* bit3311[2]; 47 Value* bit2200[2]; 48 for (unsigned i = 0; i < 2; i++) { 49 s2p_step(iBuilder, input[2 * i], input[2 * i + 1], iBuilder->simd_himask(2), 1, bit3311[i], bit2200[i]); 50 } 51 52 Value* out[4]; 53 s2p_step(iBuilder, bit3311[0], bit3311[1], 54 iBuilder->simd_himask(4), 2, out[3], out[1]); 55 56 s2p_step(iBuilder, bit2200[0], bit2200[1], 57 iBuilder->simd_himask(4), 2, out[2], out[0]); 58 for (unsigned i = 0; i < 4; i++) { 59 // iBuilder->CallPrintRegister("input" + std::to_string(i), input[i]); 60 } 61 for (unsigned i = 0; i < 4; i++) { 62 // iBuilder->CallPrintRegister("out" + std::to_string(i), out[i]); 63 } 64 } 65 66 37 67 // Little-endian bit number is used for variables. 38 68 Value * bit66442200[4]; 39 69 Value * bit77553311[4]; 70 // iBuilder->CallPrintRegister("himask2", iBuilder->simd_himask(2)); 71 // iBuilder->CallPrintRegister("himask4", iBuilder->simd_himask(4)); 72 // iBuilder->CallPrintRegister("himask8", iBuilder->simd_himask(8)); 40 73 41 74 for (unsigned i = 0; i < 4; i++) { 42 75 Value * s0 = input[2 * i]; 43 76 Value * s1 = input[2 * i + 1]; 77 // iBuilder->CallPrintRegister("s0_" + std::to_string(2 * i), s0); 78 // iBuilder->CallPrintRegister("s1_" + std::to_string(2 * i + 1), s1); 44 79 s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit77553311[i], bit66442200[i]); 80 // iBuilder->CallPrintRegister("bit77553311", bit77553311[i]); 81 // iBuilder->CallPrintRegister("bit66442200", bit66442200[i]); 45 82 } 46 83 Value * bit44440000[2]; … … 65 102 s2p_step(iBuilder, bit66662222[0], bit66662222[1], iBuilder->simd_himask(8), 4, output[1], output[5]); 66 103 s2p_step(iBuilder, bit77773333[0], bit77773333[1], iBuilder->simd_himask(8), 4, output[0], output[4]); 104 } 105 106 for (unsigned i = 0; i < 8; i++) { 107 // iBuilder->CallPrintRegister("input" + std::to_string(i), input[i]); 108 } 109 for (unsigned i = 0; i < 8; i++) { 110 // iBuilder->CallPrintRegister("output" + std::to_string(i), output[i]); 67 111 } 68 112 } … … 110 154 } 111 155 #endif 112 156 157 158 S2P4StreamByPEXTKernel::S2P4StreamByPEXTKernel(const std::unique_ptr<kernel::KernelBuilder> & b) 159 :BlockOrientedKernel("s2p4StreamByPEXT", 160 { 161 Binding{b->getStreamSetTy(1, 4), "byteStream", FixedRate(), Principal()} 162 }, 163 { 164 Binding{b->getStreamSetTy(4, 1), "basisBits"} 165 }, {}, {}, {}) { 166 167 } 168 169 void S2P4StreamByPEXTKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) { 170 Function* PEXT_func = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64); 171 uint64_t pextBaseMask = 0x1111111111111111; 172 173 Value* inputBasePtr = b->CreatePointerCast(b->getInputStreamBlockPtr("byteStream", b->getSize(0)), b->getInt64Ty()->getPointerTo()); 174 175 Value* outputBlocks[4]; 176 for (unsigned i = 0; i < 4; i++) { 177 outputBlocks[i] = ConstantVector::getNullValue(b->getBitBlockType()); 178 } 179 180 for (unsigned i = 0; i < b->getBitBlockWidth() / 64; i++) { 181 Value* currentOutput[4]; 182 for (unsigned iIndex = 0; iIndex < 4; iIndex++) { 183 currentOutput[iIndex] = b->getInt64(0); 184 } 185 186 for (unsigned j = 0; j < 4; j++) { 187 unsigned inputIndex = i * 4 + j; 188 189 Value* currentInput = b->CreateLoad(b->CreateGEP(inputBasePtr, b->getInt32(inputIndex))); 190 for (unsigned k = 0; k < 4; k++) { 191 192 Value* newBits = b->CreateCall( 193 PEXT_func,{ 194 currentInput, 195 b->getInt64(pextBaseMask << k) 196 } 197 ); 198 199 currentOutput[k] = b->CreateOr(currentOutput[k], b->CreateShl(newBits, 16 * j)); 200 } 201 } 202 203 for (unsigned iIndex = 0; iIndex < 4; iIndex++) { 204 outputBlocks[iIndex] = b->CreateInsertElement(outputBlocks[iIndex], currentOutput[iIndex], i); 205 } 206 } 207 208 for (unsigned i = 0; i < 4; i++) { 209 b->storeOutputStreamBlock("basisBits", b->getInt32(i), outputBlocks[i]); 210 // b->CallPrintRegister("outputBlocks" + std::to_string(i), outputBlocks[i]); 211 } 212 } 213 113 214 void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) { 114 215 BasicBlock * entry = kb->GetInsertBlock(); … … 135 236 Value * basisbits[8]; 136 237 s2p(kb, bytepack, basisbits, mBasisSetNumbering); 137 for (unsigned i = 0; i < 8; ++i) {238 for (unsigned i = 0; i < mNumOfStreams; ++i) { 138 239 kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]); 139 240 } … … 145 246 } 146 247 147 S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering, bool aligned, std::string prefix )248 S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, cc::BitNumbering numbering, bool aligned, std::string prefix, unsigned numOfStreams) 148 249 : MultiBlockKernel(aligned ? prefix + "s2p" + cc::numberingSuffix(numbering): prefix + "s2p_unaligned" + cc::numberingSuffix(numbering), 149 250 {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}}, 150 {Binding{b->getStreamSetTy( 8, 1), "basisBits"}}, {}, {}, {}),251 {Binding{b->getStreamSetTy(numOfStreams, 1), "basisBits"}}, {}, {}, {}), 151 252 mBasisSetNumbering(numbering), 152 mAligned(aligned) { 253 mAligned(aligned), 254 mNumOfStreams(numOfStreams) 255 { 153 256 if (!aligned) { 154 257 mStreamSetInputs[0].addAttribute(Misaligned()); -
icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.h
r6112 r6132 16 16 namespace kernel { 17 17 18 class S2P4StreamByPEXTKernel final : public BlockOrientedKernel{ 19 public: 20 S2P4StreamByPEXTKernel(const std::unique_ptr<kernel::KernelBuilder> & b); 21 protected: 22 void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) override; 23 }; 24 18 25 class S2PKernel final : public MultiBlockKernel { 19 26 public: 20 S2PKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian, bool aligned = true, std::string prefix = "" );27 S2PKernel(const std::unique_ptr<kernel::KernelBuilder> & b, cc::BitNumbering basisNumbering = cc::BitNumbering::LittleEndian, bool aligned = true, std::string prefix = "", unsigned numOfStreams = 8); 21 28 bool isCachable() const override { return true; } 22 29 bool hasSignature() const override { return false; } … … 26 33 cc::BitNumbering mBasisSetNumbering; 27 34 bool mAligned; 35 unsigned mNumOfStreams; 28 36 }; 29 37 -
icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp
r6119 r6132 27 27 #include <kernels/lz4/aio/lz4_swizzled_aio.h> 28 28 #include <kernels/lz4/aio/lz4_bitstream_aio.h> 29 #include <kernels/lz4/aio/lz4_i4_bytestream_aio.h> 29 30 #include <kernels/bitstream_pdep_kernel.h> 30 31 #include <kernels/lz4/lz4_bitstream_not_kernel.h> … … 426 427 427 428 429 430 if (compressedBitStreams[0]->getNumOfStreams() == 4) { 431 StreamSetBuffer* twistedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 4), this->getInputBufferBlocks(iBuilder)); 432 kernel::Kernel* twistK = mGrepDriver->addKernelInstance<kernel::P2S4StreamByPDEP>(iBuilder); 433 mGrepDriver->makeKernelCall(twistK, {compressedBitStreams[0]}, {twistedCharClasses}); 434 435 436 StreamSetBuffer* uncompressedTwistedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 4), this->getInputBufferBlocks(iBuilder)); 437 Kernel* lz4I4AioK = mPxDriver.addKernelInstance<LZ4I4ByteStreamAioKernel>(iBuilder); 438 lz4I4AioK->setInitialArguments({mFileSize}); 439 mGrepDriver->makeKernelCall(lz4I4AioK, { 440 mCompressedByteStream, 441 442 // Block Data 443 BlockData_IsCompressed, 444 BlockData_BlockStart, 445 BlockData_BlockEnd, 446 447 twistedCharClasses 448 }, { 449 uncompressedTwistedCharClasses 450 }); 451 452 StreamSetBuffer* untwistedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(iBuilder)); 453 kernel::Kernel* untwistK = mGrepDriver->addKernelInstance<kernel::S2P4StreamByPEXTKernel>(iBuilder); 454 mGrepDriver->makeKernelCall(untwistK, {uncompressedTwistedCharClasses}, {untwistedCharClasses}); 455 return {untwistedCharClasses}; 456 } 457 458 459 460 428 461 std::vector<StreamSetBuffer *> inputStreams = { 429 462 mCompressedByteStream, … … 554 587 555 588 StreamSetBuffer * LZ4Generator::generateAIODecompression(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { 556 //// Decode Block Information 557 StreamSetBuffer * const BlockData_IsCompressed = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getInputBufferBlocks(iBuilder), 1); 558 StreamSetBuffer * const BlockData_BlockStart = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1); 559 StreamSetBuffer * const BlockData_BlockEnd = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1); 560 561 562 //// Generate Helper Markers Extenders 563 // StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1); 564 // mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder)); 565 // Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8); 566 // mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders}); 567 568 569 Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder); 570 blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize}); 571 mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd}); 572 589 LZ4BlockInfo blockInfo = this->getBlockInfo(iBuilder); 573 590 574 591 StreamSetBuffer * const decompressionByteStream = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(iBuilder), 1); 575 576 592 Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ByteStreamAioKernel>(iBuilder); 577 593 lz4AioK->setInitialArguments({mFileSize}); … … 580 596 { 581 597 mCompressedByteStream, 582 // Extenders,583 598 584 599 // Block Data 585 BlockData_IsCompressed,586 BlockData_BlockStart,587 BlockData_BlockEnd600 blockInfo.isCompress, 601 blockInfo.blockStart, 602 blockInfo.blockEnd 588 603 }, { 589 604 decompressionByteStream … … 694 709 } 695 710 696 711 LZ4BlockInfo LZ4Generator::getBlockInfo(const std::unique_ptr<kernel::KernelBuilder> & b) { 712 LZ4BlockInfo blockInfo; 713 blockInfo.isCompress = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 8), this->getInputBufferBlocks(b), 1); 714 blockInfo.blockStart = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 64), this->getInputBufferBlocks(b), 1); 715 blockInfo.blockEnd = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 64), this->getInputBufferBlocks(b), 1); 716 717 Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(b); 718 blockDecoderK->setInitialArguments({b->CreateTrunc(mHasBlockChecksum, b->getInt1Ty()), mHeaderSize, mFileSize}); 719 mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {blockInfo.isCompress, blockInfo.blockStart, blockInfo.blockEnd}); 720 721 return blockInfo; 722 } 697 723 698 724 -
icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.h
r6119 r6132 18 18 19 19 typedef void (*MainFunctionType)(char * byte_data, size_t headerSize, size_t filesize, bool hasBlockChecksum); 20 21 struct LZ4BlockInfo { 22 parabix::StreamSetBuffer* blockStart; 23 parabix::StreamSetBuffer* blockEnd; 24 parabix::StreamSetBuffer* isCompress; 25 }; 20 26 21 27 class LZ4Generator { … … 83 89 84 90 unsigned mLz4BlockSize; 91 92 LZ4BlockInfo getBlockInfo(const std::unique_ptr<kernel::KernelBuilder> & b); 85 93 }; 86 94 -
icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.cpp
r6124 r6132 22 22 #include <kernels/lz4/lz4_bitstream_match_copy_kernel.h> 23 23 #include <kernels/lz4/lz4_bitstream_not_kernel.h> 24 #include <kernels/lz4/aio/lz4_i4_bytestream_aio.h> 24 25 #include <kernels/fake_stream_generating_kernel.h> 25 26 #include <kernels/bitstream_pdep_kernel.h> … … 29 30 #include <re/collect_ccs.h> 30 31 #include <re/replaceCC.h> 32 33 #include <re/casing.h> 34 #include <re/exclude_CC.h> 35 #include <re/to_utf8.h> 36 #include <re/re_analysis.h> 37 #include <re/re_name_resolve.h> 38 #include <re/re_name_gather.h> 39 #include <re/re_multiplex.h> 40 #include <re/re_utility.h> 31 41 32 42 #include <UCD/resolve_properties.h> … … 54 64 #include <kernels/lz4/aio/lz4_swizzled_aio.h> 55 65 #include <kernels/lz4/aio/lz4_bitstream_aio.h> 56 66 #include <re/re_seq.h> 67 #include <kernels/lz4/aio/lz4_bytestream_aio.h> 57 68 58 69 namespace re { class CC; } … … 108 119 } 109 120 110 121 parabix::StreamSetBuffer * LZ4GrepGenerator::convertCompressedBitsStreamWithByteStreamAioApproach( 122 parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix) { 123 auto mGrepDriver = &mPxDriver; 124 auto & b = mGrepDriver->getBuilder(); 125 126 LZ4BlockInfo blockInfo = this->getBlockInfo(b); 127 128 StreamSetBuffer * const mtxByteStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(b)); 129 Kernel * p2sK = mPxDriver.addKernelInstance<P2SKernel>(b, cc::BitNumbering::BigEndian, prefix, numberOfStream); 130 mPxDriver.makeKernelCall(p2sK, {compressedBitStream}, {mtxByteStream}); 131 132 StreamSetBuffer * const decompressionMtxByteStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks(b), 1); 133 Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4ByteStreamAioKernel>(b, true); 134 lz4AioK->setInitialArguments({mFileSize}); 135 mPxDriver.makeKernelCall( 136 lz4AioK, 137 { 138 mCompressedByteStream, 139 // Block Data 140 blockInfo.isCompress, 141 blockInfo.blockStart, 142 blockInfo.blockEnd, 143 mtxByteStream 144 }, { 145 decompressionMtxByteStream 146 }); 147 148 StreamSetBuffer * const decompressedMtxBitStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(8), this->getDecompressedBufferBlocks(b)); 149 150 Kernel * s2pk = mPxDriver.addKernelInstance<S2PKernel>(b, cc::BitNumbering::BigEndian, true, prefix, numberOfStream); 151 mPxDriver.makeKernelCall(s2pk, {decompressionMtxByteStream}, {decompressedMtxBitStream}); 152 153 return decompressedMtxBitStream; 154 } 111 155 112 156 StreamSetBuffer * LZ4GrepGenerator::convertCompressedBitsStreamWithSwizzledAioApproach( 113 157 parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix) { 114 158 auto mGrepDriver = &mPxDriver; 115 auto & iBuilder = mGrepDriver->getBuilder(); 116 117 //// Decode Block Information 118 StreamSetBuffer * const BlockData_IsCompressed = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getInputBufferBlocks(iBuilder), 1); 119 StreamSetBuffer * const BlockData_BlockStart = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1); 120 StreamSetBuffer * const BlockData_BlockEnd = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks(iBuilder), 1); 121 122 //// Generate Helper Markers Extenders, FX, XF 123 // StreamSetBuffer * const Extenders = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder), 1); 124 // mMatchOffsetMarker = mPxDriver.addBuffer<StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks(iBuilder)); 125 // Kernel * extenderK = mPxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8); 126 // mPxDriver.makeKernelCall(extenderK, {mCompressedBasisBits}, {Extenders}); 127 128 129 Kernel * blockDecoderK = mPxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder); 130 blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(mHasBlockChecksum, iBuilder->getInt1Ty()), mHeaderSize, mFileSize}); 131 mPxDriver.makeKernelCall(blockDecoderK, {mCompressedByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd}); 132 159 auto & b = mGrepDriver->getBuilder(); 160 161 LZ4BlockInfo blockInfo = this->getBlockInfo(b); 133 162 134 163 // Produce unswizzled bit streams 135 StreamSetBuffer * u16Swizzle0 = mPxDriver.addBuffer<StaticBuffer>( iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(iBuilder), 1);136 Kernel * unSwizzleK = mPxDriver.addKernelInstance<SwizzleGenerator>( iBuilder, 4, 1, 1, 64, "source");164 StreamSetBuffer * u16Swizzle0 = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(4), this->getInputBufferBlocks(b), 1); 165 Kernel * unSwizzleK = mPxDriver.addKernelInstance<SwizzleGenerator>(b, 4, 1, 1, 64, "source"); 137 166 mPxDriver.makeKernelCall(unSwizzleK, {compressedBitStream}, {u16Swizzle0}); 138 167 139 StreamSetBuffer * decompressedSwizzled0 = mPxDriver.addBuffer<StaticBuffer>( iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(iBuilder), 1);140 141 142 Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4SwizzledAioKernel>( iBuilder, 4, 1, 4);168 StreamSetBuffer * decompressedSwizzled0 = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(4), this->getInputBufferBlocks(b), 1); 169 170 171 Kernel* lz4AioK = mPxDriver.addKernelInstance<LZ4SwizzledAioKernel>(b, 4, 1, 4); 143 172 lz4AioK->setInitialArguments({mFileSize}); 144 173 mPxDriver.makeKernelCall( … … 149 178 150 179 // Block Data 151 BlockData_IsCompressed,152 BlockData_BlockStart,153 BlockData_BlockEnd,180 blockInfo.isCompress, 181 blockInfo.blockStart, 182 blockInfo.blockEnd, 154 183 155 184 u16Swizzle0, … … 160 189 161 190 162 StreamSetBuffer * const decompressionBitStream = mPxDriver.addBuffer<StaticBuffer>( iBuilder, iBuilder->getStreamSetTy(8, 1), this->getDecompressedBufferBlocks(iBuilder));163 Kernel * unSwizzleK2 = mPxDriver.addKernelInstance<SwizzleGenerator>( iBuilder, 4, 1, 1, 64, "dst");191 StreamSetBuffer * const decompressionBitStream = mPxDriver.addBuffer<StaticBuffer>(b, b->getStreamSetTy(8, 1), this->getDecompressedBufferBlocks(b)); 192 Kernel * unSwizzleK2 = mPxDriver.addKernelInstance<SwizzleGenerator>(b, 4, 1, 1, 64, "dst"); 164 193 mPxDriver.makeKernelCall(unSwizzleK2, {decompressedSwizzled0}, {decompressionBitStream}); 165 194 … … 317 346 318 347 }; 319 std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZ4GrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio, bool useSwizzled ) {348 std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZ4GrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio, bool useSwizzled, bool useByteStream) { 320 349 321 350 this->initREs(REs); … … 338 367 std::set<re::Name *> UnicodeProperties; 339 368 340 const auto UnicodeSets = re::collectCCs(mREs[0], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")})); 369 re::CC* linefeedCC = re::makeCC(0x0A); 370 371 re::Seq* seq = re::makeSeq(); 372 seq->push_back(mREs[0]); 373 seq->push_back(std::move(linefeedCC)); 374 375 376 const auto UnicodeSets = re::collectCCs(seq, &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")})); 341 377 StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 342 378 … … 350 386 mGrepDriver->makeKernelCall(ccK, {mCompressedBasisBits}, {CharClasses}); 351 387 352 StreamSetBuffer * CompressedLineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);353 kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);354 mPxDriver.makeKernelCall(linefeedK, {mCompressedBasisBits}, {CompressedLineFeedStream});355 356 357 StreamSetBuffer * LineBreakStream = nullptr;358 388 StreamSetBuffer * decompressedCharClasses = nullptr; 359 389 if (useSwizzled) { 360 StreamSetBuffer * combinedStream = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses + 1), baseBufferSize);361 kernel::Kernel* streamCombineKernel = mPxDriver.addKernelInstance<StreamsCombineKernel>(idb, std::vector<unsigned>({1, (unsigned)numOfCharacterClasses}));362 mPxDriver.makeKernelCall(streamCombineKernel, {CompressedLineFeedStream, CharClasses}, {combinedStream});363 StreamSetBuffer * decompressedCombinedStream = nullptr;364 365 390 if (useAio) { 366 decompressedC ombinedStream = this->convertCompressedBitsStreamWithSwizzledAioApproach(combinedStream, 1 +numOfCharacterClasses, "combined");391 decompressedCharClasses = this->convertCompressedBitsStreamWithSwizzledAioApproach(CharClasses, numOfCharacterClasses, "combined"); 367 392 } else { 368 decompressedC ombinedStream = this->convertCompressedBitsStream(combinedStream, 1 +numOfCharacterClasses, "combined");393 decompressedCharClasses = this->convertCompressedBitsStream(CharClasses, numOfCharacterClasses, "combined"); 369 394 } 370 371 LineBreakStream = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), baseBufferSize); 372 decompressedCharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize); 373 kernel::Kernel* streamSplitKernel = mPxDriver.addKernelInstance<StreamsSplitKernel>(idb, std::vector<unsigned>({1, (unsigned)numOfCharacterClasses})); 374 mPxDriver.makeKernelCall(streamSplitKernel, {decompressedCombinedStream}, {LineBreakStream, decompressedCharClasses}); 395 } else if (useByteStream){ 396 decompressedCharClasses = this->convertCompressedBitsStreamWithByteStreamAioApproach(CharClasses, numOfCharacterClasses, "combined"); 375 397 } else { 376 auto ret = this->convertCompressedBitsStreamWithBitStreamAioApproach({CharClasses , CompressedLineFeedStream}, "combined");398 auto ret = this->convertCompressedBitsStreamWithBitStreamAioApproach({CharClasses}, "combined"); 377 399 decompressedCharClasses = ret[0]; 378 LineBreakStream = ret[1];379 400 } 380 401 … … 383 404 mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits}); 384 405 385 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian); 406 StreamSetBuffer * LineBreakStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), this->getInputBufferBlocks(idb)); 407 kernel::Kernel * lineFeedGrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, transformCCs(mpx.get(), linefeedCC), externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian, true); 408 mGrepDriver->makeKernelCall(lineFeedGrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {LineBreakStream}); 409 410 411 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian, true); 386 412 mGrepDriver->makeKernelCall(icgrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {MatchResults}); 387 413 MatchResultsBufs[0] = MatchResults; … … 437 463 std::vector<StreamSetBuffer *> icgrepInputSets = {decompressedBasisBits}; 438 464 439 std::set<re::Name *> UnicodeProperties; 440 441 StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 442 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian); 443 mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults}); 444 MatchResultsBufs[i] = MatchResults; 465 if (mEnableMultiplexing) { 466 const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")})); 467 StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 468 469 mpx = make_unique<cc::MultiplexedAlphabet>("mpx", UnicodeSets); 470 mREs[i] = transformCCs(mpx.get(), mREs[i]); 471 std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs(); 472 auto numOfCharacterClasses = mpx_basis.size(); 473 StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize); 474 kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), false, cc::BitNumbering::BigEndian); 475 mGrepDriver->makeKernelCall(ccK, {decompressedBasisBits}, {CharClasses}); 476 477 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian, true); 478 icgrepInputSets.push_back(CharClasses); 479 mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults}); 480 MatchResultsBufs[i] = MatchResults; 481 } else { 482 std::set<re::Name *> UnicodeProperties; 483 484 StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 485 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian); 486 mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults}); 487 MatchResultsBufs[i] = MatchResults; 488 } 445 489 } 446 490 … … 559 603 mPxDriver.finalizeObject(); 560 604 } 605 606 void LZ4GrepGenerator::generateByteStreamMultiplexingAioPipeline(re::RE* regex) { 607 auto & iBuilder = mPxDriver.getBuilder(); 608 this->generateCountOnlyMainFunc(iBuilder); 609 610 this->generateLoadByteStreamAndBitStream(iBuilder); 611 StreamSetBuffer * LineBreakStream; 612 StreamSetBuffer * Matches; 613 std::vector<re::RE*> res = {regex}; 614 std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res, true, false, true); 615 616 kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder); 617 mPxDriver.makeKernelCall(matchCountK, {Matches}, {}); 618 mPxDriver.generatePipelineIR(); 619 620 iBuilder->setKernel(matchCountK); 621 Value * matchedLineCount = iBuilder->getAccumulator("countResult"); 622 matchedLineCount = iBuilder->CreateZExt(matchedLineCount, iBuilder->getInt64Ty()); 623 624 mPxDriver.deallocateBuffers(); 625 626 iBuilder->CreateRet(matchedLineCount); 627 628 mPxDriver.finalizeObject(); 629 } 630 561 631 562 632 void LZ4GrepGenerator::generateMultiplexingBitStreamAioPipeline(re::RE* regex) { … … 696 766 mPxDriver.finalizeObject(); 697 767 } 768 769 698 770 699 771 void LZ4GrepGenerator::generateAioPipeline(re::RE *regex) { -
icGREP/icgrep-devel/icgrep/lz4/LZ4GrepGenerator.h
r6119 r6132 28 28 29 29 void generateMultiplexingCompressedBitStream(std::vector<re::RE *> &REs); 30 std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio = false, bool useSwizzled = true );30 std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> multiplexingGrepPipeline(std::vector<re::RE *> &REs, bool useAio = false, bool useSwizzled = true, bool useByteStream = false); 31 31 32 32 … … 38 38 void generateBitStreamAioPipeline(re::RE* regex); 39 39 40 void generateByteStreamMultiplexingAioPipeline(re::RE* regex); 40 41 void generateAioPipeline(re::RE* regex); 41 42 void generateParallelAioPipeline(re::RE* regex, bool enableGather, bool enableScatter, int minParallelLevel); … … 76 77 parabix::StreamSetBuffer * convertCompressedBitsStreamWithSwizzledAioApproach( 77 78 parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix); 79 parabix::StreamSetBuffer * convertCompressedBitsStreamWithByteStreamAioApproach( 80 parabix::StreamSetBuffer *compressedBitStream, int numberOfStream, std::string prefix); 78 81 79 82 -
icGREP/icgrep-devel/icgrep/lz4_grep.cpp
r6119 r6132 95 95 g.generateBitStreamAioPipeline(re_ast); 96 96 } 97 98 97 } else { 99 g.generateAioPipeline(re_ast); 98 if (enableMultiplexing) { 99 g.generateByteStreamMultiplexingAioPipeline(re_ast); 100 } else { 101 g.generateAioPipeline(re_ast); 102 } 100 103 } 101 104
Note: See TracChangeset
for help on using the changeset viewer.