source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp @ 5974

Last change on this file since 5974 was 5974, checked in by xwa163, 17 months ago
  1. Use i1 bit stream instead of i64 number stream in M0 related streams and Match Offset related stream
  2. Improve the performance of lz4_index_builder
File size: 15.4 KB
Line 
1//
2// Created by wxy325 on 2018/3/9.
3//
4
5#include "lz4_swizzled_match_copy_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <kernels/streamset.h>
8#include <toolchain/toolchain.h>
9#include <vector>
10#include <llvm/Support/raw_ostream.h>
11
12using namespace llvm;
13using namespace std;
14namespace kernel {
15
16Value *LZ4SwizzledMatchCopyKernel::advanceUntilNextBit(const std::unique_ptr<KernelBuilder> &iBuilder, string inputName, Value *startPos, bool isNextOne) {
17    BasicBlock* entryBlock = iBuilder->GetInsertBlock();
18
19    Constant* SIZE_0 = iBuilder->getSize(0);
20    Constant* SIZE_1 = iBuilder->getSize(1);
21    Value* SIZE_64 = iBuilder->getSize(64); // maybe need to handle 32 bit machine
22    Value* SIZE_INPUT_64_COUNT = iBuilder->getSize(this->getInputStreamSetBuffer(inputName)->getBufferBlocks() * iBuilder->getBitBlockWidth() / 64);
23
24    Value* initCurrentPos = startPos;
25
26    Value* offsetMarkerRawPtr = iBuilder->CreatePointerCast(iBuilder->getRawInputPointer(inputName, SIZE_0), iBuilder->getInt64Ty()->getPointerTo());
27
28    BasicBlock* findNextMatchOffsetConBlock = iBuilder->CreateBasicBlock("findNextMatchOffsetConBlock");
29    BasicBlock* findNextMatchOffsetBodyBlock = iBuilder->CreateBasicBlock("findNextMatchOffsetBodyBlock");
30
31    iBuilder->CreateBr(findNextMatchOffsetConBlock);
32    iBuilder->SetInsertPoint(findNextMatchOffsetConBlock);
33    // Find position marker bit of next 1 bit
34
35    PHINode* phiCurrentPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
36    phiCurrentPos->addIncoming(initCurrentPos, entryBlock);
37
38    Value* currentPosGlobalBlockIndex = iBuilder->CreateUDiv(phiCurrentPos, SIZE_64);
39    Value* currentPosLocalBlockIndex = iBuilder->CreateURem(currentPosGlobalBlockIndex, SIZE_INPUT_64_COUNT);
40    Value* currentPosBlockOffset = iBuilder->CreateURem(phiCurrentPos, SIZE_64);
41    Value* currentValue = iBuilder->CreateLoad(iBuilder->CreateGEP(offsetMarkerRawPtr, currentPosLocalBlockIndex));
42
43    Value* countValue = iBuilder->CreateLShr(currentValue, currentPosBlockOffset);
44    if (!isNextOne) {
45        countValue = iBuilder->CreateNot(countValue);
46    }
47    Value* forwardZero = iBuilder->CreateCountForwardZeroes(countValue);
48    Value* realForwardZero = iBuilder->CreateAdd(currentPosBlockOffset, forwardZero);
49
50    // If targetMarker == 0, move to next block, otherwise count forward zero
51    phiCurrentPos->addIncoming(iBuilder->CreateMul(SIZE_64, iBuilder->CreateAdd(currentPosGlobalBlockIndex, SIZE_1)), iBuilder->GetInsertBlock());
52    iBuilder->CreateCondBr(iBuilder->CreateICmpUGE(realForwardZero, SIZE_64), findNextMatchOffsetConBlock, findNextMatchOffsetBodyBlock);
53
54    iBuilder->SetInsertPoint(findNextMatchOffsetBodyBlock);
55
56    Value* newPosition = iBuilder->CreateAdd(iBuilder->CreateMul(currentPosGlobalBlockIndex, SIZE_64), realForwardZero);
57
58    return newPosition;
59}
60
61Value* LZ4SwizzledMatchCopyKernel::loadNextMatchOffset(const unique_ptr<KernelBuilder> &iBuilder) {
62    Value* initCurrentPos = iBuilder->CreateAdd(iBuilder->getScalarField("currentOffsetMarkerPos"), iBuilder->getSize(1));
63    Value* newPosition = this->advanceUntilNextBit(iBuilder, "MatchOffsetMarker", initCurrentPos, true);
64
65    // Load Match Offset from newPosition
66    iBuilder->setScalarField("currentOffsetMarkerPos", newPosition);
67    iBuilder->setProcessedItemCount("MatchOffsetMarker", newPosition);
68
69    Value* matchOffsetPtr = iBuilder->getRawInputPointer("byteStream", newPosition);
70    // For now, it is safe to cast matchOffset pointer into i16 since the input byte stream is always linear available
71    matchOffsetPtr = iBuilder->CreatePointerCast(matchOffsetPtr, iBuilder->getInt16Ty()->getPointerTo());
72    Value* matchOffset = iBuilder->CreateZExt(iBuilder->CreateLoad(matchOffsetPtr), iBuilder->getSizeTy());
73
74    return matchOffset;
75}
76
77pair<Value*, Value*> LZ4SwizzledMatchCopyKernel::loadNextM0StartEnd(const unique_ptr<KernelBuilder> &iBuilder) {
78    Value* initCurrentPos = iBuilder->getScalarField("currentM0MarkerPos");
79    Value* m0Start = this->advanceUntilNextBit(iBuilder, "M0Marker", initCurrentPos, true);
80    Value* m0End = this->advanceUntilNextBit(iBuilder, "M0Marker", m0Start, false);
81    iBuilder->setScalarField("currentM0MarkerPos", m0End);
82    return std::make_pair(m0Start, m0End);
83};
84
85
86
87
88void LZ4SwizzledMatchCopyKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
89    ConstantInt * const SIZE_ZERO = iBuilder->getSize(0);
90    ConstantInt * const SIZE_ONE = iBuilder->getSize(1);
91    ConstantInt * const SIZE_PDEP_WIDTH = iBuilder->getSize(mPDEPWidth);
92    ConstantInt * const SIZE_4_MEGS = iBuilder->getSize(4 * 1024 * 1024);
93    ConstantInt * const SIZE_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
94
95    BasicBlock * const entryBlock = iBuilder->GetInsertBlock();
96
97    Value * const available = iBuilder->getAvailableItemCount("sourceStreamSet0");
98    Value * const processed = iBuilder->getProcessedItemCount("sourceStreamSet0");
99
100    Value * const itemsToDo = iBuilder->CreateUMin(iBuilder->CreateSub(available, processed), SIZE_4_MEGS);
101    iBuilder->setTerminationSignal(iBuilder->CreateICmpULT(itemsToDo, SIZE_4_MEGS));
102
103    Value * previousProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
104
105    // Output Copy
106    generateOutputCopy(iBuilder);
107
108    Value * const toProcessItemCount = iBuilder->CreateAdd(processed, itemsToDo);
109
110    // Match Copy
111    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("M0CountMarker");
112    Value *totalM0StartItemsCount = iBuilder->getAvailableItemCount("M0CountMarker");
113
114    Value * const initMatchOffset = iBuilder->getScalarField("pendingMatchOffset");
115    Value * const initMatchLength = iBuilder->getScalarField("pendingMatchLength");
116    Value * const initMatchPos = iBuilder->getScalarField("pendingMatchPos");
117
118    BasicBlock * const matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
119    iBuilder->CreateBr(matchCopyLoopCon);
120
121    iBuilder->SetInsertPoint(matchCopyLoopCon);
122    PHINode * const phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
123    phiProcessIndex->addIncoming(initM0StartProcessIndex, entryBlock);
124    PHINode * const phiMatchOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
125    phiMatchOffset->addIncoming(initMatchOffset, entryBlock);
126    PHINode * const phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
127    phiMatchLength->addIncoming(initMatchLength, entryBlock);
128    PHINode * const phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
129    phiMatchPos->addIncoming(initMatchPos, entryBlock);
130
131    BasicBlock * const loadNextMatchInfoConBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoConBlock");
132    BasicBlock * const loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
133
134    BasicBlock * const matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
135    BasicBlock * const matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
136
137    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(phiMatchLength, SIZE_ZERO), loadNextMatchInfoConBlock, matchCopyConBlock);
138
139    iBuilder->SetInsertPoint(loadNextMatchInfoConBlock);
140    Value * const hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
141    BasicBlock * const processExitBlock = iBuilder->CreateBasicBlock("exit_block");
142    iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, processExitBlock);
143
144    iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
145
146    auto ret = this->loadNextM0StartEnd(iBuilder);
147    Value *newM0Start = ret.first;
148    Value *newM0End = ret.second;
149    iBuilder->setProcessedItemCount("M0Marker", newM0End);
150    Value *newMatchOffset = this->loadNextMatchOffset(iBuilder);
151
152
153
154    Value * const newMatchLength = iBuilder->CreateAdd(iBuilder->CreateSub(newM0End, newM0Start), iBuilder->getInt64(1));
155
156    phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, SIZE_ONE), iBuilder->GetInsertBlock());
157
158    phiMatchPos->addIncoming(newM0Start, iBuilder->GetInsertBlock());
159    phiMatchOffset->addIncoming(newMatchOffset, iBuilder->GetInsertBlock());
160    phiMatchLength->addIncoming(newMatchLength, iBuilder->GetInsertBlock());
161
162    iBuilder->CreateBr(matchCopyLoopCon);
163
164    iBuilder->SetInsertPoint(matchCopyConBlock);
165
166    Value * const hasNotReachEnd = iBuilder->CreateICmpULT(phiMatchPos, toProcessItemCount);
167    iBuilder->CreateCondBr(hasNotReachEnd, matchCopyBodyBlock, processExitBlock);
168
169    iBuilder->SetInsertPoint(matchCopyBodyBlock);
170
171    Value * const matchCopyTargetPos = iBuilder->CreateSub(phiMatchPos, previousProducedItemCount);
172    Value * const matchCopyTargetBlockIndex = iBuilder->CreateUDiv(matchCopyTargetPos, SIZE_BLOCK_WIDTH);
173    Value * const matchCopyTargetStreamIndex = iBuilder->CreateUDiv(iBuilder->CreateURem(matchCopyTargetPos, SIZE_BLOCK_WIDTH), SIZE_PDEP_WIDTH); // should SIZE_PDEP_WIDTH be SIZE_STREAM_COUNT?
174    Value * const matchCopyTargetBlockOffset = iBuilder->CreateURem(phiMatchPos, SIZE_PDEP_WIDTH);
175
176    Value * const matchCopyFromPos = iBuilder->CreateSub(matchCopyTargetPos, phiMatchOffset);
177    Value * const matchCopyFromBlockIndex = iBuilder->CreateUDiv(matchCopyFromPos, SIZE_BLOCK_WIDTH);
178    Value * const matchCopyFromStreamIndex = iBuilder->CreateUDiv(iBuilder->CreateURem(matchCopyFromPos, SIZE_BLOCK_WIDTH), SIZE_PDEP_WIDTH);
179    Value * const matchCopyFromBlockOffset = iBuilder->CreateURem(matchCopyFromPos, SIZE_PDEP_WIDTH);
180
181    Value * currentCopySize = iBuilder->CreateSub(SIZE_PDEP_WIDTH, iBuilder->CreateUMax(matchCopyFromBlockOffset, matchCopyTargetBlockOffset));
182    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchOffset);
183    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
184    currentCopySize = iBuilder->CreateUMin(currentCopySize, iBuilder->CreateSub(toProcessItemCount, phiMatchPos));
185    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
186
187    Value * const shiftOffset = iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize);
188    Value * highOffset = iBuilder->CreateShl(SIZE_ONE, shiftOffset);
189    highOffset = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_PDEP_WIDTH), SIZE_ZERO, highOffset); // When currentCopySize == SIZE_PDEP_WIDTH, shl will overflow
190    Value * const lowOffset = iBuilder->CreateShl(SIZE_ONE, matchCopyFromBlockOffset);
191    Value * const maskVector = iBuilder->simd_fill(mPDEPWidth, iBuilder->CreateSub(highOffset, lowOffset));
192    Value * const fromBlockOffsetVector = iBuilder->simd_fill(mPDEPWidth, matchCopyFromBlockOffset);
193    Value * const targetBlockOffsetVector = iBuilder->simd_fill(mPDEPWidth, matchCopyTargetBlockOffset);
194
195    for (unsigned i = 0; i < mStreamSize; i++) {
196        Value * const matchCopyFromBlockPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), matchCopyFromStreamIndex, matchCopyFromBlockIndex);
197        Value * const fromBlockValue = iBuilder->CreateBlockAlignedLoad(matchCopyFromBlockPtr);
198
199        Value * const outputTargetBlockPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), matchCopyTargetStreamIndex, matchCopyTargetBlockIndex);
200        Value * const targetOriginalValue = iBuilder->CreateBlockAlignedLoad(outputTargetBlockPtr);
201
202        Value * copiedValue = iBuilder->simd_and(fromBlockValue, maskVector);
203        copiedValue = iBuilder->CreateLShr(copiedValue, fromBlockOffsetVector);
204        copiedValue = iBuilder->CreateShl(copiedValue, targetBlockOffsetVector);
205        Value * const finalValue = iBuilder->CreateOr(targetOriginalValue, copiedValue);
206
207        iBuilder->CreateStore(finalValue, outputTargetBlockPtr);
208    }
209
210    phiProcessIndex->addIncoming(phiProcessIndex, matchCopyBodyBlock);
211    phiMatchOffset->addIncoming(phiMatchOffset, matchCopyBodyBlock);
212    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, currentCopySize), matchCopyBodyBlock);
213    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, currentCopySize), matchCopyBodyBlock);
214
215    iBuilder->CreateBr(matchCopyLoopCon);
216
217    iBuilder->SetInsertPoint(processExitBlock);
218    iBuilder->setScalarField("pendingMatchOffset", phiMatchOffset);
219    iBuilder->setScalarField("pendingMatchLength", phiMatchLength);
220    iBuilder->setScalarField("pendingMatchPos", phiMatchPos);
221    iBuilder->setProcessedItemCount("M0CountMarker", phiProcessIndex);
222    iBuilder->setProcessedItemCount("sourceStreamSet0", toProcessItemCount);
223}
224
225void LZ4SwizzledMatchCopyKernel::generateOutputCopy(const std::unique_ptr<KernelBuilder> & iBuilder) {
226    Constant * SIZE_ZERO = iBuilder->getSize(0);
227    Constant * COPY_BYTES = iBuilder->getSize(4 * 1024 * 1024 * mStreamCount / 8);
228    for (unsigned i = 0; i < mStreamSize; i++) {
229        Value * inputBasePtr = iBuilder->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), SIZE_ZERO);
230        Value * outputBasePtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO);
231        iBuilder->CreateMemCpy(outputBasePtr, inputBasePtr, COPY_BYTES, 1); // Not align guaranteed in final block
232    }
233}
234
235Value* LZ4SwizzledMatchCopyKernel::loadOffset(const std::unique_ptr<KernelBuilder> & iBuilder, const std::string & bufferName, Value* offset) {
236    return iBuilder->CreateLoad(iBuilder->getRawInputPointer(bufferName, offset));
237}
238
239LZ4SwizzledMatchCopyKernel::LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, unsigned streamCount/*=4*/, unsigned streamSize/*=2*/, unsigned swizzleFactor/*=4*/, unsigned PDEP_width/*64*/)
240: SegmentOrientedKernel("LZ4SwizzledMatchCopyKernel",
241// Inputs
242{
243                                   Binding{iBuilder->getStreamSetTy(1, 1), "MatchOffsetMarker", BoundedRate(0, 1), {DisableSufficientChecking()}},
244                                   Binding{iBuilder->getStreamSetTy(1, 1), "M0Marker", BoundedRate(0, 1), {DisableSufficientChecking()}},
245                                   Binding{iBuilder->getStreamSetTy(1, 1), "M0CountMarker", BoundedRate(0, 1), {DisableSufficientChecking()}},
246                                   Binding{iBuilder->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}
247},
248// Outputs
249{},
250// Arguments
251{
252},
253{},
254{
255       Binding{iBuilder->getSizeTy(), "currentProcessIndex"},
256       Binding{iBuilder->getSizeTy(), "pendingMatchPos"},
257       Binding{iBuilder->getSizeTy(), "pendingMatchOffset"},
258       Binding{iBuilder->getSizeTy(), "pendingMatchLength"},
259       Binding(iBuilder->getSizeTy(), "currentOffsetMarkerPos"),
260       Binding(iBuilder->getSizeTy(), "currentM0MarkerPos")
261})
262, mSwizzleFactor(swizzleFactor)
263, mPDEPWidth(PDEP_width)
264, mStreamSize(streamSize)
265, mStreamCount(streamCount) {
266
267    assert((mSwizzleFactor == (iBuilder->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
268    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
269    setStride(4 * 1024 * 1024);
270    addAttribute(MustExplicitlyTerminate());
271
272    mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), Swizzled()});
273    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet0", RateEqualTo("sourceStreamSet0")});
274
275    for (unsigned i = 1; i < streamSize; i++) {
276        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), Swizzled()});
277        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0")});
278    }
279}
280
281}
Note: See TracBrowser for help on using the repository browser.