source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp @ 5981

Last change on this file since 5981 was 5981, checked in by xwa163, 13 months ago

Improve performance of swizzled_match_copy_kernel by adjusting loop structure

File size: 15.1 KB
Line 
1//
2// Created by wxy325 on 2018/3/9.
3//
4
5#include "lz4_swizzled_match_copy_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <kernels/streamset.h>
8#include <toolchain/toolchain.h>
9#include <vector>
10#include <llvm/Support/raw_ostream.h>
11
12using namespace llvm;
13using namespace std;
14namespace kernel {
15
16Value *LZ4SwizzledMatchCopyKernel::advanceUntilNextBit(const std::unique_ptr<KernelBuilder> &iBuilder, string inputName, Value *startPos, bool isNextOne) {
17    BasicBlock* entryBlock = iBuilder->GetInsertBlock();
18
19    Constant* SIZE_0 = iBuilder->getSize(0);
20    Constant* SIZE_1 = iBuilder->getSize(1);
21    Value* SIZE_64 = iBuilder->getSize(64); // maybe need to handle 32 bit machine
22    Value* SIZE_INPUT_64_COUNT = iBuilder->getSize(this->getInputStreamSetBuffer(inputName)->getBufferBlocks() * iBuilder->getBitBlockWidth() / 64);
23
24    Value* initCurrentPos = startPos;
25
26    Value* offsetMarkerRawPtr = iBuilder->CreatePointerCast(iBuilder->getRawInputPointer(inputName, SIZE_0), iBuilder->getInt64Ty()->getPointerTo());
27
28    BasicBlock* findNextMatchOffsetConBlock = iBuilder->CreateBasicBlock("findNextMatchOffsetConBlock");
29    BasicBlock* findNextMatchOffsetBodyBlock = iBuilder->CreateBasicBlock("findNextMatchOffsetBodyBlock");
30
31    iBuilder->CreateBr(findNextMatchOffsetConBlock);
32    iBuilder->SetInsertPoint(findNextMatchOffsetConBlock);
33    // Find position marker bit of next 1 bit
34
35    PHINode* phiCurrentPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
36    phiCurrentPos->addIncoming(initCurrentPos, entryBlock);
37
38    Value* currentPosGlobalBlockIndex = iBuilder->CreateUDiv(phiCurrentPos, SIZE_64);
39    Value* currentPosLocalBlockIndex = iBuilder->CreateURem(currentPosGlobalBlockIndex, SIZE_INPUT_64_COUNT);
40    Value* currentPosBlockOffset = iBuilder->CreateURem(phiCurrentPos, SIZE_64);
41    Value* currentValue = iBuilder->CreateLoad(iBuilder->CreateGEP(offsetMarkerRawPtr, currentPosLocalBlockIndex));
42
43    Value* countValue = iBuilder->CreateLShr(currentValue, currentPosBlockOffset);
44    if (!isNextOne) {
45        countValue = iBuilder->CreateNot(countValue);
46    }
47    Value* forwardZero = iBuilder->CreateCountForwardZeroes(countValue);
48    Value* realForwardZero = iBuilder->CreateAdd(currentPosBlockOffset, forwardZero);
49
50    // If targetMarker == 0, move to next block, otherwise count forward zero
51    phiCurrentPos->addIncoming(iBuilder->CreateMul(SIZE_64, iBuilder->CreateAdd(currentPosGlobalBlockIndex, SIZE_1)), iBuilder->GetInsertBlock());
52    iBuilder->CreateCondBr(iBuilder->CreateICmpUGE(realForwardZero, SIZE_64), findNextMatchOffsetConBlock, findNextMatchOffsetBodyBlock);
53
54    iBuilder->SetInsertPoint(findNextMatchOffsetBodyBlock);
55
56    Value* newPosition = iBuilder->CreateAdd(iBuilder->CreateMul(currentPosGlobalBlockIndex, SIZE_64), realForwardZero);
57
58    return newPosition;
59}
60
61pair<Value*, Value*> LZ4SwizzledMatchCopyKernel::loadNextMatchOffset(const unique_ptr<KernelBuilder> &iBuilder) {
62    Value* initCurrentPos = iBuilder->CreateAdd(iBuilder->getScalarField("currentOffsetMarkerPos"), iBuilder->getSize(1));
63    Value* newPosition = this->advanceUntilNextBit(iBuilder, "MatchOffsetMarker", initCurrentPos, true);
64
65    // Load Match Offset from newPosition
66    Value* matchOffsetPtr = iBuilder->getRawInputPointer("byteStream", newPosition);
67    // For now, it is safe to cast matchOffset pointer into i16 since the input byte stream is always linear available
68    matchOffsetPtr = iBuilder->CreatePointerCast(matchOffsetPtr, iBuilder->getInt16Ty()->getPointerTo());
69    Value* matchOffset = iBuilder->CreateZExt(iBuilder->CreateLoad(matchOffsetPtr), iBuilder->getSizeTy());
70
71    return std::make_pair(matchOffset, newPosition);
72}
73
74pair<Value*, Value*> LZ4SwizzledMatchCopyKernel::loadNextM0StartEnd(const unique_ptr<KernelBuilder> &iBuilder) {
75    Value* initCurrentPos = iBuilder->getScalarField("currentM0MarkerPos");
76    Value* m0Start = this->advanceUntilNextBit(iBuilder, "M0Marker", initCurrentPos, true);
77    Value* m0End = this->advanceUntilNextBit(iBuilder, "M0Marker", m0Start, false);
78    return std::make_pair(m0Start, m0End);
79};
80
81
82void LZ4SwizzledMatchCopyKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
83    ConstantInt * const SIZE_4_MEGS = iBuilder->getSize(4 * 1024 * 1024);
84
85    BasicBlock * const entryBlock = iBuilder->GetInsertBlock();
86
87    Value * const available = iBuilder->getAvailableItemCount("sourceStreamSet0");
88    Value * const processed = iBuilder->getProcessedItemCount("sourceStreamSet0");
89
90    Value * const itemsToDo = iBuilder->CreateUMin(iBuilder->CreateSub(available, processed), SIZE_4_MEGS);
91    iBuilder->setTerminationSignal(iBuilder->CreateICmpULT(itemsToDo, SIZE_4_MEGS));
92
93
94    // Output Copy
95    generateOutputCopy(iBuilder);
96
97    Value * const toProcessItemCount = iBuilder->CreateAdd(processed, itemsToDo);
98
99    // Match Copy
100    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("M0CountMarker");
101    Value *totalM0StartItemsCount = iBuilder->getAvailableItemCount("M0CountMarker");
102
103    BasicBlock * const matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
104    BasicBlock * const processExitBlock = iBuilder->CreateBasicBlock("exit_block");
105
106    BasicBlock * const loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
107    BasicBlock * const matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
108    BasicBlock * const matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
109
110
111    iBuilder->CreateBr(matchCopyLoopCon);
112
113    iBuilder->SetInsertPoint(matchCopyLoopCon);
114    PHINode * const phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
115    phiProcessIndex->addIncoming(initM0StartProcessIndex, entryBlock);
116
117    Value * const hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
118
119    iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, processExitBlock);
120
121    iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
122
123    auto ret = this->loadNextM0StartEnd(iBuilder);
124    Value *newM0Start = ret.first;
125    Value *newM0End = ret.second;
126
127    auto matchOffsetRet = this->loadNextMatchOffset(iBuilder);
128    Value *newMatchOffset = matchOffsetRet.first;
129    Value* newMatchOffsetPos = matchOffsetRet.second;
130
131    Value * const newMatchLength = iBuilder->CreateAdd(iBuilder->CreateSub(newM0End, newM0Start), iBuilder->getInt64(1));
132
133    iBuilder->CreateBr(matchCopyConBlock);
134    iBuilder->SetInsertPoint(matchCopyConBlock);
135
136    Value * const hasNotReachEnd = iBuilder->CreateICmpULT(newM0Start, toProcessItemCount);
137    iBuilder->CreateLikelyCondBr(hasNotReachEnd, matchCopyBodyBlock, processExitBlock);
138
139    iBuilder->SetInsertPoint(matchCopyBodyBlock);
140
141    iBuilder->setScalarField("currentOffsetMarkerPos", newMatchOffsetPos);
142    iBuilder->setProcessedItemCount("MatchOffsetMarker", newMatchOffsetPos);
143    iBuilder->setScalarField("currentM0MarkerPos", newM0End);
144    iBuilder->setProcessedItemCount("M0Marker", newM0End);
145
146
147    BasicBlock* copyLoopCon = iBuilder->CreateBasicBlock("copyLoopCon");
148    BasicBlock* copyLoopBody = iBuilder->CreateBasicBlock("copyLoopBody");
149    iBuilder->CreateBr(copyLoopCon);
150    iBuilder->SetInsertPoint(copyLoopCon);
151    PHINode* phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
152    PHINode* phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
153
154    phiMatchLength->addIncoming(newMatchLength, matchCopyBodyBlock);
155    phiMatchPos->addIncoming(newM0Start, matchCopyBodyBlock);
156
157    phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, iBuilder->getSize(1)), iBuilder->GetInsertBlock());
158
159
160    iBuilder->CreateLikelyCondBr(iBuilder->CreateICmpNE(phiMatchLength, iBuilder->getSize(0)), copyLoopBody, matchCopyLoopCon);
161
162    iBuilder->SetInsertPoint(copyLoopBody);
163    Value* copySize = this->doMatchCopy(iBuilder, phiMatchPos, newMatchOffset, phiMatchLength);
164    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, copySize), iBuilder->GetInsertBlock());
165    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, copySize), iBuilder->GetInsertBlock());
166    iBuilder->CreateBr(copyLoopCon);
167
168    iBuilder->SetInsertPoint(processExitBlock);
169    iBuilder->setProcessedItemCount("M0CountMarker", phiProcessIndex);
170    iBuilder->setProcessedItemCount("M0Marker", toProcessItemCount);
171    iBuilder->setProcessedItemCount("sourceStreamSet0", toProcessItemCount);
172    iBuilder->setScalarField("currentM0MarkerPos", toProcessItemCount);
173
174}
175
176llvm::Value* LZ4SwizzledMatchCopyKernel::doMatchCopy(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* phiMatchPos, llvm::Value* phiMatchOffset, llvm::Value* phiMatchLength) {
177    ConstantInt * const SIZE_ZERO = iBuilder->getSize(0);
178    ConstantInt * const SIZE_ONE = iBuilder->getSize(1);
179    ConstantInt * const SIZE_PDEP_WIDTH = iBuilder->getSize(mPDEPWidth);
180    ConstantInt * const SIZE_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
181
182    ConstantInt * const outputBufferBlocks = iBuilder->getSize(this->getAnyStreamSetBuffer("outputStreamSet0")->getBufferBlocks());
183
184    Value* matchPosLocalBlockIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(phiMatchPos, SIZE_BLOCK_WIDTH), outputBufferBlocks);
185    Value * const matchCopyTargetStreamIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(phiMatchPos, SIZE_PDEP_WIDTH), iBuilder->getSize(mStreamCount));
186    Value * const matchCopyTargetBlockOffset = iBuilder->CreateURem(phiMatchPos, SIZE_PDEP_WIDTH);
187
188    Value * const matchCopyFromPos = iBuilder->CreateSub(phiMatchPos, phiMatchOffset);
189
190    Value* matchCopyFromLocalBlockIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(matchCopyFromPos, SIZE_BLOCK_WIDTH), outputBufferBlocks);
191    Value * const matchCopyFromStreamIndex = iBuilder->CreateURem(iBuilder->CreateUDiv(matchCopyFromPos, SIZE_PDEP_WIDTH), iBuilder->getSize(mStreamCount));
192    Value * const matchCopyFromBlockOffset = iBuilder->CreateURem(matchCopyFromPos, SIZE_PDEP_WIDTH);
193
194    Value * currentCopySize = iBuilder->CreateSub(SIZE_PDEP_WIDTH, iBuilder->CreateUMax(matchCopyFromBlockOffset, matchCopyTargetBlockOffset));
195    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchOffset);
196    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
197    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
198
199    Value * const shiftOffset = iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize);
200    Value * highOffset = iBuilder->CreateShl(SIZE_ONE, shiftOffset);
201    highOffset = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_PDEP_WIDTH), SIZE_ZERO, highOffset); // When currentCopySize == SIZE_PDEP_WIDTH, shl will overflow
202    Value * const lowOffset = iBuilder->CreateShl(SIZE_ONE, matchCopyFromBlockOffset);
203    Value * const maskVector = iBuilder->simd_fill(mPDEPWidth, iBuilder->CreateSub(highOffset, lowOffset));
204    Value * const fromBlockOffsetVector = iBuilder->simd_fill(mPDEPWidth, matchCopyFromBlockOffset);
205    Value * const targetBlockOffsetVector = iBuilder->simd_fill(mPDEPWidth, matchCopyTargetBlockOffset);
206
207    for (unsigned i = 0; i < mStreamSize; i++) {
208        Value* basePtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer("outputStreamSet" + std::to_string(i), SIZE_ZERO), iBuilder->getBitBlockType()->getPointerTo());
209
210        Value * const matchCopyFromBlockPtr = iBuilder->CreateGEP(basePtr, iBuilder->CreateAdd(iBuilder->CreateMul(matchCopyFromLocalBlockIndex, iBuilder->getSize(mStreamCount)), matchCopyFromStreamIndex));
211        Value * const fromBlockValue = iBuilder->CreateBlockAlignedLoad(matchCopyFromBlockPtr);
212
213        Value * const outputTargetBlockPtr = iBuilder->CreateGEP(basePtr, iBuilder->CreateAdd(iBuilder->CreateMul(matchPosLocalBlockIndex, iBuilder->getSize(mStreamCount)), matchCopyTargetStreamIndex));
214        Value * const targetOriginalValue = iBuilder->CreateBlockAlignedLoad(outputTargetBlockPtr);
215
216        Value * copiedValue = iBuilder->simd_and(fromBlockValue, maskVector);
217        copiedValue = iBuilder->CreateLShr(copiedValue, fromBlockOffsetVector);
218        copiedValue = iBuilder->CreateShl(copiedValue, targetBlockOffsetVector);
219        Value * const finalValue = iBuilder->CreateOr(targetOriginalValue, copiedValue);
220
221        iBuilder->CreateStore(finalValue, outputTargetBlockPtr);
222    }
223    return currentCopySize;
224}
225
226void LZ4SwizzledMatchCopyKernel::generateOutputCopy(const std::unique_ptr<KernelBuilder> & iBuilder) {
227    Constant * SIZE_ZERO = iBuilder->getSize(0);
228    Constant * COPY_BYTES = iBuilder->getSize(4 * 1024 * 1024 * mStreamCount / 8);
229    for (unsigned i = 0; i < mStreamSize; i++) {
230        Value * inputBasePtr = iBuilder->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), SIZE_ZERO);
231        Value * outputBasePtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO);
232        iBuilder->CreateMemCpy(outputBasePtr, inputBasePtr, COPY_BYTES, 1); // Not align guaranteed in final block
233    }
234}
235
236
237LZ4SwizzledMatchCopyKernel::LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, unsigned streamCount/*=4*/, unsigned streamSize/*=2*/, unsigned swizzleFactor/*=4*/, unsigned PDEP_width/*64*/)
238: SegmentOrientedKernel("LZ4SwizzledMatchCopyKernel",
239// Inputs
240{
241                                   Binding{iBuilder->getStreamSetTy(1, 1), "MatchOffsetMarker", BoundedRate(0, 1), {DisableSufficientChecking()}},
242                                   Binding{iBuilder->getStreamSetTy(1, 1), "M0Marker", BoundedRate(0, 1), {DisableSufficientChecking()}},
243                                   Binding{iBuilder->getStreamSetTy(1, 1), "M0CountMarker", BoundedRate(0, 1), {DisableSufficientChecking()}},
244                                   Binding{iBuilder->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}
245},
246// Outputs
247{},
248// Arguments
249{
250},
251{},
252{
253       Binding{iBuilder->getSizeTy(), "currentProcessIndex"},
254       Binding{iBuilder->getSizeTy(), "pendingMatchPos"},
255       Binding{iBuilder->getSizeTy(), "pendingMatchOffset"},
256       Binding{iBuilder->getSizeTy(), "pendingMatchLength"},
257       Binding(iBuilder->getSizeTy(), "currentOffsetMarkerPos"),
258       Binding(iBuilder->getSizeTy(), "currentM0MarkerPos")
259})
260, mSwizzleFactor(swizzleFactor)
261, mPDEPWidth(PDEP_width)
262, mStreamSize(streamSize)
263, mStreamCount(streamCount) {
264
265    assert((mSwizzleFactor == (iBuilder->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
266    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
267    setStride(4 * 1024 * 1024);
268    addAttribute(MustExplicitlyTerminate());
269
270    mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), Swizzled()});
271    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet0", RateEqualTo("sourceStreamSet0")});
272
273    for (unsigned i = 1; i < streamSize; i++) {
274        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), Swizzled()});
275        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0")});
276    }
277}
278
279}
Note: See TracBrowser for help on using the repository browser.