source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp @ 5941

Last change on this file since 5941 was 5941, checked in by xwa163, 17 months ago
  1. Add attributes to disable some features of multiblock kernel
  2. Fix bug for lz4d new approach in large data, pass all test cases
  3. Disable lz4d related test cases for old approach
File size: 16.5 KB
Line 
1//
2// Created by wxy325 on 2018/3/9.
3//
4
5#include "lz4_swizzled_match_copy_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <kernels/streamset.h>
8#include <toolchain/toolchain.h>
9
10
11using namespace llvm;
12using namespace kernel;
13using namespace std;
14
15Value* LZ4SwizzledMatchCopyKernel::loadInt64NumberInput(const unique_ptr<KernelBuilder> &iBuilder, string bufferName, Value* offset) {
16    // GEP here is safe
17    Value* SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
18    Value* inputLocalBlockIndex = iBuilder->CreateUDiv(offset, SIZE_BIT_BLOCK_WIDTH);
19    Value* inputLocalBlockOffset = iBuilder->CreateURem(offset, SIZE_BIT_BLOCK_WIDTH);
20
21    Value* blockBasePtr = iBuilder->getInputStreamBlockPtr(bufferName, iBuilder->getSize(0), inputLocalBlockIndex);
22    blockBasePtr = iBuilder->CreatePointerCast(blockBasePtr, iBuilder->getInt64Ty()->getPointerTo());
23    // GEP here is safe
24    return iBuilder->CreateLoad(iBuilder->CreateGEP(blockBasePtr, inputLocalBlockOffset));
25}
26
27void LZ4SwizzledMatchCopyKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides)  {
28    // Const
29    Constant *SIZE_ZERO = iBuilder->getSize(0);
30    Constant *SIZE_ONE = iBuilder->getSize(1);
31    Constant *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
32    Constant *SIZE_PDEP_WIDTH = iBuilder->getSize(mPDEPWidth);
33
34    BasicBlock* exitBlock = iBuilder->CreateBasicBlock("exitBlock");
35
36    Value *itemsToDo = mAvailableItemCount[3];
37    Value *processedItemCount = mInitialProcessedItemCount[3];
38    Value *totalItemCount = iBuilder->CreateAdd(itemsToDo, processedItemCount);
39    Value *isFinalBlock = iBuilder->CreateICmpULT(itemsToDo, iBuilder->getSize(4 * 1024 * 1024));
40    this->mIsFinalBlock = isFinalBlock;
41    iBuilder->setTerminationSignal(isFinalBlock);
42
43    Value *previousProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
44
45    // Space Calculation
46    Value *outputBufferBlocks = iBuilder->getSize(
47            this->getAnyStreamSetBuffer("outputStreamSet0")->getBufferBlocks());
48
49    Value *outputBlocks = iBuilder->getSize(4 * 1024 * 1024 / iBuilder->getBitBlockWidth()); // Always be 4MB
50
51
52    BasicBlock* processBlock = iBuilder->CreateBasicBlock("processBlock");
53    Value* isInputEnough = iBuilder->CreateOr(isFinalBlock, iBuilder->CreateICmpUGE(itemsToDo, iBuilder->getSize(4 * 1024 * 1024)));
54
55    iBuilder->CreateCondBr(isInputEnough, processBlock, exitBlock);
56
57    iBuilder->SetInsertPoint(processBlock);
58    // Output Copy
59    this->generateOutputCopy(iBuilder, outputBlocks);
60
61    Value *newProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
62
63    BasicBlock *copyEndBlock = iBuilder->CreateBasicBlock("copyEnd");
64    iBuilder->CreateBr(copyEndBlock);
65    iBuilder->SetInsertPoint(copyEndBlock);
66
67    // Match Copy
68    BasicBlock *processExitBlock = iBuilder->CreateBasicBlock("exit_block");
69
70    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("m0Start");
71    Value *totalM0StartItemsCount = iBuilder->CreateAdd(initM0StartProcessIndex, mAvailableItemCount[0]);
72
73    Value *initMatchOffset = iBuilder->getScalarField("pendingMatchOffset");
74    Value *initMatchLength = iBuilder->getScalarField("pendingMatchLength");
75    Value *initMatchPos = iBuilder->getScalarField("pendingMatchPos");
76
77    BasicBlock *matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
78    iBuilder->CreateBr(matchCopyLoopCon);
79
80    iBuilder->SetInsertPoint(matchCopyLoopCon);
81
82
83    PHINode *phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
84    phiProcessIndex->addIncoming(initM0StartProcessIndex, copyEndBlock);
85
86    PHINode *phiMatchOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
87    phiMatchOffset->addIncoming(initMatchOffset, copyEndBlock);
88
89    PHINode *phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
90    phiMatchLength->addIncoming(initMatchLength, copyEndBlock);
91
92    PHINode *phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
93    phiMatchPos->addIncoming(initMatchPos, copyEndBlock);
94
95    BasicBlock *loadNextMatchInfoConBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoConBlock");
96    BasicBlock *loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
97
98    BasicBlock *matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
99    BasicBlock *matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
100
101
102    iBuilder->CreateCondBr(
103            iBuilder->CreateICmpEQ(phiMatchLength, iBuilder->getSize(0)),
104            loadNextMatchInfoConBlock,
105            matchCopyConBlock
106    );
107
108
109    iBuilder->SetInsertPoint(loadNextMatchInfoConBlock);
110
111    Value *hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
112    iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, processExitBlock);
113
114    iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
115
116
117    Value *m0StartBaseOffset = iBuilder->CreateURem(initM0StartProcessIndex, SIZE_BIT_BLOCK_WIDTH);
118    Value *m0StartLoadOffset = iBuilder->CreateAdd(m0StartBaseOffset,
119                                                   iBuilder->CreateSub(phiProcessIndex, initM0StartProcessIndex));
120
121
122    Value *newM0Start = this->loadInt64NumberInput(iBuilder, "m0Start", m0StartLoadOffset);
123    Value *newM0End = this->loadInt64NumberInput(iBuilder, "m0End", m0StartLoadOffset);
124    Value *newMatchOffset = this->loadInt64NumberInput(iBuilder, "matchOffset", m0StartLoadOffset);
125
126    Value *depositStart = newM0Start;
127
128    Value *depositEnd = iBuilder->CreateAdd(newM0End, iBuilder->getInt64(1));
129    Value *newMatchLength = iBuilder->CreateSub(depositEnd, depositStart);
130    phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, SIZE_ONE), iBuilder->GetInsertBlock());
131
132    phiMatchPos->addIncoming(depositStart, iBuilder->GetInsertBlock());
133    phiMatchOffset->addIncoming(newMatchOffset, iBuilder->GetInsertBlock());
134    phiMatchLength->addIncoming(newMatchLength, iBuilder->GetInsertBlock());
135
136    iBuilder->CreateBr(matchCopyLoopCon);
137
138
139    iBuilder->SetInsertPoint(matchCopyConBlock);
140    Value *hasNotReachEnd = iBuilder->CreateICmpULT(phiMatchPos, newProducedItemCount);
141//    iBuilder->CallPrintInt("newProducedItemCount", newProducedItemCount);
142    iBuilder->CreateCondBr(hasNotReachEnd, matchCopyBodyBlock, processExitBlock);
143
144    iBuilder->SetInsertPoint(matchCopyBodyBlock);
145
146
147    Value* matchCopyFromPos = iBuilder->CreateSub(phiMatchPos, phiMatchOffset);
148    Value* outputBufferSize = iBuilder->CreateMul(outputBufferBlocks, SIZE_BIT_BLOCK_WIDTH);
149    Value* matchCopyFromOffset = iBuilder->CreateURem(matchCopyFromPos, outputBufferSize);
150    Value* matchCopyFromBlockIndex = iBuilder->CreateUDiv(matchCopyFromOffset, SIZE_PDEP_WIDTH);
151    Value* matchCopyFromBlockOffset = iBuilder->CreateURem(matchCopyFromOffset, SIZE_PDEP_WIDTH);
152
153    Value* matchCopyTargetBlockIndex = iBuilder->CreateUDiv(iBuilder->CreateSub(phiMatchPos, previousProducedItemCount), SIZE_PDEP_WIDTH);
154    Value* matchCopyTargetBlockOffset = iBuilder->CreateURem(phiMatchPos, SIZE_PDEP_WIDTH);
155
156
157    Value* matchCopyFromRemain = iBuilder->CreateSub(SIZE_PDEP_WIDTH, matchCopyFromBlockOffset);
158    Value* matchCopyTargetRemain = iBuilder->CreateSub(SIZE_PDEP_WIDTH, matchCopyTargetBlockOffset);
159
160    Value* currentCopySize = iBuilder->CreateUMin(matchCopyFromRemain, matchCopyTargetRemain);
161    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchOffset);
162    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
163    currentCopySize = iBuilder->CreateUMin(currentCopySize, iBuilder->CreateSub(newProducedItemCount, phiMatchPos));
164    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
165    Value* singleMask = iBuilder->CreateSub(
166            iBuilder->CreateSelect( // When currentCopySize == SIZE_PDEP_WIDTH, shl will cause overflow
167                    iBuilder->CreateICmpEQ(currentCopySize, SIZE_PDEP_WIDTH),
168                    SIZE_ZERO,
169                    iBuilder->CreateShl(SIZE_ONE, iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize))
170            ),
171            iBuilder->CreateShl(SIZE_ONE, matchCopyFromBlockOffset)
172    );
173    Value* fullMask = iBuilder->simd_fill(mPDEPWidth, singleMask);
174
175    for (int i = 0; i < mStreamSize; i++) {
176        Value* rawOutputBasePtr = iBuilder->getRawOutputPointer("outputStreamSet" + std::to_string(i), SIZE_ZERO);
177        rawOutputBasePtr = iBuilder->CreatePointerCast(rawOutputBasePtr, iBuilder->getBitBlockType()->getPointerTo());
178        Value* matchCopyFromBlockPtr = iBuilder->CreateGEP(rawOutputBasePtr, matchCopyFromBlockIndex);
179
180        Value* fromBlockValue = iBuilder->CreateLoad(matchCopyFromBlockPtr);
181
182        Value* copiedValue = iBuilder->simd_and(fromBlockValue, fullMask);
183
184        Value* outputBlockBasePtr = iBuilder->CreatePointerCast(iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO), iBuilder->getBitBlockType()->getPointerTo());
185        Value* outputTargetBlockPtr = iBuilder->CreateGEP(outputBlockBasePtr, matchCopyTargetBlockIndex);
186        Value* targetOriginalValue = iBuilder->CreateLoad(outputTargetBlockPtr);
187
188        Value* finalValue = iBuilder->simd_or(
189                targetOriginalValue,
190                iBuilder->CreateShl(
191                        iBuilder->CreateLShr(
192                                copiedValue,
193                                iBuilder->simd_fill(mPDEPWidth, matchCopyFromBlockOffset)
194                        ),
195                        iBuilder->simd_fill(mPDEPWidth, matchCopyTargetBlockOffset)
196                )
197        );
198
199
200//        iBuilder->CallPrintRegister("targetOriginalValue", targetOriginalValue);
201//        iBuilder->CallPrintRegister("finalValue", finalValue);
202//        iBuilder->CallPrintInt("matchCopyTargetBlockOffset", matchCopyTargetBlockOffset);
203//        iBuilder->CallPrintInt("currentCopySize", currentCopySize);
204        iBuilder->CreateStore(finalValue, outputTargetBlockPtr);
205    }
206
207    phiProcessIndex->addIncoming(phiProcessIndex, iBuilder->GetInsertBlock());
208    phiMatchOffset->addIncoming(phiMatchOffset, iBuilder->GetInsertBlock());
209    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, currentCopySize), iBuilder->GetInsertBlock());
210    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, currentCopySize), iBuilder->GetInsertBlock());
211
212    iBuilder->CreateBr(matchCopyLoopCon);
213
214    iBuilder->SetInsertPoint(processExitBlock);
215    iBuilder->setScalarField("pendingMatchOffset", phiMatchOffset);
216    iBuilder->setScalarField("pendingMatchLength", phiMatchLength);
217    iBuilder->setScalarField("pendingMatchPos", phiMatchPos);
218    iBuilder->setProcessedItemCount("m0Start", phiProcessIndex);
219    iBuilder->setProcessedItemCount("m0End", phiProcessIndex);
220    iBuilder->setProcessedItemCount("matchOffset", phiProcessIndex);
221
222    iBuilder->CreateBr(exitBlock);
223    iBuilder->SetInsertPoint(exitBlock);
224}
225
226void LZ4SwizzledMatchCopyKernel::generateOutputCopy(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* outputBlocks) {
227    Value *SIZE_ZERO = iBuilder->getSize(0);
228    Value *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
229    Type* bytePtrType = iBuilder->getInt8PtrTy();
230
231    Value *previousProcessed = iBuilder->getProcessedItemCount("sourceStreamSet0");
232
233
234    Value *itemsToDo = mAvailableItemCount[3];
235    Value *copySize = iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH);
236    Value* actualCopySize = iBuilder->CreateUMin(itemsToDo, copySize);
237    Value* copyByte = iBuilder->CreateUDivCeil(iBuilder->CreateMul(copySize, iBuilder->getSize(mStreamCount)), iBuilder->getSize(8)); // i8
238
239
240    for (int i = 0; i < mStreamSize; i++) {
241        Value *inputBasePtr = iBuilder->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), SIZE_ZERO);
242        Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO);
243        iBuilder->CreateMemCpy(
244                iBuilder->CreatePointerCast(outputBasePtr, bytePtrType),
245                iBuilder->CreatePointerCast(inputBasePtr, bytePtrType),
246                copyByte,
247                1 // Not align guaranteed in final block
248        );
249    }
250    Value *newProcessed = iBuilder->CreateAdd(previousProcessed, actualCopySize);
251    iBuilder->setProcessedItemCount("sourceStreamSet0", newProcessed);
252//    iBuilder->CallPrintInt("swizzledMatchCopy:newProcessed", newProcessed);
253    iBuilder->setProducedItemCount("outputStreamSet0", newProcessed);
254}
255
256Value* LZ4SwizzledMatchCopyKernel::getMaximumMatchCopyBlock(const std::unique_ptr<KernelBuilder> &iBuilder) {
257    Value *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
258    Value *SIZE_ZERO = iBuilder->getSize(0);
259    Value *SIZE_ONE = iBuilder->getSize(1);
260    Value *m0EndInitOffset = iBuilder->CreateURem(iBuilder->getProcessedItemCount("m0End"), SIZE_BIT_BLOCK_WIDTH);
261    Value *m0EndItemsToDo = mAvailableItemCount[1];
262    Value *m0EndBasePtr = iBuilder->getInputStreamBlockPtr("m0End", SIZE_ZERO);
263    m0EndBasePtr = iBuilder->CreatePointerCast(m0EndBasePtr, iBuilder->getInt64Ty()->getPointerTo());
264    Value *lastM0 = iBuilder->CreateLoad(
265            iBuilder->CreateGEP(
266                    m0EndBasePtr,
267                    iBuilder->CreateSub(
268                            iBuilder->CreateAdd(m0EndInitOffset, m0EndItemsToDo),
269                            SIZE_ONE
270                    )
271
272            )
273    );
274    Value *lastDepositPosition = iBuilder->CreateAdd(lastM0, SIZE_ONE);
275
276    Value *currentMaxBlock = iBuilder->CreateSelect(
277            this->mIsFinalBlock,
278            iBuilder->CreateUDivCeil(lastDepositPosition, SIZE_BIT_BLOCK_WIDTH),
279            iBuilder->CreateUDiv(lastDepositPosition, SIZE_BIT_BLOCK_WIDTH)
280    );
281
282    // Produced Item Count will always be full bitblock except for final block
283    Value *previousProducedBlocks = iBuilder->CreateUDiv(
284            iBuilder->getProducedItemCount("outputStreamSet0"),
285            SIZE_BIT_BLOCK_WIDTH
286    );
287
288    // (m0 + 1) / BitBlockWidth - produceItemCount / BitBlockWidth
289    return iBuilder->CreateSub(currentMaxBlock, previousProducedBlocks);
290}
291
292LZ4SwizzledMatchCopyKernel::LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, unsigned streamCount/*=4*/, unsigned streamSize/*=2*/, unsigned swizzleFactor/*=4*/, unsigned PDEP_width/*64*/)
293        : MultiBlockKernel("LZ4SwizzledMatchCopyKernel",
294        // Inputs
295                           {
296                                   Binding{iBuilder->getStreamSetTy(1, 64), "m0Start", BoundedRate(0, 1), AlwaysConsume()},
297                                   Binding{iBuilder->getStreamSetTy(1, 64), "m0End", BoundedRate(0, 1), AlwaysConsume()},
298                                   Binding{iBuilder->getStreamSetTy(1, 64), "matchOffset", BoundedRate(0, 1), AlwaysConsume()},
299
300                           },
301        // Outputs
302                           {},
303        // Arguments
304                           {
305                                   Binding{iBuilder->getSizeTy(), "fileSize"} //TODO remove
306                           },
307                           {},
308                           {
309                                   Binding{iBuilder->getSizeTy(), "currentProcessIndex"},
310                                   Binding{iBuilder->getSizeTy(), "pendingMatchPos"},
311                                   Binding{iBuilder->getSizeTy(), "pendingMatchOffset"},
312                                   Binding{iBuilder->getSizeTy(), "pendingMatchLength"},
313                           })
314        , mSwizzleFactor(swizzleFactor)
315        , mPDEPWidth(PDEP_width)
316        , mStreamSize(streamSize)
317        , mStreamCount(streamCount) {
318
319    assert((mSwizzleFactor == (iBuilder->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
320    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
321    this->setStride(4 * 1024 * 1024);
322    addAttribute(MustExplicitlyTerminate());
323
324    mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), {Swizzled(), DisableTemporaryBuffer()}});
325    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet0", BoundedRate(0, 1), DisableTemporaryBuffer()});
326
327    for (int i = 1; i < streamSize; i++) {
328        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), {Swizzled(), DisableTemporaryBuffer()}});
329        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0"), DisableTemporaryBuffer()});
330    }
331}
Note: See TracBrowser for help on using the repository browser.