source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp @ 5906

Last change on this file since 5906 was 5906, checked in by xwa163, 12 months ago

Implement swizzled match copy kernel, which can do match copy in swizzled bitstream form

File size: 17.8 KB
Line 
1//
2// Created by wxy325 on 2018/3/9.
3//
4
5#include "lz4_swizzled_match_copy_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <kernels/streamset.h>
8#include <toolchain/toolchain.h>
9
10
11using namespace llvm;
12using namespace kernel;
13using namespace std;
14
15void LZ4SwizzledMatchCopyKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides)  {
16    // Const
17    Constant *SIZE_ZERO = iBuilder->getSize(0);
18    Constant *SIZE_ONE = iBuilder->getSize(1);
19    Constant *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
20    Constant *SIZE_PDEP_WIDTH = iBuilder->getSize(mPDEPWidth);
21
22    Value *itemsToDo = mAvailableItemCount[3];
23
24
25
26    Value *previousProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
27
28    // Space Calculation
29    Value *outputBufferBlocks = iBuilder->getSize(
30            this->getAnyStreamSetBuffer("outputStreamSet0")->getBufferBlocks());
31    Value *outputRawBeginPtr = iBuilder->CreatePointerCast(
32            iBuilder->getRawOutputPointer("outputStreamSet0", SIZE_ZERO),
33            iBuilder->getBitBlockType()->getPointerTo()); // TODO it is possible the pointer cast here is not necessary
34    Value *outputCurrentPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet0", SIZE_ZERO);
35    Value *producedOffset = iBuilder->CreatePtrDiff(outputCurrentPtr, outputRawBeginPtr);
36    producedOffset = iBuilder->CreateUDiv(producedOffset, iBuilder->getSize(mStreamCount));
37
38    Value *remainSpace = iBuilder->CreateSub(outputBufferBlocks, producedOffset);
39    Value *matchCopyWindowBlock = iBuilder->getSize(256 * 256 / codegen::BlockSize);
40    Value *remainWindowBlock = iBuilder->CreateSelect(
41            iBuilder->CreateICmpUGE(producedOffset, matchCopyWindowBlock),
42            iBuilder->getSize(0),
43            iBuilder->CreateSub(matchCopyWindowBlock, producedOffset)
44    );
45    Value *writableBlocks = iBuilder->CreateSub(remainSpace,
46                                                remainWindowBlock); //TODO handle beginning, if producedItemCount / bitblockWidth < windowBlock, there is no need for the substraction here
47
48    Value *outputBlocks = iBuilder->CreateUMin(writableBlocks, numOfStrides);
49//    outputBlocks = iBuilder->CreateUMin(outputBlocks, this->getMaximumMatchCopyBlock(iBuilder));
50
51
52    Value *isFinalBlock =
53            iBuilder->CreateOr(
54                    iBuilder->CreateICmpULT(itemsToDo, iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH)),
55                    iBuilder->CreateICmpEQ(itemsToDo, iBuilder->getSize(0))
56            );
57
58    this->mIsFinalBlock = isFinalBlock;
59    iBuilder->setTerminationSignal(isFinalBlock);
60
61    // Output Copy
62    this->generateOutputCopy(iBuilder, outputBlocks);
63
64
65
66    Value *newProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
67
68    BasicBlock *copyEndBlock = iBuilder->CreateBasicBlock("copyEnd");
69    iBuilder->CreateBr(copyEndBlock);
70    iBuilder->SetInsertPoint(copyEndBlock);
71
72    // Match Copy
73    BasicBlock *exitBlock = iBuilder->CreateBasicBlock("exit_block");
74
75    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("m0Start");
76    Value *totalM0StartItemsCount = iBuilder->CreateAdd(initM0StartProcessIndex, mAvailableItemCount[0]);
77
78    Value *initMatchOffset = iBuilder->getScalarField("pendingMatchOffset");
79    Value *initMatchLength = iBuilder->getScalarField("pendingMatchLength");
80    Value *initMatchPos = iBuilder->getScalarField("pendingMatchPos");
81
82    BasicBlock *matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
83    iBuilder->CreateBr(matchCopyLoopCon);
84
85    iBuilder->SetInsertPoint(matchCopyLoopCon);
86
87
88    PHINode *phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
89    phiProcessIndex->addIncoming(initM0StartProcessIndex, copyEndBlock);
90
91    PHINode *phiMatchOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
92    phiMatchOffset->addIncoming(initMatchOffset, copyEndBlock);
93
94    PHINode *phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
95    phiMatchLength->addIncoming(initMatchLength, copyEndBlock);
96
97    PHINode *phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
98    phiMatchPos->addIncoming(initMatchPos, copyEndBlock);
99
100    BasicBlock *loadNextMatchInfoConBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoConBlock");
101    BasicBlock *loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
102
103    BasicBlock *matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
104    BasicBlock *matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
105
106
107    iBuilder->CreateCondBr(
108            iBuilder->CreateICmpEQ(phiMatchLength, iBuilder->getSize(0)),
109            loadNextMatchInfoConBlock,
110            matchCopyConBlock
111    );
112
113
114    iBuilder->SetInsertPoint(loadNextMatchInfoConBlock);
115
116    Value *hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
117    iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, exitBlock);
118
119    iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
120
121    Value *m0StartBasePtr = iBuilder->CreatePointerCast(iBuilder->getInputStreamBlockPtr("m0Start", SIZE_ZERO), iBuilder->getInt64Ty()->getPointerTo());
122    Value *m0EndBasePtr = iBuilder->CreatePointerCast(iBuilder->getInputStreamBlockPtr("m0End", SIZE_ZERO), iBuilder->getInt64Ty()->getPointerTo());
123    Value *matchOffsetBasePtr = iBuilder->CreatePointerCast(iBuilder->getInputStreamBlockPtr("matchOffset", SIZE_ZERO), iBuilder->getInt64Ty()->getPointerTo());
124
125
126    Value *m0StartBaseOffset = iBuilder->CreateURem(initM0StartProcessIndex, SIZE_BIT_BLOCK_WIDTH);
127//    iBuilder->CallPrintInt("rawPtr", iBuilder->getRawInputPointer("m0Start", SIZE_ZERO));
128//    iBuilder->CallPrintInt("ptr", m0StartBasePtr);
129//    iBuilder->CallPrintInt("initM0StartProcessIndex", initM0StartProcessIndex);
130    Value *m0StartLoadOffset = iBuilder->CreateAdd(m0StartBaseOffset,
131                                                   iBuilder->CreateSub(phiProcessIndex, initM0StartProcessIndex));
132
133    Value *newM0Start = iBuilder->CreateLoad(iBuilder->CreateGEP(m0StartBasePtr, m0StartLoadOffset));
134    Value *newM0End = iBuilder->CreateLoad(iBuilder->CreateGEP(m0EndBasePtr, m0StartLoadOffset));
135    Value *newMatchOffset = iBuilder->CreateLoad(iBuilder->CreateGEP(matchOffsetBasePtr, m0StartLoadOffset));
136
137    Value *depositStart = newM0Start;
138//    iBuilder->CallPrintInt("depositStart", depositStart);
139//    iBuilder->CallPrintInt("newMatchLength", newMatchLength);
140
141    Value *depositEnd = iBuilder->CreateAdd(newM0End, iBuilder->getInt64(1));
142    Value *newMatchLength = iBuilder->CreateSub(depositEnd, depositStart);
143    phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, SIZE_ONE), iBuilder->GetInsertBlock());
144
145    phiMatchPos->addIncoming(depositStart, iBuilder->GetInsertBlock());
146    phiMatchOffset->addIncoming(newMatchOffset, iBuilder->GetInsertBlock());
147    phiMatchLength->addIncoming(newMatchLength, iBuilder->GetInsertBlock());
148
149    iBuilder->CreateBr(matchCopyLoopCon);
150
151
152    iBuilder->SetInsertPoint(matchCopyConBlock);
153    Value *hasNotReachEnd = iBuilder->CreateICmpULT(phiMatchPos, newProducedItemCount);
154//    iBuilder->CallPrintInt("newProducedItemCount", newProducedItemCount);
155    iBuilder->CreateCondBr(hasNotReachEnd, matchCopyBodyBlock, exitBlock);
156
157    iBuilder->SetInsertPoint(matchCopyBodyBlock);
158
159
160    Value* matchCopyFromPos = iBuilder->CreateSub(phiMatchPos, phiMatchOffset);
161    Value* outputBufferSize = iBuilder->CreateMul(outputBufferBlocks, SIZE_BIT_BLOCK_WIDTH);
162    Value* matchCopyFromOffset = iBuilder->CreateURem(matchCopyFromPos, outputBufferSize);
163    Value* matchCopyFromBlockIndex = iBuilder->CreateUDiv(matchCopyFromOffset, SIZE_PDEP_WIDTH);
164    Value* matchCopyFromBlockOffset = iBuilder->CreateURem(matchCopyFromOffset, SIZE_PDEP_WIDTH);
165
166    Value* matchCopyTargetBlockIndex = iBuilder->CreateUDiv(iBuilder->CreateSub(phiMatchPos, previousProducedItemCount), SIZE_PDEP_WIDTH);
167    Value* matchCopyTargetBlockOffset = iBuilder->CreateURem(phiMatchPos, SIZE_PDEP_WIDTH);
168
169
170    Value* matchCopyFromRemain = iBuilder->CreateSub(SIZE_PDEP_WIDTH, matchCopyFromBlockOffset);
171    Value* matchCopyTargetRemain = iBuilder->CreateSub(SIZE_PDEP_WIDTH, matchCopyTargetBlockOffset);
172
173    Value* currentCopySize = iBuilder->CreateUMin(matchCopyFromRemain, matchCopyTargetRemain);
174    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchOffset);
175    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
176    currentCopySize = iBuilder->CreateUMin(currentCopySize, iBuilder->CreateSub(newProducedItemCount, phiMatchPos));
177    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
178    Value* singleMask = iBuilder->CreateSub(
179            iBuilder->CreateSelect( // When currentCopySize == SIZE_PDEP_WIDTH, shl will cause overflow
180                    iBuilder->CreateICmpEQ(currentCopySize, SIZE_PDEP_WIDTH),
181                    SIZE_ZERO,
182                    iBuilder->CreateShl(SIZE_ONE, iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize))
183            ),
184            iBuilder->CreateShl(SIZE_ONE, matchCopyFromBlockOffset)
185    );
186    Value* fullMask = iBuilder->simd_fill(64, singleMask);
187
188//    iBuilder->CallPrintInt("phiMatchPos", phiMatchPos);
189//    iBuilder->CallPrintInt("currentCopySize", currentCopySize);
190//    iBuilder->CallPrintInt("aaa", iBuilder->CreateShl(SIZE_ONE, iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize)));
191//    iBuilder->CallPrintRegister("fullMask", fullMask);
192
193    for (int i = 0; i < mStreamSize; i++) {
194        Value* rawOutputBasePtr = iBuilder->getRawOutputPointer("outputStreamSet" + std::to_string(i), SIZE_ZERO);
195        rawOutputBasePtr = iBuilder->CreatePointerCast(rawOutputBasePtr, iBuilder->getBitBlockType()->getPointerTo());
196        Value* matchCopyFromBlockPtr = iBuilder->CreateGEP(rawOutputBasePtr, matchCopyFromBlockIndex);
197
198        Value* fromBlockValue = iBuilder->CreateLoad(matchCopyFromBlockPtr);
199
200        Value* copiedValue = iBuilder->simd_and(fromBlockValue, fullMask);
201
202        Value* outputBlockBasePtr = iBuilder->CreatePointerCast(iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO), iBuilder->getBitBlockType()->getPointerTo());
203        Value* outputTargetBlockPtr = iBuilder->CreateGEP(outputBlockBasePtr, matchCopyTargetBlockIndex);
204        Value* targetOriginalValue = iBuilder->CreateLoad(outputTargetBlockPtr);
205
206        Value* finalValue = iBuilder->simd_or(
207                targetOriginalValue,
208                iBuilder->CreateShl(
209                        iBuilder->CreateLShr(
210                                copiedValue,
211                                iBuilder->simd_fill(64, matchCopyFromBlockOffset)
212                        ),
213                        iBuilder->simd_fill(64, matchCopyTargetBlockOffset)
214                )
215        );
216
217
218//        iBuilder->CallPrintRegister("targetOriginalValue", targetOriginalValue);
219//        iBuilder->CallPrintRegister("finalValue", finalValue);
220//        iBuilder->CallPrintInt("matchCopyTargetBlockOffset", matchCopyTargetBlockOffset);
221//        iBuilder->CallPrintInt("currentCopySize", currentCopySize);
222        iBuilder->CreateStore(finalValue, outputTargetBlockPtr);
223    }
224
225    phiProcessIndex->addIncoming(phiProcessIndex, iBuilder->GetInsertBlock());
226    phiMatchOffset->addIncoming(phiMatchOffset, iBuilder->GetInsertBlock());
227    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, currentCopySize), iBuilder->GetInsertBlock());
228    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, currentCopySize), iBuilder->GetInsertBlock());
229
230    iBuilder->CreateBr(matchCopyLoopCon);
231
232    iBuilder->SetInsertPoint(exitBlock);
233//    iBuilder->CallPrintInt("test", SIZE_ZERO);
234    iBuilder->setScalarField("pendingMatchOffset", phiMatchOffset);
235    iBuilder->setScalarField("pendingMatchLength", phiMatchLength);
236    iBuilder->setScalarField("pendingMatchPos", phiMatchPos);
237//    iBuilder->CallPrintInt("pendingMatchLength", phiMatchLength);
238    iBuilder->setProcessedItemCount("m0Start", phiProcessIndex);
239    iBuilder->setProcessedItemCount("m0End", phiProcessIndex);
240    iBuilder->setProcessedItemCount("matchOffset", phiProcessIndex);
241}
242
243void LZ4SwizzledMatchCopyKernel::generateOutputCopy(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* outputBlocks) {
244    Value *SIZE_ZERO = iBuilder->getSize(0);
245    Value *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
246    Type* bytePtrType = iBuilder->getInt8PtrTy();
247
248    Value *previousProcessed = iBuilder->getProcessedItemCount("sourceStreamSet0");
249
250
251    Value *itemsToDo = mAvailableItemCount[3];
252//    iBuilder->CallPrintInt("swizzledMatchCopy:itemsToDo", itemsToDo);
253    Value *copySize = iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH);
254//    iBuilder->CallPrintInt("swizzledMatchCopy:copySize", copySize);
255    Value* actualCopySize = iBuilder->CreateUMin(itemsToDo, copySize);
256    Value* copyByte = iBuilder->CreateUDivCeil(iBuilder->CreateMul(copySize, iBuilder->getSize(mStreamCount)), iBuilder->getSize(8)); // i8
257
258
259    for (int i = 0; i < mStreamSize; i++) {
260        Value *inputBasePtr = iBuilder->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), SIZE_ZERO);
261        Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO);
262        iBuilder->CreateMemCpy(
263                iBuilder->CreatePointerCast(outputBasePtr, bytePtrType),
264                iBuilder->CreatePointerCast(inputBasePtr, bytePtrType),
265                copyByte,
266                1 // Not align guaranteed in final block
267        );
268    }
269    Value *newProcessed = iBuilder->CreateAdd(previousProcessed, actualCopySize);
270    iBuilder->setProcessedItemCount("sourceStreamSet0", newProcessed);
271//    iBuilder->CallPrintInt("swizzledMatchCopy:newProcessed", newProcessed);
272    iBuilder->setProducedItemCount("outputStreamSet0", newProcessed);
273}
274
275Value* LZ4SwizzledMatchCopyKernel::getMaximumMatchCopyBlock(const std::unique_ptr<KernelBuilder> &iBuilder) {
276    Value *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
277    Value *SIZE_ZERO = iBuilder->getSize(0);
278    Value *SIZE_ONE = iBuilder->getSize(1);
279    Value *m0EndInitOffset = iBuilder->CreateURem(iBuilder->getProcessedItemCount("m0End"), SIZE_BIT_BLOCK_WIDTH);
280    Value *m0EndItemsToDo = mAvailableItemCount[1];
281    Value *m0EndBasePtr = iBuilder->getInputStreamBlockPtr("m0End", SIZE_ZERO);
282    m0EndBasePtr = iBuilder->CreatePointerCast(m0EndBasePtr, iBuilder->getInt64Ty()->getPointerTo());
283    Value *lastM0 = iBuilder->CreateLoad(
284            iBuilder->CreateGEP(
285                    m0EndBasePtr,
286                    iBuilder->CreateSub(
287                            iBuilder->CreateAdd(m0EndInitOffset, m0EndItemsToDo),
288                            SIZE_ONE
289                    )
290
291            )
292    );
293    Value *lastDepositPosition = iBuilder->CreateAdd(lastM0, SIZE_ONE);
294
295    Value *currentMaxBlock = iBuilder->CreateSelect(
296            this->mIsFinalBlock,
297            iBuilder->CreateUDivCeil(lastDepositPosition, SIZE_BIT_BLOCK_WIDTH),
298            iBuilder->CreateUDiv(lastDepositPosition, SIZE_BIT_BLOCK_WIDTH)
299    );
300
301    // Produced Item Count will always be full bitblock except for final block
302    Value *previousProducedBlocks = iBuilder->CreateUDiv(
303            iBuilder->getProducedItemCount("outputStreamSet0"),
304            SIZE_BIT_BLOCK_WIDTH
305    );
306
307    // (m0 + 1) / BitBlockWidth - produceItemCount / BitBlockWidth
308    return iBuilder->CreateSub(currentMaxBlock, previousProducedBlocks);
309}
310
311LZ4SwizzledMatchCopyKernel::LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, unsigned streamCount/*=4*/, unsigned streamSize/*=2*/, unsigned swizzleFactor/*=4*/, unsigned PDEP_width/*64*/)
312        : MultiBlockKernel("LZ4SwizzledMatchCopyKernel",
313        // Inputs
314                           {
315                                   //TODO add swizzled attribute
316                                   Binding{iBuilder->getStreamSetTy(1, 64), "m0Start", BoundedRate(0, 1), AlwaysConsume()},
317                                   Binding{iBuilder->getStreamSetTy(1, 64), "m0End", BoundedRate(0, 1), AlwaysConsume()},
318                                   Binding{iBuilder->getStreamSetTy(1, 64), "matchOffset", BoundedRate(0, 1), AlwaysConsume()},
319
320                           },
321        // Outputs
322                           {},
323        // Arguments
324                           {},
325                           {},
326                           {
327                                   Binding{iBuilder->getSizeTy(), "currentProcessIndex"},
328                                   Binding{iBuilder->getSizeTy(), "pendingMatchPos"},
329                                   Binding{iBuilder->getSizeTy(), "pendingMatchOffset"},
330                                   Binding{iBuilder->getSizeTy(), "pendingMatchLength"},
331                           })
332        , mSwizzleFactor(swizzleFactor)
333        , mPDEPWidth(PDEP_width)
334        , mStreamSize(streamSize)
335        , mStreamCount(streamCount) {
336
337    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
338    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
339
340    addAttribute(MustExplicitlyTerminate());
341
342
343    mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), {AlwaysConsume(), Swizzled()}});
344    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet0", BoundedRate(0, 1)});
345
346    for (int i = 1; i < streamSize; i++) {
347        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), {AlwaysConsume(), Swizzled()}});
348        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0")});
349    }
350}
Note: See TracBrowser for help on using the repository browser.