source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp @ 5924

Last change on this file since 5924 was 5924, checked in by cameron, 13 months ago

Various cleanups

File size: 17.8 KB
Line 
1//
2// Created by wxy325 on 2018/3/9.
3//
4
5#include "lz4_swizzled_match_copy_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <kernels/streamset.h>
8#include <toolchain/toolchain.h>
9
10
11using namespace llvm;
12using namespace kernel;
13using namespace std;
14
15void LZ4SwizzledMatchCopyKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides)  {
16    // Const
17    Constant *SIZE_ZERO = iBuilder->getSize(0);
18    Constant *SIZE_ONE = iBuilder->getSize(1);
19    Constant *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
20    Constant *SIZE_PDEP_WIDTH = iBuilder->getSize(mPDEPWidth);
21
22    Value *itemsToDo = mAvailableItemCount[3];
23
24
25
26    Value *previousProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
27
28    // Space Calculation
29    Value *outputBufferBlocks = iBuilder->getSize(
30            this->getAnyStreamSetBuffer("outputStreamSet0")->getBufferBlocks());
31    Value *outputRawBeginPtr = iBuilder->CreatePointerCast(
32            iBuilder->getRawOutputPointer("outputStreamSet0", SIZE_ZERO),
33            iBuilder->getBitBlockType()->getPointerTo()); // TODO it is possible the pointer cast here is not necessary
34    Value *outputCurrentPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet0", SIZE_ZERO);
35    Value *producedOffset = iBuilder->CreatePtrDiff(outputCurrentPtr, outputRawBeginPtr);
36    producedOffset = iBuilder->CreateUDiv(producedOffset, iBuilder->getSize(mStreamCount));
37
38    Value *remainSpace = iBuilder->CreateSub(outputBufferBlocks, producedOffset);
39    Value *matchCopyWindowBlock = iBuilder->getSize(256 * 256 / codegen::BlockSize);
40    Value *remainWindowBlock = iBuilder->CreateSelect(
41            iBuilder->CreateICmpUGE(producedOffset, matchCopyWindowBlock),
42            iBuilder->getSize(0),
43            iBuilder->CreateSub(matchCopyWindowBlock, producedOffset)
44    );
45    Value *writableBlocks = iBuilder->CreateSub(remainSpace,
46                                                remainWindowBlock); //TODO handle beginning, if producedItemCount / bitblockWidth < windowBlock, there is no need for the substraction here
47
48    Value *outputBlocks = iBuilder->CreateUMin(writableBlocks, numOfStrides);
49//    outputBlocks = iBuilder->CreateUMin(outputBlocks, this->getMaximumMatchCopyBlock(iBuilder));
50
51
52    Value *isFinalBlock =
53            iBuilder->CreateOr(
54                    iBuilder->CreateICmpULT(itemsToDo, iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH)),
55                    iBuilder->CreateICmpEQ(itemsToDo, iBuilder->getSize(0))
56            );
57
58    this->mIsFinalBlock = isFinalBlock;
59    iBuilder->setTerminationSignal(isFinalBlock);
60
61    // Output Copy
62    this->generateOutputCopy(iBuilder, outputBlocks);
63
64
65
66    Value *newProducedItemCount = iBuilder->getProducedItemCount("outputStreamSet0");
67
68    BasicBlock *copyEndBlock = iBuilder->CreateBasicBlock("copyEnd");
69    iBuilder->CreateBr(copyEndBlock);
70    iBuilder->SetInsertPoint(copyEndBlock);
71
72    // Match Copy
73    BasicBlock *exitBlock = iBuilder->CreateBasicBlock("exit_block");
74
75    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("m0Start");
76    Value *totalM0StartItemsCount = iBuilder->CreateAdd(initM0StartProcessIndex, mAvailableItemCount[0]);
77
78    Value *initMatchOffset = iBuilder->getScalarField("pendingMatchOffset");
79    Value *initMatchLength = iBuilder->getScalarField("pendingMatchLength");
80    Value *initMatchPos = iBuilder->getScalarField("pendingMatchPos");
81
82    BasicBlock *matchCopyLoopCon = iBuilder->CreateBasicBlock("matchCopyLoopCon");
83    iBuilder->CreateBr(matchCopyLoopCon);
84
85    iBuilder->SetInsertPoint(matchCopyLoopCon);
86
87
88    PHINode *phiProcessIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
89    phiProcessIndex->addIncoming(initM0StartProcessIndex, copyEndBlock);
90
91    PHINode *phiMatchOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
92    phiMatchOffset->addIncoming(initMatchOffset, copyEndBlock);
93
94    PHINode *phiMatchLength = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
95    phiMatchLength->addIncoming(initMatchLength, copyEndBlock);
96
97    PHINode *phiMatchPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
98    phiMatchPos->addIncoming(initMatchPos, copyEndBlock);
99
100    BasicBlock *loadNextMatchInfoConBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoConBlock");
101    BasicBlock *loadNextMatchInfoBodyBlock = iBuilder->CreateBasicBlock("loadNewMatchInfoBodyBlock");
102
103    BasicBlock *matchCopyConBlock = iBuilder->CreateBasicBlock("matchCopyConBlock");
104    BasicBlock *matchCopyBodyBlock = iBuilder->CreateBasicBlock("matchCopyBodyBlock");
105
106
107    iBuilder->CreateCondBr(
108            iBuilder->CreateICmpEQ(phiMatchLength, iBuilder->getSize(0)),
109            loadNextMatchInfoConBlock,
110            matchCopyConBlock
111    );
112
113
114    iBuilder->SetInsertPoint(loadNextMatchInfoConBlock);
115
116    Value *hasMoreMatchInfo = iBuilder->CreateICmpULT(phiProcessIndex, totalM0StartItemsCount);
117    iBuilder->CreateCondBr(hasMoreMatchInfo, loadNextMatchInfoBodyBlock, exitBlock);
118
119    iBuilder->SetInsertPoint(loadNextMatchInfoBodyBlock);
120
121    Value *m0StartBasePtr = iBuilder->CreatePointerCast(iBuilder->getInputStreamBlockPtr("m0Start", SIZE_ZERO), iBuilder->getInt64Ty()->getPointerTo());
122    Value *m0EndBasePtr = iBuilder->CreatePointerCast(iBuilder->getInputStreamBlockPtr("m0End", SIZE_ZERO), iBuilder->getInt64Ty()->getPointerTo());
123    Value *matchOffsetBasePtr = iBuilder->CreatePointerCast(iBuilder->getInputStreamBlockPtr("matchOffset", SIZE_ZERO), iBuilder->getInt64Ty()->getPointerTo());
124
125
126    Value *m0StartBaseOffset = iBuilder->CreateURem(initM0StartProcessIndex, SIZE_BIT_BLOCK_WIDTH);
127//    iBuilder->CallPrintInt("rawPtr", iBuilder->getRawInputPointer("m0Start", SIZE_ZERO));
128//    iBuilder->CallPrintInt("ptr", m0StartBasePtr);
129//    iBuilder->CallPrintInt("initM0StartProcessIndex", initM0StartProcessIndex);
130    Value *m0StartLoadOffset = iBuilder->CreateAdd(m0StartBaseOffset,
131                                                   iBuilder->CreateSub(phiProcessIndex, initM0StartProcessIndex));
132
133    Value *newM0Start = iBuilder->CreateLoad(iBuilder->CreateGEP(m0StartBasePtr, m0StartLoadOffset));
134    Value *newM0End = iBuilder->CreateLoad(iBuilder->CreateGEP(m0EndBasePtr, m0StartLoadOffset));
135    Value *newMatchOffset = iBuilder->CreateLoad(iBuilder->CreateGEP(matchOffsetBasePtr, m0StartLoadOffset));
136
137    Value *depositStart = newM0Start;
138//    iBuilder->CallPrintInt("depositStart", depositStart);
139//    iBuilder->CallPrintInt("newMatchLength", newMatchLength);
140
141    Value *depositEnd = iBuilder->CreateAdd(newM0End, iBuilder->getInt64(1));
142    Value *newMatchLength = iBuilder->CreateSub(depositEnd, depositStart);
143    phiProcessIndex->addIncoming(iBuilder->CreateAdd(phiProcessIndex, SIZE_ONE), iBuilder->GetInsertBlock());
144
145    phiMatchPos->addIncoming(depositStart, iBuilder->GetInsertBlock());
146    phiMatchOffset->addIncoming(newMatchOffset, iBuilder->GetInsertBlock());
147    phiMatchLength->addIncoming(newMatchLength, iBuilder->GetInsertBlock());
148
149    iBuilder->CreateBr(matchCopyLoopCon);
150
151
152    iBuilder->SetInsertPoint(matchCopyConBlock);
153    Value *hasNotReachEnd = iBuilder->CreateICmpULT(phiMatchPos, newProducedItemCount);
154//    iBuilder->CallPrintInt("newProducedItemCount", newProducedItemCount);
155    iBuilder->CreateCondBr(hasNotReachEnd, matchCopyBodyBlock, exitBlock);
156
157    iBuilder->SetInsertPoint(matchCopyBodyBlock);
158
159
160    Value* matchCopyFromPos = iBuilder->CreateSub(phiMatchPos, phiMatchOffset);
161    Value* outputBufferSize = iBuilder->CreateMul(outputBufferBlocks, SIZE_BIT_BLOCK_WIDTH);
162    Value* matchCopyFromOffset = iBuilder->CreateURem(matchCopyFromPos, outputBufferSize);
163    Value* matchCopyFromBlockIndex = iBuilder->CreateUDiv(matchCopyFromOffset, SIZE_PDEP_WIDTH);
164    Value* matchCopyFromBlockOffset = iBuilder->CreateURem(matchCopyFromOffset, SIZE_PDEP_WIDTH);
165
166    Value* matchCopyTargetBlockIndex = iBuilder->CreateUDiv(iBuilder->CreateSub(phiMatchPos, previousProducedItemCount), SIZE_PDEP_WIDTH);
167    Value* matchCopyTargetBlockOffset = iBuilder->CreateURem(phiMatchPos, SIZE_PDEP_WIDTH);
168
169
170    Value* matchCopyFromRemain = iBuilder->CreateSub(SIZE_PDEP_WIDTH, matchCopyFromBlockOffset);
171    Value* matchCopyTargetRemain = iBuilder->CreateSub(SIZE_PDEP_WIDTH, matchCopyTargetBlockOffset);
172
173    Value* currentCopySize = iBuilder->CreateUMin(matchCopyFromRemain, matchCopyTargetRemain);
174    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchOffset);
175    currentCopySize = iBuilder->CreateUMin(currentCopySize, phiMatchLength);
176    currentCopySize = iBuilder->CreateUMin(currentCopySize, iBuilder->CreateSub(newProducedItemCount, phiMatchPos));
177    currentCopySize = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(currentCopySize, SIZE_ZERO), SIZE_ONE, currentCopySize); //Workaround for the last byte
178    Value* singleMask = iBuilder->CreateSub(
179            iBuilder->CreateSelect( // When currentCopySize == SIZE_PDEP_WIDTH, shl will cause overflow
180                    iBuilder->CreateICmpEQ(currentCopySize, SIZE_PDEP_WIDTH),
181                    SIZE_ZERO,
182                    iBuilder->CreateShl(SIZE_ONE, iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize))
183            ),
184            iBuilder->CreateShl(SIZE_ONE, matchCopyFromBlockOffset)
185    );
186    Value* fullMask = iBuilder->simd_fill(mPDEPWidth, singleMask);
187
188//    iBuilder->CallPrintInt("phiMatchPos", phiMatchPos);
189//    iBuilder->CallPrintInt("currentCopySize", currentCopySize);
190//    iBuilder->CallPrintInt("aaa", iBuilder->CreateShl(SIZE_ONE, iBuilder->CreateAdd(matchCopyFromBlockOffset, currentCopySize)));
191//    iBuilder->CallPrintRegister("fullMask", fullMask);
192
193    for (int i = 0; i < mStreamSize; i++) {
194        Value* rawOutputBasePtr = iBuilder->getRawOutputPointer("outputStreamSet" + std::to_string(i), SIZE_ZERO);
195        rawOutputBasePtr = iBuilder->CreatePointerCast(rawOutputBasePtr, iBuilder->getBitBlockType()->getPointerTo());
196        Value* matchCopyFromBlockPtr = iBuilder->CreateGEP(rawOutputBasePtr, matchCopyFromBlockIndex);
197
198        Value* fromBlockValue = iBuilder->CreateLoad(matchCopyFromBlockPtr);
199
200        Value* copiedValue = iBuilder->simd_and(fromBlockValue, fullMask);
201
202        Value* outputBlockBasePtr = iBuilder->CreatePointerCast(iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO), iBuilder->getBitBlockType()->getPointerTo());
203        Value* outputTargetBlockPtr = iBuilder->CreateGEP(outputBlockBasePtr, matchCopyTargetBlockIndex);
204        Value* targetOriginalValue = iBuilder->CreateLoad(outputTargetBlockPtr);
205
206        Value* finalValue = iBuilder->simd_or(
207                targetOriginalValue,
208                iBuilder->CreateShl(
209                        iBuilder->CreateLShr(
210                                copiedValue,
211                                iBuilder->simd_fill(mPDEPWidth, matchCopyFromBlockOffset)
212                        ),
213                        iBuilder->simd_fill(mPDEPWidth, matchCopyTargetBlockOffset)
214                )
215        );
216
217
218//        iBuilder->CallPrintRegister("targetOriginalValue", targetOriginalValue);
219//        iBuilder->CallPrintRegister("finalValue", finalValue);
220//        iBuilder->CallPrintInt("matchCopyTargetBlockOffset", matchCopyTargetBlockOffset);
221//        iBuilder->CallPrintInt("currentCopySize", currentCopySize);
222        iBuilder->CreateStore(finalValue, outputTargetBlockPtr);
223    }
224
225    phiProcessIndex->addIncoming(phiProcessIndex, iBuilder->GetInsertBlock());
226    phiMatchOffset->addIncoming(phiMatchOffset, iBuilder->GetInsertBlock());
227    phiMatchPos->addIncoming(iBuilder->CreateAdd(phiMatchPos, currentCopySize), iBuilder->GetInsertBlock());
228    phiMatchLength->addIncoming(iBuilder->CreateSub(phiMatchLength, currentCopySize), iBuilder->GetInsertBlock());
229
230    iBuilder->CreateBr(matchCopyLoopCon);
231
232    iBuilder->SetInsertPoint(exitBlock);
233//    iBuilder->CallPrintInt("test", SIZE_ZERO);
234    iBuilder->setScalarField("pendingMatchOffset", phiMatchOffset);
235    iBuilder->setScalarField("pendingMatchLength", phiMatchLength);
236    iBuilder->setScalarField("pendingMatchPos", phiMatchPos);
237//    iBuilder->CallPrintInt("pendingMatchLength", phiMatchLength);
238    iBuilder->setProcessedItemCount("m0Start", phiProcessIndex);
239    iBuilder->setProcessedItemCount("m0End", phiProcessIndex);
240    iBuilder->setProcessedItemCount("matchOffset", phiProcessIndex);
241}
242
243void LZ4SwizzledMatchCopyKernel::generateOutputCopy(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value* outputBlocks) {
244    Value *SIZE_ZERO = iBuilder->getSize(0);
245    Value *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
246    Type* bytePtrType = iBuilder->getInt8PtrTy();
247
248    Value *previousProcessed = iBuilder->getProcessedItemCount("sourceStreamSet0");
249
250
251    Value *itemsToDo = mAvailableItemCount[3];
252//    iBuilder->CallPrintInt("swizzledMatchCopy:itemsToDo", itemsToDo);
253    Value *copySize = iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH);
254//    iBuilder->CallPrintInt("swizzledMatchCopy:copySize", copySize);
255    Value* actualCopySize = iBuilder->CreateUMin(itemsToDo, copySize);
256    Value* copyByte = iBuilder->CreateUDivCeil(iBuilder->CreateMul(copySize, iBuilder->getSize(mStreamCount)), iBuilder->getSize(8)); // i8
257
258
259    for (int i = 0; i < mStreamSize; i++) {
260        Value *inputBasePtr = iBuilder->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), SIZE_ZERO);
261        Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), SIZE_ZERO);
262        iBuilder->CreateMemCpy(
263                iBuilder->CreatePointerCast(outputBasePtr, bytePtrType),
264                iBuilder->CreatePointerCast(inputBasePtr, bytePtrType),
265                copyByte,
266                1 // Not align guaranteed in final block
267        );
268    }
269    Value *newProcessed = iBuilder->CreateAdd(previousProcessed, actualCopySize);
270    iBuilder->setProcessedItemCount("sourceStreamSet0", newProcessed);
271//    iBuilder->CallPrintInt("swizzledMatchCopy:newProcessed", newProcessed);
272    iBuilder->setProducedItemCount("outputStreamSet0", newProcessed);
273}
274
275Value* LZ4SwizzledMatchCopyKernel::getMaximumMatchCopyBlock(const std::unique_ptr<KernelBuilder> &iBuilder) {
276    Value *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
277    Value *SIZE_ZERO = iBuilder->getSize(0);
278    Value *SIZE_ONE = iBuilder->getSize(1);
279    Value *m0EndInitOffset = iBuilder->CreateURem(iBuilder->getProcessedItemCount("m0End"), SIZE_BIT_BLOCK_WIDTH);
280    Value *m0EndItemsToDo = mAvailableItemCount[1];
281    Value *m0EndBasePtr = iBuilder->getInputStreamBlockPtr("m0End", SIZE_ZERO);
282    m0EndBasePtr = iBuilder->CreatePointerCast(m0EndBasePtr, iBuilder->getInt64Ty()->getPointerTo());
283    Value *lastM0 = iBuilder->CreateLoad(
284            iBuilder->CreateGEP(
285                    m0EndBasePtr,
286                    iBuilder->CreateSub(
287                            iBuilder->CreateAdd(m0EndInitOffset, m0EndItemsToDo),
288                            SIZE_ONE
289                    )
290
291            )
292    );
293    Value *lastDepositPosition = iBuilder->CreateAdd(lastM0, SIZE_ONE);
294
295    Value *currentMaxBlock = iBuilder->CreateSelect(
296            this->mIsFinalBlock,
297            iBuilder->CreateUDivCeil(lastDepositPosition, SIZE_BIT_BLOCK_WIDTH),
298            iBuilder->CreateUDiv(lastDepositPosition, SIZE_BIT_BLOCK_WIDTH)
299    );
300
301    // Produced Item Count will always be full bitblock except for final block
302    Value *previousProducedBlocks = iBuilder->CreateUDiv(
303            iBuilder->getProducedItemCount("outputStreamSet0"),
304            SIZE_BIT_BLOCK_WIDTH
305    );
306
307    // (m0 + 1) / BitBlockWidth - produceItemCount / BitBlockWidth
308    return iBuilder->CreateSub(currentMaxBlock, previousProducedBlocks);
309}
310
311LZ4SwizzledMatchCopyKernel::LZ4SwizzledMatchCopyKernel(const std::unique_ptr<kernel::KernelBuilder> &iBuilder, unsigned streamCount/*=4*/, unsigned streamSize/*=2*/, unsigned swizzleFactor/*=4*/, unsigned PDEP_width/*64*/)
312        : MultiBlockKernel("LZ4SwizzledMatchCopyKernel",
313        // Inputs
314                           {
315                                   //TODO add swizzled attribute
316                                   Binding{iBuilder->getStreamSetTy(1, 64), "m0Start", BoundedRate(0, 1), AlwaysConsume()},
317                                   Binding{iBuilder->getStreamSetTy(1, 64), "m0End", BoundedRate(0, 1), AlwaysConsume()},
318                                   Binding{iBuilder->getStreamSetTy(1, 64), "matchOffset", BoundedRate(0, 1), AlwaysConsume()},
319
320                           },
321        // Outputs
322                           {},
323        // Arguments
324                           {},
325                           {},
326                           {
327                                   Binding{iBuilder->getSizeTy(), "currentProcessIndex"},
328                                   Binding{iBuilder->getSizeTy(), "pendingMatchPos"},
329                                   Binding{iBuilder->getSizeTy(), "pendingMatchOffset"},
330                                   Binding{iBuilder->getSizeTy(), "pendingMatchLength"},
331                           })
332        , mSwizzleFactor(swizzleFactor)
333        , mPDEPWidth(PDEP_width)
334        , mStreamSize(streamSize)
335        , mStreamCount(streamCount) {
336
337    assert((mSwizzleFactor == (iBuilder->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
338    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
339
340    addAttribute(MustExplicitlyTerminate());
341
342
343    mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), {AlwaysConsume(), Swizzled()}});
344    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet0", BoundedRate(0, 1)});
345
346    for (int i = 1; i < streamSize; i++) {
347        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), {AlwaysConsume(), Swizzled()}});
348        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0")});
349    }
350}
Note: See TracBrowser for help on using the repository browser.