source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_index_builder.cpp @ 6022

Last change on this file since 6022 was 6022, checked in by xwa163, 12 months ago

Performance improvement for M0 marker bit output logic in LZ4IndexBuilderKernel

File size: 33.6 KB
Line 
1
2#include "lz4_index_builder.h"
3
4
5#include <kernels/kernel_builder.h>
6#include <iostream>
7#include <string>
8#include <llvm/Support/raw_ostream.h>
9#include <kernels/streamset.h>
10
11using namespace llvm;
12using namespace kernel;
13using namespace std;
14
15namespace kernel{
16
17    LZ4IndexBuilderKernel::LZ4IndexBuilderKernel(const std::unique_ptr<kernel::KernelBuilder> &b)
18    : SegmentOrientedKernel("LZ4IndexBuilderKernel",
19    // Inputs
20    {
21           Binding{b->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)},
22           Binding{b->getStreamSetTy(1, 1), "extender", RateEqualTo("byteStream")},
23
24           // block data
25           Binding{b->getStreamSetTy(1, 1), "isCompressed", BoundedRate(0, 1), AlwaysConsume()},
26           Binding{b->getStreamSetTy(1, 64), "blockStart", RateEqualTo("isCompressed"), AlwaysConsume()},
27           Binding{b->getStreamSetTy(1, 64), "blockEnd", RateEqualTo("isCompressed"), AlwaysConsume()}
28
29    },
30    //Outputs
31    {
32           // Uncompressed_data
33           Binding{b->getStreamSetTy(1, 64), "uncompressedStartPos",
34                   BoundedRate(0, 1)},
35           Binding{b->getStreamSetTy(1, 64), "uncompressedLength",
36                   BoundedRate(0, 1)},
37           Binding{b->getStreamSetTy(1, 64), "uncompressedOutputPos",
38                   BoundedRate(0, 1)},
39
40           Binding{b->getStreamSetTy(1, 1), "deletionMarker", BoundedRate(0, 1)},
41           Binding{b->getStreamSetTy(1, 1), "M0Marker", BoundedRate(0, 1)},
42           Binding{b->getStreamSetTy(1, 1), "MatchOffsetMarker", RateEqualTo("byteStream")}
43    },
44    //Arguments
45    {
46           Binding{b->getSizeTy(), "fileSize"}
47    },
48    {},
49    //Internal states:
50    {
51           Binding{b->getSizeTy(), "blockDataIndex"},
52           Binding{b->getInt64Ty(), "m0OutputPos"},
53           Binding{b->getInt64Ty(), "compressedSpaceClearPos"},
54
55           // For M0 output
56           Binding{b->getIntNTy(64), "pendingM0StartBits"},
57           Binding{b->getIntNTy(64), "pendingM0EndBits"},
58           Binding{b->getIntNTy(64), "pendingM0CarryBit"},
59           Binding{b->getInt64Ty(), "pendingM0Index"},
60
61
62    }) {
63        this->setStride(4 * 1024 * 1024);
64        addAttribute(MustExplicitlyTerminate());
65    }
66
67    void LZ4IndexBuilderKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> &b) {
68        BasicBlock* exitBlock = b->CreateBasicBlock("exitBlock");
69        BasicBlock* blockEndConBlock = b->CreateBasicBlock("blockEndConBlock");
70
71        Value * blockDataIndex = b->getScalarField("blockDataIndex");
72
73        // In MultiblockKernel, availableItemCount + processedItemCount == producedItemCount from previous kernel
74        // While in SegmentOrigentedKernel, availableItemCount == producedItemCount from previous kernel
75        Value * totalNumber = b->getAvailableItemCount("blockEnd");
76        Value * totalExtender = b->getAvailableItemCount("extender");
77
78        Value * blockEnd = this->generateLoadInt64NumberInput(b, "blockEnd", blockDataIndex);
79
80        b->CreateCondBr(b->CreateICmpULT(blockDataIndex, totalNumber), blockEndConBlock, exitBlock);
81
82        b->SetInsertPoint(blockEndConBlock);
83        Value * blockStart = this->generateLoadInt64NumberInput(b, "blockStart", blockDataIndex);
84        BasicBlock * processBlock = b->CreateBasicBlock("processBlock");
85        b->CreateCondBr(b->CreateICmpULE(blockEnd, totalExtender), processBlock, exitBlock);
86
87        b->SetInsertPoint(processBlock);
88
89        //TODO handle uncompressed block
90
91        this->generateProcessCompressedBlock(b, blockStart, blockEnd);
92        this->storePendingM0(b);
93        Value * newBlockDataIndex = b->CreateAdd(blockDataIndex, b->getInt64(1));
94        b->setScalarField("blockDataIndex", newBlockDataIndex);
95        b->setProcessedItemCount("isCompressed", newBlockDataIndex);
96//        b->setProcessedItemCount("blockEnd", newBlockDataIndex);
97//        b->setProcessedItemCount("blockStart", newBlockDataIndex);
98
99        b->setProcessedItemCount("byteStream", blockEnd);
100        b->CreateBr(exitBlock);
101
102        b->SetInsertPoint(exitBlock);
103    }
104
105    Value* LZ4IndexBuilderKernel::processLiteral(const std::unique_ptr<KernelBuilder> &b, Value* token, Value* tokenPos, Value* blockEnd) {
106//        b->CallPrintInt("blockEnd", blockEnd);
107        BasicBlock* entryBlock = b->GetInsertBlock();
108
109        Value * extendedLiteralValue = b->CreateICmpEQ(b->CreateAnd(token, b->getInt8(0xf0)), b->getInt8(0xf0));
110
111        BasicBlock* extendLiteralLengthCon = b->CreateBasicBlock("block_data_loop_handle_compressed_block_extend_literal_length_con");
112        BasicBlock* extendLiteralLengthBody = b->CreateBasicBlock("block_data_loop_handle_compressed_block_extend_literal_length_body");
113        BasicBlock* extendLiteralLengthExit = b->CreateBasicBlock("block_data_loop_handle_compressed_block_extend_literal_length_exit");
114
115        b->CreateCondBr(extendedLiteralValue, extendLiteralLengthCon, extendLiteralLengthExit);
116
117        b->SetInsertPoint(extendLiteralLengthCon);
118
119        Value * const nextTokenPos = b->CreateAdd(tokenPos, b->getInt64(1));
120        Value * const nextToken = b->CreateLoad(b->getRawInputPointer("byteStream", nextTokenPos));
121        Value * const isExitToken = b->CreateICmpNE(nextToken, b->getInt8(0xff));
122        b->CreateLikelyCondBr(isExitToken, extendLiteralLengthExit, extendLiteralLengthBody);
123
124
125        b->SetInsertPoint(extendLiteralLengthBody);
126        Value* newCursorPos2 = this->advanceUntilNextZero(b, "extender", b->CreateAdd(tokenPos, b->getInt64(1)), blockEnd);
127        BasicBlock* advanceFinishBlock = b->GetInsertBlock();
128
129
130        b->CreateBr(extendLiteralLengthExit);
131
132        b->SetInsertPoint(extendLiteralLengthExit);
133        PHINode* phiCursorPosAfterLiteral = b->CreatePHI(b->getInt64Ty(), 3);
134        phiCursorPosAfterLiteral->addIncoming(nextTokenPos, extendLiteralLengthCon);
135        phiCursorPosAfterLiteral->addIncoming(newCursorPos2, advanceFinishBlock);
136        phiCursorPosAfterLiteral->addIncoming(tokenPos, entryBlock);
137
138        Value * literalExtensionSize = b->CreateSub(phiCursorPosAfterLiteral, tokenPos);
139        Value * finalLengthByte = this->generateLoadSourceInputByte(b, phiCursorPosAfterLiteral);
140        finalLengthByte = b->CreateZExt(finalLengthByte, b->getInt64Ty());
141        Value * literalLengthExtendValue = b->CreateSelect(
142                b->CreateICmpUGT(literalExtensionSize, b->getSize(0)),
143                b->CreateAdd(
144                        b->CreateMul(
145                                b->CreateSub(literalExtensionSize, b->getSize(1)),
146                                b->getSize(255)
147                        ),
148                        finalLengthByte
149                ),
150                b->getSize(0)
151        );
152        literalLengthExtendValue = b->CreateZExt(literalLengthExtendValue, b->getInt64Ty());
153        Value* literalLengthBase = b->CreateLShr(b->CreateZExt(token, b->getInt64Ty()), b->getInt64(4));
154        Value* literalLength = b->CreateAdd(literalLengthBase, literalLengthExtendValue);
155
156        Value* offsetPos = b->CreateAdd(
157                b->CreateAdd(
158                        phiCursorPosAfterLiteral,
159                        literalLength),
160                b->getSize(1));
161
162        this->setCircularOutputBitstream(b, "deletionMarker", b->getProducedItemCount("deletionMarker"), b->CreateAdd(phiCursorPosAfterLiteral, b->getSize(1)));
163
164        b->setProducedItemCount("deletionMarker", offsetPos);
165        this->increaseScalarField(b, "m0OutputPos", literalLength); //TODO m0OutputPos may be removed from scalar fields
166        return offsetPos;
167    }
168
169    Value* LZ4IndexBuilderKernel::processMatch(const std::unique_ptr<KernelBuilder> &iBuilder, Value* offsetPos, Value* token, Value* blockEnd) {
170        Constant* INT64_ONE = iBuilder->getInt64(1);
171
172        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
173
174        Value* extendMatchStartPos = iBuilder->CreateAdd(offsetPos, INT64_ONE);
175        Value* extendedMatchValue = iBuilder->CreateICmpEQ(iBuilder->CreateAnd(token, iBuilder->getInt8(0xf)), iBuilder->getInt8(0xf));
176
177        BasicBlock* extendMatchBodyBlock = iBuilder->CreateBasicBlock("block_data_loop_handle_compressed_block_loop_extend_match_body");
178        BasicBlock* extendMatchExitBlock = iBuilder->CreateBasicBlock("block_data_loop_handle_compressed_block_loop_extend_match_exit");
179
180        iBuilder->CreateCondBr(extendedMatchValue, extendMatchBodyBlock, extendMatchExitBlock);
181
182        iBuilder->SetInsertPoint(extendMatchBodyBlock);
183
184        //ExtendMatchBodyBlock
185        Value* newCursorPos = this->advanceUntilNextZero(iBuilder, "extender", iBuilder->CreateAdd(extendMatchStartPos, INT64_ONE), blockEnd);
186        BasicBlock* advanceFinishBlock = iBuilder->GetInsertBlock();
187
188        iBuilder->CreateBr(extendMatchExitBlock);
189
190        //ExtendMatchExitBlock
191        iBuilder->SetInsertPoint(extendMatchExitBlock);
192        PHINode* phiCursorPosAfterMatch = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
193        phiCursorPosAfterMatch->addIncoming(newCursorPos, advanceFinishBlock);
194        phiCursorPosAfterMatch->addIncoming(extendMatchStartPos, entryBlock);
195
196        Value* oldMatchExtensionSize = iBuilder->CreateSub(phiCursorPosAfterMatch, extendMatchStartPos);
197//        extendedMatchValue = iBuilder->CreateICmpEQ(iBuilder->CreateAnd(token, iBuilder->getInt8(0xf)), iBuilder->getInt8(0xf));
198        Value* matchExtensionSize = iBuilder->CreateSelect(extendedMatchValue, oldMatchExtensionSize, iBuilder->getSize(0));
199        Value* matchLengthBase = iBuilder->CreateZExt(iBuilder->CreateAnd(token, iBuilder->getInt8(0x0f)), iBuilder->getInt64Ty());
200        Value* matchLength = iBuilder->CreateAdd(matchLengthBase, iBuilder->getInt64(4));
201
202
203        Value* extensionLastBitPos = iBuilder->CreateAdd(offsetPos, iBuilder->getSize(1));
204        extensionLastBitPos = iBuilder->CreateAdd(extensionLastBitPos, matchExtensionSize);
205
206        Value* extensionLastBitValue = this->generateLoadSourceInputByte(iBuilder, extensionLastBitPos);
207        extensionLastBitValue = iBuilder->CreateZExt(extensionLastBitValue, iBuilder->getSizeTy());
208
209
210        Value* matchLengthAddValue = iBuilder->CreateSelect(
211                iBuilder->CreateICmpUGT(matchExtensionSize, iBuilder->getSize(0)),
212                iBuilder->CreateAdd(
213                        iBuilder->CreateMul(
214                                iBuilder->CreateSub(matchExtensionSize, iBuilder->getSize(1)),
215                                iBuilder->getSize(255)
216                        ),
217                        extensionLastBitValue
218                )
219                ,
220                iBuilder->getSize(0)
221        );
222        matchLengthAddValue = iBuilder->CreateZExt(matchLengthAddValue, iBuilder->getInt64Ty());
223
224        matchLength = iBuilder->CreateAdd(matchLength, matchLengthAddValue);
225
226        Value* outputPos = iBuilder->getScalarField("m0OutputPos");
227
228        Value* outputEndPos = iBuilder->CreateSub(
229                iBuilder->CreateAdd(outputPos, matchLength),
230                iBuilder->getInt64(1)
231        );
232
233
234
235        this->markCircularOutputBitstream(iBuilder, "MatchOffsetMarker", offsetPos);
236        this->increaseScalarField(iBuilder, "m0OutputPos", matchLength);
237//        this->setCircularOutputBitstream(iBuilder, "M0Marker", outputPos, outputEndPos);
238        this->appendM0Output(iBuilder, outputPos, outputEndPos);
239
240        return iBuilder->CreateAdd(phiCursorPosAfterMatch, INT64_ONE);
241    }
242
243    void LZ4IndexBuilderKernel::generateProcessCompressedBlock(const std::unique_ptr<KernelBuilder> &iBuilder, Value* blockStart, Value* blockEnd) {
244        Value* clearPos = iBuilder->getScalarField("compressedSpaceClearPos");
245        // We can not only clear [blockStart, blockEnd), since there are 4 bytes between blockEnd and nextBlockStart
246        this->clearCircularOutputBitstream(iBuilder, "deletionMarker", clearPos, blockEnd);
247        this->clearCircularOutputBitstream(iBuilder, "MatchOffsetMarker", clearPos, blockEnd);
248        iBuilder->setScalarField("compressedSpaceClearPos", blockEnd);
249
250        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
251
252//        Value* m0OutputBlockPtr = iBuilder->getOutputStreamBlockPtr("M0Marker", iBuilder->getSize(0));
253//        iBuilder->CreateMemSet(m0OutputBlockPtr, iBuilder->getInt8(0), 4 * 1024 * 1024 / 8, true);
254
255
256        Value* isTerminal = iBuilder->CreateICmpEQ(blockEnd, iBuilder->getScalarField("fileSize"));
257        iBuilder->setTerminationSignal(isTerminal);
258
259        //TODO use memset to clear output buffer for extract marker
260
261        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("processCompressedExitBlock");
262
263        BasicBlock* processCon = iBuilder->CreateBasicBlock("processCompressedConBlock");
264        BasicBlock* processBody = iBuilder->CreateBasicBlock("processCompressedBodyBlock");
265
266        iBuilder->CreateBr(processCon);
267        iBuilder->SetInsertPoint(processCon);
268
269        PHINode* phiCursorValue = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 3); // phiCursorValue should always be the position of next token except for the final sequence
270        phiCursorValue->addIncoming(blockStart, entryBlock);
271
272        iBuilder->CreateCondBr(iBuilder->CreateICmpULT(phiCursorValue, blockEnd), processBody, exitBlock);
273
274        // Process Body
275        iBuilder->SetInsertPoint(processBody);
276
277        //TODO add acceleration here
278        Value* token = this->generateLoadSourceInputByte(iBuilder, phiCursorValue);
279        // Process Literal
280        BasicBlock* processLiteralBlock = iBuilder->CreateBasicBlock("processLiteralBlock");
281        iBuilder->CreateBr(processLiteralBlock);
282        iBuilder->SetInsertPoint(processLiteralBlock);
283
284        Value* offsetPos = this->processLiteral(iBuilder, token, phiCursorValue, blockEnd);
285        // Process Match
286        BasicBlock* handleM0BodyBlock = iBuilder->CreateBasicBlock("block_data_loop_handle_compressed_block_loop_handle_m0_body");
287        BasicBlock* handleM0ElseBlock = iBuilder->CreateBasicBlock("block_data_loop_handle_compressed_block_loop_handle_m0_else");
288
289        iBuilder->CreateCondBr(
290                iBuilder->CreateICmpULT(offsetPos, blockEnd),
291                handleM0BodyBlock,
292                handleM0ElseBlock
293        );
294
295        // HandleM0Body
296        iBuilder->SetInsertPoint(handleM0BodyBlock);
297        Value* nextTokenPos = this->processMatch(iBuilder, offsetPos, token, blockEnd);
298        phiCursorValue->addIncoming(nextTokenPos, iBuilder->GetInsertBlock());
299
300        iBuilder->CreateBr(processCon);
301
302
303        // HandleM0Else
304        iBuilder->SetInsertPoint(handleM0ElseBlock);
305
306        phiCursorValue->addIncoming(offsetPos, handleM0ElseBlock);
307        // Store final M0 pos to make sure the bit stream will be long enough
308        Value* finalM0OutputPos = iBuilder->getScalarField("m0OutputPos");
309        iBuilder->setProducedItemCount("M0Marker", finalM0OutputPos);
310        // finalM0OutputPos should always be 4MB * n except for the final block
311
312        iBuilder->CreateBr(processCon);
313
314
315        iBuilder->SetInsertPoint(exitBlock);
316    }
317
318    Value * LZ4IndexBuilderKernel::advanceUntilNextZero(const unique_ptr<KernelBuilder> &iBuilder, string inputName, Value * startPos, Value * maxPos) {
319
320        Constant* SIZE_64 = iBuilder->getSize(64);
321
322        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
323
324        BasicBlock* advanceConBlock = iBuilder->CreateBasicBlock("advanceConBlock");
325        BasicBlock* advanceBodyBlock = iBuilder->CreateBasicBlock("advanceBodyBlock");
326        BasicBlock* advanceExitBlock = iBuilder->CreateBasicBlock("advanceExitBlock");
327
328        iBuilder->CreateBr(advanceConBlock);
329        // TODO special handling for the first advance may have better performance
330        iBuilder->SetInsertPoint(advanceConBlock);
331
332        PHINode* phiCurrentPos = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 2);
333        phiCurrentPos->addIncoming(startPos, entryBlock);
334        PHINode* phiIsFinish = iBuilder->CreatePHI(iBuilder->getInt1Ty(), 2);
335        phiIsFinish->addIncoming(iBuilder->getInt1(false), entryBlock);
336        iBuilder->CreateCondBr(iBuilder->CreateNot(phiIsFinish), advanceBodyBlock, advanceExitBlock);
337
338        iBuilder->SetInsertPoint(advanceBodyBlock);
339
340        Value * currentBlockGlobalPos = iBuilder->CreateUDiv(phiCurrentPos, SIZE_64);
341        Value * currentBlockLocalPos = iBuilder->CreateURem(currentBlockGlobalPos, iBuilder->getSize(this->getAnyStreamSetBuffer(inputName)->getBufferBlocks() * iBuilder->getBitBlockWidth() / 64));
342        Value * currentPosBitBlockOffset = iBuilder->CreateURem(phiCurrentPos, SIZE_64);
343
344        Value * ptr = iBuilder->CreatePointerCast(iBuilder->getRawInputPointer(inputName, iBuilder->getSize(0)), iBuilder->getInt64Ty()->getPointerTo());
345        Value * currentBitValue = iBuilder->CreateLoad(iBuilder->CreateGEP(ptr, currentBlockLocalPos));
346
347        currentBitValue = iBuilder->CreateLShr(currentBitValue, currentPosBitBlockOffset);
348        currentBitValue = iBuilder->CreateNot(currentBitValue);
349
350        Value * forwardZeroCount = iBuilder->CreateTrunc(iBuilder->CreateCountForwardZeroes(currentBitValue), iBuilder->getInt64Ty());
351        Value * newOffset = iBuilder->CreateAdd(currentPosBitBlockOffset, forwardZeroCount);
352        newOffset = iBuilder->CreateUMin(newOffset, iBuilder->getSize(64));
353
354        Value * actualAdvanceValue = iBuilder->CreateSub(newOffset, currentPosBitBlockOffset);
355        Value * newPos = iBuilder->CreateAdd(phiCurrentPos, actualAdvanceValue);
356        if (maxPos) {
357            newPos = iBuilder->CreateUMin(maxPos, newPos);
358            actualAdvanceValue = iBuilder->CreateSub(newPos, phiCurrentPos);
359            newOffset = iBuilder->CreateAdd(currentPosBitBlockOffset, actualAdvanceValue);
360        }
361
362        phiIsFinish->addIncoming(iBuilder->CreateICmpNE(newOffset, iBuilder->getSize(64)), iBuilder->GetInsertBlock());
363        phiCurrentPos->addIncoming(newPos, iBuilder->GetInsertBlock());
364        iBuilder->CreateBr(advanceConBlock);
365
366        iBuilder->SetInsertPoint(advanceExitBlock);
367        return phiCurrentPos;
368    }
369
370    Value * LZ4IndexBuilderKernel::generateLoadInt64NumberInput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value * globalOffset) {
371//        Constant* SIZE_STRIDE_SIZE = iBuilder->getSize(getStride());
372        Constant* SIZE_STRIDE_SIZE = iBuilder->getSize(this->getInputStreamSetBuffer(inputBufferName)->getBufferBlocks() * iBuilder->getBitBlockWidth());
373        Value * processed = iBuilder->getProcessedItemCount(inputBufferName);
374        processed = iBuilder->CreateAnd(processed, ConstantExpr::getNeg(SIZE_STRIDE_SIZE));
375        Value * offset = iBuilder->CreateSub(globalOffset, processed);
376        Value * valuePtr = iBuilder->getRawInputPointer(inputBufferName, offset);
377        return iBuilder->CreateLoad(valuePtr);
378    }
379
380    Value *LZ4IndexBuilderKernel::generateLoadSourceInputByte(const std::unique_ptr<KernelBuilder> &iBuilder, Value * offset) {
381        Value * ptr = iBuilder->getRawInputPointer("byteStream", offset);
382        return iBuilder->CreateLoad(ptr);
383    }
384
385    void LZ4IndexBuilderKernel::increaseScalarField(const unique_ptr<KernelBuilder> &iBuilder, const string &fieldName, Value *value) {
386        Value *fieldValue = iBuilder->getScalarField(fieldName);
387        fieldValue = iBuilder->CreateAdd(fieldValue, value);
388        iBuilder->setScalarField(fieldName, fieldValue);
389    }
390
391
392    void LZ4IndexBuilderKernel::clearCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
393                                                             const std::string &bitstreamName,
394                                                             llvm::Value *start, llvm::Value *end) {
395        //TODO currently we assume that start/end pos is not in the same byte because of the requirement of the LZ4 format
396        Value* SIZE_0 = iBuilder->getSize(0);
397        Value* SIZE_8 = iBuilder->getSize(8);
398        Value* INT8_0 = iBuilder->getInt8(0);
399        Type* INT8_PTR_TY = iBuilder->getInt8PtrTy();
400
401        Value* outputBufferBytes = iBuilder->CreateUDiv(iBuilder->getSize(this->getAnyStreamSetBuffer(bitstreamName)->getBufferBlocks() * iBuilder->getBitBlockWidth()), SIZE_8);
402        Value* rawOutputPtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, SIZE_0), INT8_PTR_TY);
403
404        Value* startRemain = iBuilder->CreateURem(start, SIZE_8);
405        Value* startBytePos = iBuilder->CreateUDiv(start, SIZE_8);
406        Value* endRemain = iBuilder->CreateURem(end, SIZE_8);
407        Value* endBytePos = iBuilder->CreateUDiv(end, SIZE_8);
408
409        BasicBlock* startByteCpyBlock = iBuilder->CreateBasicBlock("startByteCpyBlock");
410        BasicBlock* endByteCpyConBlock = iBuilder->CreateBasicBlock("endByteCpyConBlock");
411        BasicBlock* endByteCpyBlock = iBuilder->CreateBasicBlock("endByteCpyBlock");
412        BasicBlock* memsetBlock = iBuilder->CreateBasicBlock("memsetBlock");
413
414        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(startRemain, SIZE_0), startByteCpyBlock, endByteCpyConBlock);
415
416        // Clear highest {startShiftAmount} bits
417        iBuilder->SetInsertPoint(startByteCpyBlock);
418        Value* startPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(startBytePos, outputBufferBytes));
419        Value* startValue = iBuilder->CreateLoad(startPtr);
420
421        Value* startShiftAmount = iBuilder->CreateSub(SIZE_8, startRemain);
422        startShiftAmount = iBuilder->CreateZExtOrTrunc(startShiftAmount, startValue->getType());
423        startValue = iBuilder->CreateLShr(iBuilder->CreateShl(startValue, startShiftAmount), startShiftAmount);
424
425        iBuilder->CreateStore(startValue, startPtr);
426        iBuilder->CreateBr(endByteCpyConBlock);
427
428        iBuilder->SetInsertPoint(endByteCpyConBlock);
429        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(endBytePos, SIZE_0), endByteCpyBlock, memsetBlock);
430
431        // Clear lowest {endRemain} bits
432        iBuilder->SetInsertPoint(endByteCpyBlock);
433        Value* endPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(endBytePos, outputBufferBytes));
434        Value* endValue = iBuilder->CreateLoad(endPtr);
435        endRemain = iBuilder->CreateZExtOrTrunc(endRemain, endValue->getType());
436        endValue = iBuilder->CreateShl(iBuilder->CreateLShr(endValue, endRemain), endRemain);
437        iBuilder->CreateStore(endValue, endPtr);
438        iBuilder->CreateBr(memsetBlock);
439
440        iBuilder->SetInsertPoint(memsetBlock);
441        Value* memsetStartByte = iBuilder->CreateUDivCeil(start, SIZE_8);
442        Value* memsetEndByte = endBytePos;
443
444        Value* memsetSize = iBuilder->CreateSub(memsetEndByte, memsetStartByte);
445
446        memsetSize = iBuilder->CreateUMin(memsetSize, outputBufferBytes);
447        // We always assume that  (memsetEndByte - memsetStartByte) < outputBufferBytes
448
449        Value* memsetStartByteRem = iBuilder->CreateURem(memsetStartByte, outputBufferBytes);
450
451        Value* memsetSize1 = iBuilder->CreateUMin(iBuilder->CreateSub(outputBufferBytes, memsetStartByteRem), memsetSize);
452        Value* memsetSize2 = iBuilder->CreateSub(memsetSize, memsetSize1);
453
454        iBuilder->CreateMemSet(iBuilder->CreateGEP(rawOutputPtr, memsetStartByteRem), INT8_0, memsetSize1, true);
455        iBuilder->CreateMemSet(rawOutputPtr, INT8_0, memsetSize2, true);
456    }
457
458    void LZ4IndexBuilderKernel::setCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder,
459                                                             const std::string &bitstreamName,
460                                                             llvm::Value *start, llvm::Value *end) {
461        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("exitBlock");
462
463        Value* SIZE_0 = iBuilder->getSize(0);
464        Value* SIZE_1 = iBuilder->getSize(1);
465        Value* SIZE_8 = iBuilder->getSize(8);
466//        Value* INT8_0 = iBuilder->getInt8(0);
467//        Value* INT8_1 = iBuilder->getInt8(1);
468        Type* INT8_PTR_TY = iBuilder->getInt8PtrTy();
469
470        Value* outputBufferBytes = iBuilder->getSize(this->getAnyStreamSetBuffer(bitstreamName)->getBufferBlocks() * iBuilder->getBitBlockWidth() / 8);
471        Value* rawOutputPtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, SIZE_0), INT8_PTR_TY);
472
473        Value* startRemain = iBuilder->CreateURem(start, SIZE_8);
474        Value* startBytePos = iBuilder->CreateUDiv(start, SIZE_8);
475        Value* endRemain = iBuilder->CreateURem(end, SIZE_8);
476        Value* endBytePos = iBuilder->CreateUDiv(end, SIZE_8);
477        Value* startShiftAmount = iBuilder->CreateSub(SIZE_8, startRemain);
478
479        BasicBlock* shortSetBlock = iBuilder->CreateBasicBlock("shortSetBlock");
480        BasicBlock* longSetBlock = iBuilder->CreateBasicBlock("longSetBlock");
481
482//        iBuilder->CreateBr(startByteCpyBlock);
483        iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(startBytePos, endBytePos), shortSetBlock, longSetBlock);
484
485        // When startPos and endPos are in the same byte
486        iBuilder->SetInsertPoint(shortSetBlock);
487        Value* targetPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(startBytePos, outputBufferBytes));
488        Value* targetValue = iBuilder->CreateLoad(targetPtr);
489        Value* rangeMask = iBuilder->CreateSub(iBuilder->CreateShl(SIZE_1, endRemain), iBuilder->CreateShl(SIZE_1, startRemain));
490        rangeMask = iBuilder->CreateZExtOrTrunc(rangeMask, targetValue->getType());
491        targetValue = iBuilder->CreateOr(rangeMask, targetValue);
492
493//        targetValue = iBuilder->CreateNot(iBuilder->CreateLShr(iBuilder->CreateShl(iBuilder->CreateNot(targetValue), startShiftAmount), startShiftAmount));
494//        targetValue = iBuilder->CreateShl(iBuilder->CreateLShr(targetValue, endRemain), endRemain);
495        iBuilder->CreateStore(targetValue, targetPtr);
496        iBuilder->CreateBr(exitBlock);
497
498        iBuilder->SetInsertPoint(longSetBlock);
499
500        BasicBlock* startByteCpyBlock = iBuilder->CreateBasicBlock("startByteCpyBlock");
501        BasicBlock* endByteCpyConBlock = iBuilder->CreateBasicBlock("endByteCpyConBlock");
502        BasicBlock* endByteCpyBlock = iBuilder->CreateBasicBlock("endByteCpyBlock");
503        BasicBlock* memsetBlock = iBuilder->CreateBasicBlock("memsetBlock");
504
505        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(startRemain, SIZE_0), startByteCpyBlock, endByteCpyConBlock);
506        // Clear highest {startShiftAmount} bits
507        iBuilder->SetInsertPoint(startByteCpyBlock);
508        Value* startPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(startBytePos, outputBufferBytes));
509        Value* startValue = iBuilder->CreateLoad(startPtr);
510
511        Value* startShiftAmount2 = iBuilder->CreateZExtOrTrunc(startShiftAmount, startValue->getType());
512        startValue = iBuilder->CreateNot(iBuilder->CreateLShr(iBuilder->CreateShl(iBuilder->CreateNot(startValue), startShiftAmount2), startShiftAmount2));
513
514        iBuilder->CreateStore(startValue, startPtr);
515        iBuilder->CreateBr(endByteCpyConBlock);
516
517        iBuilder->SetInsertPoint(endByteCpyConBlock);
518        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(endBytePos, SIZE_0), endByteCpyBlock, memsetBlock);
519
520        // Clear lowest {endRemain} bits
521        iBuilder->SetInsertPoint(endByteCpyBlock);
522        Value* endPtr = iBuilder->CreateGEP(rawOutputPtr, iBuilder->CreateURem(endBytePos, outputBufferBytes));
523        Value* endValue = iBuilder->CreateLoad(endPtr);
524        Value* endRemain2 = iBuilder->CreateZExtOrTrunc(endRemain, endValue->getType());
525        endValue = iBuilder->CreateNot(iBuilder->CreateShl(iBuilder->CreateLShr(iBuilder->CreateNot(endValue), endRemain2), endRemain2));
526        iBuilder->CreateStore(endValue, endPtr);
527        iBuilder->CreateBr(memsetBlock);
528
529        iBuilder->SetInsertPoint(memsetBlock);
530        Value* memsetStartByte = iBuilder->CreateUDivCeil(start, SIZE_8);
531        Value* memsetEndByte = endBytePos;
532
533        Value* memsetSize = iBuilder->CreateSub(memsetEndByte, memsetStartByte);
534
535        memsetSize = iBuilder->CreateUMin(memsetSize, outputBufferBytes);
536
537        // We always assume that  (memsetEndByte - memsetStartByte) < outputBufferBytes
538
539        Value* memsetStartByteRem = iBuilder->CreateURem(memsetStartByte, outputBufferBytes);
540
541        Value* memsetSize1 = iBuilder->CreateUMin(iBuilder->CreateSub(outputBufferBytes, memsetStartByteRem), memsetSize);
542        Value* memsetSize2 = iBuilder->CreateSub(memsetSize, memsetSize1);
543
544        iBuilder->CreateMemSet(iBuilder->CreateGEP(rawOutputPtr, memsetStartByteRem), iBuilder->getInt8(0xff), memsetSize1, true);
545        iBuilder->CreateMemSet(rawOutputPtr, iBuilder->getInt8(0xff), memsetSize2, true);
546        iBuilder->CreateBr(exitBlock);
547
548        iBuilder->SetInsertPoint(exitBlock);
549    }
550
551    void LZ4IndexBuilderKernel::markCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder, const string &bitstreamName, Value *pos) {
552        Value* SIZE_0 = iBuilder->getSize(0);
553        Value* SIZE_8 = iBuilder->getSize(8);
554        Value* INT8_1 = iBuilder->getInt8(1);
555        Type* bytePtrType = iBuilder->getInt8PtrTy();
556
557        Value* outputBufferBytes = iBuilder->getSize(this->getOutputStreamSetBuffer(bitstreamName)->getBufferBlocks() * iBuilder->getBitBlockWidth() / 8);
558
559        Value* bytePos = iBuilder->CreateUDiv(pos, SIZE_8);
560        bytePos = iBuilder->CreateURem(bytePos, outputBufferBytes);
561        Value* byteOffset = iBuilder->CreateTrunc(iBuilder->CreateURem(pos, SIZE_8), iBuilder->getInt8Ty());
562
563        Value* outputRawPtr = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, SIZE_0), bytePtrType);
564        Value* outputTargetPtr = iBuilder->CreateGEP(outputRawPtr, bytePos);
565
566        Value* targetValue = iBuilder->CreateLoad(outputTargetPtr);
567        targetValue = iBuilder->CreateOr(targetValue, iBuilder->CreateShl(INT8_1, byteOffset));
568        iBuilder->CreateStore(targetValue, outputTargetPtr);
569    }
570
571    void LZ4IndexBuilderKernel::appendM0Output(const std::unique_ptr<KernelBuilder> &b, llvm::Value *start, llvm::Value *end) {
572        // ---- Entry
573        // Constant
574
575        int fw = 64;
576        BasicBlock* entryBlock = b->GetInsertBlock();
577        Value* SIZE_1 = b->getSize(1);
578        Value* SIZE_256 = b->getSize(fw);
579        Value* INT256_0 = b->getIntN(fw, 0);
580        Value* INT256_1 = b->getIntN(fw, 1);
581
582        Value* startBlockIndex = b->CreateUDiv(start, SIZE_256);
583        Value* startOffset = b->CreateZExt(b->CreateURem(start, SIZE_256), b->getIntNTy(fw));
584        Value* endBlockIndex = b->CreateUDiv(end, SIZE_256);
585        Value* endOffset = b->CreateZExt(b->CreateURem(end, SIZE_256), b->getIntNTy(fw));
586
587
588        BasicBlock* appendM0Con = b->CreateBasicBlock("appendM0Con");
589        BasicBlock* appendM0Body = b->CreateBasicBlock("appendM0Body");
590        BasicBlock* appendM0Exit = b->CreateBasicBlock("appendM0Exit");
591
592        Value* pendingM0Index = b->getScalarField("pendingM0Index");
593        Value* pendingM0StartBits = b->getScalarField("pendingM0StartBits");
594        Value* pendingM0EndBits = b->getScalarField("pendingM0EndBits");
595        Value* pendingM0CarryBit = b->getScalarField("pendingM0CarryBit");
596
597        b->CreateBr(appendM0Con);
598
599        // ---- AppendM0Con
600        b->SetInsertPoint(appendM0Con);
601        PHINode* phiCurrentIndex = b->CreatePHI(b->getSizeTy(), 2);
602        phiCurrentIndex->addIncoming(pendingM0Index, entryBlock);
603        PHINode* phiStartBits = b->CreatePHI(b->getIntNTy(fw), 2);
604        phiStartBits->addIncoming(pendingM0StartBits, entryBlock);
605        PHINode* phiEndBits = b->CreatePHI(b->getIntNTy(fw), 2);
606        phiEndBits->addIncoming(pendingM0EndBits, entryBlock);
607        PHINode* phiCarryBit = b->CreatePHI(b->getIntNTy(fw), 2);
608        phiCarryBit->addIncoming(pendingM0CarryBit, entryBlock);
609
610
611        b->CreateUnlikelyCondBr(b->CreateICmpULT(phiCurrentIndex, endBlockIndex), appendM0Body, appendM0Exit);
612        // ---- AppendM0Body
613        b->SetInsertPoint(appendM0Body);
614        Value* actualStartBits = b->CreateSelect(b->CreateICmpEQ(phiCurrentIndex, startBlockIndex), b->CreateOr(phiStartBits, b->CreateShl(INT256_1, startOffset)), phiStartBits);
615        Value* outputValue = b->CreateSub(b->CreateSub(phiEndBits, actualStartBits), phiCarryBit);
616        Value* newCarryBit = b->CreateZExt(b->CreateICmpUGT(b->CreateAdd(actualStartBits, phiCarryBit), phiEndBits), b->getIntNTy(fw));
617
618        this->storeM0(b, phiCurrentIndex, outputValue);
619
620        phiCurrentIndex->addIncoming(b->CreateAdd(phiCurrentIndex, SIZE_1), b->GetInsertBlock());
621        phiStartBits->addIncoming(INT256_0, b->GetInsertBlock());
622        phiEndBits->addIncoming(INT256_0, b->GetInsertBlock());
623        phiCarryBit->addIncoming(newCarryBit, b->GetInsertBlock());
624
625        b->CreateBr(appendM0Con);
626
627        // ---- AppendM0Exit
628        b->SetInsertPoint(appendM0Exit);
629        Value* finalStartBits = b->CreateSelect(b->CreateICmpEQ(phiCurrentIndex, startBlockIndex), b->CreateOr(phiStartBits, b->CreateShl(INT256_1, startOffset)), phiStartBits);
630        Value* finalEndBits = b->CreateOr(phiEndBits, b->CreateShl(INT256_1, endOffset));
631        b->setScalarField("pendingM0Index", phiCurrentIndex);
632        b->setScalarField("pendingM0StartBits", finalStartBits);
633        b->setScalarField("pendingM0EndBits", finalEndBits);
634        b->setScalarField("pendingM0CarryBit", phiCarryBit);
635    }
636
637    void LZ4IndexBuilderKernel::storeM0(const std::unique_ptr<KernelBuilder> &b, llvm::Value* blockIndex, llvm::Value* value) {
638        int fw = 64;
639        Value* m0BufferBlocks = b->getSize(this->getOutputStreamSetBuffer("M0Marker")->getBufferBlocks() * b->getBitBlockWidth() / fw);
640        Value* indexRem = b->CreateURem(blockIndex, m0BufferBlocks);
641        Value* outputBasePtr = b->CreatePointerCast(b->getRawOutputPointer("M0Marker", b->getSize(0)), b->getIntNTy(fw)->getPointerTo());
642        b->CreateStore(value, b->CreateGEP(outputBasePtr, indexRem));
643    }
644
645    void LZ4IndexBuilderKernel::storePendingM0(const std::unique_ptr<KernelBuilder> &b) {
646        Value* outputValue = b->CreateSub(
647                b->CreateSub(
648                        b->getScalarField("pendingM0EndBits"),
649                        b->getScalarField("pendingM0StartBits")
650                ),
651                b->getScalarField("pendingM0CarryBit")
652        );
653        this->storeM0(b, b->getScalarField("pendingM0Index"), outputValue);
654    }
655
656}
Note: See TracBrowser for help on using the repository browser.