source: icGREP/icgrep-devel/icgrep/kernels/lz4_index_decoder.cpp @ 5816

Last change on this file since 5816 was 5793, checked in by nmedfort, 21 months ago

Bug fix for pipeline: it was terminating too early when there was insufficient output space to process all of the input for a kernel.

File size: 29.8 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "lz4_index_decoder.h"
8#include <kernels/kernel_builder.h>
9 
10using namespace llvm;
11using namespace kernel;
12
13#ifndef NDEBUG
14#define DEBUG_RT_PRINT 0
15#else
16#define DEBUG_RT_PRINT 0
17#endif
18
19#define printRTDebugMsg(MSG) \
20    if (DEBUG_RT_PRINT) b->CallPrintMsgToStderr(MSG)
21
22#define printRTDebugInt(NAME, X) \
23    if (DEBUG_RT_PRINT) b->CallPrintIntToStderr(NAME, X)
24
25#define printGlobalPos() \
26    printRTDebugInt("GlobalPos", b->CreateAdd(blockStartPos, b->CreateLoad(sOffset)))
27
28namespace {
29
30Value * generateBitswap(const std::unique_ptr<KernelBuilder> & b, Value * v) {
31    Value * bswapFunc = Intrinsic::getDeclaration(b->getModule(),
32            Intrinsic::bswap, v->getType());
33    return b->CreateCall(bswapFunc, {v});
34}
35
36Value * createStackVar(const std::unique_ptr<KernelBuilder> & b, Type * type, StringRef name, Value * initializer = nullptr) {
37    Value * var = b->CreateAlloca(type, nullptr, name);
38    if (initializer) {
39        b->CreateStore(initializer, var);
40    } else {
41        b->CreateStore(ConstantInt::get(type, 0), var);
42    }
43    return var;
44}
45
46void incStackVar(const std::unique_ptr<KernelBuilder> & b, Value * svar, Value * increment = nullptr) {
47    Value * value = b->CreateLoad(svar);
48    if (increment) {
49        value = b->CreateAdd(value, increment);
50    } else {
51        value = b->CreateAdd(value, ConstantInt::get(value->getType(), 1));
52    }
53    b->CreateStore(value, svar);
54}
55
56Value * getOutputPtr(const std::unique_ptr<KernelBuilder> & b, Value * blockStartPtr, Value * offset) {
57    return b->CreateGEP(
58            b->CreatePointerCast(blockStartPtr, b->getInt32Ty()->getPointerTo()),
59            offset
60            );
61}
62
63}       // anonymouse namespace
64
65/**
66 * Get the offset within the current word.
67 */
68Value * LZ4IndexDecoderKernel::getWordOffset(const std::unique_ptr<kernel::KernelBuilder> & b) {
69    Value * offset = b->CreateLoad(sOffset);
70    IntegerType * type = cast<IntegerType>(offset->getType());
71    Constant * mask = ConstantInt::get(type, wordWidth - 1);
72    return b->CreateAnd(offset, mask);
73}
74
75/**
76 * Get the offset of the start of the current word.
77 */
78Value * LZ4IndexDecoderKernel::getWordStartOffset(const std::unique_ptr<KernelBuilder> & b) {
79    Value * offset = b->CreateLoad(sOffset);
80    IntegerType * type = cast<IntegerType>(offset->getType());
81    Constant * mask = ConstantExpr::getNeg(ConstantInt::get(type, wordWidth));
82    return b->CreateAnd(offset, mask);
83}
84
85/**
86 * Load a raw byte from byteStream.
87 * If offset is not provided, load the current byte by default.
88 */
89Value * LZ4IndexDecoderKernel::loadRawByte(const std::unique_ptr<KernelBuilder> & b, Value * offset) {
90    Value * blockStartPtr = b->CreatePointerCast(
91            b->getInputStreamBlockPtr("byteStream", b->getInt32(0)),
92            b->getInt8PtrTy()
93            );
94    if (offset == nullptr) {
95        offset = b->CreateLoad(sOffset);
96    }
97    Value * ptr = b->CreateGEP(blockStartPtr, offset);
98    return b->CreateLoad(ptr);
99}
100
101
102/**
103 * Set the current extender word up until before the offset position.
104 * extender = .......  (little-endian, LSB on the right)
105 * offset   =    ^
106 * cleared  = ....111
107 */
108void LZ4IndexDecoderKernel::setExtenderUntilOffset(const std::unique_ptr<KernelBuilder> & b) {
109    // Little-endian, offset counts from LSB
110    // extender = extender ^ ~((1 << offset) -1)
111    Value * extender = b->CreateLoad(sExtender);
112    Value * wordOffset = b->CreateZExt(
113            getWordOffset(b),
114            b->getSizeTy()
115            );
116    Value * one = b->getSize(1);
117    Value * mask = b->CreateSub(
118            b->CreateShl(one, wordOffset),
119            one);
120    extender = b->CreateOr(extender, mask);
121    b->CreateStore(extender, sExtender);
122}
123
124
125/**
126 * Load the extender word at the current offset.
127 * Called when we potentially reach a new word.  Usually followed by setExtenderUntilOffset.
128 */
129void LZ4IndexDecoderKernel::loadCurrentExtender(const std::unique_ptr<KernelBuilder> & b) {
130    Value * offset = b->CreateLoad(sOffset);
131    IntegerType * type = cast<IntegerType>(offset->getType());
132    ConstantInt * shift = ConstantInt::get(type, std::log2(wordWidth));
133    Value * shiftedOffset = b->CreateLShr(offset, shift);
134    Value * extender = b->CreateExtractElement(extenders, shiftedOffset);
135    b->CreateStore(extender, sExtender);
136}
137
138
139void LZ4IndexDecoderKernel::generateProduceOutput(const std::unique_ptr<KernelBuilder> &b) {
140    Value * producedItem = b->getProducedItemCount("literalIndexes");
141
142//#ifndef NDEBUG
143//    b->CallPrintInt("ProducedItem", producedItem);
144//    // LiteralStart is adjusted to be relative to the block start, so that
145//    // the output can be compared against that of the reference implementation.
146//    Value * literalStart = b->CreateSub(b->getScalarField("LiteralStart"), b->getScalarField("LZ4BlockStart"));
147//    b->CallPrintInt("LiteralStart", literalStart);
148//    b->CallPrintInt("LiteralLength", b->getScalarField("LiteralLength"));
149//    b->CallPrintInt("MatchOffset", b->getScalarField("MatchOffset"));
150//    b->CallPrintInt("MatchLength", b->getScalarField("MatchLength"));
151//#endif
152    printRTDebugMsg("--------------");
153
154    Value * outputOffset = b->CreateAnd(b->CreateTrunc(producedItem, b->getInt32Ty()), b->getInt32(b->getBitBlockWidth() - 1));  // producedItem % blockWidth (as blockWidth is always a power of 2)
155    Value * baseLiteralStartPtr = b->getOutputStreamBlockPtr("literalIndexes", b->getInt32(0));
156
157    Value * literalStartPtr = getOutputPtr(b, baseLiteralStartPtr, outputOffset);
158    Value * literalLengthPtr = getOutputPtr(b,
159            b->getOutputStreamBlockPtr("literalIndexes", b->getInt32(1)), outputOffset);
160    Value * matchOffsetPtr = getOutputPtr(b,
161            b->getOutputStreamBlockPtr("matchIndexes", b->getInt32(0)), outputOffset);
162    Value * matchLengthPtr = getOutputPtr(b,
163            b->getOutputStreamBlockPtr("matchIndexes", b->getInt32(1)), outputOffset);
164
165    b->CreateStore(b->getScalarField("LiteralStart"), literalStartPtr);
166    b->CreateStore(b->getScalarField("LiteralLength"), literalLengthPtr);
167    b->CreateStore(b->getScalarField("MatchOffset"), matchOffsetPtr);
168    b->CreateStore(b->getScalarField("MatchLength"), matchLengthPtr);
169    b->setProducedItemCount("literalIndexes", b->CreateAdd(producedItem, b->getSize(1)));
170    // matchIndexes has a fixed ratio of 1:1 w.r.t. literalIndexes.
171}
172
173
174void LZ4IndexDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
175    BasicBlock * entry_block = b->GetInsertBlock();
176    BasicBlock * exit_block = b->CreateBasicBlock("exit");
177
178    // %entry
179    b->SetInsertPoint(entry_block);
180    printRTDebugMsg("entry");
181    // Global positions in the byte stream.
182    Value * blockNo = b->getScalarField("BlockNo");
183    blockStartPos = b->CreateMul(blockNo, b->getInt32(b->getBitBlockWidth()), "blockStartPos");
184    extenders = b->CreateBitCast(
185            b->loadInputStreamBlock("extenders", b->getInt32(0)),
186            VectorType::get(b->getSizeTy(), b->getBitBlockWidth() / b->getSizeTy()->getBitWidth()),
187            "extenders");
188    // Create a series of stack variables which will be promoted by mem2reg.
189    sOffset = createStackVar(b, b->getInt32Ty(), "offset");
190    // tempLength has different meanings in different states.
191    sTempLength = createStackVar(b, b->getInt32Ty(), "tempLength", b->getScalarField("TempLength"));
192    sTempCount = createStackVar(b, b->getInt32Ty(), "tempCount", b->getScalarField("TempCount"));
193    sState = createStackVar(b, b->getInt8Ty(), "state", b->getScalarField("State"));
194    sExtender = createStackVar(b, b->getSizeTy(), "extender",
195            b->CreateExtractElement(extenders, b->getInt32(0)));
196
197    BasicBlock * skippingBytes = b->CreateBasicBlock("skipping_bytes");
198    BasicBlock * dispatch = b->CreateBasicBlock("dispatch");
199
200    b->CreateCondBr(
201            b->CreateICmpUGT(b->getScalarField("BytesToSkip"), b->getInt32(0)),
202            skippingBytes, dispatch
203            );
204
205    // %skipping_bytes
206    generateSkippingBytes(b, skippingBytes, exit_block);
207    // Insert point is at the end of skippingBytes.
208    b->CreateBr(dispatch);
209
210    // %dispatch
211    // Indirect branching will be added to %dispatch at last.
212
213    // %at_block_checksum
214    BasicBlock * atBlockChecksum = b->CreateBasicBlock("at_block_checksum");
215    generateAtBlockChecksum(b, atBlockChecksum, skippingBytes);
216 
217    // %at_block_size
218    BasicBlock * atBlockSize = b->CreateBasicBlock("at_block_size");
219    generateAtBlockSize(b, atBlockSize, skippingBytes, exit_block);
220
221    // %at_token
222    BasicBlock * atToken = b->CreateBasicBlock("at_token");
223    generateAtToken(b, atToken, exit_block);
224
225    // %extending_literal_length
226    BasicBlock * extendingLiteralLen = b->CreateBasicBlock("extending_literal_length");
227    generateExtendingLiteralLen(b, extendingLiteralLen, exit_block);
228
229    // %at_literals
230    BasicBlock * atLiterals = b->CreateBasicBlock("at_literals");
231    generateAtLiterals(b, atLiterals);
232    b->CreateBr(skippingBytes);
233
234    // %at_first_offset
235    // Note that the last sequence is incomplete and ends with literals.
236    // If the whole LZ4 block is done, process the (optional) checksum.
237    // Otherwise, go around to process the next sequence.
238    BasicBlock * atOffset1 = b->CreateBasicBlock("at_first_offset");
239    b->SetInsertPoint(atOffset1);
240    Value * nowGlobalPos = b->CreateAdd(blockStartPos, b->CreateLoad(sOffset));
241    BasicBlock * blockEnd_else = b->CreateBasicBlock("block_end_else");
242    // Conditional branch inserted at the end of the last block.
243    b->CreateUnlikelyCondBr(
244            b->CreateICmpEQ(nowGlobalPos, b->getScalarField("LZ4BlockEnd")),
245            atBlockChecksum, blockEnd_else
246            );
247    generateAtFirstOffset(b, blockEnd_else, exit_block);
248
249    // %at_second_offset
250    BasicBlock * atOffset2 = b->CreateBasicBlock("at_second_offset");
251    generateAtSecondOffset(b, atOffset2, exit_block);
252
253    // %extending_match_length
254    BasicBlock * extendingMatchLen = b->CreateBasicBlock("extending_match_length");
255    generateExtendingMatchLen(b, extendingMatchLen, exit_block);
256    b->CreateBr(atToken);
257
258    // Indirect branching.
259    b->SetInsertPoint(dispatch);
260    printRTDebugMsg("dispatch");
261    // The order must comply with enum State.
262    Constant * labels = ConstantVector::get(
263            {BlockAddress::get(atBlockSize), BlockAddress::get(atToken), BlockAddress::get(extendingLiteralLen), BlockAddress::get(atLiterals),
264             BlockAddress::get(atOffset1), BlockAddress::get(atOffset2), BlockAddress::get(extendingMatchLen), BlockAddress::get(atBlockChecksum)}
265            );
266    Value * target = b->CreateExtractElement(labels, b->CreateLoad(sState));
267    IndirectBrInst * indirectBr = b->CreateIndirectBr(target);
268    indirectBr->addDestination(atBlockSize);
269    indirectBr->addDestination(atToken);
270    indirectBr->addDestination(extendingLiteralLen);
271    indirectBr->addDestination(atLiterals);
272    indirectBr->addDestination(atOffset1);
273    indirectBr->addDestination(atOffset2);
274    indirectBr->addDestination(extendingMatchLen);
275    indirectBr->addDestination(atBlockChecksum);
276
277    // %exit
278    b->SetInsertPoint(exit_block);
279    printRTDebugMsg("exit");
280    b->setScalarField("State", b->CreateLoad(sState));
281    b->setScalarField("TempLength", b->CreateLoad(sTempLength));
282    b->setScalarField("TempCount", b->CreateLoad(sTempCount));
283    b->setScalarField("BlockNo", b->CreateAdd(blockNo, b->getInt32(1)));
284    // When the kernel builder uses indirectbr, doBlock is not a separate function.
285    // Hence, we branch to a new basic block and fall through instead of returning.
286    BasicBlock * end_block = b->CreateBasicBlock("end_of_block");
287    b->CreateBr(end_block);
288    b->SetInsertPoint(end_block);
289}
290
291
292void LZ4IndexDecoderKernel::generateBoundaryDetection(const std::unique_ptr<KernelBuilder> & b, State state, BasicBlock * exit_block, bool updateExtenderWord) {
293    if (updateExtenderWord) {
294        BasicBlock * wordBoundary_then = b->CreateBasicBlock("word_boundary_then-" + StateLabels.at(state));
295        BasicBlock * blockBoundary_else = b->CreateBasicBlock("block_boundary_else-" + StateLabels.at(state));
296        BasicBlock * wordBoundary_cont = b->CreateBasicBlock("word_boundary_cont-" + StateLabels.at(state));
297        b->CreateUnlikelyCondBr(
298                b->CreateICmpEQ(getWordOffset(b), b->getInt32(0)),
299                wordBoundary_then, wordBoundary_cont
300                );
301
302        b->SetInsertPoint(wordBoundary_then);
303        b->CreateUnlikelyCondBr(
304                b->CreateICmpEQ(b->CreateLoad(sOffset), b->getInt32(b->getBitBlockWidth())),
305                exit_block, blockBoundary_else
306                );
307
308        // Reaching word boundary but not block boundary.  Update the extender word as requested.
309        b->SetInsertPoint(blockBoundary_else);
310        loadCurrentExtender(b);
311        b->CreateBr(wordBoundary_cont);
312
313        // Leave the insert point at the end and return.
314        b->SetInsertPoint(wordBoundary_cont);
315    } else {
316        BasicBlock * blockBoundary_cont = b->CreateBasicBlock("block_boundary_cont-" + StateLabels.at(state));
317        b->CreateUnlikelyCondBr(
318                b->CreateICmpEQ(b->CreateLoad(sOffset), b->getInt32(b->getBitBlockWidth())),
319                exit_block, blockBoundary_cont
320                );
321        // Leave the insert point at the end and return.
322        b->SetInsertPoint(blockBoundary_cont);
323    }
324}
325
326
327void LZ4IndexDecoderKernel::generateSkippingBytes(const std::unique_ptr<kernel::KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
328    b->SetInsertPoint(bb);
329    printRTDebugMsg("skipping bytes");
330
331    Value * remainingBytesInBlock = b->CreateSub(
332            b->getInt32(b->getBitBlockWidth()), b->CreateLoad(sOffset)
333            );
334    Value * remainingBytesToSkip = b->getScalarField("BytesToSkip");
335    Value * advanceDist = b->CreateUMin(remainingBytesInBlock, remainingBytesToSkip);
336    remainingBytesToSkip = b->CreateSub(remainingBytesToSkip, advanceDist);
337    incStackVar(b, sOffset, advanceDist);
338    b->setScalarField("BytesToSkip", remainingBytesToSkip);
339
340    generateBoundaryDetection(b, State::SKIPPING_BYTES, exit_block);
341    // Falls through.
342}
343
344
345void LZ4IndexDecoderKernel::generateAtBlockSize(const std::unique_ptr<KernelBuilder> &b, BasicBlock * bb, BasicBlock * skippingBytes, BasicBlock * exit_block) {
346    b->CreateBr(bb);
347    b->SetInsertPoint(bb);
348    printRTDebugMsg("scanning block size");
349    printGlobalPos();
350
351    // Use tempLength to hold the block size temporarily.
352    // Note that it is initially stored as big-endian (for the ease of reading) and will be "swapped" later.
353    // Use tempCount as the loop counter (0..3).
354    // Both variables are initialized from kernel states at %entry.
355
356    // A do-while loop.
357    BasicBlock * loopBody = b->CreateBasicBlock("blocksize_loop_body");
358    BasicBlock * loopExit = b->CreateBasicBlock("blocksize_loop_exit");
359    b->CreateBr(loopBody);
360
361    b->SetInsertPoint(loopBody);
362    Value * byte = loadRawByte(b);
363    Value * newTempLength = b->CreateAdd(
364            b->CreateShl(b->CreateLoad(sTempLength), b->getInt32(8)),
365            b->CreateZExt(byte, b->getInt32Ty())
366            );
367    b->CreateStore(newTempLength, sTempLength);
368    incStackVar(b, sTempCount);
369    incStackVar(b, sOffset);
370    // Stop when we read all four bytes or reach the end of the block.
371    b->CreateCondBr(
372            b->CreateOr(
373                b->CreateICmpEQ(b->CreateLoad(sTempCount), b->getInt32(4)),
374                b->CreateICmpEQ(b->CreateLoad(sOffset), b->getInt32(b->getBitBlockWidth()))
375                ),
376            loopExit, loopBody
377            );
378
379    b->SetInsertPoint(loopExit);
380    BasicBlock * blockSizeCompleted_then = b->CreateBasicBlock("blocksize_completed_then");
381    BasicBlock * blockSizeCompleted_cont = b->CreateBasicBlock("blocksize_completed_cont");
382    b->CreateLikelyCondBr(
383            b->CreateICmpEQ(b->CreateLoad(sTempCount), b->getInt32(4)),
384            blockSizeCompleted_then, blockSizeCompleted_cont
385            );
386
387    // All four bytes of the block size are read in.
388    b->SetInsertPoint(blockSizeCompleted_then);
389    // Remember to swap the block size back to little-endian.
390    Value * blockSize = generateBitswap(b, b->CreateLoad(sTempLength));
391    Value * currentPos = b->CreateAdd(blockStartPos, b->CreateLoad(sOffset));
392    b->setScalarField("LZ4BlockStart", currentPos);
393    b->setScalarField("LZ4BlockEnd", b->CreateAdd(currentPos, blockSize));
394    printRTDebugInt("blockSize", blockSize);
395
396    BasicBlock * uncompressedBlock_then = b->CreateBasicBlock("uncompressed_block_then");
397    BasicBlock * uncompressedBlock_else = b->CreateBasicBlock("uncompressed_block_cont");
398    b->CreateUnlikelyCondBr(
399            b->CreateTrunc(
400                b->CreateLShr(blockSize, b->getInt32(31)),
401                b->getInt1Ty()
402                ),
403            uncompressedBlock_then,
404            uncompressedBlock_else
405            );
406
407    b->SetInsertPoint(uncompressedBlock_then);
408    Value * realBlockSize = b->CreateXor(blockSize, b->getInt32(1L << 31));
409    b->setScalarField("LZ4BlockEnd", b->CreateAdd(currentPos, realBlockSize));
410    b->setScalarField("BytesToSkip", realBlockSize);
411    b->setScalarField("LiteralStart", currentPos);
412    b->setScalarField("LiteralLength", realBlockSize);
413    // No need to set MatchLength/MatchOffset to 0, nor to produce output,
414    // because %atBlockChecksum will do so as the last sequence.
415    b->CreateStore(b->getInt8(State::AT_BLOCK_CHECKSUM), sState);
416    b->CreateBr(skippingBytes);
417
418    b->SetInsertPoint(uncompressedBlock_else);
419    // Reset these temporary values for later use.
420    b->CreateStore(b->getInt32(0), sTempLength);
421    b->CreateStore(b->getInt32(0), sTempCount);
422    b->CreateStore(b->getInt8(State::AT_TOKEN), sState);
423    // A block size of 0 is the end mark of the frame. Exit.
424    b->CreateUnlikelyCondBr(
425            b->CreateICmpEQ(blockSize, ConstantInt::getNullValue(blockSize->getType())),
426            exit_block,
427            blockSizeCompleted_cont
428            );
429
430    // We could be at the boundary no matter the block size is completed or not.
431    b->SetInsertPoint(blockSizeCompleted_cont);
432    generateBoundaryDetection(b, State::AT_BLOCK_SIZE, exit_block);
433    // Falls through to %at_token.
434}
435
436
437void LZ4IndexDecoderKernel::generateAtToken(const std::unique_ptr<kernel::KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
438    b->CreateBr(bb);
439    b->SetInsertPoint(bb);
440    printRTDebugMsg("reading token");
441
442    Value * token = loadRawByte(b);
443    Value * literalLen = b->CreateZExt(
444        b->CreateLShr(token, b->getInt8(4)),
445        b->getInt32Ty()
446        );
447    Value * matchLen = b->CreateZExt(
448        b->CreateAnd(token, b->getInt8(0xf)),
449        b->getInt32Ty()
450        );
451    incStackVar(b, sOffset);
452    // Prepare extender word for scanning.
453    loadCurrentExtender(b);
454    setExtenderUntilOffset(b);
455    // Store the (partial) match length to be extended later.
456    b->setScalarField("MatchLength", matchLen);
457    // Use tempLength to accumulate extended lengths (until at_literals).
458    b->CreateStore(literalLen, sTempLength);
459    b->CreateStore(b->getInt8(State::EXTENDING_LITERAL_LENGTH), sState);
460
461    generateBoundaryDetection(b, State::AT_TOKEN, exit_block);
462    // Falls through to %extending_literal_length.
463}
464
465
466void LZ4IndexDecoderKernel::generateExtendingLiteralLen(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
467    b->CreateBr(bb);
468    b->SetInsertPoint(bb);
469    printRTDebugMsg("extending literal len");
470
471    Value * wordOffset = getWordOffset(b);
472    Value * blockOffset = getWordStartOffset(b);
473    Value * literalLen = b->CreateLoad(sTempLength);
474    Value * literalExtEnd = b->CreateTrunc(
475                b->CreateCountForwardZeroes(b->CreateNot(b->CreateLoad(sExtender))),
476                b->getInt32Ty());
477    printRTDebugInt("wordOffset", wordOffset);
478    printRTDebugInt("literalExtEnd", literalExtEnd);
479    // number of extender = literalExtEnd - wordOffset
480    Value * numExtenders = b->CreateSub(literalExtEnd, wordOffset);
481    Value * literalExtReachBoundary =
482            b->CreateICmpEQ(literalExtEnd, b->getInt32(wordWidth));
483    // There are literalExtEnd forward zeroes, we load bytes[literalExtEnd]
484    // which is the first non-extender.  If literalExtEnd == 64, we force the
485    // load index to be 0 to avoid out-of-bound access, and lastByte will be 0.
486    Value * loadOffset = b->CreateSelect(literalExtReachBoundary,
487            ConstantInt::getNullValue(literalExtEnd->getType()),
488            literalExtEnd);
489    Value * lastByte = b->CreateSelect(literalExtReachBoundary,
490            b->getInt8(0),
491            loadRawByte(b, b->CreateAdd(blockOffset, loadOffset)));
492    Value * literalLenExted = b->CreateICmpUGE(literalLen, b->getInt32(0xf));
493    literalLen = b->CreateSelect(literalLenExted,
494            b->CreateAdd(
495                literalLen,
496                b->CreateAdd(
497                    b->CreateMul(numExtenders, b->getInt32(0xff)),
498                    b->CreateZExt(lastByte, b->getInt32Ty())
499                    )
500                ),      // literalLen + numExtenders * 255
501            literalLen);
502    wordOffset = b->CreateSelect(literalLenExted,
503            literalExtEnd,
504            wordOffset);
505    // If lastByte is truly the last length byte, we need to advance the cursor by 1.
506    wordOffset = b->CreateSelect(
507            b->CreateAnd(literalLenExted, b->CreateNot(literalExtReachBoundary)),
508            b->CreateAdd(wordOffset, b->getInt32(1)),
509            wordOffset
510            );
511    b->CreateStore(literalLen, sTempLength);
512    b->CreateStore(b->CreateAdd(blockOffset, wordOffset), sOffset);
513    Value * unfinished = b->CreateAnd(literalExtReachBoundary, literalLenExted);
514    Value * newState = b->CreateSelect(unfinished,
515            b->getInt8(State::EXTENDING_LITERAL_LENGTH),
516            b->getInt8(State::AT_LITERALS));
517    b->CreateStore(newState, sState);
518
519    generateBoundaryDetection(b, State::EXTENDING_LITERAL_LENGTH, exit_block, true);
520    BasicBlock * cont_block = b->CreateBasicBlock("finished_" + StateLabels.at(State::EXTENDING_LITERAL_LENGTH));
521    // Insert point is still in wordBoundary block now.
522    // See if there are still more extenders.
523    b->CreateUnlikelyCondBr(unfinished, bb, cont_block);
524
525    b->SetInsertPoint(cont_block);
526    // Falls through to %at_literals.
527}
528
529
530void LZ4IndexDecoderKernel::generateAtLiterals(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb) {
531    b->CreateBr(bb);
532    b->SetInsertPoint(bb);
533    b->setScalarField("LiteralStart", b->CreateAdd(blockStartPos, b->CreateLoad(sOffset)));
534    b->setScalarField("LiteralLength", b->CreateLoad(sTempLength));
535    b->setScalarField("BytesToSkip", b->CreateLoad(sTempLength));
536    b->CreateStore(b->getInt8(State::AT_FIRST_OFFSET), sState);
537
538    // No boundary detection here as we do not advance the cursor.
539    // Control flow will be redirected to %skipping_bytes later.
540}
541
542
543void LZ4IndexDecoderKernel::generateAtFirstOffset(const std::unique_ptr<KernelBuilder> &b, BasicBlock * bb, BasicBlock * exit_block) {
544    b->SetInsertPoint(bb);
545    printRTDebugMsg("reading first offset");
546
547    Value * byte = b->CreateZExt(loadRawByte(b), b->getInt32Ty());
548    // Use tempLength to store partial offset.
549    b->CreateStore(byte, sTempLength);
550    incStackVar(b, sOffset);
551    b->CreateStore(b->getInt8(State::AT_SECOND_OFFSET), sState);
552
553    generateBoundaryDetection(b, State::AT_FIRST_OFFSET, exit_block);
554    // Falls through to %at_second_offset.
555}
556
557
558void LZ4IndexDecoderKernel::generateAtSecondOffset(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
559    b->CreateBr(bb);
560    b->SetInsertPoint(bb);
561    printRTDebugMsg("reading second offset");
562
563    Value * byte1 = b->CreateLoad(sTempLength);
564    Value * byte2 = b->CreateZExt(loadRawByte(b), b->getInt32Ty());
565    Value * offset = b->CreateAdd(
566            b->CreateShl(byte2, b->getInt32(8)),
567            byte1
568            );
569    b->setScalarField("MatchOffset", offset);
570    incStackVar(b, sOffset);
571    // Prepare extender word and tempLength for extending.
572    loadCurrentExtender(b);
573    setExtenderUntilOffset(b);
574    b->CreateStore(b->getScalarField("MatchLength"), sTempLength);
575    b->CreateStore(b->getInt8(State::EXTENDING_MATCH_LENGTH), sState);
576
577    generateBoundaryDetection(b, State::AT_SECOND_OFFSET, exit_block);
578    // Falls through to %extending_match_length.
579}
580
581
582void LZ4IndexDecoderKernel::generateExtendingMatchLen(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
583    b->CreateBr(bb);
584    b->SetInsertPoint(bb);
585    printRTDebugMsg("extending match length");
586    printGlobalPos();
587    printRTDebugInt("rawbyte", loadRawByte(b));
588    printRTDebugInt("extword", b->CreateLoad(sExtender));
589
590    Value * wordOffset = getWordOffset(b);
591    Value * blockOffset = getWordStartOffset(b);
592    Value * matchLen = b->CreateLoad(sTempLength);
593    Value * matchExtEnd = b->CreateTrunc(
594        b->CreateCountForwardZeroes(b->CreateNot(b->CreateLoad(sExtender))),
595        b->getInt32Ty()
596        );
597    printRTDebugInt("wordoffset", wordOffset);
598    printRTDebugInt("matchExtEnd", matchExtEnd);
599    // number of extender = matchExtEnd - wordOffset
600    Value * numExtenders = b->CreateSub(matchExtEnd, wordOffset);
601    Value * matchExtReachBoundary = 
602            b->CreateICmpEQ(matchExtEnd, b->getInt32(wordWidth));
603    // There are matchExtEnd forward zeroes, we load bytes[matchExtEnd]
604    // which is the first non-extender.  If matchExtEnd == 64, we force the
605    // load index to be 0 to avoid out-of-bound access, and lastByte will be 0.
606    Value * loadOffset = b->CreateSelect(matchExtReachBoundary,
607            ConstantInt::getNullValue(matchExtEnd->getType()),
608            matchExtEnd);
609    Value * lastByte = b->CreateSelect(matchExtReachBoundary,
610            b->getInt8(0),
611            loadRawByte(b, b->CreateAdd(blockOffset, loadOffset)));
612    Value * matchLenExted = b->CreateICmpUGE(matchLen, b->getInt32(0xf));
613    matchLen = b->CreateSelect(matchLenExted,
614            b->CreateAdd(
615                matchLen,
616                b->CreateAdd(
617                    b->CreateMul(numExtenders, b->getInt32(0xff)),
618                    b->CreateZExt(lastByte, b->getInt32Ty())
619                    )
620                ),      // matchLen + numExtenders * 255
621            matchLen);
622    wordOffset = b->CreateSelect(matchLenExted,
623            matchExtEnd,
624            wordOffset);
625    // If lastByte is truly the last length byte, we need to advance the cursor by 1.
626    wordOffset = b->CreateSelect(
627            b->CreateAnd(matchLenExted, b->CreateNot(matchExtReachBoundary)),
628            b->CreateAdd(wordOffset, b->getInt32(1)),
629            wordOffset
630            );
631    b->CreateStore(matchLen, sTempLength);
632    b->CreateStore(b->CreateAdd(blockOffset, wordOffset), sOffset);
633
634    Value * unfinished = b->CreateAnd(matchExtReachBoundary, matchLenExted);
635    BasicBlock * output_then = b->CreateBasicBlock("output_then");
636    BasicBlock * output_cont = b->CreateBasicBlock("output_cont");
637    b->CreateLikelyCondBr(
638            b->CreateNot(unfinished),
639            output_then, output_cont
640            );
641    b->SetInsertPoint(output_then);
642    b->CreateStore(b->getInt8(State::AT_TOKEN), sState);
643    matchLen = b->CreateAdd(matchLen, b->getInt32(4));    // Add the constant at the end.
644    b->setScalarField("MatchLength", matchLen);
645    generateProduceOutput(b);
646    b->CreateBr(output_cont);
647
648    b->SetInsertPoint(output_cont);
649    generateBoundaryDetection(b, State::EXTENDING_MATCH_LENGTH, exit_block, true);
650    BasicBlock * cont_block = b->CreateBasicBlock("finished_" + StateLabels.at(State::EXTENDING_MATCH_LENGTH));
651    // Insert point is still in wordBoundary block now.
652    // See if there are still more extenders.
653    b->CreateUnlikelyCondBr(unfinished, bb, cont_block);
654
655    b->SetInsertPoint(cont_block);
656}
657
658
659void LZ4IndexDecoderKernel::generateAtBlockChecksum(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * skippingBytes) {
660    // No branch here as we have made a conditional branch outside.
661    b->SetInsertPoint(bb);
662    printRTDebugMsg("processing block checksum");
663
664    // Produce the partial output (fill matchIndexes with 0).
665    b->setScalarField("MatchOffset", b->getInt32(0));
666    b->setScalarField("MatchLength", b->getInt32(0));
667    generateProduceOutput(b);
668
669    BasicBlock * hasChecksum_then = b->CreateBasicBlock("has_checksum_then");
670    BasicBlock * hasChecksum_cont = b->CreateBasicBlock("has_checksum_cont");
671
672    b->CreateStore(b->getInt8(State::AT_BLOCK_SIZE), sState);
673    b->CreateCondBr(b->getScalarField("hasBlockChecksum"), hasChecksum_then, hasChecksum_cont);
674
675    b->SetInsertPoint(hasChecksum_then);
676    b->setScalarField("BytesToSkip", b->getInt32(4));
677    b->CreateBr(skippingBytes);
678    // Boundary detection will be done in skipping_bytes.
679
680    b->SetInsertPoint(hasChecksum_cont);
681    // No checksum, offset not advanced.  Falls through to the next block (block_size).
682}
683
684LZ4IndexDecoderKernel::LZ4IndexDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
685: BlockOrientedKernel("lz4IndexDecoder",
686// Inputs
687{Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Misaligned()},
688 Binding{b->getStreamSetTy(1, 1), "extenders"}},
689// Outputs: literal start, literal length, match offset, match length
690{Binding{b->getStreamSetTy(2, 32), "literalIndexes", UnknownRate()},
691 Binding{b->getStreamSetTy(2, 32), "matchIndexes", RateEqualTo("literalIndexes")}},
692// Arguments
693{Binding{b->getInt1Ty(), "hasBlockChecksum"}},
694{},
695// Internal states:
696{Binding{b->getInt32Ty(), "BlockNo"},
697 Binding{b->getInt8Ty(), "State"},
698 Binding{b->getInt32Ty(), "LZ4BlockStart"},
699 Binding{b->getInt32Ty(), "LZ4BlockEnd"},
700 Binding{b->getInt32Ty(), "BytesToSkip"},
701 Binding{b->getInt32Ty(), "TempLength"},
702 Binding{b->getInt32Ty(), "TempCount"},
703 Binding{b->getInt32Ty(), "LiteralStart"},
704 Binding{b->getInt32Ty(), "LiteralLength"},
705 Binding{b->getInt32Ty(), "MatchOffset"},
706 Binding{b->getInt32Ty(), "MatchLength"}})
707, wordWidth{b->getSizeTy()->getBitWidth()} {
708
709}
Note: See TracBrowser for help on using the repository browser.