source: icGREP/icgrep-devel/icgrep/kernels/sequential_kernel.cpp @ 5902

Last change on this file since 5902 was 5864, checked in by xwa163, 18 months ago

Add LZ4D extract deposit related kernel, target and test cases

File size: 58.3 KB
Line 
1
2#include "sequential_kernel.h"
3#include <kernels/kernel_builder.h>
4#include <kernels/streamset.h>
5#include <iostream>
6#include <string>
7#include <llvm/Support/raw_ostream.h>
8
9
10
11using namespace llvm;
12using namespace kernel;
13using namespace parabix;
14using namespace std;
15
16#define SequentialSegmentStateKey ("SequentialSegment_State")
17#define ModifyInputTempKey ("ModifyInput_Temp")
18#define MemCpyUntilZeroCopyOffsetTempKey ("MemCpyUntilZeroCopyOffsetTempKey")
19#define CountForwardMaxPosTempKey ("CountForwardMaxPosTempKey")
20
21
22
23namespace kernel {
24    SequentialKernel::SequentialKernel(
25            const std::unique_ptr<kernel::KernelBuilder> & iBuilder,
26            std::string && kernelName,
27            std::vector<Binding> && stream_inputs,
28            std::vector<Binding> && stream_outputs,
29            std::vector<Binding> && scalar_parameters,
30            std::vector<Binding> && scalar_outputs,
31            std::vector<Binding> && internal_scalars):
32            MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
33        addScalar(iBuilder->getSizeTy(), SequentialSegmentStateKey);
34        addScalar(iBuilder->getInt1Ty(), ModifyInputTempKey);
35        addScalar(iBuilder->getSizeTy(), MemCpyUntilZeroCopyOffsetTempKey);
36        addScalar(iBuilder->getSizeTy(), CountForwardMaxPosTempKey);
37        addScalar(iBuilder->getSizeTy(), "tempClear");
38
39    }
40
41
42    void SequentialKernel::recordCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder, Value* maxPos) {
43        if (maxPos) {
44            iBuilder->setScalarField(CountForwardMaxPosTempKey, maxPos);
45        }
46    }
47    Value* SequentialKernel::restoreCountForwardTempMaxPos(const std::unique_ptr<KernelBuilder> &iBuilder, Value* currentMaxPos) {
48        if (currentMaxPos) {
49            return iBuilder->getScalarField(CountForwardMaxPosTempKey);
50        }
51        return NULL;
52    }
53
54    void SequentialKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
55        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
56//        iBuilder->CallPrintInt("entry", iBuilder->getSize(1));
57//        iBuilder->CallPrintInt("available", iBuilder->getAvailableItemCount("byteStream"));
58
59        // AfterEntryBlock will be the entry block of subclass if it is initial state
60        BasicBlock* afterEntryBlock = iBuilder->CreateBasicBlock("afterEntryBlock");
61        this->exitBlock = iBuilder->CreateBasicBlock("exitBlock");
62
63        this->stateBlocks.push_back(afterEntryBlock); // index 0 will be initial state
64        iBuilder->SetInsertPoint(afterEntryBlock);
65        this->generateDoSequentialSegmentMethod(iBuilder);
66
67        iBuilder->CreateBr(this->exitBlock);
68        iBuilder->SetInsertPoint(this->exitBlock);
69
70
71        iBuilder->SetInsertPoint(entryBlock);
72        this->generateBuildIndexBits(iBuilder);
73        this->generateClearBuffer(iBuilder);
74
75
76        // Create Indirect Branch
77        std::vector<Constant*> blockAddressVector = std::vector<Constant*>();
78        for (BasicBlock* bb : this->stateBlocks) {
79            blockAddressVector.push_back(BlockAddress::get(bb));
80        }
81        Constant * labels = ConstantVector::get(blockAddressVector);
82
83        Value * target = iBuilder->CreateExtractElement(labels, iBuilder->getScalarField(SequentialSegmentStateKey));
84        IndirectBrInst * indirectBr = iBuilder->CreateIndirectBr(target);
85        for (BasicBlock* bb : this->stateBlocks) {
86            indirectBr->addDestination(bb);
87        }
88
89        iBuilder->SetInsertPoint(this->exitBlock);
90    }
91
92    bool SequentialKernel::hasIndexBits(const std::string& streamName) {
93        return inputStreamIndexMap.find(streamName) != inputStreamIndexMap.end();
94    }
95
96    void SequentialKernel::configOutputBufferToBeClear(const std::map<string, string>& clearMap) {
97        this->clearBufferMap = clearMap;
98    }
99
100    void SequentialKernel::generateClearBuffer(const std::unique_ptr<KernelBuilder> &iBuilder) {
101        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("clear_buffer_entry");
102        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("clear_buffer_exit");
103
104        iBuilder->CreateBr(entryBlock);
105        iBuilder->SetInsertPoint(entryBlock);
106
107        for (auto iter = this->clearBufferMap.begin(); iter != this->clearBufferMap.end(); iter++) {
108            string inputName = iter->first;
109            string outputName = iter->second;
110
111            BasicBlock* clearEntry = iBuilder->CreateBasicBlock("clear_" + outputName + "_entry");
112            BasicBlock* clearCon = iBuilder->CreateBasicBlock("clear_" + outputName + "_con");
113            BasicBlock* clearBody = iBuilder->CreateBasicBlock("clear_" + outputName + "_body");
114            BasicBlock* clearExit = iBuilder->CreateBasicBlock("clear_" + outputName + "_exit");
115
116            iBuilder->CreateBr(clearEntry);
117            iBuilder->SetInsertPoint(clearEntry);
118
119            Value* itemProduced = iBuilder->getScalarField("tempClear");
120            Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputName), iBuilder->getProcessedItemCount(inputName));
121            iBuilder->setScalarField("tempClear", itemsTotal);
122
123            size_t outputSize = this->getOutputBufferSize(iBuilder, outputName);
124            size_t outputPackNum = outputSize / 64;
125
126            Value* startPackIndex = iBuilder->CreateLShr(itemProduced, iBuilder->getSize(std::log2(64)));
127            Value* endPackIndex = iBuilder->CreateLShr(itemsTotal, iBuilder->getSize(std::log2(64)));
128
129            iBuilder->CreateBr(clearCon);
130            iBuilder->SetInsertPoint(clearCon);
131
132            PHINode* currentPackIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
133            currentPackIndex->addIncoming(startPackIndex, clearEntry);
134            iBuilder->CreateCondBr(iBuilder->CreateICmpULT(currentPackIndex, endPackIndex), clearBody, clearExit);
135
136            iBuilder->SetInsertPoint(clearBody);
137            Value* outputBasePtr = iBuilder->getRawOutputPointer(outputName, iBuilder->getSize(0));
138            outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
139            Value* maskedPackIndex = iBuilder->CreateAnd(currentPackIndex, iBuilder->getSize(outputPackNum - 1));
140            iBuilder->CreateStore(iBuilder->getInt64(0), iBuilder->CreateGEP(outputBasePtr, maskedPackIndex));
141
142            currentPackIndex->addIncoming(iBuilder->CreateAdd(currentPackIndex, iBuilder->getSize(1)), clearBody);
143            iBuilder->CreateBr(clearCon);
144
145            iBuilder->SetInsertPoint(clearExit);
146        }
147        iBuilder->CreateBr(exitBlock);
148        iBuilder->SetInsertPoint(exitBlock);
149
150    }
151
152    void SequentialKernel::generateBuildIndexBits(const std::unique_ptr<KernelBuilder> &iBuilder) {
153//        iBuilder->CallPrintInt("entry", iBuilder->getSize(0));
154        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("build_index_bits_entry");
155        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("build_index_bits_exit");
156
157        iBuilder->CreateBr(entryBlock);
158
159        // Entry Block
160        iBuilder->SetInsertPoint(entryBlock);
161
162        for (auto iter = inputStreamIndexMap.begin(); iter != inputStreamIndexMap.end(); iter++) {
163            string streamName = iter->first;
164//            size_t indexArraySize = iter->second;
165
166            BasicBlock* indexUpdateEntryBlock = iBuilder->CreateBasicBlock(streamName + "_index_update_entry");
167            iBuilder->CreateBr(indexUpdateEntryBlock);
168
169            iBuilder->SetInsertPoint(indexUpdateEntryBlock);
170
171            Value* previousItemsAvailable = iBuilder->getScalarField(this->generateInputPreviousAvailableName(streamName));
172            Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(streamName), iBuilder->getProcessedItemCount(streamName));
173            iBuilder->setScalarField(this->generateInputPreviousAvailableName(streamName), itemsTotal);
174
175            size_t bufferSize = this->getInputBufferSize(iBuilder, streamName);
176            size_t indexBitsCount = bufferSize / 64;
177
178            Value* indexBitToBeUpdateStart = iBuilder->CreateLShr(previousItemsAvailable, std::log2(64));
179            Value* indexBitToBeUpdateEnd = iBuilder->CreateLShr(iBuilder->CreateAdd(itemsTotal, iBuilder->getSize(63)), std::log2(64));
180
181
182            BasicBlock* updateLoopCon = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_con");
183            BasicBlock* updateLoopBody = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_body");
184            BasicBlock* updateLoopFinal = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_final");
185            BasicBlock* updateLoopExit = iBuilder->CreateBasicBlock(streamName + "_index_update_loop_exit");
186
187            iBuilder->CreateBr(updateLoopCon);
188
189
190            // Update Loop Con
191            iBuilder->SetInsertPoint(updateLoopCon);
192            PHINode* currentUpdateBitIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
193            currentUpdateBitIndex->addIncoming(indexBitToBeUpdateStart, indexUpdateEntryBlock);
194
195            iBuilder->CreateCondBr(
196                    iBuilder->CreateICmpULT(currentUpdateBitIndex, indexBitToBeUpdateEnd),
197                    updateLoopBody,
198                    updateLoopExit
199            );
200
201            // Update Loop Body
202            iBuilder->SetInsertPoint(updateLoopBody);
203            Value* bitIndex = iBuilder->CreateURem(currentUpdateBitIndex, iBuilder->getSize(indexBitsCount)); // TODO replace with and
204            Value* arrayIndex = iBuilder->CreateLShr(
205                    bitIndex,
206                    iBuilder->getSize(std::log2(64)));
207            Value* indexIndex = iBuilder->CreateAnd(bitIndex, iBuilder->getSize(63));
208
209            Value* inputStreamPtr = iBuilder->getRawInputPointer(streamName, iBuilder->getSize(0));
210            inputStreamPtr = iBuilder->CreatePointerCast(inputStreamPtr, iBuilder->getInt64Ty()->getPointerTo());
211
212            Value* targetInputValue = iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, bitIndex));
213
214            // handle bit 0 index
215            Value* index0OldValue = iBuilder->CreateExtractElement(
216                    iBuilder->getScalarField(this->generateInputZeroIndexName(streamName)),
217                    arrayIndex
218            );
219
220            Value* newBit0Value = iBuilder->CreateNot(
221                    iBuilder->CreateICmpEQ(
222                            targetInputValue,
223                            iBuilder->CreateNot(
224                                    iBuilder->getInt64(0x0)
225                            )
226                    )
227            );
228
229
230
231            newBit0Value = iBuilder->CreateZExt(newBit0Value, iBuilder->getInt64Ty());
232
233            Value* index0NewValue = index0OldValue;
234            index0NewValue = iBuilder->CreateAnd(
235                    index0NewValue,
236                    iBuilder->CreateNot(
237                            iBuilder->CreateShl(
238                                    iBuilder->getInt64(1),
239                                    indexIndex
240                            )
241                    )
242            );
243            index0NewValue = iBuilder->CreateOr(
244                    index0NewValue,
245                    iBuilder->CreateShl(
246                            newBit0Value,
247                            indexIndex
248                    )
249            );
250            iBuilder->setScalarField(
251                    this->generateInputZeroIndexName(streamName),
252                    iBuilder->CreateInsertElement(
253                            iBuilder->getScalarField(this->generateInputZeroIndexName(streamName)),
254                            index0NewValue,
255                            arrayIndex
256                    )
257            );
258
259
260            // handle bit 1 index
261
262            Value* index1OldValue = iBuilder->CreateExtractElement(
263                    iBuilder->getScalarField(this->generateInputOneIndexName(streamName)),
264                    arrayIndex
265            );
266
267            Value* newBit1Value = iBuilder->CreateNot(iBuilder->CreateICmpEQ(targetInputValue, iBuilder->getInt64(0)));
268            newBit1Value = iBuilder->CreateZExt(newBit1Value, iBuilder->getInt64Ty());
269
270            Value* index1NewValue = index1OldValue;
271            index1NewValue = iBuilder->CreateAnd(
272                    index1NewValue,
273                    iBuilder->CreateNot(
274                            iBuilder->CreateShl(
275                                    iBuilder->getInt64(1),
276                                    indexIndex
277                            )
278                    )
279            );
280            index1NewValue = iBuilder->CreateOr(
281                    index1NewValue,
282                    iBuilder->CreateShl(
283                            newBit1Value,
284                            indexIndex
285                    )
286            );
287
288            iBuilder->setScalarField(
289                    this->generateInputOneIndexName(streamName),
290                    iBuilder->CreateInsertElement(
291                            iBuilder->getScalarField(this->generateInputOneIndexName(streamName)),
292                            index1NewValue,
293                            arrayIndex
294                    )
295            );
296
297            iBuilder->CreateBr(updateLoopFinal);
298
299
300            // Update Loop Final
301            iBuilder->SetInsertPoint(updateLoopFinal);
302            currentUpdateBitIndex->addIncoming(iBuilder->CreateAdd(currentUpdateBitIndex, iBuilder->getSize(1)), updateLoopFinal);
303            iBuilder->CreateBr(updateLoopCon);
304
305            //Update Loop Exit
306            iBuilder->SetInsertPoint(updateLoopExit);
307
308        }
309
310        iBuilder->CreateBr(exitBlock);
311        iBuilder->SetInsertPoint(exitBlock);
312
313    }
314
315    void SequentialKernel::generateDoSequentialSegmentMethod(const std::unique_ptr<KernelBuilder> &iBuilder) {
316        // Will be override by subclass
317    }
318
319    // Initialize
320
321    // Index
322    void SequentialKernel::configIndexBits(const std::unique_ptr<KernelBuilder> &iBuilder, const std::map<std::string, size_t>& inputIndexMap) {
323        for (auto iter = inputIndexMap.begin(); iter != inputIndexMap.end(); iter++ ) {
324            string inputBufferName = iter->first;
325            size_t indexBitNum = iter->second; // blockSize = size / iBuilder->getStride()
326            size_t indexArraySize = ((indexBitNum * iBuilder->getStride() / 64 ) + 63) / 64;
327            inputStreamIndexMap.insert(make_pair(inputBufferName, indexArraySize));
328
329            this->addScalar(VectorType::get(iBuilder->getInt64Ty(), indexArraySize), generateInputZeroIndexName(inputBufferName));
330            this->addScalar(VectorType::get(iBuilder->getInt64Ty(), indexArraySize), generateInputOneIndexName(inputBufferName));
331            this->addScalar(iBuilder->getSizeTy(), generateInputPreviousAvailableName(inputBufferName));
332        }
333
334    }
335    inline string SequentialKernel::generateInputZeroIndexName(string inputStreamName) {
336        return "index_" + inputStreamName + "_zero_index";
337    }
338    inline string SequentialKernel::generateInputOneIndexName(string inputStreamName) {
339        return "index_" + inputStreamName + "_one_index";
340    }
341
342    inline string SequentialKernel::generateInputPreviousAvailableName(std::string inputStreamName) {
343        return "index_" + inputStreamName + "_previous_item_available";
344    }
345
346    // Cursor
347    std::string SequentialKernel::generateCursorFullname(std::string cursorName) {
348        return "Cursor_" + cursorName;
349    }
350    void SequentialKernel::initBufferCursor(const std::unique_ptr<KernelBuilder> &iBuilder, std::vector<std::string> cursorNames) {
351        for (std::string name : cursorNames) {
352            addScalar(iBuilder->getSizeTy(), this->generateCursorFullname(name));
353        }
354    }
355
356    Value* SequentialKernel::getCursorValue(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName) {
357        return iBuilder->getScalarField(this->generateCursorFullname(cursorName));
358    }
359
360    void SequentialKernel::setCursorValue(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, Value* value) {
361        iBuilder->setScalarField(this->generateCursorFullname(cursorName), value);
362    }
363
364    void SequentialKernel::advanceCursor(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, llvm::Value* nums) {
365        std::string fullname = this->generateCursorFullname(cursorName);
366        Value* cursorValue = iBuilder->getScalarField(fullname);
367        cursorValue = iBuilder->CreateAdd(cursorValue, nums);
368        iBuilder->setScalarField(fullname, cursorValue);
369    }
370
371    void SequentialKernel::advanceCursorUntilPos(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, llvm::Value* position) {
372        std::string fullname = this->generateCursorFullname(cursorName);
373        Value* cursorValue = iBuilder->getScalarField(fullname);
374        iBuilder->CreateAssert(iBuilder->CreateICmpSLE(cursorValue, position), cursorName + " Cursor can only move forward");
375        iBuilder->setScalarField(fullname, position);
376    }
377
378
379    // forwardBits, packEnd, exceedAvailable
380    std::pair<llvm::Value*, std::pair<llvm::Value*, llvm::Value*>> SequentialKernel::genereateCountForwardBitsOnePack(
381            const std::unique_ptr<KernelBuilder> &iBuilder,
382            std::string inputStreamBufferName,
383            llvm::Value* cursorValue,
384            bool isZero
385    ){
386        size_t bufferSize = this->getInputBufferSize(iBuilder, inputStreamBufferName);
387        Value* bufferOffsetMask = iBuilder->getSize(bufferSize - 1);
388
389        Value* actualBufferOffset = iBuilder->CreateAnd(bufferOffsetMask, cursorValue);
390
391        Value* packIndex = iBuilder->CreateLShr(actualBufferOffset, iBuilder->getSize(std::log2(64)));
392
393        Value* countStartBitIndex = iBuilder->CreateAnd(actualBufferOffset, iBuilder->getSize(64 - 1));
394
395        Value* inputStreamPtr = iBuilder->getInputStreamBlockPtr(inputStreamBufferName, iBuilder->getInt32(0));
396        inputStreamPtr = iBuilder->CreatePointerCast(inputStreamPtr, iBuilder->getInt64Ty()->getPointerTo());
397        Value* packData = iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, packIndex));
398
399
400
401        packData = iBuilder->CreateLShr(packData, countStartBitIndex);
402
403        if (!isZero) {
404            packData = iBuilder->CreateNot(packData);
405        }
406        Value* forwardZeroCount = iBuilder->CreateCountForwardZeroes(packData);
407
408
409
410        Value* isEndOfPack = iBuilder->CreateICmpUGE(iBuilder->CreateAdd(countStartBitIndex, forwardZeroCount), iBuilder->getSize(64));
411        forwardZeroCount = iBuilder->CreateSelect(
412                isEndOfPack,
413                iBuilder->CreateSub(iBuilder->getSize(64), countStartBitIndex),
414                forwardZeroCount
415        );
416
417        Value* newCursorValue = iBuilder->CreateAdd(cursorValue, forwardZeroCount);
418        Value* itemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName), iBuilder->getProcessedItemCount(inputStreamBufferName));
419
420        Value* isExceedAvailable = iBuilder->CreateICmpUGE(newCursorValue, itemTotal);
421
422        newCursorValue = iBuilder->CreateSelect(isExceedAvailable, itemTotal, newCursorValue);
423
424//        Value* isNotFinished = iBuilder->CreateOr(isEndOfPack, isExceedAvailable);
425//        Value* isFinished = iBuilder->CreateNot(isNotFinished);
426        return std::make_pair(iBuilder->CreateSub(newCursorValue, cursorValue), make_pair(isEndOfPack, isExceedAvailable));
427    };
428
429    // pair<forwardZeros, isFinished>
430    std::pair<llvm::Value*, llvm::Value*> SequentialKernel::generateCountForwardBits(
431            const std::unique_ptr<KernelBuilder> &iBuilder,
432            std::string inputStreamBufferName,
433            llvm::Value* cursorValue,
434            bool isZero,
435            llvm::Value* maxPos
436    ) {
437        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("count_forward_bit_entry");
438        iBuilder->CreateBr(entryBlock);
439        iBuilder->SetInsertPoint(entryBlock);
440
441        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("count_forward_bit_exit");
442
443
444        auto onePackResult = genereateCountForwardBitsOnePack(iBuilder, inputStreamBufferName, cursorValue, isZero);
445
446        Value* forwardCount = onePackResult.first;
447        Value* isEndOfPack = onePackResult.second.first;
448        Value* isExceedAvailable = onePackResult.second.second;
449        Value* newCursorValue = iBuilder->CreateAdd(cursorValue, forwardCount);
450
451        if (!hasIndexBits(inputStreamBufferName)) {
452            Value* isNotFinished = iBuilder->CreateOr(isEndOfPack, isExceedAvailable);
453            Value* isFinished = iBuilder->CreateNot(isNotFinished);
454
455            if (maxPos) {
456                Value* reachMaxPos = iBuilder->CreateICmpUGE(newCursorValue, maxPos);
457                isFinished = iBuilder->CreateSelect(
458                        reachMaxPos,
459                        iBuilder->getInt1(true),
460                        isFinished
461                );
462                newCursorValue = iBuilder->CreateSelect(
463                        reachMaxPos,
464                        maxPos,
465                        newCursorValue
466                );
467
468            }
469
470            iBuilder->CreateBr(exitBlock);
471            iBuilder->SetInsertPoint(exitBlock);
472
473            return std::make_pair(iBuilder->CreateSub(newCursorValue, cursorValue), isFinished);
474        } else {
475            BasicBlock* countIndexBitConBlock = iBuilder->CreateBasicBlock("count_forward_bit_count_index_con");
476            BasicBlock* countIndexBitBodyBlock = iBuilder->CreateBasicBlock("count_forward_bit_count_index_body");
477            BasicBlock* countFinalPackBlock = iBuilder->CreateBasicBlock("count_forward_bit_count_final_pack");
478
479            BasicBlock* beforeExitBlock = iBuilder->CreateBasicBlock("count_forward_bit_before_exit");
480            iBuilder->CreateBr(countIndexBitConBlock);
481
482            // beforeExitBlock
483            iBuilder->SetInsertPoint(beforeExitBlock);
484            PHINode* finalNewCursorValue = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
485            PHINode* isFinish = iBuilder->CreatePHI(iBuilder->getInt1Ty(), 3);
486
487
488            Value* retCursorValue = finalNewCursorValue;
489            Value* retIsFinish = isFinish;
490            if (maxPos) {
491                Value* exceedMaxPos = iBuilder->CreateICmpUGE(retCursorValue, maxPos);
492                retCursorValue = iBuilder->CreateSelect(exceedMaxPos, maxPos, retCursorValue);
493                retIsFinish = iBuilder->CreateSelect(exceedMaxPos, iBuilder->getInt1(true), retIsFinish);
494            }
495
496            iBuilder->CreateBr(exitBlock);
497
498
499            // countIndexBitConBlock
500            iBuilder->SetInsertPoint(countIndexBitConBlock);
501
502            // isEndOfPack && !isExceedAvailable
503            Value* shouldCountIndexBit = isEndOfPack;
504
505            if (maxPos) {
506                Value* reachMaxPos = iBuilder->CreateICmpUGE(newCursorValue, maxPos);
507                shouldCountIndexBit = iBuilder->CreateSelect(reachMaxPos, iBuilder->getInt1(false), shouldCountIndexBit);
508            }
509
510            finalNewCursorValue->addIncoming(newCursorValue, countIndexBitConBlock);
511            isFinish->addIncoming(iBuilder->CreateNot(shouldCountIndexBit), countIndexBitConBlock);
512
513            iBuilder->CreateCondBr(shouldCountIndexBit, countIndexBitBodyBlock, beforeExitBlock);
514
515            // countIndexBitBodyBlock
516            iBuilder->SetInsertPoint(countIndexBitBodyBlock);
517            Value* countBeginBitIndex = iBuilder->CreateLShr(newCursorValue, iBuilder->getSize(std::log2(64)));
518
519
520            Value* indexCount = this->generateCountIndexBit(iBuilder, inputStreamBufferName, !isZero, countBeginBitIndex);
521
522            newCursorValue = iBuilder->CreateAdd(
523                    newCursorValue,
524                    iBuilder->CreateShl(
525                            indexCount,
526                            std::log2(64)
527                    )
528            );
529
530            Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName), iBuilder->getProcessedItemCount(inputStreamBufferName));
531            isExceedAvailable = iBuilder->CreateICmpUGE(newCursorValue, itemsTotal);
532            newCursorValue =  iBuilder->CreateSelect(
533                    isExceedAvailable,
534                    itemsTotal,
535                    newCursorValue
536            );
537            BasicBlock* countIndexBitBodyExitBlock = iBuilder->GetInsertBlock();
538
539            finalNewCursorValue->addIncoming(newCursorValue, countIndexBitBodyExitBlock);
540            isFinish->addIncoming(iBuilder->CreateNot(isExceedAvailable), countIndexBitBodyExitBlock);
541
542            iBuilder->CreateCondBr(
543                    isExceedAvailable,
544                    beforeExitBlock,
545                    countFinalPackBlock
546            );
547
548            // CountFinalPackBlock
549            iBuilder->SetInsertPoint(countFinalPackBlock);
550            auto onePackResult = genereateCountForwardBitsOnePack(iBuilder, inputStreamBufferName, newCursorValue, isZero);
551
552            forwardCount = onePackResult.first;
553            //isEndOfPack = onePackResult.second.first;  // should always be false
554            //isExceedAvailable = onePackResult.second.second; // should always be false
555            Value* finalCursorValue = iBuilder->CreateAdd(newCursorValue, forwardCount);
556
557            finalNewCursorValue->addIncoming(finalCursorValue, countFinalPackBlock);
558            isFinish->addIncoming(iBuilder->getInt1(true), countFinalPackBlock);
559
560            iBuilder->CreateBr(beforeExitBlock);
561
562            // exit block
563            iBuilder->SetInsertPoint(exitBlock);
564            return std::make_pair(iBuilder->CreateSub(retCursorValue, cursorValue), retIsFinish);
565        }
566
567    };
568
569    Value* SequentialKernel::generateCountIndexBit(const std::unique_ptr<KernelBuilder> &iBuilder, std::string streamName, bool isZero, llvm::Value* beginBitIndex) {
570        string indexBitScalarName = isZero? this->generateInputZeroIndexName(streamName) : this->generateInputOneIndexName(streamName);
571        BasicBlock* countIndexBitEntryBlock = iBuilder->CreateBasicBlock("count_index_bit_entry_block");
572
573        BasicBlock* countIndexBitConBlock = iBuilder->CreateBasicBlock("count_index_bit_con_block");
574        BasicBlock* countIndexBitBodyBlock = iBuilder->CreateBasicBlock("count_index_bit_body_block");
575//        BasicBlock* countIndexBitFinalBlock = iBuilder->CreateBasicBlock("count_index_bit_final_block");
576        BasicBlock* countIndexBitExitBlock = iBuilder->CreateBasicBlock("count_index_bit_exit_block");
577
578
579        iBuilder->CreateBr(countIndexBitEntryBlock);
580
581        // CountIndexBitEntry
582        iBuilder->SetInsertPoint(countIndexBitEntryBlock);
583        auto info = this->inputStreamIndexMap.find(streamName);
584        //TODO
585//        assert(( "index bit of " + streamName + " not exists") && (info != this->inputStreamIndexMap.end()));
586
587        Value* itemsTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(streamName), iBuilder->getProcessedItemCount(streamName));
588        Value* maxIndexBitCount = iBuilder->CreateLShr(
589                iBuilder->CreateAdd(itemsTotal, iBuilder->getSize(63)),
590                std::log2(64)
591        );
592
593        iBuilder->CreateBr(countIndexBitConBlock);
594        //Con Block
595        iBuilder->SetInsertPoint(countIndexBitConBlock);
596        PHINode* currentBitIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
597        currentBitIndex->addIncoming(beginBitIndex, countIndexBitEntryBlock);
598        iBuilder->CreateCondBr(
599                iBuilder->CreateICmpULT(
600                        currentBitIndex,
601                        maxIndexBitCount
602                ),
603                countIndexBitBodyBlock,
604                countIndexBitExitBlock
605        );
606
607        // Body Block
608        iBuilder->SetInsertPoint(countIndexBitBodyBlock);
609
610
611        Value* countArrayIndex = iBuilder->CreateLShr(
612                iBuilder->CreateAnd(
613                        currentBitIndex,
614                        iBuilder->getSize(this->getInputBufferSize(iBuilder, streamName) / 64 - 1)
615                ),
616                iBuilder->getSize(std::log2(64))
617        );
618        Value* countStartBitIndex = iBuilder->CreateAnd(currentBitIndex, iBuilder->getSize(63));
619
620        Value* packData = iBuilder->CreateExtractElement(
621                iBuilder->getScalarField(indexBitScalarName),
622                countArrayIndex
623        );
624
625        packData = iBuilder->CreateSelect(
626                iBuilder->CreateICmpEQ(countStartBitIndex, iBuilder->getSize(0)),
627                packData,
628                iBuilder->CreateLShr(packData, countStartBitIndex)
629        );
630
631        Value* forwardZeroCount = iBuilder->CreateCountForwardZeroes(packData);
632
633        Value* isEndOfPack = iBuilder->CreateICmpUGE(iBuilder->CreateAdd(countStartBitIndex, forwardZeroCount), iBuilder->getSize(64));
634        forwardZeroCount = iBuilder->CreateSelect(
635                isEndOfPack,
636                iBuilder->CreateSub(iBuilder->getSize(64), countStartBitIndex),
637                forwardZeroCount
638        );
639
640
641        Value* newBitIndex = iBuilder->CreateAdd(currentBitIndex, forwardZeroCount);
642        currentBitIndex->addIncoming(newBitIndex, countIndexBitBodyBlock);
643
644        iBuilder->CreateCondBr(
645                isEndOfPack,
646                countIndexBitConBlock,
647                countIndexBitExitBlock
648        );
649
650
651        //Exit Block
652        iBuilder->SetInsertPoint(countIndexBitExitBlock);
653        PHINode* finalBitIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
654        finalBitIndex->addIncoming(currentBitIndex, countIndexBitConBlock);
655        finalBitIndex->addIncoming(newBitIndex, countIndexBitBodyBlock);
656
657        return iBuilder->CreateSub(finalBitIndex, beginBitIndex);
658    }
659
660    std::pair<llvm::Value*, llvm::Value*> SequentialKernel::generateCountForwardOnes(const unique_ptr<KernelBuilder> &iBuilder, string inputStreamBufferName, Value* beginOffset, Value* maxPos) {
661        return this->generateCountForwardBits(iBuilder, inputStreamBufferName, beginOffset, false, maxPos);
662    };
663
664    std::pair<llvm::Value*, llvm::Value*> SequentialKernel::generateCountForwardZeros(const unique_ptr<KernelBuilder> &iBuilder, string inputStreamBufferName, Value* beginOffset, Value* maxPos) {
665        return this->generateCountForwardBits(iBuilder, inputStreamBufferName, beginOffset, true, maxPos);
666    }
667
668
669    BasicBlock* SequentialKernel::advanceCursorUntilNextOne(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName, Value* maxPos) {
670        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_entry");
671
672        this->recordCountForwardTempMaxPos(iBuilder, maxPos);
673
674        iBuilder->CreateBr(entryBlock);
675        iBuilder->SetInsertPoint(entryBlock);
676
677
678        // StateIndex will be increased in waitCursorUntilInputAvailable
679        this->waitCursorUntilInputAvailable(iBuilder, cursorName, inputStreamBufferName);
680
681        BasicBlock* countForwareZeroBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_count_block");
682        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_one_exit_block");
683
684        iBuilder->CreateBr(countForwareZeroBlock);
685        iBuilder->SetInsertPoint(countForwareZeroBlock);
686
687        Value* cursorValue = this->getCursorValue(iBuilder, cursorName);
688
689        maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
690
691        auto retValue = this->generateCountForwardZeros(iBuilder, inputStreamBufferName, cursorValue, maxPos);
692
693        cursorValue = iBuilder->CreateAdd(cursorValue, retValue.first);
694        Value* isFinished = retValue.second;
695
696
697        //TODO Add additional handle for isFinish (is isFinish === false, the next pack will always start from index 0), avoid using waitCursorUntilInputAvailable in the second loop
698
699        this->setCursorValue(iBuilder, cursorName, cursorValue);
700
701        iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
702        //TODO add index bits for count forward zeros and ones
703        iBuilder->SetInsertPoint(exitBlock);
704        return exitBlock;
705    }
706
707
708    BasicBlock* SequentialKernel::advanceCursorUntilNextZero(
709            const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName, Value* maxPos) {
710        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_entry");
711
712        this->recordCountForwardTempMaxPos(iBuilder, maxPos);
713
714        iBuilder->CreateBr(entryBlock);
715        iBuilder->SetInsertPoint(entryBlock);
716
717        this->waitCursorUntilInputAvailable(iBuilder, cursorName, inputStreamBufferName);
718
719        BasicBlock* countForwareOneBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_count_block");
720        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("advance_cursor_until_next_zero_exit_block");
721
722        iBuilder->CreateBr(countForwareOneBlock);
723        iBuilder->SetInsertPoint(countForwareOneBlock);
724
725        Value* cursorValue = this->getCursorValue(iBuilder, cursorName);
726
727        maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
728
729        auto retValue = this->generateCountForwardOnes(iBuilder, inputStreamBufferName, cursorValue, maxPos);
730
731        this->advanceCursor(iBuilder, cursorName, retValue.first);
732
733        Value* isFinished = retValue.second;
734
735        iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
736        //TODO add index bits for count forward zeros and ones
737        iBuilder->SetInsertPoint(exitBlock);
738        return exitBlock;
739    }
740
741    void SequentialKernel::memcpyCircularBuffer(
742            const std::unique_ptr<KernelBuilder> &iBuilder,
743            string sourceBufferName,
744            llvm::Value* sourceOffset,
745            string dstBufferName,
746            llvm::Value* outputOffset,
747            llvm::Value* distance
748    ) {
749
750        size_t outputBufferSize = this->getOutputBufferSize(iBuilder, dstBufferName);
751        Value* outputBufferSizeValue = iBuilder->getSize(outputBufferSize);
752        Value* outputBufferSizeMask = iBuilder->getSize(outputBufferSize - 1);
753        Value* maskedOutputOffset = iBuilder->CreateAnd(outputOffset, outputBufferSizeMask);
754        Value* remainBuffer = iBuilder->CreateSub(outputBufferSizeValue, maskedOutputOffset);
755        Value* copyLength1 = iBuilder->CreateSelect(iBuilder->CreateICmpUGE(remainBuffer, distance), distance, remainBuffer);
756        Value* copyLength2 = iBuilder->CreateSub(distance, copyLength1);
757
758
759        Value* inputBufferBasePtr = iBuilder->getRawInputPointer(sourceBufferName, iBuilder->getSize(0));
760        Value* outputBufferBasePtr = iBuilder->getRawOutputPointer(dstBufferName, iBuilder->getSize(0));
761
762        iBuilder->CreateMemCpy(
763                iBuilder->CreateGEP(outputBufferBasePtr, maskedOutputOffset),
764                iBuilder->CreateGEP(inputBufferBasePtr, sourceOffset),
765                copyLength1,
766                1); // no alignment guaranteed
767        // Assumed output buffer is Circular buffer
768        iBuilder->CreateMemCpy(
769                outputBufferBasePtr,
770                iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(sourceOffset, copyLength1)),
771                copyLength2,
772                8
773        );
774        iBuilder->setProducedItemCount(dstBufferName, iBuilder->CreateAdd(outputOffset, distance));
775    }
776
777    BasicBlock* SequentialKernel::memcpy2CursorsUntilNextZero(
778            const std::unique_ptr<KernelBuilder> &iBuilder,
779            string sourceBufferName,
780            string sourceCursorName,
781            string dstBufferName,
782            string dstCursorName,
783            string sourceMarkerName,
784            Value* maxPos
785    ) {
786        BasicBlock* previousEntryBlock = iBuilder->GetInsertBlock();
787
788        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("memcpy_2_cursors_until_next_zero_entry");
789        this->recordCountForwardTempMaxPos(iBuilder, maxPos);
790
791        iBuilder->CreateBr(entryBlock);
792        iBuilder->SetInsertPoint(entryBlock);
793
794        this->waitCursorUntilInputAvailable(iBuilder, sourceCursorName, sourceMarkerName);
795
796        BasicBlock* bodyBlock = iBuilder->CreateBasicBlock("memcpy_2_cursors_until_next_zero_body");
797        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("memcpy_2_cursors_until_next_zero_exit");
798
799        iBuilder->CreateBr(bodyBlock);
800        iBuilder->SetInsertPoint(bodyBlock);
801
802        // Count Forward Zero in this pack
803        Value* sourceCursorValue = this->getCursorValue(iBuilder, sourceCursorName);
804
805        maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
806        auto retValue = this->generateCountForwardOnes(iBuilder, sourceMarkerName, sourceCursorValue, maxPos);
807        Value* distance = retValue.first;
808
809        // Memcpy from sourceBuffer[sourceCursor : sourceCursor + distance] to dstBuffer[dstCursor : dstCursor + distance]
810        Value* inputBufferBasePtr = iBuilder->getRawInputPointer(sourceBufferName, iBuilder->getSize(0));
811        Value* outputBufferBasePtr = iBuilder->getRawOutputPointer(dstBufferName, iBuilder->getSize(0));
812
813        Value* outputOffset = this->getCursorValue(iBuilder, dstCursorName);
814        size_t outputBufferSize = this->getOutputBufferSize(iBuilder, dstBufferName);
815        Value* outputBufferSizeValue = iBuilder->getSize(outputBufferSize);
816        Value* outputBufferSizeMask = iBuilder->getSize(outputBufferSize - 1);
817        Value* maskedOutputOffset = iBuilder->CreateAnd(outputOffset, outputBufferSizeMask);
818        Value* remainBuffer = iBuilder->CreateSub(outputBufferSizeValue, maskedOutputOffset);
819        Value* copyLength1 = iBuilder->CreateSelect(iBuilder->CreateICmpUGE(remainBuffer, distance), distance, remainBuffer);
820        Value* copyLength2 = iBuilder->CreateSub(distance, copyLength1);
821
822        iBuilder->CreateMemCpy(
823                iBuilder->CreateGEP(outputBufferBasePtr, maskedOutputOffset),
824                iBuilder->CreateGEP(inputBufferBasePtr, sourceCursorValue),
825                copyLength1,
826                1); // no alignment guaranteed
827        // Assumed output buffer is Circular buffer
828        iBuilder->CreateMemCpy(
829                outputBufferBasePtr,
830                iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(sourceCursorValue, copyLength1)),
831                copyLength2,
832                8
833        );
834
835        // Update cursor value and producedItemCount
836        this->advanceCursor(iBuilder, sourceCursorName, distance);
837        this->advanceCursor(iBuilder, dstCursorName, distance);
838        iBuilder->setProducedItemCount(dstBufferName, this->getCursorValue(iBuilder, dstCursorName));
839
840        // Finish
841        Value* isFinished = retValue.second;
842        iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
843        //TODO should not use index bits for count forward zeros in this case
844        iBuilder->SetInsertPoint(exitBlock);
845        return exitBlock;
846    }
847
848    BasicBlock* SequentialKernel::memcpyOutputDst(
849            const unique_ptr<KernelBuilder> &iBuilder,
850            string outputBufferName,
851            Value* copyOffset,
852            Value* copyLength
853
854    ) {
855        Value* distance = copyLength;
856
857        BasicBlock* matchCopyEntryBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_entry");
858        BasicBlock* matchCopyExitBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_exit");
859
860        Value* outputOffset = iBuilder->getProducedItemCount(outputBufferName);
861
862        iBuilder->CreateBr(matchCopyEntryBlock);
863
864        iBuilder->SetInsertPoint(matchCopyEntryBlock);
865        this->generateDstMatchCopy(iBuilder, matchCopyEntryBlock, matchCopyExitBlock, outputBufferName, copyOffset, distance, outputOffset);
866
867        iBuilder->SetInsertPoint(matchCopyExitBlock);
868        // Update Cursor Value and producedItemCount
869        iBuilder->setProducedItemCount(outputBufferName, iBuilder->CreateAdd(outputOffset, copyLength));
870
871        return matchCopyExitBlock;
872    }
873
874    llvm::BasicBlock* SequentialKernel::memcpyOutputDstCursorUntilNextZero(
875            const std::unique_ptr<KernelBuilder> &iBuilder,
876            std::string outputBufferName,
877            llvm::Value* copyOffset,
878            std::string dstCursorName,
879            std::string dstMarkerName,
880            llvm::Value* maxPos
881    ) {
882        iBuilder->setScalarField(MemCpyUntilZeroCopyOffsetTempKey, copyOffset);
883        this->recordCountForwardTempMaxPos(iBuilder, maxPos);
884
885        BasicBlock* entryBlock = iBuilder->CreateBasicBlock("memcpy_ooutput_dst_cursor_until_next_zero_entry");
886        iBuilder->CreateBr(entryBlock);
887        iBuilder->SetInsertPoint(entryBlock);
888
889        this->waitCursorUntilInputAvailable(iBuilder, dstCursorName, dstMarkerName);
890
891        BasicBlock* bodyBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_body");
892        BasicBlock* exitBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_exit");
893
894        iBuilder->CreateBr(bodyBlock);
895        iBuilder->SetInsertPoint(bodyBlock);
896
897        // Count Forward Zero in this pack
898        Value* cursorValue = this->getCursorValue(iBuilder, dstCursorName);
899        maxPos = this->restoreCountForwardTempMaxPos(iBuilder, maxPos);
900        auto retValue = this->generateCountForwardOnes(iBuilder, dstMarkerName, cursorValue, maxPos);
901        Value* distance = retValue.first;
902
903        // Memcpy from outputBuffer[cursorValue - copyOffset : cursorValue - copyOffset + distance] to outputBuffer[cursorValue : cursorValue + distance]
904        BasicBlock* matchCopyEntryBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_entry");
905        BasicBlock* matchCopyExitBlock = iBuilder->CreateBasicBlock("memcpy_output_dst_cursor_until_next_zero_matchcpy_exit");
906        Value* outputOffset = this->getCursorValue(iBuilder, dstCursorName);
907
908        iBuilder->CreateBr(matchCopyEntryBlock);
909
910        iBuilder->SetInsertPoint(matchCopyEntryBlock);
911        copyOffset = iBuilder->getScalarField(MemCpyUntilZeroCopyOffsetTempKey);
912        this->generateDstMatchCopy(iBuilder, matchCopyEntryBlock, matchCopyExitBlock, outputBufferName, copyOffset, distance, outputOffset);
913
914        iBuilder->SetInsertPoint(matchCopyExitBlock);
915        // Update Cursor Value and producedItemCount
916        this->advanceCursor(iBuilder, dstCursorName, distance);
917        iBuilder->setProducedItemCount(outputBufferName, this->getCursorValue(iBuilder, dstCursorName));
918
919        // Finish
920        Value* isFinished = retValue.second;
921        iBuilder->CreateCondBr(isFinished, exitBlock, entryBlock);
922
923
924        iBuilder->SetInsertPoint(exitBlock);
925
926        return exitBlock;
927    }
928
929    void SequentialKernel::generateDstMatchCopy(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock* entry, BasicBlock* exit, string outputBufferName, Value* matchOffset, Value* matchLength, Value* outputOffset) {
930        iBuilder->SetInsertPoint(entry);
931
932        Value * outputBufferBasePtr = iBuilder->getRawOutputPointer(outputBufferName, iBuilder->getSize(0));
933
934        Value* bufferSize = iBuilder->getSize(this->getOutputBufferSize(iBuilder, outputBufferName));
935        Value* bufferSizeMask = iBuilder->CreateSub(bufferSize, iBuilder->getSize(1));
936
937
938        Value* matchStart = iBuilder->CreateSub(outputOffset, matchOffset);
939        Value * baseSrcOffset = iBuilder->CreateAnd(matchStart, bufferSizeMask);
940        Value * baseDstOffset = iBuilder->CreateAnd(outputOffset, bufferSizeMask);
941
942
943        Value * copyStep = iBuilder->CreateSelect(
944                iBuilder->CreateICmpULT(matchOffset, iBuilder->getSize(4)),
945                iBuilder->getSize(1),
946                iBuilder->getSize(4)
947        );
948
949
950        BasicBlock * cpyLoopCond = iBuilder->CreateBasicBlock("matchcopy_loop_cond");
951        BasicBlock * cpyLoopBody = iBuilder->CreateBasicBlock("matchcopy_loop_body");
952        BasicBlock * cpyLoopExit = iBuilder->CreateBasicBlock("matchcopy_loop_exit");
953
954
955        iBuilder->CreateBr(cpyLoopCond);
956
957        iBuilder->SetInsertPoint(cpyLoopCond);
958
959        PHINode * phiSrcOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
960        PHINode * phiDstOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
961        PHINode * phiIter = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3);
962        phiSrcOffset->addIncoming(baseSrcOffset, entry);
963        phiDstOffset->addIncoming(baseDstOffset, entry);
964        phiIter->addIncoming(iBuilder->getSize(0), entry);
965
966        iBuilder->CreateCondBr(
967                iBuilder->CreateICmpUGE(phiIter, matchLength),
968                cpyLoopExit,
969                cpyLoopBody
970        );
971
972        iBuilder->SetInsertPoint(cpyLoopBody);
973        BasicBlock * reachingBufferEnd_then = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_then");
974        BasicBlock * reachingBufferEnd_else = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_else");
975
976
977        Value * distSrcEnd = iBuilder->CreateSub(bufferSize, phiSrcOffset);
978        Value * distDstEnd = iBuilder->CreateSub(bufferSize, phiDstOffset);
979        Value * minDist = iBuilder->CreateSelect(iBuilder->CreateICmpULT(distSrcEnd, distDstEnd), distSrcEnd, distDstEnd);
980        iBuilder->CreateUnlikelyCondBr(
981                iBuilder->CreateICmpULE(minDist, iBuilder->getSize(4)),
982                reachingBufferEnd_then,
983                reachingBufferEnd_else
984        );
985
986        iBuilder->SetInsertPoint(reachingBufferEnd_then);
987
988        Value * src8 = iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset);
989        Value * dst8 = iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset);
990        iBuilder->CreateStore(iBuilder->CreateLoad(src8), dst8);
991        Value * newSrcOffset = iBuilder->CreateAnd(
992                iBuilder->CreateAdd(phiSrcOffset, iBuilder->getSize(1)),
993                bufferSizeMask
994        );
995        Value * newDstOffset = iBuilder->CreateAnd(
996                iBuilder->CreateAdd(phiDstOffset, iBuilder->getSize(1)),
997                bufferSizeMask
998        );
999        phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_then);
1000        phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_then);
1001        phiIter->addIncoming(iBuilder->CreateAdd(phiIter, iBuilder->getSize(1)), reachingBufferEnd_then);
1002        iBuilder->CreateBr(cpyLoopCond);
1003
1004
1005        iBuilder->SetInsertPoint(reachingBufferEnd_else);
1006        // Copy 4 bytes at a time (regardless of step length).
1007        Value * src32 = iBuilder->CreatePointerCast(
1008                iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset),
1009                iBuilder->getInt32Ty()->getPointerTo());
1010        Value * dst32 = iBuilder->CreatePointerCast(
1011                iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset),
1012                iBuilder->getInt32Ty()->getPointerTo());
1013        // Force unaligned load/store of an int32.
1014        iBuilder->CreateAlignedStore(iBuilder->CreateAlignedLoad(src32, 1), dst32, 1);
1015        newSrcOffset = iBuilder->CreateAnd(
1016                iBuilder->CreateAdd(phiSrcOffset, copyStep),
1017                bufferSizeMask
1018        );
1019        newDstOffset = iBuilder->CreateAnd(
1020                iBuilder->CreateAdd(phiDstOffset, copyStep),
1021                bufferSizeMask
1022        );
1023        phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_else);
1024        phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_else);
1025        phiIter->addIncoming(iBuilder->CreateAdd(phiIter, copyStep), reachingBufferEnd_else);
1026        iBuilder->CreateBr(cpyLoopCond);
1027
1028        iBuilder->SetInsertPoint(cpyLoopExit);
1029        outputOffset = iBuilder->CreateAdd(outputOffset, matchLength);
1030
1031        iBuilder->CreateBr(exit);
1032    }
1033
1034
1035    BasicBlock* SequentialKernel::waitCursorUntilInputAvailable(const std::unique_ptr<KernelBuilder> &iBuilder, std::string cursorName, std::string inputStreamBufferName) {
1036//        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
1037        Value* nextStateValue = iBuilder->getSize(this->stateBlocks.size());
1038
1039        BasicBlock* restoreBlock = iBuilder->CreateBasicBlock("wait_cursor_until_input_available_restore");
1040        BasicBlock* continueBlock = iBuilder->CreateBasicBlock("wait_cursor_until_input_available_continue");
1041
1042        this->stateBlocks.push_back(restoreBlock);
1043
1044        iBuilder->CreateBr(restoreBlock);
1045
1046        iBuilder->SetInsertPoint(restoreBlock);
1047
1048        Value* cursorValue = this->getCursorValue(iBuilder, cursorName);
1049        Value* itemTotal = iBuilder->CreateAdd(iBuilder->getAvailableItemCount(inputStreamBufferName), iBuilder->getProcessedItemCount(inputStreamBufferName));
1050        Value* isAvailable = iBuilder->CreateICmpULT(cursorValue, itemTotal);
1051
1052        Value* nextState = iBuilder->CreateSelect(isAvailable, iBuilder->getSize(0), nextStateValue);
1053        iBuilder->setScalarField(SequentialSegmentStateKey, nextState);
1054
1055        iBuilder->CreateCondBr(isAvailable, continueBlock, this->exitBlock);
1056
1057        iBuilder->SetInsertPoint(continueBlock);
1058
1059        return continueBlock;
1060
1061    }
1062
1063    size_t SequentialKernel::getInputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
1064        //TODO codegen::BlockSize == iBuilder->getStride() ?
1065        return this->getInputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
1066    }
1067
1068    size_t SequentialKernel::getOutputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, string bufferName) {
1069        return this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
1070    }
1071
1072    Value* SequentialKernel::offsetToPackBaseOffset(const unique_ptr<KernelBuilder> &iBuilder, Value* offset) {
1073        return iBuilder->CreateShl(
1074                this->offsetToPackIndex(iBuilder, offset),
1075                iBuilder->getSize(std::log2(64))
1076        );
1077    }
1078    Value* SequentialKernel::offsetToPackIndex(const unique_ptr<KernelBuilder> &iBuilder, Value* offset) {
1079        return iBuilder->CreateLShr(offset, iBuilder->getSize(std::log2(64)));
1080    }
1081
1082    Value* SequentialKernel::offsetToPackOffset(const unique_ptr<KernelBuilder> &iBuilder, Value* offset) {
1083        return iBuilder->CreateAnd(offset, iBuilder->getSize(64 - 1));
1084    }
1085
1086    Value* SequentialKernel::offsetToActualBufferOffset(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset) {
1087        size_t bufferSize = this->getInputBufferSize(iBuilder, inputBufferName);
1088        Value* bufferOffsetMask = iBuilder->getSize(bufferSize - 1);
1089        return iBuilder->CreateAnd(bufferOffsetMask, offset);
1090    }
1091
1092    Value* SequentialKernel::generateLoadCircularInputPack(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset) {
1093        Value* actualBufferOffset = this->offsetToActualBufferOffset(iBuilder, inputBufferName, offset);
1094        Value* packIndex = this->offsetToPackIndex(iBuilder, actualBufferOffset);
1095//        Value* countStartBitIndex = this->offsetToPackOffset(iBuilder, actualBufferOffset);
1096
1097
1098        Value* inputStreamPtr = iBuilder->getInputStreamBlockPtr(inputBufferName, iBuilder->getInt32(0));
1099        inputStreamPtr = iBuilder->CreatePointerCast(inputStreamPtr, iBuilder->getInt64Ty()->getPointerTo());
1100        return iBuilder->CreateLoad(iBuilder->CreateGEP(inputStreamPtr, packIndex));
1101
1102//        packData = iBuilder->CreateLShr(packData, countStartBitIndex);
1103
1104    }
1105
1106    Value* SequentialKernel::generateLoadCircularInput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset, Type* pointerType) {
1107        size_t inputSize = this->getInputBufferSize(iBuilder, inputBufferName);
1108        Value* offsetMask = iBuilder->getSize(inputSize - 1);
1109        Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
1110
1111        Value* inputBufferPtr = iBuilder->getRawInputPointer(inputBufferName, iBuilder->getSize(0));
1112
1113        inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
1114        return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
1115    }
1116    Value* SequentialKernel::generateLoadCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string inputBufferName, Value* offset, Type* pointerType) {
1117        size_t inputSize = this->getOutputBufferSize(iBuilder, inputBufferName);
1118        Value* offsetMask = iBuilder->getSize(inputSize - 1);
1119        Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
1120
1121        Value* inputBufferPtr = iBuilder->getRawOutputPointer(inputBufferName, iBuilder->getSize(0));
1122
1123        inputBufferPtr = iBuilder->CreatePointerCast(inputBufferPtr, pointerType);
1124        return iBuilder->CreateLoad(iBuilder->CreateGEP(inputBufferPtr, maskedOffset));
1125    }
1126
1127    Value* SequentialKernel::generateLoadSourceInputByte(const std::unique_ptr<KernelBuilder> &iBuilder, string sourceBufferName, Value* offset) {
1128        Value * blockStartPtr = iBuilder->CreatePointerCast(
1129                iBuilder->getInputStreamBlockPtr(sourceBufferName, iBuilder->getInt32(0)),
1130                iBuilder->getInt8PtrTy()
1131        );
1132        Value * ptr = iBuilder->CreateGEP(blockStartPtr, offset);
1133
1134
1135        return iBuilder->CreateLoad(ptr);
1136    }
1137
1138
1139    void SequentialKernel::generateStoreCircularOutput(const unique_ptr<KernelBuilder> &iBuilder, string outputBufferName, Type* pointerType, Value* value) {
1140        Value* offset = iBuilder->getProducedItemCount(outputBufferName);
1141
1142        size_t inputSize = this->getOutputBufferSize(iBuilder, outputBufferName);
1143        Value* offsetMask = iBuilder->getSize(inputSize - 1);
1144        Value* maskedOffset = iBuilder->CreateAnd(offsetMask, offset);
1145
1146        Value* outputBufferPtr = iBuilder->getRawOutputPointer(outputBufferName, iBuilder->getSize(0));
1147
1148        outputBufferPtr = iBuilder->CreatePointerCast(outputBufferPtr, pointerType);
1149        iBuilder->CreateStore(value, iBuilder->CreateGEP(outputBufferPtr, maskedOffset));
1150
1151        offset = iBuilder->CreateAdd(offset, iBuilder->getSize(1));
1152        iBuilder->setProducedItemCount(outputBufferName, offset);
1153    }
1154
1155    void SequentialKernel::increaseScalarField(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& fieldName, llvm::Value* value) {
1156        Value* fieldValue = iBuilder->getScalarField(fieldName);
1157        fieldValue = iBuilder->CreateAdd(fieldValue, value);
1158        iBuilder->setScalarField(fieldName, fieldValue);
1159    }
1160
1161
1162    void SequentialKernel::markCircularOutputBitstreamOnePack(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& bitstreamName, llvm::Value* start, llvm::Value* end, bool isOne) {
1163        Value* outputBasePtr = iBuilder->getRawOutputPointer(bitstreamName, iBuilder->getSize(0));
1164
1165        outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
1166
1167        size_t outputBufferSize = this->getOutputBufferSize(iBuilder, bitstreamName);
1168        Value* outputMask = iBuilder->getSize(outputBufferSize / 64 - 1);
1169
1170
1171        Value* startOffset = iBuilder->CreateLShr(start, iBuilder->getSize(std::log2(64)), "startOffset");
1172        Value* curOffset = startOffset;
1173
1174
1175        Value* outputLowestBitValue = iBuilder->CreateSelect(
1176                iBuilder->CreateICmpULE(
1177                        iBuilder->CreateShl(curOffset, std::log2(64)),
1178                        start
1179                ),
1180                iBuilder->CreateShl(iBuilder->getSize(1), iBuilder->CreateAnd(start, iBuilder->getSize(64 - 1))),
1181                iBuilder->getSize(1)
1182        );
1183
1184        Value* outputHighestBitValue = iBuilder->CreateShl(
1185                iBuilder->getSize(1),
1186                iBuilder->CreateAnd(end, iBuilder->getSize(64 - 1))
1187        );
1188
1189
1190        Value* bitMask = iBuilder->CreateSub(
1191                outputHighestBitValue,
1192                outputLowestBitValue
1193        );
1194
1195        if (!isOne) {
1196            bitMask = iBuilder->CreateNot(bitMask);
1197        }
1198    }
1199
1200    // Assume we have enough output buffer
1201    llvm::BasicBlock* SequentialKernel::markCircularOutputBitstream(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& bitstreamName, llvm::Value* start, llvm::Value* end, bool isOne, bool setProduced) {
1202        BasicBlock* entryBlock = iBuilder->GetInsertBlock();
1203
1204        Value* outputBasePtr = iBuilder->getRawOutputPointer(bitstreamName, iBuilder->getSize(0));
1205
1206        outputBasePtr = iBuilder->CreatePointerCast(outputBasePtr, iBuilder->getInt64Ty()->getPointerTo());
1207
1208        size_t outputBufferSize = this->getOutputBufferSize(iBuilder, bitstreamName);
1209        Value* outputMask = iBuilder->getSize(outputBufferSize / 64 - 1);
1210
1211        BasicBlock* conBlock = iBuilder->CreateBasicBlock("mark_bit_one_con");
1212        BasicBlock* bodyBlock =iBuilder->CreateBasicBlock("mark_bit_one_body");
1213        BasicBlock* exitBlock =iBuilder->CreateBasicBlock("mark_bit_one_exit");
1214
1215        Value* startOffset = iBuilder->CreateLShr(start, iBuilder->getSize(std::log2(64)), "startOffset");
1216
1217        iBuilder->CreateBr(conBlock);
1218
1219        // Con
1220        iBuilder->SetInsertPoint(conBlock);
1221
1222
1223        PHINode* curOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
1224        curOffset->addIncoming(startOffset, entryBlock);
1225
1226        iBuilder->CreateCondBr(
1227                iBuilder->CreateICmpULT(iBuilder->CreateShl(curOffset, std::log2(64)), end),
1228                bodyBlock,
1229                exitBlock
1230        );
1231
1232        // Body
1233        iBuilder->SetInsertPoint(bodyBlock);
1234        Value* maskedOffset = iBuilder->CreateAnd(curOffset, outputMask);
1235
1236        Value* outputLowestBitValue = iBuilder->CreateSelect(
1237                iBuilder->CreateICmpULE(
1238                        iBuilder->CreateShl(curOffset, std::log2(64)),
1239                        start
1240                ),
1241                iBuilder->CreateShl(iBuilder->getSize(1), iBuilder->CreateAnd(start, iBuilder->getSize(64 - 1))),
1242                iBuilder->getSize(1)
1243        );
1244
1245        Value* hasNotReachEnd = iBuilder->CreateICmpULE(
1246                iBuilder->CreateShl(iBuilder->CreateAdd(curOffset, iBuilder->getSize(1)), std::log2(64)),
1247                end
1248        );
1249        Value* producedItemsCount = iBuilder->CreateSelect(
1250                hasNotReachEnd,
1251                iBuilder->CreateShl(iBuilder->CreateAdd(curOffset, iBuilder->getSize(1)), std::log2(64)),
1252                end
1253        );
1254
1255        Value* outputHighestBitValue = iBuilder->CreateSelect(
1256                hasNotReachEnd,
1257                iBuilder->getSize(0),
1258                iBuilder->CreateShl(
1259                        iBuilder->getSize(1),
1260                        iBuilder->CreateAnd(end, iBuilder->getSize(64 - 1))
1261                )
1262        );
1263
1264
1265        Value* bitMask = iBuilder->CreateSub(
1266                outputHighestBitValue,
1267                outputLowestBitValue
1268        );
1269
1270        if (!isOne) {
1271            bitMask = iBuilder->CreateNot(bitMask);
1272        }
1273
1274        Value* targetPtr = iBuilder->CreateGEP(outputBasePtr, maskedOffset);
1275        Value* oldValue = iBuilder->CreateLoad(targetPtr);
1276        Value* newValue = NULL;
1277        if (isOne) {
1278            newValue = iBuilder->CreateOr(oldValue, bitMask);
1279        } else {
1280            newValue = iBuilder->CreateAnd(oldValue, bitMask);
1281        }
1282        iBuilder->CreateStore(
1283                newValue,
1284                targetPtr
1285        );
1286        if (setProduced) {
1287            iBuilder->setProducedItemCount(bitstreamName, producedItemsCount);
1288        }
1289
1290        curOffset->addIncoming(iBuilder->CreateAdd(curOffset, iBuilder->getSize(1)), bodyBlock);
1291        iBuilder->CreateBr(conBlock);
1292
1293        // Exit
1294        iBuilder->SetInsertPoint(exitBlock);
1295        return exitBlock;
1296    }
1297}
Note: See TracBrowser for help on using the repository browser.