source: icGREP/icgrep-devel/icgrep/kernels/expand3_4.cpp @ 5216

Last change on this file since 5216 was 5216, checked in by cameron, 3 years ago

Radix 64 expansion kernel

File size: 17.9 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "expand3_4.h"
6#include <kernels/kernel.h>
7#include <IDISA/idisa_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10namespace kernel {
11using namespace llvm;
12
13// This kernel produces an expanded input stream by duplicating every third byte.
14// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
15// a single shufflevector operation produces 16 bytes of output data from the
16// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
17// produced from 24 bytes of input data.
18//
19// Using aligned SIMD loads, an inner loop processes three registers full of input
20// data (i.e., three BytePacks) to produce four registers full of output.   This is
21// a 3 step process.
22// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
23//          At this point 3/4 of the data in input_pack0 has been processed.
24// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
25//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
26//          At this point 1/2 of the data in input_pack1 has been processed.
27// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
28//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
29//          Then apply a further shuffle opertaion to use the remaining 3/4 of
30//          input_pack2 to produce output_pack3.
31
32// The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
33// producing 4 blocks of output.   Unless less than one tripleBlock remains, the
34// doSegment method always processes an integral number of tripleBlocks as a logical
35// segment.  Both input and output buffers are hence maintained at block boundaries,
36// with the input data completely processed for each tripleBlock.
37//
38// The pipeline must guarantee that the doSegment method is called with the
39// a continous buffer for the full segment (number of blocks).
40
41void expand3_4Kernel::generateDoSegmentMethod() {
42    IDISA::IDISA_Builder::InsertPoint savePoint = iBuilder->saveIP();
43    Module * m = iBuilder->getModule();
44    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
45    BasicBlock * expand2_3entry = BasicBlock::Create(iBuilder->getContext(), "expand2_3entry", doSegmentFunction, 0);
46    iBuilder->SetInsertPoint(expand2_3entry);
47    BasicBlock * expand_3_4_loop = BasicBlock::Create(iBuilder->getContext(), "expand_3_4_loop", doSegmentFunction, 0);
48    BasicBlock * expand3_4_loop_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_loop_exit", doSegmentFunction, 0);
49    BasicBlock * finalStep1 = BasicBlock::Create(iBuilder->getContext(), "finalStep1", doSegmentFunction, 0);
50    BasicBlock * finalStep2 = BasicBlock::Create(iBuilder->getContext(), "finalStep2", doSegmentFunction, 0);
51    BasicBlock * step2load = BasicBlock::Create(iBuilder->getContext(), "step2load", doSegmentFunction, 0);
52    BasicBlock * step2store = BasicBlock::Create(iBuilder->getContext(), "step2store", doSegmentFunction, 0);
53    BasicBlock * finalStep3 = BasicBlock::Create(iBuilder->getContext(), "finalStep3", doSegmentFunction, 0);
54    BasicBlock * step3load = BasicBlock::Create(iBuilder->getContext(), "step3load", doSegmentFunction, 0);
55    BasicBlock * step3store = BasicBlock::Create(iBuilder->getContext(), "step3store", doSegmentFunction, 0);
56    BasicBlock * step3store2 = BasicBlock::Create(iBuilder->getContext(), "step3store2", doSegmentFunction, 0);
57    BasicBlock * itemsDone = BasicBlock::Create(iBuilder->getContext(), "itemsDone", doSegmentFunction, 0);
58    BasicBlock * setTermination = BasicBlock::Create(iBuilder->getContext(), "setTermination", doSegmentFunction, 0);
59    BasicBlock * expand3_4_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_exit", doSegmentFunction, 0);
60    BasicBlock * finalExit = BasicBlock::Create(iBuilder->getContext(), "finalExit", doSegmentFunction, 0);
61   
62    // Determine the require shufflevector constants.
63    const unsigned PACK_SIZE = iBuilder->getStride()/8;
64   
65    // Construct a list of indexes in  the form
66    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
67    unsigned sourceByteIndex = 0;
68    unsigned expand3_4_index[PACK_SIZE];
69    for (unsigned i = 0; i < PACK_SIZE; i++) {
70        expand3_4_index[i] = sourceByteIndex;
71        if (i % 4 != 2) sourceByteIndex++;
72    }
73    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
74    Value * expand_3_4_shuffle[4];
75    for (unsigned j = 0; j < 4; j++) {
76        std::vector<Constant *> Idxs;
77        for (unsigned i = 0; i < PACK_SIZE; i++) {
78            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
79        }
80        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
81    }   
82    Constant * Const3 = ConstantInt::get(iBuilder->getSizeTy(), 3);
83    Constant * Const4 = ConstantInt::get(iBuilder->getSizeTy(), 4);
84    Constant * tripleBlockSize = ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getBitBlockWidth() * 3);
85    Constant * stride = ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride());
86    Constant * packSize = ConstantInt::get(iBuilder->getSizeTy(), PACK_SIZE);
87    Constant * loopItemCount = ConstantInt::get(iBuilder->getSizeTy(), 3 * PACK_SIZE); // 3 packs per loop.
88    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(parabix::i8));
89   
90    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
91    Function::arg_iterator args = doSegmentFunction->arg_begin();
92    Value * self = &*(args++);
93    Value * blocksToDo = &*(args);
94    //iBuilder->CallPrintInt("blocksToDo", blocksToDo);
95    Value * segmentNo = getLogicalSegmentNo(self);
96    Value * streamStructPtr = getStreamSetStructPtr(self, "sourceStream");
97    //iBuilder->CallPrintInt("streamStructPtr", iBuilder->CreatePtrToInt(streamStructPtr, iBuilder->getInt64Ty()));
98   
99    LoadInst * producerPos = iBuilder->CreateAtomicLoadAcquire(mStreamSetInputBuffers[0]->getProducerPosPtr(streamStructPtr));
100    //iBuilder->CallPrintInt("producerPos", producerPos);
101    Value * processed = getProcessedItemCount(self);
102    Value * itemsAvail = iBuilder->CreateSub(producerPos, processed);
103   
104    // Except for the final segment, we always process an integral number of triple blocks.
105    Value * tripleBlocksToDo = iBuilder->CreateUDiv(blocksToDo, ConstantInt::get(iBuilder->getSizeTy(), 3));
106    Value * tripleBlocksAvail = iBuilder->CreateUDiv(itemsAvail, tripleBlockSize);
107    Value * lessThanFullSegment = iBuilder->CreateICmpULT(tripleBlocksAvail, tripleBlocksToDo);
108    Value * tripleBlockItems = iBuilder->CreateMul(iBuilder->CreateSelect(lessThanFullSegment, tripleBlocksAvail, tripleBlocksToDo), tripleBlockSize);
109    Value * endSignal = iBuilder->CreateLoad(mStreamSetInputBuffers[0]->hasEndOfInputPtr(streamStructPtr));
110    Value * inFinalSegment = iBuilder->CreateAnd(endSignal, lessThanFullSegment);
111    Value * itemsToDo = iBuilder->CreateSelect(inFinalSegment, itemsAvail, tripleBlockItems);
112    //iBuilder->CallPrintInt("itemsToDo", itemsToDo);
113
114    Value * blockNo = getScalarField(self, blockNoScalar);
115    Value * sourceBlockPtr = getStreamSetBlockPtr(self, "sourceStream", blockNo);
116   
117    Value * outputGenerated = getProducedItemCount(self); // bytes previously generated to output
118    Value * outputBlockNo = iBuilder->CreateUDiv(outputGenerated, stride);
119    Value * outputBlockPtr = getStreamSetBlockPtr(self, "expandedStream", outputBlockNo);
120   
121    // A block is made up of 8 packs.  Get the pointer to the first pack (changes the type of the pointer only).
122    Value * sourcePackPtr = iBuilder->CreateGEP(sourceBlockPtr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(0)});
123    Value * outputPackPtr = iBuilder->CreateGEP(outputBlockPtr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(0)});
124    Value * hasFullLoop = iBuilder->CreateICmpUGE(itemsToDo, loopItemCount);
125    iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
126    iBuilder->SetInsertPoint(expand_3_4_loop);
127    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
128    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
129    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
130    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
131    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
132    loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
133    // Step 1 of the main loop.
134    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
135    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
136    iBuilder->CreateAlignedStore(expand0, loopOutput_ptr, packAlign);
137    // Step 2 of the main loop.
138    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, {iBuilder->getInt32(1)});
139    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(1)});
140    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
141    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
142    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
143    // Step 3 of the main loop.
144    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, {iBuilder->getInt32(2)});
145    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(2)});
146    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
147    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
148    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
149    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(3)});
150    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
151    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
152   
153    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, {iBuilder->getInt32(3)});
154    Value * loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(4)});
155    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, loopItemCount);
156    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
157    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
158    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
159    //iBuilder->CallPrintInt("loopItemsRemain", remainingItems);
160    Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, loopItemCount);
161    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
162
163    // Except for the final segment, the number of items remaining is now 0.
164    // For the final segment, less than loopItemCount items remain.
165    iBuilder->SetInsertPoint(expand3_4_loop_exit);
166    PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
167    PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
168    PHINode * loopExitItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
169    loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
170    loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
171    loopExitItemsRemain->addIncoming(itemsToDo, expand2_3entry);
172    loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
173    loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
174    loopExitItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
175
176    // There may be one or two remaining full packs and/or a partial pack. 
177    //
178    // We have several cases depending on the number of reumaing items.  Let N = packSize
179    // (a) 0 remaining items: all done
180    // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
181    // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
182    // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
183    // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
184    // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
185    // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
186    Value * condition_a = iBuilder->CreateICmpEQ(loopExitItemsRemain, ConstantInt::getNullValue(iBuilder->getSizeTy()));
187    iBuilder->CreateCondBr(condition_a, itemsDone, finalStep1);
188    // Final Step1 processing
189    iBuilder->SetInsertPoint(finalStep1);
190    pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
191    expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
192    iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
193    Value * condition_b = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 3 * PACK_SIZE/4));
194    iBuilder->CreateCondBr(condition_b, itemsDone, finalStep2);
195    // Final Step 2 processing
196    iBuilder->SetInsertPoint(finalStep2);
197    Value * condition_c = iBuilder->CreateICmpULE(loopExitItemsRemain, packSize);
198    iBuilder->CreateCondBr(condition_c, step2store, step2load);
199    iBuilder->SetInsertPoint(step2load);
200    inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, {iBuilder->getInt32(1)});
201    pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
202    iBuilder->CreateBr(step2store);
203    iBuilder->SetInsertPoint(step2store);
204    PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
205    pack1phi->addIncoming(undefPack, finalStep2);
206    pack1phi->addIncoming(pack1, step2load);
207    outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, {iBuilder->getInt32(1)});
208    expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
209    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
210    Value * condition_d = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 6 * PACK_SIZE/4));
211    iBuilder->CreateCondBr(condition_d, itemsDone, finalStep3);
212    // Final Step 3
213    iBuilder->SetInsertPoint(finalStep3);
214    Value * condition_e = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 2 * PACK_SIZE));
215    iBuilder->CreateCondBr(condition_e, step3store, step3load);
216    iBuilder->SetInsertPoint(step3load);
217    inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, {iBuilder->getInt32(2)});
218    pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
219    iBuilder->CreateBr(step3store);
220    iBuilder->SetInsertPoint(step3store);
221    PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
222    pack2phi->addIncoming(undefPack, finalStep3);
223    pack2phi->addIncoming(pack2, step3load);
224    outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, {iBuilder->getInt32(2)});
225    expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
226    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
227    Value * condition_f = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 9 * PACK_SIZE/4));
228    iBuilder->CreateCondBr(condition_f, itemsDone, step3store2);
229    iBuilder->SetInsertPoint(step3store2);
230    outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, {iBuilder->getInt32(3)});
231    expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
232    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
233    iBuilder->CreateBr(itemsDone);
234    //
235    iBuilder->SetInsertPoint(itemsDone);
236   
237    processed = iBuilder->CreateAdd(processed, itemsToDo);
238    setProcessedItemCount(self, processed);
239    setScalarField(self, blockNoScalar, iBuilder->CreateUDiv(processed, stride));
240    // We have produced 4 output bytes for every 3 input bytes.  If the number of input
241    // bytes is not a multiple of 3, then we have one more output byte for each excess
242    // input byte.
243    Value * totalProduced = iBuilder->CreateAdd(iBuilder->CreateMul(iBuilder->CreateUDiv(processed, Const3), Const4), iBuilder->CreateURem(processed, Const3));
244    setProducedItemCount(self, totalProduced);
245    Value * ssStructPtr = getStreamSetStructPtr(self, "expandedStream");
246   
247    Value * producerPosPtr = mStreamSetOutputBuffers[0]->getProducerPosPtr(ssStructPtr);
248    iBuilder->CreateAtomicStoreRelease(totalProduced, producerPosPtr);
249   
250    iBuilder->CreateCondBr(inFinalSegment, setTermination, expand3_4_exit);
251    iBuilder->SetInsertPoint(setTermination);
252#ifndef NDEBUG
253    iBuilder->CallPrintInt(mKernelName + " termination in segment ", segmentNo);
254#endif
255    setTerminationSignal(self);
256    mStreamSetOutputBuffers[0]->setEndOfInput(ssStructPtr);
257    iBuilder->CreateBr(expand3_4_exit);
258    iBuilder->SetInsertPoint(expand3_4_exit);
259    // Must be the last action, for synchronization.
260    setLogicalSegmentNo(self, iBuilder->CreateAdd(segmentNo, ConstantInt::get(iBuilder->getSizeTy(), 1)));
261    iBuilder->CreateBr(finalExit);
262   
263    iBuilder->SetInsertPoint(finalExit);
264    iBuilder->CreateRetVoid();
265    iBuilder->restoreIP(savePoint);
266}
267
268
269// The doBlock method is deprecated.   But in case it is used, just call doSegment with
270// 1 as the number of blocks to do.
271void expand3_4Kernel::generateDoBlockMethod() {
272    auto savePoint = iBuilder->saveIP();
273    Module * m = iBuilder->getModule();
274    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
275    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
276    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
277    Value * self = getParameter(doBlockFunction, "self");
278    iBuilder->CreateCall(doSegmentFunction, {self, ConstantInt::get(iBuilder->getSizeTy(), 1)});
279    iBuilder->CreateRetVoid();
280    iBuilder->restoreIP(savePoint);
281}
282
283}
Note: See TracBrowser for help on using the repository browser.