source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 5261

Last change on this file since 5261 was 5261, checked in by cameron, 9 months ago

Move responsibility for ProducedItemCount? into doSegment unless overridden

File size: 36.9 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "radix64.h"
6//#include "expand3_4.h"
7#include <kernels/kernel.h>
8#include <IR_Gen/idisa_builder.h>
9#include <llvm/IR/Module.h>
10#include <llvm/Support/raw_ostream.h>
11
12using namespace llvm;
13
14namespace kernel {
15
16// This kernel produces an expanded input stream by duplicating every third byte.
17// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
18// a single shufflevector operation produces 16 bytes of output data from the
19// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
20// produced from 24 bytes of input data.
21//
22// Using aligned SIMD loads, an inner loop processes three registers full of input
23// data (i.e., three BytePacks) to produce four registers full of output.   This is
24// a 3 step process.
25// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
26//          At this point 3/4 of the data in input_pack0 has been processed.
27// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
28//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
29//          At this point 1/2 of the data in input_pack1 has been processed.
30// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
31//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
32//          Then apply a further shuffle opertaion to use the remaining 3/4 of
33//          input_pack2 to produce output_pack3.
34
35// The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
36// producing 4 blocks of output.   Unless less than one tripleBlock remains, the
37// doSegment method always processes an integral number of tripleBlocks as a logical
38// segment.  Both input and output buffers are hence maintained at block boundaries,
39// with the input data completely processed for each tripleBlock.
40//
41// The pipeline must guarantee that the doSegment method is called with the
42// a continous buffer for the full segment (number of blocks).
43
44   
45expand3_4Kernel::expand3_4Kernel(IDISA::IDISA_Builder * iBuilder) :
46    KernelBuilder(iBuilder, "expand3_4",
47                  {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream"}},
48                  {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
49                  {}, {}, {}) {
50        setDoBlockUpdatesProducedItemCountsAttribute(true);
51    }
52   
53   
54void expand3_4Kernel::generateDoSegmentMethod() const {
55    IDISA::IDISA_Builder::InsertPoint savePoint = iBuilder->saveIP();
56    Module * m = iBuilder->getModule();
57    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
58    BasicBlock * expand2_3entry = BasicBlock::Create(iBuilder->getContext(), "expand2_3entry", doSegmentFunction, 0);
59    iBuilder->SetInsertPoint(expand2_3entry);
60    BasicBlock * expand_3_4_loop = BasicBlock::Create(iBuilder->getContext(), "expand_3_4_loop", doSegmentFunction, 0);
61    BasicBlock * expand3_4_loop_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_loop_exit", doSegmentFunction, 0);
62    BasicBlock * finalStep1 = BasicBlock::Create(iBuilder->getContext(), "finalStep1", doSegmentFunction, 0);
63    BasicBlock * finalStep2 = BasicBlock::Create(iBuilder->getContext(), "finalStep2", doSegmentFunction, 0);
64    BasicBlock * step2load = BasicBlock::Create(iBuilder->getContext(), "step2load", doSegmentFunction, 0);
65    BasicBlock * step2store = BasicBlock::Create(iBuilder->getContext(), "step2store", doSegmentFunction, 0);
66    BasicBlock * finalStep3 = BasicBlock::Create(iBuilder->getContext(), "finalStep3", doSegmentFunction, 0);
67    BasicBlock * step3load = BasicBlock::Create(iBuilder->getContext(), "step3load", doSegmentFunction, 0);
68    BasicBlock * step3store = BasicBlock::Create(iBuilder->getContext(), "step3store", doSegmentFunction, 0);
69    BasicBlock * step3store2 = BasicBlock::Create(iBuilder->getContext(), "step3store2", doSegmentFunction, 0);
70    BasicBlock * itemsDone = BasicBlock::Create(iBuilder->getContext(), "itemsDone", doSegmentFunction, 0);
71    BasicBlock * setTermination = BasicBlock::Create(iBuilder->getContext(), "setTermination", doSegmentFunction, 0);
72    BasicBlock * expand3_4_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_exit", doSegmentFunction, 0);
73    BasicBlock * finalExit = BasicBlock::Create(iBuilder->getContext(), "finalExit", doSegmentFunction, 0);
74   
75    // Determine the require shufflevector constants.
76    const unsigned PACK_SIZE = iBuilder->getStride()/8;
77   
78    // Construct a list of indexes in  the form
79    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
80    unsigned sourceByteIndex = 0;
81    unsigned expand3_4_index[PACK_SIZE];
82    for (unsigned i = 0; i < PACK_SIZE; i++) {
83        expand3_4_index[i] = sourceByteIndex;
84        if (i % 4 != 2) sourceByteIndex++;
85    }
86    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
87    Value * expand_3_4_shuffle[4];
88    for (unsigned j = 0; j < 4; j++) {
89        std::vector<Constant *> Idxs;
90        for (unsigned i = 0; i < PACK_SIZE; i++) {
91            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
92        }
93        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
94    }
95    Constant * Const3 = iBuilder->getSize(3);
96    Constant * Const4 = iBuilder->getSize(4);
97    Constant * stride = iBuilder->getSize(iBuilder->getStride());
98    Constant * packSize = iBuilder->getSize(PACK_SIZE);
99    Constant * loopItemCount = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
100    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
101   
102    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
103    Function::arg_iterator args = doSegmentFunction->arg_begin();
104    Value * self = &*(args++);
105    Value * blocksToDo = &*(args);
106    Value * streamStructPtr = getStreamSetStructPtr(self, "sourceStream");
107
108    LoadInst * producerPos = iBuilder->CreateAtomicLoadAcquire(mStreamSetInputBuffers[0]->getProducerPosPtr(streamStructPtr));
109    Value * processed = getProcessedItemCount(self, "sourceStream");
110    Value * itemsAvail = iBuilder->CreateSub(producerPos, processed);
111   
112    // Except for the final segment, we always process an integral number of triple blocks.
113    Value * tripleBlocksToDo = iBuilder->CreateMul(blocksToDo, Const3);
114    Constant * blockItems = iBuilder->getSize(iBuilder->getBitBlockWidth());
115    Value * tripleItemMax = iBuilder->CreateMul(tripleBlocksToDo, blockItems);
116
117    Value * lessThanFullSegment = iBuilder->CreateICmpULT(itemsAvail, tripleItemMax);
118    Value * tripleBlockItems = iBuilder->CreateSelect(lessThanFullSegment, itemsAvail, tripleItemMax);
119
120    Value * endSignal = iBuilder->CreateLoad(mStreamSetInputBuffers[0]->getEndOfInputPtr(streamStructPtr));
121    Value * inFinalSegment = iBuilder->CreateAnd(endSignal, lessThanFullSegment);
122    Value * itemsToDo = iBuilder->CreateSelect(inFinalSegment, itemsAvail, tripleBlockItems);
123
124//    iBuilder->CallPrintInt("itemsToDo", itemsToDo);
125
126    Value * blockNo = getScalarField(self, blockNoScalar);
127
128    // A block is made up of 8 packs.  Get the pointer to the first pack (changes the type of the pointer only).
129    Value * sourcePackPtr = getStream(self, "sourceStream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(0));
130
131    Value * outputGenerated = getProducedItemCount(self, "expandedStream"); // bytes previously generated to output
132    Value * outputBlockNo = iBuilder->CreateUDiv(outputGenerated, stride);
133    Value * outputPackPtr = getStream(self, "expandedStream", outputBlockNo, iBuilder->getInt32(0), iBuilder->getInt32(0));
134
135    Value * hasFullLoop = iBuilder->CreateICmpUGE(itemsToDo, loopItemCount);
136
137
138    iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
139    iBuilder->SetInsertPoint(expand_3_4_loop);
140    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
141    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
142    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
143
144    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
145    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
146    loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
147
148    // Step 1 of the main loop.
149    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
150    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
151    iBuilder->CreateAlignedStore(expand0, loopOutput_ptr, packAlign);
152    // Step 2 of the main loop.
153    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(1));
154    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(1));
155    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
156    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
157    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
158    // Step 3 of the main loop.
159    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(2));
160    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(2));
161    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
162    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
163    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
164    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(3));
165    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
166    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
167
168    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(3));
169
170
171
172    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, loopItemCount);
173
174    Value * loopProcessed = iBuilder->CreateSub(itemsToDo, remainingItems);
175    loopProcessed = iBuilder->CreateMul(iBuilder->CreateUDiv(loopProcessed, iBuilder->getInt64(3)), iBuilder->getInt64(4));
176
177    Value * loopNextOutputPack;
178    loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(4));
179
180    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
181    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
182    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
183
184    Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, loopItemCount);
185    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
186
187    // Except for the final segment, the number of items remaining is now 0.
188    // For the final segment, less than loopItemCount items remain.
189    iBuilder->SetInsertPoint(expand3_4_loop_exit);
190    PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
191    PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
192    PHINode * loopExitItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
193    loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
194    loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
195    loopExitItemsRemain->addIncoming(itemsToDo, expand2_3entry);
196    loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
197    loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
198    loopExitItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
199    // There may be one or two remaining full packs and/or a partial pack.
200    //
201    // We have several cases depending on the number of reumaing items.  Let N = packSize
202    // (a) 0 remaining items: all done
203    // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
204    // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
205    // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
206    // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
207    // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
208    // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
209    Value * condition_a = iBuilder->CreateICmpEQ(loopExitItemsRemain, ConstantInt::getNullValue(iBuilder->getSizeTy()));
210    iBuilder->CreateCondBr(condition_a, itemsDone, finalStep1);
211    // Final Step1 processing
212    iBuilder->SetInsertPoint(finalStep1);
213    pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
214    expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
215    iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
216    Value * condition_b = iBuilder->CreateICmpULE(loopExitItemsRemain, iBuilder->getSize(3 * PACK_SIZE/4));
217    iBuilder->CreateCondBr(condition_b, itemsDone, finalStep2);
218    // Final Step 2 processing
219    iBuilder->SetInsertPoint(finalStep2);
220    Value * condition_c = iBuilder->CreateICmpULE(loopExitItemsRemain, packSize);
221    iBuilder->CreateCondBr(condition_c, step2store, step2load);
222    iBuilder->SetInsertPoint(step2load);
223    inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(1));
224    pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
225    iBuilder->CreateBr(step2store);
226    iBuilder->SetInsertPoint(step2store);
227    PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
228    pack1phi->addIncoming(undefPack, finalStep2);
229    pack1phi->addIncoming(pack1, step2load);
230    outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(1));
231    expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
232    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
233    Value * condition_d = iBuilder->CreateICmpULE(loopExitItemsRemain, iBuilder->getSize(6 * PACK_SIZE/4));
234    iBuilder->CreateCondBr(condition_d, itemsDone, finalStep3);
235    // Final Step 3
236    iBuilder->SetInsertPoint(finalStep3);
237    Value * condition_e = iBuilder->CreateICmpULE(loopExitItemsRemain, iBuilder->getSize(2 * PACK_SIZE));
238    iBuilder->CreateCondBr(condition_e, step3store, step3load);
239    iBuilder->SetInsertPoint(step3load);
240    inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(2));
241    pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
242    iBuilder->CreateBr(step3store);
243    iBuilder->SetInsertPoint(step3store);
244    PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
245    pack2phi->addIncoming(undefPack, finalStep3);
246    pack2phi->addIncoming(pack2, step3load);
247    outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(2));
248    expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
249    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
250    Value * condition_f = iBuilder->CreateICmpULE(loopExitItemsRemain, iBuilder->getSize(9 * PACK_SIZE/4));
251    iBuilder->CreateCondBr(condition_f, itemsDone, step3store2);
252    iBuilder->SetInsertPoint(step3store2);
253    outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(3));
254    expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
255    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
256    iBuilder->CreateBr(itemsDone);
257    //
258    iBuilder->SetInsertPoint(itemsDone);
259
260    processed = iBuilder->CreateAdd(processed, itemsToDo);
261    setProcessedItemCount(self, "sourceStream", processed);
262
263    setScalarField(self, blockNoScalar, iBuilder->CreateUDiv(processed, stride));
264    // We have produced 4 output bytes for every 3 input bytes.  If the number of input
265    // bytes is not a multiple of 3, then we have one more output byte for each excess
266    // input byte.
267    Value * totalProduced = iBuilder->CreateAdd(iBuilder->CreateMul(iBuilder->CreateUDiv(processed, Const3), Const4), iBuilder->CreateURem(processed, Const3));
268    setProducedItemCount(self, "expandedStream", totalProduced);
269    Value * ssStructPtr = getStreamSetStructPtr(self, "expandedStream");
270
271    Value * producerPosPtr = mStreamSetOutputBuffers[0]->getProducerPosPtr(ssStructPtr);
272
273    iBuilder->CreateAtomicStoreRelease(totalProduced, producerPosPtr);
274   
275    iBuilder->CreateCondBr(inFinalSegment, setTermination, expand3_4_exit);
276    iBuilder->SetInsertPoint(setTermination);
277#ifndef NDEBUG
278//    iBuilder->CallPrintInt(mKernelName + " termination in segment ", segmentNo);
279#endif
280    setTerminationSignal(self);
281    mStreamSetOutputBuffers[0]->setEndOfInput(ssStructPtr);
282    iBuilder->CreateBr(expand3_4_exit);
283    iBuilder->SetInsertPoint(expand3_4_exit);
284    // Must be the last action, for synchronization.
285    iBuilder->CreateBr(finalExit);
286   
287    iBuilder->SetInsertPoint(finalExit);
288    iBuilder->CreateRetVoid();
289    iBuilder->restoreIP(savePoint);
290}
291
292
293// The doBlock method is deprecated.   But in case it is used, just call doSegment with
294// 1 as the number of blocks to do.
295void expand3_4Kernel::generateDoBlockMethod() const {
296    auto savePoint = iBuilder->saveIP();
297    Module * m = iBuilder->getModule();
298    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
299    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
300    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
301    Value * self = getParameter(doBlockFunction, "self");
302    iBuilder->CreateCall(doSegmentFunction, {self, iBuilder->getSize(1)});
303    iBuilder->CreateRetVoid();
304    iBuilder->restoreIP(savePoint);
305}
306
307// Radix 64 determination, converting 3 bytes to 4 6-bit values.
308//
309//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
310//           zy                            bits to move 6 positions right
311//             xwvuts                      bits to move 8 positions left
312//                    rqpm                 bits to move 4 positions right
313//                        nlkj             bits to move 10 positions left
314//                             hqfedc      bits to move 2 positions right
315//                                   ba    bits to move 12 positions left
316//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
317void radix64Kernel::generateDoBlockLogic(Value * self, Value * blockNo) const {
318
319    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
320    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
321    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
322    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
323    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
324    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
325   
326    for (unsigned i = 0; i < 8; i++) {
327        Value * expandedStream = getStream(self, "expandedStream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
328        Value * bytepack = iBuilder->CreateBlockAlignedLoad(expandedStream);
329
330        Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
331        Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
332        Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
333        Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
334        Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
335        Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
336
337        Value * mid = right_6_result;
338        mid = iBuilder->simd_or(mid, right_4_result);
339        mid = iBuilder->simd_or(mid, right_2_result);
340        mid = iBuilder->simd_or(mid, left_8_result);
341        mid = iBuilder->simd_or(mid, left_10_result);
342        mid = iBuilder->simd_or(mid, left_12_result);
343        Value * radix64pack = iBuilder->bitCast(mid);
344
345        Value * radix64stream = getStream(self, "radix64stream",blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
346        iBuilder->CreateBlockAlignedStore(radix64pack, radix64stream);
347    }
348    Value * produced = getProducedItemCount(self, "radix64stream");
349    produced = iBuilder->CreateAdd(produced, iBuilder->getSize(iBuilder->getStride()));
350    setProducedItemCount(self, "radix64stream", produced);   
351}
352
353void radix64Kernel::generateFinalBlockMethod() const {
354    auto savePoint = iBuilder->saveIP();
355    Module * m = iBuilder->getModule();
356    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
357    BasicBlock * radix64_fb_entry = BasicBlock::Create(iBuilder->getContext(), "radix64_fb_entry", finalBlockFunction, 0);
358    iBuilder->SetInsertPoint(radix64_fb_entry);
359    BasicBlock * radix64_loop = BasicBlock::Create(iBuilder->getContext(), "radix64_loop", finalBlockFunction, 0);
360    BasicBlock * loopExit = BasicBlock::Create(iBuilder->getContext(), "loopExit", finalBlockFunction, 0);
361    BasicBlock * handleRemainFirstByte = BasicBlock::Create(iBuilder->getContext(), "handleRemainFirstByte", finalBlockFunction, 0);
362    BasicBlock * handleRemainSecondByte = BasicBlock::Create(iBuilder->getContext(), "handleRemainSecondByte", finalBlockFunction, 0);
363    BasicBlock * handleNoRemainSecondByte = BasicBlock::Create(iBuilder->getContext(), "handleNoRemainSecondByte", finalBlockFunction, 0);
364    BasicBlock * fbExit = BasicBlock::Create(iBuilder->getContext(), "fbExit", finalBlockFunction, 0);
365    // Final Block arguments: self, remaining.
366    Function::arg_iterator args = finalBlockFunction->arg_begin();
367    Value * self = &*(args++);
368    Value * remainingBytes = &*(args++);
369    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
370
371    const unsigned PACK_SIZE = iBuilder->getStride()/8;
372    Constant * packSize = iBuilder->getSize(PACK_SIZE);
373    Value * blockNo = getScalarField(self, blockNoScalar);
374
375    Value * step_right_6 = iBuilder->simd_fill(32, iBuilder->getInt32(0x00C00000));
376    Value * step_left_8 = iBuilder->simd_fill(32, iBuilder->getInt32(0x003F0000));
377    Value * step_right_4 = iBuilder->simd_fill(32, iBuilder->getInt32(0x0000F000));
378    Value * step_left_10 = iBuilder->simd_fill(32, iBuilder->getInt32(0x00000F00));
379    Value * step_right_2 = iBuilder->simd_fill(32, iBuilder->getInt32(0x000000FC));
380    Value * step_left_12 = iBuilder->simd_fill(32, iBuilder->getInt32(0x00000003));
381
382
383    // Enter the loop only if there is at least one byte remaining to process.
384    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
385
386    iBuilder->SetInsertPoint(radix64_loop);
387    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
388    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
389    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), radix64_fb_entry);
390    loopRemain->addIncoming(remainingBytes, radix64_fb_entry);
391
392    Value * expandedStreamLoopPtr = getStream(self, "expandedStream", blockNo, iBuilder->getInt32(0), idx);
393    Value * bytepack = iBuilder->CreateBlockAlignedLoad(expandedStreamLoopPtr);
394    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
395    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
396    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
397    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
398    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
399    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
400
401    Value * mid = right_6_result;
402    mid = iBuilder->simd_or(mid, right_4_result);
403    mid = iBuilder->simd_or(mid, right_2_result);
404    mid = iBuilder->simd_or(mid, left_8_result);
405    mid = iBuilder->simd_or(mid, left_10_result);
406    mid = iBuilder->simd_or(mid, left_12_result);
407    Value * radix64pack = iBuilder->bitCast(mid);
408
409    Value * radix64streamPtr = getStream(self, "radix64stream", blockNo, iBuilder->getInt32(0), idx);
410    iBuilder->CreateBlockAlignedStore(radix64pack, radix64streamPtr);
411
412    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
413    idx->addIncoming(nextIdx, radix64_loop);
414    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
415    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
416
417    Value* continueLoop = iBuilder->CreateICmpULT(remainAfterLoop, packSize);
418    iBuilder->CreateCondBr(continueLoop, radix64_loop, loopExit);
419
420    iBuilder->SetInsertPoint(loopExit);
421    // All base64 data has been computed, but we may need to set one or two '=' padding bytes.
422    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(0)), fbExit, handleRemainFirstByte);
423    iBuilder->SetInsertPoint(handleRemainFirstByte);
424    // At least one padding byte required.
425    Value * i8input_ptr = getStreamView(iBuilder->getInt8PtrTy(), self, "expandedStream", blockNo, iBuilder->getInt32(0));
426    Value * remainOutputStart = iBuilder->CreateSub(remainingBytes, remainMod4);
427
428    Value * firstRemainByte = iBuilder->CreateLoad(i8input_ptr);
429
430    Value * first_move_right_2_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0xFC);
431    Value * first_output_byte = iBuilder->CreateLShr(iBuilder->CreateAnd(firstRemainByte, first_move_right_2_mask), 2);
432
433    Value * first_move_left_4_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0x03);
434    Value * first_move_left_4_byte = iBuilder->CreateShl(iBuilder->CreateAnd(firstRemainByte, first_move_left_4_mask), 4);
435
436
437    Value * i8OutPtr0 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, remainOutputStart);
438
439    iBuilder->CreateStore(first_output_byte, i8OutPtr0);
440
441    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(1)), handleNoRemainSecondByte, handleRemainSecondByte);
442    iBuilder->SetInsertPoint(handleRemainSecondByte);
443
444    Value * secondRemainByte = iBuilder->CreateLoad(iBuilder->CreateGEP(i8input_ptr, iBuilder->getInt32(1)));
445    Value * second_move_right_4_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0xF0);
446    Value * second_move_right_4_byte = iBuilder->CreateLShr(iBuilder->CreateAnd(secondRemainByte, second_move_right_4_mask), 4);
447    Value * second_output_byte = iBuilder->CreateOr(first_move_left_4_byte, second_move_right_4_byte);
448
449    Value * i8OutPtr1 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(1)));
450
451    iBuilder->CreateStore(second_output_byte, i8OutPtr1);
452
453    Value * second_move_left_2_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0x0F);
454    Value * second_move_left_2_byte = iBuilder->CreateShl(iBuilder->CreateAnd(secondRemainByte, second_move_left_2_mask), 2);
455
456    Value * i8OutPtr2 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(2)));
457
458    iBuilder->CreateStore(second_move_left_2_byte, i8OutPtr2);
459    iBuilder->CreateBr(fbExit);
460
461    iBuilder->SetInsertPoint(handleNoRemainSecondByte);
462
463    i8OutPtr1 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(1)));
464
465    iBuilder->CreateStore(first_move_left_4_byte, i8OutPtr1);
466    iBuilder->CreateBr(fbExit);
467
468    iBuilder->SetInsertPoint(fbExit);
469    Value * outputNumberAdd = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(0)), iBuilder->getSize(0), iBuilder->getSize(1));
470    Value * produced = iBuilder->CreateAdd(getProducedItemCount(self, "radix64stream"), iBuilder->CreateAdd(remainingBytes, outputNumberAdd));
471    setProducedItemCount(self, "radix64stream", produced);
472
473    iBuilder->CreateRetVoid();
474    iBuilder->restoreIP(savePoint);
475}
476
477   
478radix64Kernel::radix64Kernel(IDISA::IDISA_Builder * iBuilder) :
479    KernelBuilder(iBuilder, "radix64",
480                  {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
481                  {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
482                  {}, {}, {}) {
483        setDoBlockUpdatesProducedItemCountsAttribute(true);
484}
485   
486void radix64Kernel::generateDoBlockMethod() const {
487    auto savePoint = iBuilder->saveIP();
488
489    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
490
491    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
492
493    Value * self = getParameter(doBlockFunction, "self");
494    Value * blockNo = getScalarField(self, blockNoScalar);
495
496    generateDoBlockLogic(self, blockNo);
497
498    iBuilder->CreateRetVoid();
499    iBuilder->restoreIP(savePoint);
500}
501
502base64Kernel::base64Kernel(IDISA::IDISA_Builder * iBuilder) :
503    KernelBuilder(iBuilder, "base64",
504                  {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
505                  {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream"}},
506                  {}, {}, {}) {
507        setDoBlockUpdatesProducedItemCountsAttribute(true);
508    }
509   
510
511void base64Kernel::generateDoBlockLogic(Value * self, Value * blockNo) const {       
512    for (unsigned i = 0; i < 8; i++) {
513        Value * radix64stream_ptr = getStream(self, "radix64stream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
514        Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64stream_ptr);
515        Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
516        Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
517        Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
518        Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
519        // Strategy:
520        // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
521        // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
522        // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
523        // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
524        // 4. subtract ord('0') - ord('/') + (63 - 62) = 2 for all values = 63
525        Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
526        Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
527        Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
528        Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
529        Value * base64pack = iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(2))));
530        Value * base64stream_ptr = getStream(self, "base64stream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
531        iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64stream_ptr);
532    }
533    Value * produced = getProducedItemCount(self, "base64stream");
534    produced = iBuilder->CreateAdd(produced, iBuilder->getSize(iBuilder->getStride()));
535    setProducedItemCount(self, "base64stream", produced);
536}
537
538
539// Special processing for the base 64 format.   The output must always contain a multiple
540// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
541// number of radix 64 values
542void base64Kernel::generateFinalBlockMethod() const {
543    auto savePoint = iBuilder->saveIP();
544    Module * m = iBuilder->getModule();
545    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
546    BasicBlock * base64_fb_entry = BasicBlock::Create(iBuilder->getContext(), "base64_fb_entry", finalBlockFunction, 0);
547    iBuilder->SetInsertPoint(base64_fb_entry);
548    BasicBlock * base64_loop = BasicBlock::Create(iBuilder->getContext(), "base64_loop", finalBlockFunction, 0);
549    BasicBlock * loopExit = BasicBlock::Create(iBuilder->getContext(), "loopExit", finalBlockFunction, 0);
550    BasicBlock * doPadding = BasicBlock::Create(iBuilder->getContext(), "doPadding", finalBlockFunction, 0);
551    BasicBlock * doPadding2 = BasicBlock::Create(iBuilder->getContext(), "doPadding2", finalBlockFunction, 0);
552    BasicBlock * fbExit = BasicBlock::Create(iBuilder->getContext(), "fbExit", finalBlockFunction, 0);
553    // Final Block arguments: self, remaining.
554    Function::arg_iterator args = finalBlockFunction->arg_begin();
555    Value * self = &*(args++);
556    Value * remainingBytes = &*(args++);
557    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
558    Value * padBytes = iBuilder->CreateSub(iBuilder->getSize(4), remainMod4);
559    padBytes = iBuilder->CreateAnd(padBytes, iBuilder->getSize(3));
560
561    Constant * packSize = iBuilder->getSize(iBuilder->getStride() / 8);
562    Value * blockNo = getScalarField(self, blockNoScalar);
563
564    // Enter the loop only if there is at least one byte remaining to process.
565    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, base64_loop);
566   
567    iBuilder->SetInsertPoint(base64_loop);
568    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
569    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
570    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), base64_fb_entry);
571    loopRemain->addIncoming(remainingBytes, base64_fb_entry);
572    Value * radix64streamPtr = getStream(self, "radix64stream", blockNo, iBuilder->getInt32(0), idx);
573    Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64streamPtr);
574    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
575    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
576    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
577    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
578    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
579    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
580    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
581    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
582    Value * base64pack = iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(2))));
583    Value * base64streamPtr = getStream(self, "base64stream", blockNo, iBuilder->getInt32(0), idx);
584    iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64streamPtr);
585    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
586    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
587    loopRemain->addIncoming(remainAfterLoop, base64_loop);
588
589    Value* continueLoop = iBuilder->CreateICmpULT(remainAfterLoop, packSize);
590    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
591
592    iBuilder->SetInsertPoint(loopExit);
593    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, iBuilder->getSize(0)), fbExit, doPadding);
594
595    iBuilder->SetInsertPoint(doPadding);
596    Value * i8output_ptr = getStreamView(iBuilder->getInt8PtrTy(), self, "base64stream", blockNo, iBuilder->getInt32(0));
597    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, remainingBytes));
598    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(3)), fbExit, doPadding2);
599    iBuilder->SetInsertPoint(doPadding2);
600    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, iBuilder->getSize(1));
601    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, finalPadPos));
602    iBuilder->CreateBr(fbExit);
603    iBuilder->SetInsertPoint(fbExit);
604    Value * produced = iBuilder->CreateAdd(getProducedItemCount(self, "base64stream"), iBuilder->CreateAdd(remainingBytes, padBytes));
605    setProducedItemCount(self, "base64stream", produced);
606    iBuilder->CreateRetVoid();
607    iBuilder->restoreIP(savePoint);
608}
609
610void base64Kernel::generateDoBlockMethod() const {
611    auto savePoint = iBuilder->saveIP();
612
613    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
614
615    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
616
617    Value * self = getParameter(doBlockFunction, "self");
618    Value * blockNo = getScalarField(self, blockNoScalar);
619
620    generateDoBlockLogic(self, blockNo);
621
622    iBuilder->CreateRetVoid();
623    iBuilder->restoreIP(savePoint);
624}
625
626}
Note: See TracBrowser for help on using the repository browser.