source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 5232

Last change on this file since 5232 was 5232, checked in by xwa163, 2 years ago

Add based64 related kernels

File size: 37.0 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "radix64.h"
6//#include "expand3_4.h"
7#include <kernels/kernel.h>
8#include <IDISA/idisa_builder.h>
9#include <llvm/Support/raw_ostream.h>
10
11namespace kernel {
12using namespace llvm;
13
14// This kernel produces an expanded input stream by duplicating every third byte.
15// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
16// a single shufflevector operation produces 16 bytes of output data from the
17// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
18// produced from 24 bytes of input data.
19//
20// Using aligned SIMD loads, an inner loop processes three registers full of input
21// data (i.e., three BytePacks) to produce four registers full of output.   This is
22// a 3 step process.
23// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
24//          At this point 3/4 of the data in input_pack0 has been processed.
25// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
26//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
27//          At this point 1/2 of the data in input_pack1 has been processed.
28// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
29//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
30//          Then apply a further shuffle opertaion to use the remaining 3/4 of
31//          input_pack2 to produce output_pack3.
32
33// The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
34// producing 4 blocks of output.   Unless less than one tripleBlock remains, the
35// doSegment method always processes an integral number of tripleBlocks as a logical
36// segment.  Both input and output buffers are hence maintained at block boundaries,
37// with the input data completely processed for each tripleBlock.
38//
39// The pipeline must guarantee that the doSegment method is called with the
40// a continous buffer for the full segment (number of blocks).
41
42void expand3_4Kernel::generateDoSegmentMethod() {
43    IDISA::IDISA_Builder::InsertPoint savePoint = iBuilder->saveIP();
44    Module * m = iBuilder->getModule();
45    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
46    BasicBlock * expand2_3entry = BasicBlock::Create(iBuilder->getContext(), "expand2_3entry", doSegmentFunction, 0);
47    iBuilder->SetInsertPoint(expand2_3entry);
48    BasicBlock * expand_3_4_loop = BasicBlock::Create(iBuilder->getContext(), "expand_3_4_loop", doSegmentFunction, 0);
49    BasicBlock * expand3_4_loop_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_loop_exit", doSegmentFunction, 0);
50    BasicBlock * finalStep1 = BasicBlock::Create(iBuilder->getContext(), "finalStep1", doSegmentFunction, 0);
51    BasicBlock * finalStep2 = BasicBlock::Create(iBuilder->getContext(), "finalStep2", doSegmentFunction, 0);
52    BasicBlock * step2load = BasicBlock::Create(iBuilder->getContext(), "step2load", doSegmentFunction, 0);
53    BasicBlock * step2store = BasicBlock::Create(iBuilder->getContext(), "step2store", doSegmentFunction, 0);
54    BasicBlock * finalStep3 = BasicBlock::Create(iBuilder->getContext(), "finalStep3", doSegmentFunction, 0);
55    BasicBlock * step3load = BasicBlock::Create(iBuilder->getContext(), "step3load", doSegmentFunction, 0);
56    BasicBlock * step3store = BasicBlock::Create(iBuilder->getContext(), "step3store", doSegmentFunction, 0);
57    BasicBlock * step3store2 = BasicBlock::Create(iBuilder->getContext(), "step3store2", doSegmentFunction, 0);
58    BasicBlock * itemsDone = BasicBlock::Create(iBuilder->getContext(), "itemsDone", doSegmentFunction, 0);
59    BasicBlock * setTermination = BasicBlock::Create(iBuilder->getContext(), "setTermination", doSegmentFunction, 0);
60    BasicBlock * expand3_4_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_exit", doSegmentFunction, 0);
61    BasicBlock * finalExit = BasicBlock::Create(iBuilder->getContext(), "finalExit", doSegmentFunction, 0);
62   
63    // Determine the require shufflevector constants.
64    const unsigned PACK_SIZE = iBuilder->getStride()/8;
65   
66    // Construct a list of indexes in  the form
67    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
68    unsigned sourceByteIndex = 0;
69    unsigned expand3_4_index[PACK_SIZE];
70    for (unsigned i = 0; i < PACK_SIZE; i++) {
71        expand3_4_index[i] = sourceByteIndex;
72        if (i % 4 != 2) sourceByteIndex++;
73    }
74    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
75    Value * expand_3_4_shuffle[4];
76    for (unsigned j = 0; j < 4; j++) {
77        std::vector<Constant *> Idxs;
78        for (unsigned i = 0; i < PACK_SIZE; i++) {
79            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
80        }
81        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
82    }
83    Constant * Const3 = ConstantInt::get(iBuilder->getSizeTy(), 3);
84    Constant * Const4 = ConstantInt::get(iBuilder->getSizeTy(), 4);
85    Constant * stride = ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride());
86    Constant * packSize = ConstantInt::get(iBuilder->getSizeTy(), PACK_SIZE);
87    Constant * loopItemCount = ConstantInt::get(iBuilder->getSizeTy(), 3 * PACK_SIZE); // 3 packs per loop.
88    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(parabix::i8));
89   
90    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
91    Function::arg_iterator args = doSegmentFunction->arg_begin();
92    Value * self = &*(args++);
93    Value * blocksToDo = &*(args);
94    Value * streamStructPtr = getStreamSetStructPtr(self, "sourceStream");
95
96    LoadInst * producerPos = iBuilder->CreateAtomicLoadAcquire(mStreamSetInputBuffers[0]->getProducerPosPtr(streamStructPtr));
97    Value * processed = getProcessedItemCount(self);
98    Value * itemsAvail = iBuilder->CreateSub(producerPos, processed);
99   
100    // Except for the final segment, we always process an integral number of triple blocks.
101    Value * tripleBlocksToDo = iBuilder->CreateMul(blocksToDo, Const3);
102    Constant * blockItems = ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getBitBlockWidth());
103    Value * tripleItemMax = iBuilder->CreateMul(tripleBlocksToDo, blockItems);
104
105    Value * lessThanFullSegment = iBuilder->CreateICmpULT(itemsAvail, tripleItemMax);
106    Value * tripleBlockItems = iBuilder->CreateSelect(lessThanFullSegment, itemsAvail, tripleItemMax);
107
108    Value * endSignal = iBuilder->CreateLoad(mStreamSetInputBuffers[0]->getEndOfInputPtr(streamStructPtr));
109    Value * inFinalSegment = iBuilder->CreateAnd(endSignal, lessThanFullSegment);
110    Value * itemsToDo = iBuilder->CreateSelect(inFinalSegment, itemsAvail, tripleBlockItems);
111
112//    iBuilder->CallPrintInt("itemsToDo", itemsToDo);
113
114    Value * blockNo = getScalarField(self, blockNoScalar);
115
116    Value * sourceBlockPtr = getStreamSetBlockPtr(self, "sourceStream", blockNo);
117
118    Value * outputGenerated = getProducedItemCount(self); // bytes previously generated to output
119    Value * outputBlockNo = iBuilder->CreateUDiv(outputGenerated, stride);
120
121    Value * outputBlockPtr = getStreamSetBlockPtr(self, "expandedStream", outputBlockNo);
122
123    // A block is made up of 8 packs.  Get the pointer to the first pack (changes the type of the pointer only).
124    Value * sourcePackPtr = iBuilder->CreateGEP(sourceBlockPtr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(0)});
125    Value * outputPackPtr = iBuilder->CreateGEP(outputBlockPtr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(0)});
126    Value * hasFullLoop = iBuilder->CreateICmpUGE(itemsToDo, loopItemCount);
127
128
129    iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
130    iBuilder->SetInsertPoint(expand_3_4_loop);
131    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
132    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
133    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
134
135    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
136    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
137    loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
138
139    // Step 1 of the main loop.
140    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
141    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
142    iBuilder->CreateAlignedStore(expand0, loopOutput_ptr, packAlign);
143    // Step 2 of the main loop.
144    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, {iBuilder->getInt32(1)});
145    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(1)});
146    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
147    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
148    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
149    // Step 3 of the main loop.
150    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, {iBuilder->getInt32(2)});
151    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(2)});
152    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
153    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
154    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
155    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(3)});
156    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
157    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
158
159    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, {iBuilder->getInt32(3)});
160
161
162
163    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, loopItemCount);
164
165    Value * loopProcessed = iBuilder->CreateSub(itemsToDo, remainingItems);
166    loopProcessed = iBuilder->CreateMul(iBuilder->CreateUDiv(loopProcessed, iBuilder->getInt64(3)), iBuilder->getInt64(4));
167
168    Value * loopNextOutputPack;
169    loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, {iBuilder->getInt32(4)});
170
171    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
172    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
173    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
174
175    Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, loopItemCount);
176    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
177
178    // Except for the final segment, the number of items remaining is now 0.
179    // For the final segment, less than loopItemCount items remain.
180    iBuilder->SetInsertPoint(expand3_4_loop_exit);
181    PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
182    PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
183    PHINode * loopExitItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
184    loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
185    loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
186    loopExitItemsRemain->addIncoming(itemsToDo, expand2_3entry);
187    loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
188    loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
189    loopExitItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
190    // There may be one or two remaining full packs and/or a partial pack.
191    //
192    // We have several cases depending on the number of reumaing items.  Let N = packSize
193    // (a) 0 remaining items: all done
194    // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
195    // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
196    // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
197    // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
198    // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
199    // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
200    Value * condition_a = iBuilder->CreateICmpEQ(loopExitItemsRemain, ConstantInt::getNullValue(iBuilder->getSizeTy()));
201    iBuilder->CreateCondBr(condition_a, itemsDone, finalStep1);
202    // Final Step1 processing
203    iBuilder->SetInsertPoint(finalStep1);
204    pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
205    expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
206    iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
207    Value * condition_b = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 3 * PACK_SIZE/4));
208    iBuilder->CreateCondBr(condition_b, itemsDone, finalStep2);
209    // Final Step 2 processing
210    iBuilder->SetInsertPoint(finalStep2);
211    Value * condition_c = iBuilder->CreateICmpULE(loopExitItemsRemain, packSize);
212    iBuilder->CreateCondBr(condition_c, step2store, step2load);
213    iBuilder->SetInsertPoint(step2load);
214    inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, {iBuilder->getInt32(1)});
215    pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
216    iBuilder->CreateBr(step2store);
217    iBuilder->SetInsertPoint(step2store);
218    PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
219    pack1phi->addIncoming(undefPack, finalStep2);
220    pack1phi->addIncoming(pack1, step2load);
221    outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, {iBuilder->getInt32(1)});
222    expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
223    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
224    Value * condition_d = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 6 * PACK_SIZE/4));
225    iBuilder->CreateCondBr(condition_d, itemsDone, finalStep3);
226    // Final Step 3
227    iBuilder->SetInsertPoint(finalStep3);
228    Value * condition_e = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 2 * PACK_SIZE));
229    iBuilder->CreateCondBr(condition_e, step3store, step3load);
230    iBuilder->SetInsertPoint(step3load);
231    inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, {iBuilder->getInt32(2)});
232    pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
233    iBuilder->CreateBr(step3store);
234    iBuilder->SetInsertPoint(step3store);
235    PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
236    pack2phi->addIncoming(undefPack, finalStep3);
237    pack2phi->addIncoming(pack2, step3load);
238    outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, {iBuilder->getInt32(2)});
239    expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
240    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
241    Value * condition_f = iBuilder->CreateICmpULE(loopExitItemsRemain, ConstantInt::get(iBuilder->getSizeTy(), 9 * PACK_SIZE/4));
242    iBuilder->CreateCondBr(condition_f, itemsDone, step3store2);
243    iBuilder->SetInsertPoint(step3store2);
244    outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, {iBuilder->getInt32(3)});
245    expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
246    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
247    iBuilder->CreateBr(itemsDone);
248    //
249    iBuilder->SetInsertPoint(itemsDone);
250
251    processed = iBuilder->CreateAdd(processed, itemsToDo);
252    setProcessedItemCount(self, processed);
253
254    setScalarField(self, blockNoScalar, iBuilder->CreateUDiv(processed, stride));
255    // We have produced 4 output bytes for every 3 input bytes.  If the number of input
256    // bytes is not a multiple of 3, then we have one more output byte for each excess
257    // input byte.
258    Value * totalProduced = iBuilder->CreateAdd(iBuilder->CreateMul(iBuilder->CreateUDiv(processed, Const3), Const4), iBuilder->CreateURem(processed, Const3));
259    setProducedItemCount(self, totalProduced);
260    Value * ssStructPtr = getStreamSetStructPtr(self, "expandedStream");
261
262    Value * producerPosPtr = mStreamSetOutputBuffers[0]->getProducerPosPtr(ssStructPtr);
263
264    iBuilder->CreateAtomicStoreRelease(totalProduced, producerPosPtr);
265   
266    iBuilder->CreateCondBr(inFinalSegment, setTermination, expand3_4_exit);
267    iBuilder->SetInsertPoint(setTermination);
268#ifndef NDEBUG
269//    iBuilder->CallPrintInt(mKernelName + " termination in segment ", segmentNo);
270#endif
271    setTerminationSignal(self);
272    mStreamSetOutputBuffers[0]->setEndOfInput(ssStructPtr);
273    iBuilder->CreateBr(expand3_4_exit);
274    iBuilder->SetInsertPoint(expand3_4_exit);
275    // Must be the last action, for synchronization.
276    iBuilder->CreateBr(finalExit);
277   
278    iBuilder->SetInsertPoint(finalExit);
279    iBuilder->CreateRetVoid();
280    iBuilder->restoreIP(savePoint);
281}
282
283
284// The doBlock method is deprecated.   But in case it is used, just call doSegment with
285// 1 as the number of blocks to do.
286void expand3_4Kernel::generateDoBlockMethod() {
287    auto savePoint = iBuilder->saveIP();
288    Module * m = iBuilder->getModule();
289    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
290    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
291    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
292    Value * self = getParameter(doBlockFunction, "self");
293    iBuilder->CreateCall(doSegmentFunction, {self, ConstantInt::get(iBuilder->getSizeTy(), 1)});
294    iBuilder->CreateRetVoid();
295    iBuilder->restoreIP(savePoint);
296}
297
298// Radix 64 determination, converting 3 bytes to 4 6-bit values.
299//
300//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
301//           zy                            bits to move 6 positions right
302//             xwvuts                      bits to move 8 positions left
303//                    rqpm                 bits to move 4 positions right
304//                        nlkj             bits to move 10 positions left
305//                             hqfedc      bits to move 2 positions right
306//                                   ba    bits to move 12 positions left
307//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
308void radix64Kernel::generateDoBlockLogic(Value * self, Value * blockNo) {
309    Value * expandedStream = getStreamSetBlockPtr(self, "expandedStream", blockNo);
310    Value * radix64stream = getStreamSetBlockPtr(self, "radix64stream",blockNo);
311
312    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
313    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
314    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
315    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
316    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
317    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
318   
319    for (unsigned i = 0; i < 8; i++) {
320        Value * bytepack = iBuilder->CreateBlockAlignedLoad(expandedStream, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(i)});
321
322        Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
323        Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
324        Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
325        Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
326        Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
327        Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
328
329        Value * mid = right_6_result;
330        mid = iBuilder->simd_or(mid, right_4_result);
331        mid = iBuilder->simd_or(mid, right_2_result);
332        mid = iBuilder->simd_or(mid, left_8_result);
333        mid = iBuilder->simd_or(mid, left_10_result);
334        mid = iBuilder->simd_or(mid, left_12_result);
335        Value * radix64pack = iBuilder->bitCast(mid);
336
337        iBuilder->CreateBlockAlignedStore(radix64pack, radix64stream, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(i)});
338    }
339    Value * produced = getProducedItemCount(self);
340    produced = iBuilder->CreateAdd(produced, ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride()));
341    setProducedItemCount(self, produced);   
342}
343
344void radix64Kernel::generateFinalBlockMethod() {
345    auto savePoint = iBuilder->saveIP();
346    Module * m = iBuilder->getModule();
347    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
348    BasicBlock * radix64_fb_entry = BasicBlock::Create(iBuilder->getContext(), "radix64_fb_entry", finalBlockFunction, 0);
349    iBuilder->SetInsertPoint(radix64_fb_entry);
350    BasicBlock * radix64_loop = BasicBlock::Create(iBuilder->getContext(), "radix64_loop", finalBlockFunction, 0);
351    BasicBlock * loopExit = BasicBlock::Create(iBuilder->getContext(), "loopExit", finalBlockFunction, 0);
352    BasicBlock * handleRemainFirstByte = BasicBlock::Create(iBuilder->getContext(), "handleRemainFirstByte", finalBlockFunction, 0);
353    BasicBlock * handleRemainSecondByte = BasicBlock::Create(iBuilder->getContext(), "handleRemainSecondByte", finalBlockFunction, 0);
354    BasicBlock * handleNoRemainSecondByte = BasicBlock::Create(iBuilder->getContext(), "handleNoRemainSecondByte", finalBlockFunction, 0);
355    BasicBlock * fbExit = BasicBlock::Create(iBuilder->getContext(), "fbExit", finalBlockFunction, 0);
356    // Final Block arguments: self, remaining.
357    Function::arg_iterator args = finalBlockFunction->arg_begin();
358    Value * self = &*(args++);
359    Value * remainingBytes = &*(args++);
360    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, ConstantInt::get(iBuilder->getSizeTy(), 3));
361
362    const unsigned PACK_SIZE = iBuilder->getStride()/8;
363    Constant * packSize = ConstantInt::get(iBuilder->getSizeTy(), PACK_SIZE);
364    Value * blockNo = getScalarField(self, blockNoScalar);
365    Value * expandedstream_ptr = getStreamSetBlockPtr(self, "expandedStream", blockNo);
366    Value * radix64stream_ptr = getStreamSetBlockPtr(self, "radix64stream", blockNo);
367    Type * i8_t = iBuilder->getInt8Ty();
368
369    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
370    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
371    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
372    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
373    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
374    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
375
376
377    // Enter the loop only if there is at least one byte remaining to process.
378    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, ConstantInt::get(iBuilder->getSizeTy(), 0)), fbExit, radix64_loop);
379
380    iBuilder->SetInsertPoint(radix64_loop);
381    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
382    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
383    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), radix64_fb_entry);
384    loopRemain->addIncoming(remainingBytes, radix64_fb_entry);
385
386    Value * bytepack = iBuilder->CreateBlockAlignedLoad(expandedstream_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), idx});
387    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
388    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
389    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
390    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
391    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
392    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
393
394    Value * mid = right_6_result;
395    mid = iBuilder->simd_or(mid, right_4_result);
396    mid = iBuilder->simd_or(mid, right_2_result);
397    mid = iBuilder->simd_or(mid, left_8_result);
398    mid = iBuilder->simd_or(mid, left_10_result);
399    mid = iBuilder->simd_or(mid, left_12_result);
400    Value * radix64pack = iBuilder->bitCast(mid);
401
402    iBuilder->CreateBlockAlignedStore(radix64pack, radix64stream_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), idx});
403
404    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
405    idx->addIncoming(nextIdx, radix64_loop);
406    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
407    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
408
409    Value* continueLoop = iBuilder->CreateICmpULT(remainAfterLoop, packSize);
410    iBuilder->CreateCondBr(continueLoop, radix64_loop, loopExit);
411
412    iBuilder->SetInsertPoint(loopExit);
413    // All base64 data has been computed, but we may need to set one or two '=' padding bytes.
414    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, ConstantInt::get(iBuilder->getSizeTy(), 0)), fbExit, handleRemainFirstByte);
415    iBuilder->SetInsertPoint(handleRemainFirstByte);
416    // At least one padding byte required.
417
418    Value * i8output_ptr = iBuilder->CreatePointerCast(radix64stream_ptr, iBuilder->getInt8PtrTy());
419    Value * i8input_ptr = iBuilder->CreatePointerCast(expandedstream_ptr, iBuilder->getInt8PtrTy());
420    Value * remainOutputStart = iBuilder->CreateSub(remainingBytes, remainMod4);
421
422    Value * firstRemainByte = iBuilder->CreateLoad(iBuilder->CreateGEP(i8input_ptr, {iBuilder->getInt32(0)}));
423
424    Value * first_move_right_2_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0xFC);
425    Value * first_output_byte = iBuilder->CreateLShr(iBuilder->CreateAnd(firstRemainByte, first_move_right_2_mask), 2);
426
427    Value * first_move_left_4_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0x03);
428    Value * first_move_left_4_byte = iBuilder->CreateShl(iBuilder->CreateAnd(firstRemainByte, first_move_left_4_mask), 4);
429
430    iBuilder->CreateStore(first_output_byte, iBuilder->CreateGEP(i8output_ptr, {iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(0))}));
431
432
433    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, ConstantInt::get(iBuilder->getSizeTy(), 1)), handleNoRemainSecondByte, handleRemainSecondByte);
434    iBuilder->SetInsertPoint(handleRemainSecondByte);
435
436    Value * secondRemainByte = iBuilder->CreateLoad(iBuilder->CreateGEP(i8input_ptr, {iBuilder->getInt32(1)}));
437    Value * second_move_right_4_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0xF0);
438    Value * second_move_right_4_byte = iBuilder->CreateLShr(iBuilder->CreateAnd(secondRemainByte, second_move_right_4_mask), 4);
439    Value * second_output_byte = iBuilder->CreateOr(first_move_left_4_byte, second_move_right_4_byte);
440    iBuilder->CreateStore(second_output_byte, iBuilder->CreateGEP(i8output_ptr, {iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(1))}));
441
442    Value * second_move_left_2_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0x0F);
443    Value * second_move_left_2_byte = iBuilder->CreateShl(iBuilder->CreateAnd(secondRemainByte, second_move_left_2_mask), 2);
444    iBuilder->CreateStore(second_move_left_2_byte, iBuilder->CreateGEP(i8output_ptr, {iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(2))}));
445    iBuilder->CreateBr(fbExit);
446
447    iBuilder->SetInsertPoint(handleNoRemainSecondByte);
448    iBuilder->CreateStore(first_move_left_4_byte, iBuilder->CreateGEP(i8output_ptr, {iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(1))}));
449    iBuilder->CreateBr(fbExit);
450
451    iBuilder->SetInsertPoint(fbExit);
452    Value * outputNumberAdd = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(remainMod4, ConstantInt::get(iBuilder->getSizeTy(), 0)), ConstantInt::get(iBuilder->getSizeTy(), 0), ConstantInt::get(iBuilder->getSizeTy(), 1));
453    Value * produced = iBuilder->CreateAdd(getProducedItemCount(self), iBuilder->CreateAdd(remainingBytes, outputNumberAdd));
454    setProducedItemCount(self, produced);
455
456    iBuilder->CreateRetVoid();
457    iBuilder->restoreIP(savePoint);
458}
459
460void radix64Kernel::generateDoBlockMethod() {
461    auto savePoint = iBuilder->saveIP();
462
463    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
464
465    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
466
467    Value * self = getParameter(doBlockFunction, "self");
468    Value * blockNo = getScalarField(self, blockNoScalar);
469
470    generateDoBlockLogic(self, blockNo);
471
472    iBuilder->CreateRetVoid();
473    iBuilder->restoreIP(savePoint);
474}
475
476
477void base64Kernel::generateDoBlockLogic(Value * self, Value * blockNo) {
478    Value * radix64stream_ptr = getStreamSetBlockPtr(self, "radix64stream", blockNo);
479    Value * base64stream_ptr = getStreamSetBlockPtr(self, "base64stream", blockNo);
480    Type * i8_t = iBuilder->getInt8Ty();
481   
482    for (unsigned i = 0; i < 8; i++) {
483        Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64stream_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(i)});
484        Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 25)));
485        Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 51)));
486        Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 62)));
487        Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 63)));
488        // Strategy:
489        // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
490        // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
491        // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
492        // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
493        // 4. subtract ord('0') - ord('/') + (63 - 62) = 2 for all values = 63
494        Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 'A')));
495        Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 6))));
496        Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 75))));
497        Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 15))));
498        Value * base64pack = iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 2))));
499        iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64stream_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(i)});
500    }
501    Value * produced = getProducedItemCount(self);
502    produced = iBuilder->CreateAdd(produced, ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride()));
503    setProducedItemCount(self, produced);   
504}
505
506
507// Special processing for the base 64 format.   The output must always contain a multiple
508// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
509// number of radix 64 values
510void base64Kernel::generateFinalBlockMethod() {
511    auto savePoint = iBuilder->saveIP();
512    Module * m = iBuilder->getModule();
513    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
514    BasicBlock * base64_fb_entry = BasicBlock::Create(iBuilder->getContext(), "base64_fb_entry", finalBlockFunction, 0);
515    iBuilder->SetInsertPoint(base64_fb_entry);
516    BasicBlock * base64_loop = BasicBlock::Create(iBuilder->getContext(), "base64_loop", finalBlockFunction, 0);
517    BasicBlock * loopExit = BasicBlock::Create(iBuilder->getContext(), "loopExit", finalBlockFunction, 0);
518    BasicBlock * doPadding = BasicBlock::Create(iBuilder->getContext(), "doPadding", finalBlockFunction, 0);
519    BasicBlock * doPadding2 = BasicBlock::Create(iBuilder->getContext(), "doPadding2", finalBlockFunction, 0);
520    BasicBlock * fbExit = BasicBlock::Create(iBuilder->getContext(), "fbExit", finalBlockFunction, 0);
521    // Final Block arguments: self, remaining.
522    Function::arg_iterator args = finalBlockFunction->arg_begin();
523    Value * self = &*(args++);
524    Value * remainingBytes = &*(args++);
525    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, ConstantInt::get(iBuilder->getSizeTy(), 3));
526    Value * padBytes = iBuilder->CreateSub(ConstantInt::get(iBuilder->getSizeTy(), 4), remainMod4);
527    padBytes = iBuilder->CreateAnd(padBytes, ConstantInt::get(iBuilder->getSizeTy(), 3));
528
529    const unsigned PACK_SIZE = iBuilder->getStride()/8;
530    Constant * packSize = ConstantInt::get(iBuilder->getSizeTy(), PACK_SIZE);
531    Value * blockNo = getScalarField(self, blockNoScalar);
532    Value * radix64stream_ptr = getStreamSetBlockPtr(self, "radix64stream", blockNo);
533    Value * base64stream_ptr = getStreamSetBlockPtr(self, "base64stream", blockNo);
534    Type * i8_t = iBuilder->getInt8Ty();
535   
536    // Enter the loop only if there is at least one byte remaining to process.
537    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, ConstantInt::get(iBuilder->getSizeTy(), 0)), fbExit, base64_loop);
538   
539    iBuilder->SetInsertPoint(base64_loop);
540    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
541    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
542    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), base64_fb_entry);
543    loopRemain->addIncoming(remainingBytes, base64_fb_entry);
544    Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64stream_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), idx});
545    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 25)));
546    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 51)));
547    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 62)));
548    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 63)));
549    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 'A')));
550    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 6))));
551    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 75))));
552    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 15))));
553    Value * base64pack = iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, ConstantInt::get(i8_t, 2))));
554    iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64stream_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), idx});
555    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
556    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
557    loopRemain->addIncoming(remainAfterLoop, base64_loop);
558
559    Value* continueLoop = iBuilder->CreateICmpULT(remainAfterLoop, packSize);
560    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
561
562    iBuilder->SetInsertPoint(loopExit);
563    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, ConstantInt::get(iBuilder->getSizeTy(), 0)), fbExit, doPadding);
564    iBuilder->SetInsertPoint(doPadding);
565    Value * i8output_ptr = iBuilder->CreatePointerCast(base64stream_ptr, iBuilder->getInt8PtrTy());
566    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, {remainingBytes}));
567    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, ConstantInt::get(iBuilder->getSizeTy(), 3)), fbExit, doPadding2);
568    iBuilder->SetInsertPoint(doPadding2);
569    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, ConstantInt::get(iBuilder->getSizeTy(), 1));
570    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, {finalPadPos}));
571    iBuilder->CreateBr(fbExit);
572    iBuilder->SetInsertPoint(fbExit);
573    Value * produced = iBuilder->CreateAdd(getProducedItemCount(self), iBuilder->CreateAdd(remainingBytes, padBytes));
574    setProducedItemCount(self, produced);
575
576
577    iBuilder->CreateRetVoid();
578    iBuilder->restoreIP(savePoint);
579}
580
581void base64Kernel::generateDoBlockMethod() {
582    auto savePoint = iBuilder->saveIP();
583
584    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
585
586    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
587
588    Value * self = getParameter(doBlockFunction, "self");
589    Value * blockNo = getScalarField(self, blockNoScalar);
590
591    generateDoBlockLogic(self, blockNo);
592
593    iBuilder->CreateRetVoid();
594    iBuilder->restoreIP(savePoint);
595}
596
597}
Note: See TracBrowser for help on using the repository browser.