source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 5277

Last change on this file since 5277 was 5277, checked in by cameron, 3 years ago

radix64/base64 fixes

File size: 35.1 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "radix64.h"
6#include <kernels/streamset.h>
7#include <IR_Gen/idisa_builder.h>
8#include <llvm/IR/Module.h>
9#include <llvm/Support/raw_ostream.h>
10
11using namespace llvm;
12
13namespace kernel {
14
15// This kernel produces an expanded input stream by duplicating every third byte.
16// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
17// a single shufflevector operation produces 16 bytes of output data from the
18// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
19// produced from 24 bytes of input data.
20//
21// Using aligned SIMD loads, an inner loop processes three registers full of input
22// data (i.e., three BytePacks) to produce four registers full of output.   This is
23// a 3 step process.
24// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
25//          At this point 3/4 of the data in input_pack0 has been processed.
26// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
27//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
28//          At this point 1/2 of the data in input_pack1 has been processed.
29// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
30//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
31//          Then apply a further shuffle opertaion to use the remaining 3/4 of
32//          input_pack2 to produce output_pack3.
33
34// The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
35// producing 4 blocks of output.   Unless less than one tripleBlock remains, the
36// doSegment method always processes an integral number of tripleBlocks as a logical
37// segment.  Both input and output buffers are hence maintained at block boundaries,
38// with the input data completely processed for each tripleBlock.
39//
40// The pipeline must guarantee that the doSegment method is called with the
41// a continous buffer for the full segment (number of blocks).
42
43   
44expand3_4Kernel::expand3_4Kernel(IDISA::IDISA_Builder * iBuilder) :
45    KernelBuilder(iBuilder, "expand3_4",
46                  {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream"}},
47                  {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
48                  {}, {}, {}) {
49        setDoBlockUpdatesProducedItemCountsAttribute(true);
50    }
51   
52   
53void expand3_4Kernel::generateDoSegmentMethod() const {
54    IDISA::IDISA_Builder::InsertPoint savePoint = iBuilder->saveIP();
55    Module * m = iBuilder->getModule();
56    Function * doSegmentFunction = m->getFunction(mKernelName + doSegment_suffix);
57    BasicBlock * expand2_3entry = BasicBlock::Create(iBuilder->getContext(), "expand2_3entry", doSegmentFunction, 0);
58    iBuilder->SetInsertPoint(expand2_3entry);
59    BasicBlock * expand_3_4_loop = BasicBlock::Create(iBuilder->getContext(), "expand_3_4_loop", doSegmentFunction, 0);
60    BasicBlock * expand3_4_loop_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_loop_exit", doSegmentFunction, 0);
61    BasicBlock * finalStep1 = BasicBlock::Create(iBuilder->getContext(), "finalStep1", doSegmentFunction, 0);
62    BasicBlock * finalStep2 = BasicBlock::Create(iBuilder->getContext(), "finalStep2", doSegmentFunction, 0);
63    BasicBlock * step2load = BasicBlock::Create(iBuilder->getContext(), "step2load", doSegmentFunction, 0);
64    BasicBlock * step2store = BasicBlock::Create(iBuilder->getContext(), "step2store", doSegmentFunction, 0);
65    BasicBlock * finalStep3 = BasicBlock::Create(iBuilder->getContext(), "finalStep3", doSegmentFunction, 0);
66    BasicBlock * step3load = BasicBlock::Create(iBuilder->getContext(), "step3load", doSegmentFunction, 0);
67    BasicBlock * step3store = BasicBlock::Create(iBuilder->getContext(), "step3store", doSegmentFunction, 0);
68    BasicBlock * step3store2 = BasicBlock::Create(iBuilder->getContext(), "step3store2", doSegmentFunction, 0);
69    BasicBlock * itemsDone = BasicBlock::Create(iBuilder->getContext(), "itemsDone", doSegmentFunction, 0);
70    BasicBlock * expand3_4_final = BasicBlock::Create(iBuilder->getContext(), "expand3_4_final", doSegmentFunction, 0);
71    BasicBlock * expand3_4_exit = BasicBlock::Create(iBuilder->getContext(), "expand3_4_exit", doSegmentFunction, 0);
72   
73    // Determine the require shufflevector constants.
74    const unsigned PACK_SIZE = iBuilder->getStride()/8;
75   
76    // Construct a list of indexes in  the form
77    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
78    unsigned sourceByteIndex = 0;
79    unsigned expand3_4_index[PACK_SIZE];
80    for (unsigned i = 0; i < PACK_SIZE; i++) {
81        expand3_4_index[i] = sourceByteIndex;
82        if (i % 4 != 2) sourceByteIndex++;
83    }
84    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
85    Value * expand_3_4_shuffle[4];
86    for (unsigned j = 0; j < 4; j++) {
87        std::vector<Constant *> Idxs;
88        for (unsigned i = 0; i < PACK_SIZE; i++) {
89            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
90        }
91        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
92    }
93    Constant * Const3 = iBuilder->getSize(3);
94    Constant * Const4 = iBuilder->getSize(4);
95    Constant * tripleBlockSize = iBuilder->getSize(3 * iBuilder->getStride());
96    Constant * stride = iBuilder->getSize(iBuilder->getStride());
97    Constant * packSize = iBuilder->getSize(PACK_SIZE);
98    Constant * triplePackSize = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
99    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
100   
101    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
102
103    Function::arg_iterator args = doSegmentFunction->arg_begin();
104    Value * self = &*(args++);
105    Value * doFinal = &*(args++);
106    Value * producerPos = &*(args++);
107    Value * processed = getProcessedItemCount(self, "sourceStream");
108    Value * itemsAvail = iBuilder->CreateSub(producerPos, processed);
109   
110    //
111    // The main loop processes 3 packs of data at a time.  For doFinal
112    // processing, process all the remaining sets of 3 packs, otherwise
113    // process in multiples of 3 full blocks of data.
114    //
115    Value * loopDivisor = iBuilder->CreateSelect(doFinal, triplePackSize, tripleBlockSize);
116    Value * excessItems = iBuilder->CreateURem(itemsAvail, loopDivisor);
117    Value * loopItemsToDo = iBuilder->CreateSub(itemsAvail, excessItems);
118
119    Value * blockNo = getScalarField(self, blockNoScalar);
120
121    // A block is made up of 8 packs.  Get the pointer to the first pack (changes the type of the pointer only).
122    Value * sourcePackPtr = getStream(self, "sourceStream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(0));
123
124    Value * outputGenerated = getProducedItemCount(self, "expandedStream"); // bytes previously generated to output
125    Value * outputBlockNo = iBuilder->CreateUDiv(outputGenerated, stride);
126    Value * outputPackPtr = getStream(self, "expandedStream", outputBlockNo, iBuilder->getInt32(0), iBuilder->getInt32(0));
127
128    Value * hasFullLoop = iBuilder->CreateICmpUGE(loopItemsToDo, triplePackSize);
129
130    iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
131    iBuilder->SetInsertPoint(expand_3_4_loop);
132    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
133    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
134    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
135
136    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
137    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
138    loopItemsRemain->addIncoming(loopItemsToDo, expand2_3entry);
139
140    // Step 1 of the main loop.
141    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
142    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
143    iBuilder->CreateAlignedStore(expand0, loopOutput_ptr, packAlign);
144    // Step 2 of the main loop.
145    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(1));
146    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(1));
147    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
148    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
149    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
150    // Step 3 of the main loop.
151    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(2));
152    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(2));
153    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
154    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
155    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
156    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(3));
157    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
158    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
159
160    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(3));
161    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, triplePackSize);
162
163    Value * loopNextOutputPack;
164    loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(4));
165
166    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
167    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
168    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
169
170    Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, triplePackSize);
171    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
172   
173    iBuilder->SetInsertPoint(expand3_4_loop_exit);
174    PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
175    PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
176    loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
177    loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
178    loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
179    loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
180
181    // Update the produced and processed items count based on the loopItemsToDo value.
182    processed = iBuilder->CreateAdd(processed, loopItemsToDo);
183    setProcessedItemCount(self, "sourceStream", processed);
184   
185    setScalarField(self, blockNoScalar, iBuilder->CreateUDiv(processed, stride));
186    // We have produced 4 output bytes for every 3 input bytes.
187    Value * totalProduced = iBuilder->CreateMul(iBuilder->CreateUDiv(processed, Const3), Const4);
188    setProducedItemCount(self, "expandedStream", totalProduced);
189   
190    // Except for final segment processing, we are done.
191    iBuilder->CreateCondBr(doFinal, expand3_4_final, expand3_4_exit);
192
193    // Final segment processing.   Less than a triplePack remains.
194    iBuilder->SetInsertPoint(expand3_4_final);
195   
196    // There may be one or two remaining full packs and/or a partial pack.
197    //
198    // We have several cases depending on the number of reumaing items.  Let N = packSize
199    // (a) 0 remaining items: all done
200    // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
201    // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
202    // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
203    // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
204    // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
205    // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
206    Value * condition_a = iBuilder->CreateICmpEQ(excessItems, ConstantInt::getNullValue(iBuilder->getSizeTy()));
207    iBuilder->CreateCondBr(condition_a, itemsDone, finalStep1);
208    // Final Step1 processing
209    iBuilder->SetInsertPoint(finalStep1);
210    pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
211    expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
212    iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
213    Value * condition_b = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(3 * PACK_SIZE/4));
214    iBuilder->CreateCondBr(condition_b, itemsDone, finalStep2);
215    // Final Step 2 processing
216    iBuilder->SetInsertPoint(finalStep2);
217    Value * condition_c = iBuilder->CreateICmpULE(excessItems, packSize);
218    iBuilder->CreateCondBr(condition_c, step2store, step2load);
219    iBuilder->SetInsertPoint(step2load);
220    inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(1));
221    pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
222    iBuilder->CreateBr(step2store);
223    iBuilder->SetInsertPoint(step2store);
224    PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
225    pack1phi->addIncoming(undefPack, finalStep2);
226    pack1phi->addIncoming(pack1, step2load);
227    outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(1));
228    expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
229    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
230    Value * condition_d = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(6 * PACK_SIZE/4));
231    iBuilder->CreateCondBr(condition_d, itemsDone, finalStep3);
232    // Final Step 3
233    iBuilder->SetInsertPoint(finalStep3);
234    Value * condition_e = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(2 * PACK_SIZE));
235    iBuilder->CreateCondBr(condition_e, step3store, step3load);
236    iBuilder->SetInsertPoint(step3load);
237    inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(2));
238    pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
239    iBuilder->CreateBr(step3store);
240    iBuilder->SetInsertPoint(step3store);
241    PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
242    pack2phi->addIncoming(undefPack, finalStep3);
243    pack2phi->addIncoming(pack2, step3load);
244    outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(2));
245    expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
246    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
247    Value * condition_f = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(9 * PACK_SIZE/4));
248    iBuilder->CreateCondBr(condition_f, itemsDone, step3store2);
249    iBuilder->SetInsertPoint(step3store2);
250    outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(3));
251    expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
252    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
253    iBuilder->CreateBr(itemsDone);
254    //
255    iBuilder->SetInsertPoint(itemsDone);
256    processed = iBuilder->CreateAdd(processed, excessItems);
257    setProcessedItemCount(self, "sourceStream", processed);
258
259    setScalarField(self, blockNoScalar, iBuilder->CreateUDiv(processed, stride));
260    // We have produced 4 output bytes for every 3 input bytes.  If the number of input
261    // bytes is not a multiple of 3, then we have one more output byte for each excess
262    // input byte.
263    totalProduced = iBuilder->CreateAdd(iBuilder->CreateMul(iBuilder->CreateUDiv(processed, Const3), Const4), iBuilder->CreateURem(processed, Const3));
264    setProducedItemCount(self, "expandedStream", totalProduced);
265   
266    iBuilder->CreateBr(expand3_4_exit);
267    iBuilder->SetInsertPoint(expand3_4_exit);
268    iBuilder->CreateRetVoid();
269    iBuilder->restoreIP(savePoint);
270}
271
272
273// Radix 64 determination, converting 3 bytes to 4 6-bit values.
274//
275//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
276//           zy                            bits to move 6 positions right
277//             xwvuts                      bits to move 8 positions left
278//                    rqpm                 bits to move 4 positions right
279//                        nlkj             bits to move 10 positions left
280//                             hqfedc      bits to move 2 positions right
281//                                   ba    bits to move 12 positions left
282//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
283void radix64Kernel::generateDoBlockLogic(Value * self, Value * blockNo) const {
284
285    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
286    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
287    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
288    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
289    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
290    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
291   
292    for (unsigned i = 0; i < 8; i++) {
293        Value * expandedStream = getStream(self, "expandedStream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
294        Value * bytepack = iBuilder->CreateBlockAlignedLoad(expandedStream);
295
296        Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
297        Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
298        Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
299        Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
300        Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
301        Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
302
303        Value * mid = right_6_result;
304        mid = iBuilder->simd_or(mid, right_4_result);
305        mid = iBuilder->simd_or(mid, right_2_result);
306        mid = iBuilder->simd_or(mid, left_8_result);
307        mid = iBuilder->simd_or(mid, left_10_result);
308        mid = iBuilder->simd_or(mid, left_12_result);
309        Value * radix64pack = iBuilder->bitCast(mid);
310
311        Value * radix64stream = getStream(self, "radix64stream",blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
312        iBuilder->CreateBlockAlignedStore(radix64pack, radix64stream);
313    }
314    Value * produced = getProducedItemCount(self, "radix64stream");
315    produced = iBuilder->CreateAdd(produced, iBuilder->getSize(iBuilder->getStride()));
316    setProducedItemCount(self, "radix64stream", produced);   
317}
318
319void radix64Kernel::generateFinalBlockMethod() const {
320    auto savePoint = iBuilder->saveIP();
321    Module * m = iBuilder->getModule();
322    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
323    BasicBlock * radix64_fb_entry = BasicBlock::Create(iBuilder->getContext(), "radix64_fb_entry", finalBlockFunction, 0);
324    iBuilder->SetInsertPoint(radix64_fb_entry);
325    BasicBlock * radix64_loop = BasicBlock::Create(iBuilder->getContext(), "radix64_loop", finalBlockFunction, 0);
326    BasicBlock * loopExit = BasicBlock::Create(iBuilder->getContext(), "loopExit", finalBlockFunction, 0);
327    BasicBlock * handleRemainFirstByte = BasicBlock::Create(iBuilder->getContext(), "handleRemainFirstByte", finalBlockFunction, 0);
328    BasicBlock * handleRemainSecondByte = BasicBlock::Create(iBuilder->getContext(), "handleRemainSecondByte", finalBlockFunction, 0);
329    BasicBlock * handleNoRemainSecondByte = BasicBlock::Create(iBuilder->getContext(), "handleNoRemainSecondByte", finalBlockFunction, 0);
330    BasicBlock * fbExit = BasicBlock::Create(iBuilder->getContext(), "fbExit", finalBlockFunction, 0);
331    // Final Block arguments: self, remaining.
332    Function::arg_iterator args = finalBlockFunction->arg_begin();
333    Value * self = &*(args++);
334    Value * remainingBytes = &*(args++);
335    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
336
337    const unsigned PACK_SIZE = iBuilder->getStride()/8;
338    Constant * packSize = iBuilder->getSize(PACK_SIZE);
339    Value * blockNo = getScalarField(self, blockNoScalar);
340
341    Value * step_right_6 = iBuilder->simd_fill(32, iBuilder->getInt32(0x00C00000));
342    Value * step_left_8 = iBuilder->simd_fill(32, iBuilder->getInt32(0x003F0000));
343    Value * step_right_4 = iBuilder->simd_fill(32, iBuilder->getInt32(0x0000F000));
344    Value * step_left_10 = iBuilder->simd_fill(32, iBuilder->getInt32(0x00000F00));
345    Value * step_right_2 = iBuilder->simd_fill(32, iBuilder->getInt32(0x000000FC));
346    Value * step_left_12 = iBuilder->simd_fill(32, iBuilder->getInt32(0x00000003));
347
348
349    // Enter the loop only if there is at least one byte remaining to process.
350    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
351
352    iBuilder->SetInsertPoint(radix64_loop);
353    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
354    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
355    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), radix64_fb_entry);
356    loopRemain->addIncoming(remainingBytes, radix64_fb_entry);
357
358    Value * expandedStreamLoopPtr = getStream(self, "expandedStream", blockNo, iBuilder->getInt32(0), idx);
359    Value * bytepack = iBuilder->CreateBlockAlignedLoad(expandedStreamLoopPtr);
360    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
361    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
362    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
363    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
364    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
365    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
366
367    Value * mid = right_6_result;
368    mid = iBuilder->simd_or(mid, right_4_result);
369    mid = iBuilder->simd_or(mid, right_2_result);
370    mid = iBuilder->simd_or(mid, left_8_result);
371    mid = iBuilder->simd_or(mid, left_10_result);
372    mid = iBuilder->simd_or(mid, left_12_result);
373    Value * radix64pack = iBuilder->bitCast(mid);
374
375    Value * radix64streamPtr = getStream(self, "radix64stream", blockNo, iBuilder->getInt32(0), idx);
376    iBuilder->CreateBlockAlignedStore(radix64pack, radix64streamPtr);
377
378    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
379    idx->addIncoming(nextIdx, radix64_loop);
380    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
381    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
382
383    Value* continueLoop = iBuilder->CreateICmpULT(remainAfterLoop, packSize);
384    iBuilder->CreateCondBr(continueLoop, radix64_loop, loopExit);
385
386    iBuilder->SetInsertPoint(loopExit);
387    // All base64 data has been computed, but we may need to set one or two '=' padding bytes.
388    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(0)), fbExit, handleRemainFirstByte);
389    iBuilder->SetInsertPoint(handleRemainFirstByte);
390    // At least one padding byte required.
391    Value * i8input_ptr = getStreamView(iBuilder->getInt8PtrTy(), self, "expandedStream", blockNo, iBuilder->getInt32(0));
392    Value * remainOutputStart = iBuilder->CreateSub(remainingBytes, remainMod4);
393
394    Value * firstRemainByte = iBuilder->CreateLoad(i8input_ptr);
395
396    Value * first_move_right_2_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0xFC);
397    Value * first_output_byte = iBuilder->CreateLShr(iBuilder->CreateAnd(firstRemainByte, first_move_right_2_mask), 2);
398
399    Value * first_move_left_4_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0x03);
400    Value * first_move_left_4_byte = iBuilder->CreateShl(iBuilder->CreateAnd(firstRemainByte, first_move_left_4_mask), 4);
401
402
403    Value * i8OutPtr0 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, remainOutputStart);
404
405    iBuilder->CreateStore(first_output_byte, i8OutPtr0);
406
407    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(1)), handleNoRemainSecondByte, handleRemainSecondByte);
408    iBuilder->SetInsertPoint(handleRemainSecondByte);
409
410    Value * secondRemainByte = iBuilder->CreateLoad(iBuilder->CreateGEP(i8input_ptr, iBuilder->getInt32(1)));
411    Value * second_move_right_4_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0xF0);
412    Value * second_move_right_4_byte = iBuilder->CreateLShr(iBuilder->CreateAnd(secondRemainByte, second_move_right_4_mask), 4);
413    Value * second_output_byte = iBuilder->CreateOr(first_move_left_4_byte, second_move_right_4_byte);
414
415    Value * i8OutPtr1 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(1)));
416
417    iBuilder->CreateStore(second_output_byte, i8OutPtr1);
418
419    Value * second_move_left_2_mask = ConstantInt::get(iBuilder->getInt8Ty(), 0x0F);
420    Value * second_move_left_2_byte = iBuilder->CreateShl(iBuilder->CreateAnd(secondRemainByte, second_move_left_2_mask), 2);
421
422    Value * i8OutPtr2 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(2)));
423
424    iBuilder->CreateStore(second_move_left_2_byte, i8OutPtr2);
425    iBuilder->CreateBr(fbExit);
426
427    iBuilder->SetInsertPoint(handleNoRemainSecondByte);
428
429    i8OutPtr1 = getStreamView(iBuilder->getInt8PtrTy(), self, "radix64stream", blockNo, iBuilder->CreateAdd(remainOutputStart, iBuilder->getInt64(1)));
430
431    iBuilder->CreateStore(first_move_left_4_byte, i8OutPtr1);
432    iBuilder->CreateBr(fbExit);
433
434    iBuilder->SetInsertPoint(fbExit);
435    Value * outputNumberAdd = iBuilder->CreateSelect(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(0)), iBuilder->getSize(0), iBuilder->getSize(1));
436    Value * produced = iBuilder->CreateAdd(getProducedItemCount(self, "radix64stream"), iBuilder->CreateAdd(remainingBytes, outputNumberAdd));
437    setProducedItemCount(self, "radix64stream", produced);
438
439    iBuilder->CreateRetVoid();
440    iBuilder->restoreIP(savePoint);
441}
442
443   
444radix64Kernel::radix64Kernel(IDISA::IDISA_Builder * iBuilder) :
445    KernelBuilder(iBuilder, "radix64",
446                  {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
447                  {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
448                  {}, {}, {}) {
449        setDoBlockUpdatesProducedItemCountsAttribute(true);
450}
451   
452void radix64Kernel::generateDoBlockMethod() const {
453    auto savePoint = iBuilder->saveIP();
454
455    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
456
457    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
458
459    Value * self = getParameter(doBlockFunction, "self");
460    Value * blockNo = getScalarField(self, blockNoScalar);
461
462    generateDoBlockLogic(self, blockNo);
463
464    iBuilder->CreateRetVoid();
465    iBuilder->restoreIP(savePoint);
466}
467
468base64Kernel::base64Kernel(IDISA::IDISA_Builder * iBuilder) :
469    KernelBuilder(iBuilder, "base64",
470                  {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
471                  {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream"}},
472                  {}, {}, {}) {
473        setDoBlockUpdatesProducedItemCountsAttribute(true);
474    }
475   
476
477void base64Kernel::generateDoBlockLogic(Value * self, Value * blockNo) const {       
478    for (unsigned i = 0; i < 8; i++) {
479        Value * radix64stream_ptr = getStream(self, "radix64stream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
480        Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64stream_ptr);
481        Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
482        Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
483        Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
484        Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
485        // Strategy:
486        // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
487        // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
488        // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
489        // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
490        // 4. subtract ord('0') - ord('/') + (63 - 62) = 2 for all values = 63
491        Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
492        Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
493        Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
494        Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
495        Value * base64pack = iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(2))));
496        Value * base64stream_ptr = getStream(self, "base64stream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(i));
497        iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64stream_ptr);
498    }
499    Value * produced = getProducedItemCount(self, "base64stream");
500    produced = iBuilder->CreateAdd(produced, iBuilder->getSize(iBuilder->getStride()));
501    setProducedItemCount(self, "base64stream", produced);
502}
503
504
505// Special processing for the base 64 format.   The output must always contain a multiple
506// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
507// number of radix 64 values
508void base64Kernel::generateFinalBlockMethod() const {
509    auto savePoint = iBuilder->saveIP();
510    Module * m = iBuilder->getModule();
511    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
512    BasicBlock * base64_fb_entry = BasicBlock::Create(iBuilder->getContext(), "base64_fb_entry", finalBlockFunction, 0);
513    iBuilder->SetInsertPoint(base64_fb_entry);
514    BasicBlock * base64_loop = BasicBlock::Create(iBuilder->getContext(), "base64_loop", finalBlockFunction, 0);
515    BasicBlock * loopExit = BasicBlock::Create(iBuilder->getContext(), "loopExit", finalBlockFunction, 0);
516    BasicBlock * doPadding = BasicBlock::Create(iBuilder->getContext(), "doPadding", finalBlockFunction, 0);
517    BasicBlock * doPadding2 = BasicBlock::Create(iBuilder->getContext(), "doPadding2", finalBlockFunction, 0);
518    BasicBlock * fbExit = BasicBlock::Create(iBuilder->getContext(), "fbExit", finalBlockFunction, 0);
519    // Final Block arguments: self, remaining.
520    Function::arg_iterator args = finalBlockFunction->arg_begin();
521    Value * self = &*(args++);
522    Value * remainingBytes = &*(args++);
523    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
524    Value * padBytes = iBuilder->CreateSub(iBuilder->getSize(4), remainMod4);
525    padBytes = iBuilder->CreateAnd(padBytes, iBuilder->getSize(3));
526
527    Constant * packSize = iBuilder->getSize(iBuilder->getStride() / 8);
528    Value * blockNo = getScalarField(self, blockNoScalar);
529
530    // Enter the loop only if there is at least one byte remaining to process.
531    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, base64_loop);
532   
533    iBuilder->SetInsertPoint(base64_loop);
534    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
535    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
536    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), base64_fb_entry);
537    loopRemain->addIncoming(remainingBytes, base64_fb_entry);
538    Value * radix64streamPtr = getStream(self, "radix64stream", blockNo, iBuilder->getInt32(0), idx);
539    Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64streamPtr);
540    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
541    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
542    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
543    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
544    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
545    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
546    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
547    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
548    Value * base64pack = iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(2))));
549    Value * base64streamPtr = getStream(self, "base64stream", blockNo, iBuilder->getInt32(0), idx);
550    iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64streamPtr);
551    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
552    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
553    loopRemain->addIncoming(remainAfterLoop, base64_loop);
554
555    Value* continueLoop = iBuilder->CreateICmpULT(remainAfterLoop, packSize);
556    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
557
558    iBuilder->SetInsertPoint(loopExit);
559    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, iBuilder->getSize(0)), fbExit, doPadding);
560
561    iBuilder->SetInsertPoint(doPadding);
562    Value * i8output_ptr = getStreamView(iBuilder->getInt8PtrTy(), self, "base64stream", blockNo, iBuilder->getInt32(0));
563    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, remainingBytes));
564    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(3)), fbExit, doPadding2);
565    iBuilder->SetInsertPoint(doPadding2);
566    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, iBuilder->getSize(1));
567    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, finalPadPos));
568    iBuilder->CreateBr(fbExit);
569    iBuilder->SetInsertPoint(fbExit);
570    Value * produced = iBuilder->CreateAdd(getProducedItemCount(self, "base64stream"), iBuilder->CreateAdd(remainingBytes, padBytes));
571    setProducedItemCount(self, "base64stream", produced);
572    iBuilder->CreateRetVoid();
573    iBuilder->restoreIP(savePoint);
574}
575
576void base64Kernel::generateDoBlockMethod() const {
577    auto savePoint = iBuilder->saveIP();
578
579    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
580
581    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
582
583    Value * self = getParameter(doBlockFunction, "self");
584    Value * blockNo = getScalarField(self, blockNoScalar);
585
586    generateDoBlockLogic(self, blockNo);
587
588    iBuilder->CreateRetVoid();
589    iBuilder->restoreIP(savePoint);
590}
591
592}
Note: See TracBrowser for help on using the repository browser.