source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 5497

Last change on this file since 5497 was 5440, checked in by nmedfort, 2 years ago

Large refactoring step. Removed IR generation code from Kernel (formally KernelBuilder?) and moved it into the new KernelBuilder? class.

File size: 23.5 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "radix64.h"
6#include <kernels/streamset.h>
7#include <kernels/kernel_builder.h>
8
9using namespace llvm;
10
11namespace kernel {
12
13// This kernel produces an expanded input stream by duplicating every third byte.
14// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
15// a single shufflevector operation produces 16 bytes of output data from the
16// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
17// produced from 24 bytes of input data.
18//
19// Using aligned SIMD loads, an inner loop processes three registers full of input
20// data (i.e., three BytePacks) to produce four registers full of output.   This is
21// a 3 step process.
22// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
23//          At this point 3/4 of the data in input_pack0 has been processed.
24// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
25//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
26//          At this point 1/2 of the data in input_pack1 has been processed.
27// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
28//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
29//          Then apply a further shuffle opertaion to use the remaining 3/4 of
30//          input_pack2 to produce output_pack3.
31
32// The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
33// producing 4 blocks of output.   Unless less than one tripleBlock remains, the
34// doSegment method always processes an integral number of tripleBlocks as a logical
35// segment.  Both input and output buffers are hence maintained at block boundaries,
36// with the input data completely processed for each tripleBlock.
37//
38// The pipeline must guarantee that the doSegment method is called with the
39// a continous buffer for the full segment (number of blocks).
40
41void expand3_4Kernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
42
43    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
44    BasicBlock * expand_3_4_loop = iBuilder->CreateBasicBlock("expand_3_4_loop");
45    BasicBlock * expand3_4_loop_exit = iBuilder->CreateBasicBlock("expand3_4_loop_exit");
46    BasicBlock * finalStep1 = iBuilder->CreateBasicBlock("finalStep1");
47    BasicBlock * finalStep2 = iBuilder->CreateBasicBlock("finalStep2");
48    BasicBlock * step2load = iBuilder->CreateBasicBlock("step2load");
49    BasicBlock * step2store = iBuilder->CreateBasicBlock("step2store");
50    BasicBlock * finalStep3 = iBuilder->CreateBasicBlock("finalStep3");
51    BasicBlock * step3load = iBuilder->CreateBasicBlock("step3load");
52    BasicBlock * step3store = iBuilder->CreateBasicBlock("step3store");
53    BasicBlock * step3store2 = iBuilder->CreateBasicBlock("step3store2");
54    BasicBlock * itemsDone = iBuilder->CreateBasicBlock("itemsDone");
55    BasicBlock * expand3_4_final = iBuilder->CreateBasicBlock("expand3_4_final");
56    BasicBlock * expand3_4_exit = iBuilder->CreateBasicBlock("expand3_4_exit");
57   
58    // Determine the require shufflevector constants.
59    const unsigned PACK_SIZE = iBuilder->getStride()/8;
60   
61    // Construct a list of indexes in  the form
62    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
63    unsigned sourceByteIndex = 0;
64    unsigned expand3_4_index[PACK_SIZE];
65    for (unsigned i = 0; i < PACK_SIZE; i++) {
66        expand3_4_index[i] = sourceByteIndex;
67        if (i % 4 != 2) sourceByteIndex++;
68    }
69    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
70    Value * expand_3_4_shuffle[4];
71    for (unsigned j = 0; j < 4; j++) {
72        std::vector<Constant *> Idxs;
73        for (unsigned i = 0; i < PACK_SIZE; i++) {
74            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
75        }
76        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
77    }
78
79    Constant * tripleBlockSize = iBuilder->getSize(3 * iBuilder->getStride());
80    Constant * packSize = iBuilder->getSize(PACK_SIZE);
81    Constant * triplePackSize = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
82    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
83   
84    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
85
86    Value * processed = iBuilder->getProcessedItemCount("sourceStream");
87    Value * available = iBuilder->getAvailableItemCount("sourceStream");
88    Value * itemsAvail = iBuilder->CreateSub(available, processed);
89   
90    //
91    // The main loop processes 3 packs of data at a time.  For doFinal
92    // processing, process all the remaining sets of 3 packs, otherwise
93    // process in multiples of 3 full blocks of data.
94    //
95    Value * loopDivisor = iBuilder->CreateSelect(getIsFinal(), triplePackSize, tripleBlockSize);
96    Value * excessItems = iBuilder->CreateURem(itemsAvail, loopDivisor);
97    Value * loopItemsToDo = iBuilder->CreateSub(itemsAvail, excessItems);
98
99    // A block is made up of 8 packs.  Get the pointer to the first pack (changes the type of the pointer only).
100    Value * sourcePackPtr = iBuilder->getInputStreamPackPtr("sourceStream", iBuilder->getInt32(0), iBuilder->getInt32(0));
101    Value * outputPackPtr = iBuilder->getOutputStreamPackPtr("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(0));
102
103    Value * hasFullLoop = iBuilder->CreateICmpUGE(loopItemsToDo, triplePackSize);
104
105    iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
106    iBuilder->SetInsertPoint(expand_3_4_loop);
107    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
108    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
109    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
110
111    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
112    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
113    loopItemsRemain->addIncoming(loopItemsToDo, expand2_3entry);
114
115    // Step 1 of the main loop.
116    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
117    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
118    iBuilder->CreateBlockAlignedStore(expand0, loopOutput_ptr);
119    // Step 2 of the main loop.
120    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(1));
121    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(1));
122    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
123    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
124    iBuilder->CreateBlockAlignedStore(expand1, outPack1_ptr);
125    // Step 3 of the main loop.
126    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(2));
127    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(2));
128    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
129    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
130    iBuilder->CreateBlockAlignedStore(expand2, outPack2_ptr);
131    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(3));
132    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
133    iBuilder->CreateBlockAlignedStore(expand3, outPack3_ptr);
134
135    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(3));
136    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, triplePackSize);
137
138    Value * loopNextOutputPack;
139    loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(4));
140
141    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
142    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
143    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
144
145    Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, triplePackSize);
146    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
147   
148    iBuilder->SetInsertPoint(expand3_4_loop_exit);
149    PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
150    PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
151    loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
152    loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
153    loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
154    loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
155
156    // Update the processed items count based on the loopItemsToDo value.
157    processed = iBuilder->CreateAdd(processed, loopItemsToDo);
158    iBuilder->setProcessedItemCount("sourceStream", processed);
159
160
161    // Except for final segment processing, we are done.
162    iBuilder->CreateCondBr(getIsFinal(), expand3_4_final, expand3_4_exit);
163
164    // Final segment processing.   Less than a triplePack remains.
165    iBuilder->SetInsertPoint(expand3_4_final);
166   
167    // There may be one or two remaining full packs and/or a partial pack.
168    //
169    // We have several cases depending on the number of reumaing items.  Let N = packSize
170    // (a) 0 remaining items: all done
171    // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
172    // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
173    // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
174    // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
175    // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
176    // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
177    Value * condition_a = iBuilder->CreateICmpEQ(excessItems, ConstantInt::getNullValue(iBuilder->getSizeTy()));
178    iBuilder->CreateCondBr(condition_a, itemsDone, finalStep1);
179    // Final Step1 processing
180    iBuilder->SetInsertPoint(finalStep1);
181    pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
182    expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
183    iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
184    Value * condition_b = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(3 * PACK_SIZE/4));
185    iBuilder->CreateCondBr(condition_b, itemsDone, finalStep2);
186    // Final Step 2 processing
187    iBuilder->SetInsertPoint(finalStep2);
188    Value * condition_c = iBuilder->CreateICmpULE(excessItems, packSize);
189    iBuilder->CreateCondBr(condition_c, step2store, step2load);
190    iBuilder->SetInsertPoint(step2load);
191    inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(1));
192    pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
193    iBuilder->CreateBr(step2store);
194    iBuilder->SetInsertPoint(step2store);
195    PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
196    pack1phi->addIncoming(undefPack, finalStep2);
197    pack1phi->addIncoming(pack1, step2load);
198    outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(1));
199    expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
200    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
201    Value * condition_d = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(6 * PACK_SIZE/4));
202    iBuilder->CreateCondBr(condition_d, itemsDone, finalStep3);
203    // Final Step 3
204    iBuilder->SetInsertPoint(finalStep3);
205    Value * condition_e = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(2 * PACK_SIZE));
206    iBuilder->CreateCondBr(condition_e, step3store, step3load);
207    iBuilder->SetInsertPoint(step3load);
208    inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(2));
209    pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
210    iBuilder->CreateBr(step3store);
211    iBuilder->SetInsertPoint(step3store);
212    PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
213    pack2phi->addIncoming(undefPack, finalStep3);
214    pack2phi->addIncoming(pack2, step3load);
215    outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(2));
216    expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
217    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
218    Value * condition_f = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(9 * PACK_SIZE/4));
219    iBuilder->CreateCondBr(condition_f, itemsDone, step3store2);
220    iBuilder->SetInsertPoint(step3store2);
221    outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(3));
222    expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
223    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
224    iBuilder->CreateBr(itemsDone);
225    //
226    iBuilder->SetInsertPoint(itemsDone);
227    processed = iBuilder->CreateAdd(processed, excessItems);
228    iBuilder->setProcessedItemCount("sourceStream", processed);
229
230   
231    iBuilder->CreateBr(expand3_4_exit);
232    iBuilder->SetInsertPoint(expand3_4_exit);
233}
234
235
236// Radix 64 determination, converting 3 bytes to 4 6-bit values.
237//
238//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
239//           zy                            bits to move 6 positions right
240//             xwvuts                      bits to move 8 positions left
241//                    rqpm                 bits to move 4 positions right
242//                        nlkj             bits to move 10 positions left
243//                             hqfedc      bits to move 2 positions right
244//                                   ba    bits to move 12 positions left
245//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
246inline Value * radix64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * bytepack) const {
247
248    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
249    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
250
251    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
252    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
253    Value * mid = iBuilder->simd_or(right_6_result, left_8_result);
254
255    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
256    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
257    mid = iBuilder->simd_or(mid, right_4_result);
258
259    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
260    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
261    mid = iBuilder->simd_or(mid, left_10_result);
262
263    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
264    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
265    mid = iBuilder->simd_or(mid, right_2_result);
266
267    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
268    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
269    mid = iBuilder->simd_or(mid, left_12_result);
270
271    return iBuilder->bitCast(mid);
272}
273
274void radix64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
275    for (unsigned i = 0; i < 8; i++) {
276        Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
277        Value * radix64pack = processPackData(iBuilder, bytepack);
278        iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), radix64pack);
279    }
280}
281
282void radix64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
283
284    BasicBlock * entry = iBuilder->GetInsertBlock();
285    BasicBlock * radix64_loop = iBuilder->CreateBasicBlock("radix64_loop");
286    BasicBlock * fbExit = iBuilder->CreateBasicBlock("fbExit");
287   
288    const unsigned PACK_SIZE = iBuilder->getStride()/8;
289    Constant * packSize = iBuilder->getSize(PACK_SIZE);
290
291    // Enter the loop only if there is at least one byte remaining to process.
292    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
293
294    iBuilder->SetInsertPoint(radix64_loop);
295    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
296    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
297    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
298    loopRemain->addIncoming(remainingBytes, entry);
299
300    Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), idx);
301    Value * radix64pack = processPackData(iBuilder, bytepack);
302    iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), idx, radix64pack);
303
304    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
305    idx->addIncoming(nextIdx, radix64_loop);
306    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
307    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
308
309    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
310
311    iBuilder->CreateCondBr(continueLoop, radix64_loop, fbExit);
312
313    iBuilder->SetInsertPoint(fbExit);
314}
315
316inline llvm::Value* base64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* bytepack) const {
317    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
318    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
319    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
320    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
321    // Strategy:
322    // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
323    // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
324    // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
325    // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
326    // 4. add ord('/') - ord('0') - (63 - 52) = 3 for all values = 63
327    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
328    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
329    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
330    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
331    return iBuilder->bitCast(iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(12)))));
332}
333
334void base64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
335    for (unsigned i = 0; i < 8; i++) {
336        Value * bytepack = iBuilder->loadInputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i));
337        Value * base64pack = processPackData(iBuilder, bytepack);
338        iBuilder->storeOutputStreamPack("base64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), base64pack);
339    }
340}
341
342// Special processing for the base 64 format.   The output must always contain a multiple
343// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
344// number of radix 64 values
345void base64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
346
347    BasicBlock * entry = iBuilder->GetInsertBlock();
348    BasicBlock * base64_loop = iBuilder->CreateBasicBlock("base64_loop");
349    BasicBlock * loopExit = iBuilder->CreateBasicBlock("loopExit");
350    BasicBlock * doPadding = iBuilder->CreateBasicBlock("doPadding");
351    BasicBlock * doPadding2 = iBuilder->CreateBasicBlock("doPadding2");
352    BasicBlock * fbExit = iBuilder->CreateBasicBlock("fbExit");
353
354    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
355    Value * padBytes = iBuilder->CreateSub(iBuilder->getSize(4), remainMod4);
356    padBytes = iBuilder->CreateAnd(padBytes, iBuilder->getSize(3));
357
358    Constant * packSize = iBuilder->getSize(iBuilder->getStride() / 8);
359
360    // Enter the loop only if there is at least one byte remaining to process.
361    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, base64_loop);
362
363    iBuilder->SetInsertPoint(base64_loop);
364    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
365    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
366    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
367    loopRemain->addIncoming(remainingBytes, entry);
368    Value * bytepack = iBuilder->loadInputStreamPack("radix64stream", iBuilder->getInt32(0), idx);
369    Value * base64pack = processPackData(iBuilder, bytepack);
370    iBuilder->storeOutputStreamPack("base64stream", iBuilder->getInt32(0), idx, base64pack);
371    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
372    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
373    loopRemain->addIncoming(remainAfterLoop, base64_loop);
374
375    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
376    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
377
378    iBuilder->SetInsertPoint(loopExit);
379    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, iBuilder->getSize(0)), fbExit, doPadding);
380
381    iBuilder->SetInsertPoint(doPadding);
382    Value * i8output_ptr = iBuilder->getOutputStreamBlockPtr("base64stream", iBuilder->getInt32(0));
383    i8output_ptr = iBuilder->CreatePointerCast(i8output_ptr, iBuilder->getInt8PtrTy());
384    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, remainingBytes));
385    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(3)), fbExit, doPadding2);
386    iBuilder->SetInsertPoint(doPadding2);
387    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, iBuilder->getSize(1));
388    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, finalPadPos));
389    iBuilder->CreateBr(fbExit);
390    iBuilder->SetInsertPoint(fbExit);
391}
392
393expand3_4Kernel::expand3_4Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
394: SegmentOrientedKernel("expand3_4",
395            {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream"}},
396            {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream", FixedRatio(4,3)}},
397            {}, {}, {}) {
398}
399
400radix64Kernel::radix64Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
401: BlockOrientedKernel("radix64",
402            {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
403            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
404            {}, {}, {}) {
405}
406
407base64Kernel::base64Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
408: BlockOrientedKernel("base64",
409            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
410            {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream", RoundUpToMultiple(4)}},
411            {}, {}, {}) {
412}
413
414}
Note: See TracBrowser for help on using the repository browser.