source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 5356

Last change on this file since 5356 was 5356, checked in by cameron, 2 years ago

Eliminate obsolete DoBlockUpdatesProducedItemCounts? attribute

File size: 26.1 KB
RevLine 
[5216]1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
[5232]5#include "radix64.h"
[5267]6#include <kernels/streamset.h>
[5238]7#include <IR_Gen/idisa_builder.h>
[5260]8#include <llvm/IR/Module.h>
[5216]9#include <llvm/Support/raw_ostream.h>
10
11using namespace llvm;
12
[5260]13namespace kernel {
14
[5216]15// This kernel produces an expanded input stream by duplicating every third byte.
16// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
17// a single shufflevector operation produces 16 bytes of output data from the
18// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
19// produced from 24 bytes of input data.
20//
21// Using aligned SIMD loads, an inner loop processes three registers full of input
[5307]22// data (i.e., three BytePacks) to produce four registers full of output.   This is
[5216]23// a 3 step process.
24// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
25//          At this point 3/4 of the data in input_pack0 has been processed.
26// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
27//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
28//          At this point 1/2 of the data in input_pack1 has been processed.
29// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
30//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
31//          Then apply a further shuffle opertaion to use the remaining 3/4 of
32//          input_pack2 to produce output_pack3.
33
34// The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
35// producing 4 blocks of output.   Unless less than one tripleBlock remains, the
36// doSegment method always processes an integral number of tripleBlocks as a logical
37// segment.  Both input and output buffers are hence maintained at block boundaries,
38// with the input data completely processed for each tripleBlock.
39//
[5307]40// The pipeline must guarantee that the doSegment method is called with the
[5216]41// a continous buffer for the full segment (number of blocks).
42
[5292]43void expand3_4Kernel::generateDoSegmentMethod(Value *doFinal, const std::vector<Value *> &producerPos) {
[5286]44
45    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
[5292]46    BasicBlock * expand_3_4_loop = CreateBasicBlock("expand_3_4_loop");
47    BasicBlock * expand3_4_loop_exit = CreateBasicBlock("expand3_4_loop_exit");
48    BasicBlock * finalStep1 = CreateBasicBlock("finalStep1");
49    BasicBlock * finalStep2 = CreateBasicBlock("finalStep2");
50    BasicBlock * step2load = CreateBasicBlock("step2load");
51    BasicBlock * step2store = CreateBasicBlock("step2store");
52    BasicBlock * finalStep3 = CreateBasicBlock("finalStep3");
53    BasicBlock * step3load = CreateBasicBlock("step3load");
54    BasicBlock * step3store = CreateBasicBlock("step3store");
55    BasicBlock * step3store2 = CreateBasicBlock("step3store2");
56    BasicBlock * itemsDone = CreateBasicBlock("itemsDone");
57    BasicBlock * expand3_4_final = CreateBasicBlock("expand3_4_final");
58    BasicBlock * expand3_4_exit = CreateBasicBlock("expand3_4_exit");
[5216]59   
60    // Determine the require shufflevector constants.
61    const unsigned PACK_SIZE = iBuilder->getStride()/8;
62   
63    // Construct a list of indexes in  the form
64    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
65    unsigned sourceByteIndex = 0;
66    unsigned expand3_4_index[PACK_SIZE];
67    for (unsigned i = 0; i < PACK_SIZE; i++) {
68        expand3_4_index[i] = sourceByteIndex;
69        if (i % 4 != 2) sourceByteIndex++;
70    }
71    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
72    Value * expand_3_4_shuffle[4];
73    for (unsigned j = 0; j < 4; j++) {
74        std::vector<Constant *> Idxs;
75        for (unsigned i = 0; i < PACK_SIZE; i++) {
76            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
77        }
78        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
[5232]79    }
[5325]80
[5277]81    Constant * tripleBlockSize = iBuilder->getSize(3 * iBuilder->getStride());
[5246]82    Constant * packSize = iBuilder->getSize(PACK_SIZE);
[5277]83    Constant * triplePackSize = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
[5260]84    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
[5216]85   
86    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
[5276]87
[5292]88    Value * processed = getProcessedItemCount("sourceStream");
[5286]89    Value * itemsAvail = iBuilder->CreateSub(producerPos[0], processed);
[5216]90   
[5277]91    //
92    // The main loop processes 3 packs of data at a time.  For doFinal
93    // processing, process all the remaining sets of 3 packs, otherwise
94    // process in multiples of 3 full blocks of data.
95    //
96    Value * loopDivisor = iBuilder->CreateSelect(doFinal, triplePackSize, tripleBlockSize);
97    Value * excessItems = iBuilder->CreateURem(itemsAvail, loopDivisor);
98    Value * loopItemsToDo = iBuilder->CreateSub(itemsAvail, excessItems);
[5232]99
[5260]100    // A block is made up of 8 packs.  Get the pointer to the first pack (changes the type of the pointer only).
[5317]101    Value * sourcePackPtr = getInputStreamPackPtr("sourceStream", iBuilder->getInt32(0), iBuilder->getInt32(0));
102    Value * outputPackPtr = getOutputStreamPackPtr("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(0));
[5232]103
[5277]104    Value * hasFullLoop = iBuilder->CreateICmpUGE(loopItemsToDo, triplePackSize);
[5232]105
[5216]106    iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
107    iBuilder->SetInsertPoint(expand_3_4_loop);
108    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
109    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
110    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
[5232]111
[5216]112    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
113    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
[5277]114    loopItemsRemain->addIncoming(loopItemsToDo, expand2_3entry);
[5232]115
[5216]116    // Step 1 of the main loop.
117    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
118    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
[5297]119    iBuilder->CreateBlockAlignedStore(expand0, loopOutput_ptr);
[5216]120    // Step 2 of the main loop.
[5240]121    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(1));
122    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(1));
[5216]123    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
124    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
[5297]125    iBuilder->CreateBlockAlignedStore(expand1, outPack1_ptr);
[5216]126    // Step 3 of the main loop.
[5240]127    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(2));
128    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(2));
[5216]129    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
130    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
[5297]131    iBuilder->CreateBlockAlignedStore(expand2, outPack2_ptr);
[5240]132    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(3));
[5216]133    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
[5297]134    iBuilder->CreateBlockAlignedStore(expand3, outPack3_ptr);
[5232]135
[5240]136    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(3));
[5277]137    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, triplePackSize);
[5232]138
139    Value * loopNextOutputPack;
[5240]140    loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(4));
[5232]141
[5216]142    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
143    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
144    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
[5232]145
[5277]146    Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, triplePackSize);
[5216]147    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
[5277]148   
[5216]149    iBuilder->SetInsertPoint(expand3_4_loop_exit);
150    PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
151    PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
152    loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
153    loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
154    loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
155    loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
[5277]156
[5327]157    // Update the processed items count based on the loopItemsToDo value.
[5277]158    processed = iBuilder->CreateAdd(processed, loopItemsToDo);
[5292]159    setProcessedItemCount("sourceStream", processed);
[5325]160
161
[5277]162    // Except for final segment processing, we are done.
163    iBuilder->CreateCondBr(doFinal, expand3_4_final, expand3_4_exit);
164
165    // Final segment processing.   Less than a triplePack remains.
166    iBuilder->SetInsertPoint(expand3_4_final);
167   
[5232]168    // There may be one or two remaining full packs and/or a partial pack.
[5216]169    //
170    // We have several cases depending on the number of reumaing items.  Let N = packSize
171    // (a) 0 remaining items: all done
172    // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
173    // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
174    // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
175    // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
176    // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
177    // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
[5277]178    Value * condition_a = iBuilder->CreateICmpEQ(excessItems, ConstantInt::getNullValue(iBuilder->getSizeTy()));
[5216]179    iBuilder->CreateCondBr(condition_a, itemsDone, finalStep1);
180    // Final Step1 processing
181    iBuilder->SetInsertPoint(finalStep1);
182    pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
183    expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
184    iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
[5277]185    Value * condition_b = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(3 * PACK_SIZE/4));
[5216]186    iBuilder->CreateCondBr(condition_b, itemsDone, finalStep2);
187    // Final Step 2 processing
188    iBuilder->SetInsertPoint(finalStep2);
[5277]189    Value * condition_c = iBuilder->CreateICmpULE(excessItems, packSize);
[5216]190    iBuilder->CreateCondBr(condition_c, step2store, step2load);
191    iBuilder->SetInsertPoint(step2load);
[5240]192    inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(1));
[5216]193    pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
194    iBuilder->CreateBr(step2store);
195    iBuilder->SetInsertPoint(step2store);
196    PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
197    pack1phi->addIncoming(undefPack, finalStep2);
198    pack1phi->addIncoming(pack1, step2load);
[5240]199    outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(1));
[5216]200    expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
201    iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
[5277]202    Value * condition_d = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(6 * PACK_SIZE/4));
[5216]203    iBuilder->CreateCondBr(condition_d, itemsDone, finalStep3);
204    // Final Step 3
205    iBuilder->SetInsertPoint(finalStep3);
[5277]206    Value * condition_e = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(2 * PACK_SIZE));
[5216]207    iBuilder->CreateCondBr(condition_e, step3store, step3load);
208    iBuilder->SetInsertPoint(step3load);
[5240]209    inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(2));
[5216]210    pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
211    iBuilder->CreateBr(step3store);
212    iBuilder->SetInsertPoint(step3store);
213    PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
214    pack2phi->addIncoming(undefPack, finalStep3);
215    pack2phi->addIncoming(pack2, step3load);
[5240]216    outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(2));
[5216]217    expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
218    iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
[5277]219    Value * condition_f = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(9 * PACK_SIZE/4));
[5216]220    iBuilder->CreateCondBr(condition_f, itemsDone, step3store2);
221    iBuilder->SetInsertPoint(step3store2);
[5240]222    outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(3));
[5216]223    expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
224    iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
225    iBuilder->CreateBr(itemsDone);
226    //
227    iBuilder->SetInsertPoint(itemsDone);
[5277]228    processed = iBuilder->CreateAdd(processed, excessItems);
[5292]229    setProcessedItemCount("sourceStream", processed);
[5232]230
[5325]231   
[5216]232    iBuilder->CreateBr(expand3_4_exit);
233    iBuilder->SetInsertPoint(expand3_4_exit);
234}
235
236
[5219]237// Radix 64 determination, converting 3 bytes to 4 6-bit values.
238//
[5232]239//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
240//           zy                            bits to move 6 positions right
241//             xwvuts                      bits to move 8 positions left
242//                    rqpm                 bits to move 4 positions right
243//                        nlkj             bits to move 10 positions left
244//                             hqfedc      bits to move 2 positions right
245//                                   ba    bits to move 12 positions left
246//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
[5297]247inline Value * radix64Kernel::processPackData(llvm::Value * bytepack) const {
248
[5232]249    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
[5297]250    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
251
[5232]252    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
[5297]253    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
254    Value * mid = iBuilder->simd_or(right_6_result, left_8_result);
255
[5232]256    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
[5297]257    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
258    mid = iBuilder->simd_or(mid, right_4_result);
259
[5232]260    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
[5297]261    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
262    mid = iBuilder->simd_or(mid, left_10_result);
263
[5232]264    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
[5297]265    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
266    mid = iBuilder->simd_or(mid, right_2_result);
267
[5232]268    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
[5288]269    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
[5297]270    mid = iBuilder->simd_or(mid, left_12_result);
[5288]271
[5297]272    return iBuilder->bitCast(mid);
[5288]273}
274
[5297]275void radix64Kernel::generateDoBlockMethod() {
[5219]276    for (unsigned i = 0; i < 8; i++) {
[5317]277        Value * bytepack = loadInputStreamPack("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
[5288]278        Value * radix64pack = processPackData(bytepack);
[5325]279        storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), radix64pack);
[5219]280    }
281}
282
[5297]283void radix64Kernel::generateFinalBlockMethod(Value * remainingBytes) {
[5285]284
285    BasicBlock * entry = iBuilder->GetInsertBlock();
[5292]286    BasicBlock * radix64_loop = CreateBasicBlock("radix64_loop");
287    BasicBlock * fbExit = CreateBasicBlock("fbExit");
[5325]288   
[5232]289    const unsigned PACK_SIZE = iBuilder->getStride()/8;
[5246]290    Constant * packSize = iBuilder->getSize(PACK_SIZE);
[5232]291
292    // Enter the loop only if there is at least one byte remaining to process.
[5234]293    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
[5232]294
295    iBuilder->SetInsertPoint(radix64_loop);
296    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
297    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
[5285]298    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
299    loopRemain->addIncoming(remainingBytes, entry);
[5232]300
[5317]301    Value * bytepack = loadInputStreamPack("expandedStream", iBuilder->getInt32(0), idx);
[5288]302    Value * radix64pack = processPackData(bytepack);
[5317]303    storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), idx, radix64pack);
[5232]304
305    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
306    idx->addIncoming(nextIdx, radix64_loop);
307    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
308    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
309
[5291]310    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
[5288]311
[5325]312    iBuilder->CreateCondBr(continueLoop, radix64_loop, fbExit);
[5232]313
314    iBuilder->SetInsertPoint(fbExit);
315}
316
[5297]317inline llvm::Value* base64Kernel::processPackData(llvm::Value* bytepack) const {
[5288]318    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
319    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
320    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
321    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
322    // Strategy:
323    // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
324    // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
325    // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
326    // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
327    // 4. add ord('/') - ord('0') - (63 - 52) = 3 for all values = 63
328    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
329    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
330    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
331    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
[5317]332    return iBuilder->bitCast(iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(12)))));
[5288]333}
334
[5297]335void base64Kernel::generateDoBlockMethod() {
[5219]336    for (unsigned i = 0; i < 8; i++) {
[5317]337        Value * bytepack = loadInputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i));
338        Value * base64pack = processPackData(bytepack);
339        storeOutputStreamPack("base64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), base64pack);
[5219]340    }
341}
342
[5307]343//// Special processing for the base 64 format.   The output must always contain a multiple
344//// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
345//// number of radix 64 values
346//void base64Kernel::generateFinalBlockMethod(Value * remainingBytes) {
347
348//    BasicBlock * entry = iBuilder->GetInsertBlock();
349//    BasicBlock * base64_loop = CreateBasicBlock("base64_loop");
350//    BasicBlock * loopExit = CreateBasicBlock("loopExit");
351//    BasicBlock * doPadding = CreateBasicBlock("doPadding");
352//    BasicBlock * doPadding2 = CreateBasicBlock("doPadding2");
353//    BasicBlock * fbExit = CreateBasicBlock("fbExit");
354
355//    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
356//    Value * padBytes = iBuilder->CreateSub(iBuilder->getSize(4), remainMod4);
357//    padBytes = iBuilder->CreateAnd(padBytes, iBuilder->getSize(3));
358
359//    Constant * packSize = iBuilder->getSize(iBuilder->getStride() / 8);
360
361//    // Enter the loop only if there is at least one byte remaining to process.
362//    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, base64_loop);
363
364//    iBuilder->SetInsertPoint(base64_loop);
365//    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
366//    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
367//    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
368//    loopRemain->addIncoming(remainingBytes, entry);
369//    Value * radix64streamPtr = getInputStream("radix64stream", iBuilder->getInt32(0), idx);
370//    Value * bytepack = iBuilder->CreateBlockAlignedLoad(radix64streamPtr);
371//    Value * base64pack = processPackData(bytepack);
372//    Value * base64streamPtr = getOutputStream("base64stream", iBuilder->getInt32(0), idx);
373
374//    iBuilder->CreateBlockAlignedStore(iBuilder->bitCast(base64pack), base64streamPtr);
375//    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
376//    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
377//    loopRemain->addIncoming(remainAfterLoop, base64_loop);
378
379//    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
380//    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
381
382//    iBuilder->SetInsertPoint(loopExit);
383//    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, iBuilder->getSize(0)), fbExit, doPadding);
384
385//    iBuilder->SetInsertPoint(doPadding);
386
387//    base64streamPtr = getOutputStream("base64stream", iBuilder->getInt32(0), idx);
388//    Value * i8streamPtr = iBuilder->CreatePointerCast(base64streamPtr, iBuilder->getInt8PtrTy());
389//    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8streamPtr, remainingBytes));
390//    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(3)), fbExit, doPadding2);
391//    iBuilder->SetInsertPoint(doPadding2);
392//    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, iBuilder->getSize(1));
393//    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8streamPtr, finalPadPos));
394//    iBuilder->CreateBr(fbExit);
395//    iBuilder->SetInsertPoint(fbExit);
396//    Value * produced = iBuilder->CreateAdd(getProducedItemCount("base64stream"), iBuilder->CreateAdd(remainingBytes, padBytes));
397//    setProducedItemCount("base64stream", produced);
398//}
399
[5219]400// Special processing for the base 64 format.   The output must always contain a multiple
401// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
402// number of radix 64 values
[5297]403void base64Kernel::generateFinalBlockMethod(Value * remainingBytes) {
[5285]404
405    BasicBlock * entry = iBuilder->GetInsertBlock();
[5292]406    BasicBlock * base64_loop = CreateBasicBlock("base64_loop");
407    BasicBlock * loopExit = CreateBasicBlock("loopExit");
408    BasicBlock * doPadding = CreateBasicBlock("doPadding");
409    BasicBlock * doPadding2 = CreateBasicBlock("doPadding2");
410    BasicBlock * fbExit = CreateBasicBlock("fbExit");
[5285]411
[5246]412    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
413    Value * padBytes = iBuilder->CreateSub(iBuilder->getSize(4), remainMod4);
414    padBytes = iBuilder->CreateAnd(padBytes, iBuilder->getSize(3));
[5219]415
[5234]416    Constant * packSize = iBuilder->getSize(iBuilder->getStride() / 8);
[5260]417
[5219]418    // Enter the loop only if there is at least one byte remaining to process.
[5234]419    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, base64_loop);
[5285]420
[5219]421    iBuilder->SetInsertPoint(base64_loop);
422    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
423    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
[5285]424    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
425    loopRemain->addIncoming(remainingBytes, entry);
[5317]426    Value * bytepack = loadInputStreamPack("radix64stream", iBuilder->getInt32(0), idx);
[5288]427    Value * base64pack = processPackData(bytepack);
[5317]428    storeOutputStreamPack("base64stream", iBuilder->getInt32(0), idx, base64pack);
[5219]429    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
[5232]430    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
431    loopRemain->addIncoming(remainAfterLoop, base64_loop);
432
[5290]433    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
[5232]434    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
435
[5219]436    iBuilder->SetInsertPoint(loopExit);
[5234]437    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, iBuilder->getSize(0)), fbExit, doPadding);
[5260]438
[5219]439    iBuilder->SetInsertPoint(doPadding);
[5317]440    Value * i8output_ptr = getOutputStreamBlockPtr("base64stream", iBuilder->getInt32(0));
[5307]441    i8output_ptr = iBuilder->CreatePointerCast(i8output_ptr, iBuilder->getInt8PtrTy());
[5240]442    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, remainingBytes));
[5234]443    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(3)), fbExit, doPadding2);
[5219]444    iBuilder->SetInsertPoint(doPadding2);
[5234]445    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, iBuilder->getSize(1));
[5240]446    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, finalPadPos));
[5219]447    iBuilder->CreateBr(fbExit);
448    iBuilder->SetInsertPoint(fbExit);
449}
[5232]450
[5283]451expand3_4Kernel::expand3_4Kernel(IDISA::IDISA_Builder * iBuilder)
452: SegmentOrientedKernel(iBuilder, "expand3_4",
[5297]453            {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream"}},
[5328]454            {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream", FixedRatio(4,3)}},
[5297]455            {}, {}, {}) {
[5232]456}
[5283]457
458radix64Kernel::radix64Kernel(IDISA::IDISA_Builder * iBuilder)
[5297]459: BlockOrientedKernel(iBuilder, "radix64",
460            {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
461            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
462            {}, {}, {}) {
[5283]463}
464
465base64Kernel::base64Kernel(IDISA::IDISA_Builder * iBuilder)
[5297]466: BlockOrientedKernel(iBuilder, "base64",
467            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
[5328]468            {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream", RoundUpToMultiple(4)}},
[5297]469            {}, {}, {}) {
[5283]470}
471
472}
Note: See TracBrowser for help on using the repository browser.