source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 5941

Last change on this file since 5941 was 5831, checked in by nmedfort, 21 months ago

Potential bug fix for 32-bit

File size: 17.5 KB
RevLine 
[5216]1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
[5232]5#include "radix64.h"
[5267]6#include <kernels/streamset.h>
[5436]7#include <kernels/kernel_builder.h>
[5216]8
9using namespace llvm;
10
[5260]11namespace kernel {
12
[5216]13// This kernel produces an expanded input stream by duplicating every third byte.
14// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
15// a single shufflevector operation produces 16 bytes of output data from the
16// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
17// produced from 24 bytes of input data.
18//
19// Using aligned SIMD loads, an inner loop processes three registers full of input
[5307]20// data (i.e., three BytePacks) to produce four registers full of output.   This is
[5216]21// a 3 step process.
22// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
23//          At this point 3/4 of the data in input_pack0 has been processed.
24// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
25//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
26//          At this point 1/2 of the data in input_pack1 has been processed.
27// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
28//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
29//          Then apply a further shuffle opertaion to use the remaining 3/4 of
30//          input_pack2 to produce output_pack3.
31
[5508]32// The MultiBlockLogic is based on a natural stride taking 3 packs at a time.
33// In this case, the output produced is exactly 4 packs or 4 blocks, with no pending
34// data maintained in the kernel state.
[5216]35//
[5508]36// When processing the final partial stride of data, the kernel performs full
37// triple-pack processing for each full or partial triple-pack remaining,
38// relying on the MultiBlockKernel builder to only copy the correct number
39// of bytes to the actual output stream.
[5216]40
[5831]41void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
[5286]42
43    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
[5440]44    BasicBlock * expand_3_4_loop = iBuilder->CreateBasicBlock("expand_3_4_loop");
45    BasicBlock * expand3_4_exit = iBuilder->CreateBasicBlock("expand3_4_exit");
[5216]46   
47    // Determine the require shufflevector constants.
[5508]48    const unsigned PACK_SIZE = iBuilder->getBitBlockWidth()/8;
[5216]49   
50    // Construct a list of indexes in  the form
51    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
52    unsigned sourceByteIndex = 0;
53    unsigned expand3_4_index[PACK_SIZE];
54    for (unsigned i = 0; i < PACK_SIZE; i++) {
55        expand3_4_index[i] = sourceByteIndex;
56        if (i % 4 != 2) sourceByteIndex++;
57    }
58    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
59    Value * expand_3_4_shuffle[4];
60    for (unsigned j = 0; j < 4; j++) {
61        std::vector<Constant *> Idxs;
62        for (unsigned i = 0; i < PACK_SIZE; i++) {
63            Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
64        }
65        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
[5232]66    }
[5325]67
[5277]68    Constant * triplePackSize = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
[5260]69    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
[5216]70   
71    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
[5276]72
[5706]73    Value * itemsToDo = mAvailableItemCount[0];
[5507]74
[5706]75    Value * sourceStream = iBuilder->getInputStreamBlockPtr("sourceStream", iBuilder->getInt32(0));
76    Value * expandedStream = iBuilder->getOutputStreamBlockPtr("expand34Stream", iBuilder->getInt32(0));
77
[5508]78    // The main loop processes 3 packs of data at a time.
[5594]79    // The initial pack offsets may be nonzero.
[5599]80    sourceStream = iBuilder->CreatePointerCast(sourceStream, iBuilder->getInt8PtrTy());
81    expandedStream = iBuilder->CreatePointerCast(expandedStream, iBuilder->getInt8PtrTy());
[5594]82    Value * offset = iBuilder->CreateURem(iBuilder->getProcessedItemCount("sourceStream"), iBuilder->getSize(iBuilder->getBitBlockWidth()));
[5599]83    Value * sourcePackPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(sourceStream, offset), iBuilder->getBitBlockType()->getPointerTo());
[5594]84    offset = iBuilder->CreateURem(iBuilder->getProducedItemCount("expand34Stream"), iBuilder->getSize(iBuilder->getBitBlockWidth()));
[5599]85    Value * outputPackPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(expandedStream, offset), iBuilder->getBitBlockType()->getPointerTo());
[5508]86    iBuilder->CreateCondBr(iBuilder->CreateICmpSGT(itemsToDo, iBuilder->getSize(0)), expand_3_4_loop, expand3_4_exit);
87   
[5216]88    iBuilder->SetInsertPoint(expand_3_4_loop);
89    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
90    PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
91    PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
[5232]92
[5216]93    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
94    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
[5508]95    loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
[5232]96
[5507]97
[5216]98    // Step 1 of the main loop.
99    Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
100    Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
[5297]101    iBuilder->CreateBlockAlignedStore(expand0, loopOutput_ptr);
[5216]102    // Step 2 of the main loop.
[5240]103    Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(1));
104    Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(1));
[5216]105    Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
106    Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
[5297]107    iBuilder->CreateBlockAlignedStore(expand1, outPack1_ptr);
[5216]108    // Step 3 of the main loop.
[5240]109    Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(2));
110    Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(2));
[5216]111    Value * pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
112    Value * expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1, pack2, expand_3_4_shuffle[2]));
[5297]113    iBuilder->CreateBlockAlignedStore(expand2, outPack2_ptr);
[5240]114    Value * outPack3_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(3));
[5216]115    Value * expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2, undefPack, expand_3_4_shuffle[3]));
[5297]116    iBuilder->CreateBlockAlignedStore(expand3, outPack3_ptr);
[5232]117
[5240]118    Value * loopNextInputPack = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(3));
[5277]119    Value * remainingItems = iBuilder->CreateSub(loopItemsRemain, triplePackSize);
[5232]120
121    Value * loopNextOutputPack;
[5240]122    loopNextOutputPack = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(4));
[5232]123
[5216]124    loopInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
125    loopOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
126    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
[5232]127
[5508]128    Value * continueLoop = iBuilder->CreateICmpSGT(remainingItems, iBuilder->getSize(0));
129    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_exit);
[5277]130   
[5216]131    iBuilder->SetInsertPoint(expand3_4_exit);
[5755]132
[5706]133}
[5216]134
135
[5219]136// Radix 64 determination, converting 3 bytes to 4 6-bit values.
137//
[5232]138//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
139//           zy                            bits to move 6 positions right
140//             xwvuts                      bits to move 8 positions left
141//                    rqpm                 bits to move 4 positions right
142//                        nlkj             bits to move 10 positions left
143//                             hqfedc      bits to move 2 positions right
144//                                   ba    bits to move 12 positions left
145//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
[5440]146inline Value * radix64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * bytepack) const {
[5297]147
[5232]148    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
[5297]149    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
150
[5232]151    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
[5297]152    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
153    Value * mid = iBuilder->simd_or(right_6_result, left_8_result);
154
[5232]155    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
[5297]156    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
157    mid = iBuilder->simd_or(mid, right_4_result);
158
[5232]159    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
[5297]160    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
161    mid = iBuilder->simd_or(mid, left_10_result);
162
[5232]163    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
[5297]164    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
165    mid = iBuilder->simd_or(mid, right_2_result);
166
[5232]167    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
[5288]168    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
[5297]169    mid = iBuilder->simd_or(mid, left_12_result);
[5288]170
[5297]171    return iBuilder->bitCast(mid);
[5288]172}
173
[5440]174void radix64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
[5219]175    for (unsigned i = 0; i < 8; i++) {
[5440]176        Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
177        Value * radix64pack = processPackData(iBuilder, bytepack);
178        iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), radix64pack);
[5219]179    }
180}
181
[5440]182void radix64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
[5285]183
184    BasicBlock * entry = iBuilder->GetInsertBlock();
[5440]185    BasicBlock * radix64_loop = iBuilder->CreateBasicBlock("radix64_loop");
186    BasicBlock * fbExit = iBuilder->CreateBasicBlock("fbExit");
[5325]187   
[5232]188    const unsigned PACK_SIZE = iBuilder->getStride()/8;
[5246]189    Constant * packSize = iBuilder->getSize(PACK_SIZE);
[5232]190
191    // Enter the loop only if there is at least one byte remaining to process.
[5234]192    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
[5232]193
194    iBuilder->SetInsertPoint(radix64_loop);
195    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
196    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
[5285]197    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
198    loopRemain->addIncoming(remainingBytes, entry);
[5232]199
[5440]200    Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), idx);
201    Value * radix64pack = processPackData(iBuilder, bytepack);
202    iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), idx, radix64pack);
[5232]203
204    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
205    idx->addIncoming(nextIdx, radix64_loop);
206    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
207    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
208
[5291]209    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
[5288]210
[5325]211    iBuilder->CreateCondBr(continueLoop, radix64_loop, fbExit);
[5232]212
213    iBuilder->SetInsertPoint(fbExit);
214}
215
[5440]216inline llvm::Value* base64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* bytepack) const {
[5288]217    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
218    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
219    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
220    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
221    // Strategy:
222    // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
223    // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
224    // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
225    // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
226    // 4. add ord('/') - ord('0') - (63 - 52) = 3 for all values = 63
227    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
228    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
229    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
230    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
[5317]231    return iBuilder->bitCast(iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(12)))));
[5288]232}
233
[5440]234void base64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
[5219]235    for (unsigned i = 0; i < 8; i++) {
[5440]236        Value * bytepack = iBuilder->loadInputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i));
237        Value * base64pack = processPackData(iBuilder, bytepack);
238        iBuilder->storeOutputStreamPack("base64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), base64pack);
[5219]239    }
240}
241
242// Special processing for the base 64 format.   The output must always contain a multiple
243// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
244// number of radix 64 values
[5440]245void base64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
[5285]246
247    BasicBlock * entry = iBuilder->GetInsertBlock();
[5440]248    BasicBlock * base64_loop = iBuilder->CreateBasicBlock("base64_loop");
249    BasicBlock * loopExit = iBuilder->CreateBasicBlock("loopExit");
250    BasicBlock * doPadding = iBuilder->CreateBasicBlock("doPadding");
251    BasicBlock * doPadding2 = iBuilder->CreateBasicBlock("doPadding2");
252    BasicBlock * fbExit = iBuilder->CreateBasicBlock("fbExit");
[5285]253
[5246]254    Value * remainMod4 = iBuilder->CreateAnd(remainingBytes, iBuilder->getSize(3));
255    Value * padBytes = iBuilder->CreateSub(iBuilder->getSize(4), remainMod4);
256    padBytes = iBuilder->CreateAnd(padBytes, iBuilder->getSize(3));
[5219]257
[5234]258    Constant * packSize = iBuilder->getSize(iBuilder->getStride() / 8);
[5260]259
[5219]260    // Enter the loop only if there is at least one byte remaining to process.
[5234]261    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, base64_loop);
[5285]262
[5219]263    iBuilder->SetInsertPoint(base64_loop);
264    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
265    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
[5285]266    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
267    loopRemain->addIncoming(remainingBytes, entry);
[5440]268    Value * bytepack = iBuilder->loadInputStreamPack("radix64stream", iBuilder->getInt32(0), idx);
269    Value * base64pack = processPackData(iBuilder, bytepack);
270    iBuilder->storeOutputStreamPack("base64stream", iBuilder->getInt32(0), idx, base64pack);
[5219]271    idx->addIncoming(iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1)), base64_loop);
[5232]272    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
273    loopRemain->addIncoming(remainAfterLoop, base64_loop);
274
[5290]275    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
[5232]276    iBuilder->CreateCondBr(continueLoop, base64_loop, loopExit);
277
[5219]278    iBuilder->SetInsertPoint(loopExit);
[5234]279    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(padBytes, iBuilder->getSize(0)), fbExit, doPadding);
[5260]280
[5219]281    iBuilder->SetInsertPoint(doPadding);
[5440]282    Value * i8output_ptr = iBuilder->getOutputStreamBlockPtr("base64stream", iBuilder->getInt32(0));
[5307]283    i8output_ptr = iBuilder->CreatePointerCast(i8output_ptr, iBuilder->getInt8PtrTy());
[5240]284    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, remainingBytes));
[5234]285    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainMod4, iBuilder->getSize(3)), fbExit, doPadding2);
[5219]286    iBuilder->SetInsertPoint(doPadding2);
[5234]287    Value * finalPadPos = iBuilder->CreateAdd(remainingBytes, iBuilder->getSize(1));
[5240]288    iBuilder->CreateStore(ConstantInt::get(iBuilder->getInt8Ty(), '='), iBuilder->CreateGEP(i8output_ptr, finalPadPos));
[5219]289    iBuilder->CreateBr(fbExit);
290    iBuilder->SetInsertPoint(fbExit);
291}
[5232]292
[5436]293expand3_4Kernel::expand3_4Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
[5507]294: MultiBlockKernel("expand3_4",
[5706]295            {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream", FixedRate(3)}},
296            {Binding{iBuilder->getStreamSetTy(1, 8), "expand34Stream", FixedRate(4)}},
[5297]297            {}, {}, {}) {
[5706]298
[5232]299}
[5283]300
[5436]301radix64Kernel::radix64Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
[5435]302: BlockOrientedKernel("radix64",
[5297]303            {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
304            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
305            {}, {}, {}) {
[5283]306}
307
[5436]308base64Kernel::base64Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
[5435]309: BlockOrientedKernel("base64",
[5297]310            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
[5706]311            {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream", FixedRate(1), RoundUpTo(4)}},
[5297]312            {}, {}, {}) {
[5283]313}
314
315}
Note: See TracBrowser for help on using the repository browser.