source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp @ 6133

Last change on this file since 6133 was 5998, checked in by nmedfort, 14 months ago

Added temporary buffer functionality to the pipeline for single stream source buffers. Fixed memory leak from UCD::UnicodeBreakRE()

File size: 15.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "radix64.h"
6#include <kernels/streamset.h>
7#include <kernels/kernel_builder.h>
8
9using namespace llvm;
10
11namespace kernel {
12
13// This kernel produces an expanded input stream by duplicating every third byte.
14// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
15// a single shufflevector operation produces 16 bytes of output data from the
16// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
17// produced from 24 bytes of input data.
18//
19// Using aligned SIMD loads, an inner loop processes three registers full of input
20// data (i.e., three BytePacks) to produce four registers full of output.   This is
21// a 3 step process.
22// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
23//          At this point 3/4 of the data in input_pack0 has been processed.
24// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
25//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
26//          At this point 1/2 of the data in input_pack1 has been processed.
27// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
28//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
29//          Then apply a further shuffle opertaion to use the remaining 3/4 of
30//          input_pack2 to produce output_pack3.
31
32// The MultiBlockLogic is based on a natural stride taking 3 packs at a time.
33// In this case, the output produced is exactly 4 packs or 4 blocks, with no pending
34// data maintained in the kernel state.
35//
36// When processing the final partial stride of data, the kernel performs full
37// triple-pack processing for each full or partial triple-pack remaining,
38// relying on the MultiBlockKernel builder to only copy the correct number
39// of bytes to the actual output stream.
40
41void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &b, Value * const numOfStrides) {
42
43    BasicBlock * expand2_3entry = b->GetInsertBlock();
44    BasicBlock * expand_3_4_loop = b->CreateBasicBlock("expand_3_4_loop");
45    BasicBlock * expand3_4_exit = b->CreateBasicBlock("expand3_4_exit");
46
47    // Determine the require shufflevector constants.
48    const unsigned PACK_SIZE = b->getBitBlockWidth()/8;
49
50    ConstantInt * const ZERO = b->getSize(0);
51    ConstantInt * const ONE = b->getSize(1);
52    ConstantInt * const THREE = b->getSize(3);
53    ConstantInt * const FOUR = b->getSize(4);
54    ConstantInt * const SEVEN = b->getSize(7);
55
56    // Construct a list of indexes in  the form
57    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
58    unsigned sourceByteIndex = 0;
59    unsigned expand3_4_index[PACK_SIZE];
60    for (unsigned i = 0; i < PACK_SIZE; i++) {
61        expand3_4_index[i] = sourceByteIndex;
62        if (i % 4 != 2) sourceByteIndex++;
63    }
64    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
65    Value * expand_3_4_shuffle[4];
66    for (unsigned j = 0; j < 4; j++) {
67        std::vector<Constant *> Idxs;
68        for (unsigned i = 0; i < PACK_SIZE; i++) {
69            Idxs.push_back(ConstantInt::get(b->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
70        }
71        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
72    }
73
74    UndefValue * undefPack = UndefValue::get(b->fwVectorType(8));
75    Value * const numOfBlocks = b->CreateMul(numOfStrides, b->getSize(8));
76    // The main loop processes 3 packs of data at a time.
77    b->CreateBr(expand_3_4_loop);
78
79    b->SetInsertPoint(expand_3_4_loop);
80    PHINode * strideOffset = b->CreatePHI(b->getSizeTy(), 2);
81    strideOffset->addIncoming(ZERO, expand2_3entry);
82
83    Value * const baseInputOffset = b->CreateMul(strideOffset, THREE);
84    Value * const baseOutputOffset = b->CreateMul(strideOffset, FOUR);
85    Value * carryOver = undefPack;
86    for (unsigned i = 0; i < 3; ++i) {
87        ConstantInt * const index = b->getSize(i);
88        Value * const inputOffset = b->CreateAdd(baseInputOffset, index);
89        Value * const inputPackIndex = b->CreateAnd(inputOffset, SEVEN);
90        Value * const inputBlockOffset = b->CreateLShr(inputOffset, THREE);
91        Value * const input = b->fwCast(8, b->loadInputStreamPack("sourceStream", ZERO, inputPackIndex, inputBlockOffset));
92        Value * const expanded = b->CreateShuffleVector(carryOver, input, expand_3_4_shuffle[i]);
93        Value * const outputOffset = b->CreateAdd(baseOutputOffset, index);
94        Value * const outputPackIndex = b->CreateAnd(outputOffset, SEVEN);
95        Value * const outputBlockOffset = b->CreateLShr(outputOffset, THREE);
96        b->storeOutputStreamPack("expand34Stream", ZERO, outputPackIndex, outputBlockOffset, b->bitCast(expanded));
97        carryOver = input;
98    }
99    Value * expanded = b->CreateShuffleVector(carryOver, undefPack, expand_3_4_shuffle[3]);
100    Value * outputOffset = b->CreateAdd(baseOutputOffset, THREE);
101    Value * const outputPackIndex = b->CreateAnd(outputOffset, SEVEN);
102    Value * const outputBlockOffset = b->CreateLShr(outputOffset, THREE);
103    b->storeOutputStreamPack("expand34Stream", ZERO, outputPackIndex, outputBlockOffset, b->bitCast(expanded));
104
105    Value * const nextStrideOffset = b->CreateAdd(strideOffset, ONE);
106    strideOffset->addIncoming(nextStrideOffset, expand_3_4_loop);
107    Value * continueLoop = b->CreateICmpULT(nextStrideOffset, numOfBlocks);
108    b->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_exit);
109
110    b->SetInsertPoint(expand3_4_exit);
111}
112
113
114// Radix 64 determination, converting 3 bytes to 4 6-bit values.
115//
116//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
117//           zy                            bits to move 6 positions right
118//             xwvuts                      bits to move 8 positions left
119//                    rqpm                 bits to move 4 positions right
120//                        nlkj             bits to move 10 positions left
121//                             hqfedc      bits to move 2 positions right
122//                                   ba    bits to move 12 positions left
123//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
124inline Value * radix64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * bytepack) const {
125
126    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
127    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
128
129    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
130    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
131    Value * mid = iBuilder->simd_or(right_6_result, left_8_result);
132
133    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
134    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
135    mid = iBuilder->simd_or(mid, right_4_result);
136
137    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
138    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
139    mid = iBuilder->simd_or(mid, left_10_result);
140
141    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
142    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
143    mid = iBuilder->simd_or(mid, right_2_result);
144
145    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
146    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
147    mid = iBuilder->simd_or(mid, left_12_result);
148
149    return iBuilder->bitCast(mid);
150}
151
152void radix64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
153    for (unsigned i = 0; i < 8; i++) {
154        Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
155        Value * radix64pack = processPackData(iBuilder, bytepack);
156        iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), radix64pack);
157    }
158}
159
160void radix64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
161
162    BasicBlock * entry = iBuilder->GetInsertBlock();
163    BasicBlock * radix64_loop = iBuilder->CreateBasicBlock("radix64_loop");
164    BasicBlock * fbExit = iBuilder->CreateBasicBlock("fbExit");
165   
166    const unsigned PACK_SIZE = iBuilder->getStride()/8;
167    Constant * packSize = iBuilder->getSize(PACK_SIZE);
168
169    // Enter the loop only if there is at least one byte remaining to process.
170    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
171
172    iBuilder->SetInsertPoint(radix64_loop);
173    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
174    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
175    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
176    loopRemain->addIncoming(remainingBytes, entry);
177
178    Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), idx);
179    Value * radix64pack = processPackData(iBuilder, bytepack);
180    iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), idx, radix64pack);
181
182    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
183    idx->addIncoming(nextIdx, radix64_loop);
184    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
185    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
186
187    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
188
189    iBuilder->CreateCondBr(continueLoop, radix64_loop, fbExit);
190
191    iBuilder->SetInsertPoint(fbExit);
192}
193
194inline llvm::Value* base64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* bytepack) const {
195    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
196    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
197    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
198    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
199    // Strategy:
200    // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
201    // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
202    // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
203    // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
204    // 4. add ord('/') - ord('0') - (63 - 52) = 3 for all values = 63
205    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
206    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
207    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
208    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
209    return iBuilder->bitCast(iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(12)))));
210}
211
212void base64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
213    for (unsigned i = 0; i < 8; i++) {
214        Value * bytepack = iBuilder->loadInputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i));
215        Value * base64pack = processPackData(iBuilder, bytepack);
216        iBuilder->storeOutputStreamPack("base64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), base64pack);
217    }
218}
219
220// Special processing for the base 64 format.   The output must always contain a multiple
221// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
222// number of radix 64 values
223void base64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
224
225    BasicBlock * entry = b->GetInsertBlock();
226    BasicBlock * base64_loop = b->CreateBasicBlock("base64_loop");
227    BasicBlock * loopExit = b->CreateBasicBlock("loopExit");
228    BasicBlock * doPadding = b->CreateBasicBlock("doPadding");
229    BasicBlock * doPadding2 = b->CreateBasicBlock("doPadding2");
230    BasicBlock * fbExit = b->CreateBasicBlock("fbExit");
231
232    Constant * const ZERO = b->getSize(0);
233    Constant * const ONE = b->getSize(1);
234    Constant * const THREE = b->getSize(3);
235    Constant * const PADDING = b->getInt8('=');
236
237    Value * remainMod4 = b->CreateAnd(remainingBytes, THREE);
238    Value * padBytes = b->CreateAnd(b->CreateSub(b->getSize(4), remainMod4), THREE);
239
240    Constant * const PACK_SIZE = b->getSize(b->getStride() / 8);
241
242    // Enter the loop only if there is at least one byte remaining to process.
243    b->CreateCondBr(b->CreateICmpEQ(remainingBytes, ZERO), fbExit, base64_loop);
244
245    b->SetInsertPoint(base64_loop);
246    PHINode * idx = b->CreatePHI(b->getSizeTy(), 2);
247    PHINode * loopRemain = b->CreatePHI(b->getSizeTy(), 2);
248    idx->addIncoming(ZERO, entry);
249    loopRemain->addIncoming(remainingBytes, entry);
250    Value * bytepack = b->loadInputStreamPack("radix64stream", ZERO, idx);
251    Value * base64pack = processPackData(b, bytepack);
252    b->storeOutputStreamPack("base64stream", ZERO, idx, base64pack);
253    idx->addIncoming(b->CreateAdd(idx, ONE), base64_loop);
254    Value * remainAfterLoop = b->CreateSub(loopRemain, PACK_SIZE);
255    loopRemain->addIncoming(remainAfterLoop, base64_loop);
256    Value * continueLoop = b->CreateICmpUGT(loopRemain, PACK_SIZE);
257    b->CreateCondBr(continueLoop, base64_loop, loopExit);
258
259    b->SetInsertPoint(loopExit);
260    b->CreateCondBr(b->CreateICmpEQ(padBytes, ZERO), fbExit, doPadding);
261
262    b->SetInsertPoint(doPadding);
263    Value * i8output_ptr = b->getOutputStreamBlockPtr("base64stream", ZERO);
264    i8output_ptr = b->CreatePointerCast(i8output_ptr, b->getInt8PtrTy());
265    b->CreateStore(PADDING, b->CreateGEP(i8output_ptr, remainingBytes));
266    b->CreateCondBr(b->CreateICmpEQ(remainMod4, THREE), fbExit, doPadding2);
267
268    b->SetInsertPoint(doPadding2);
269    Value * finalPadPos = b->CreateAdd(remainingBytes, ONE);
270    b->CreateStore(PADDING, b->CreateGEP(i8output_ptr, finalPadPos));
271    b->CreateBr(fbExit);
272    b->SetInsertPoint(fbExit);
273}
274
275expand3_4Kernel::expand3_4Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
276: MultiBlockKernel("expand3_4",
277            {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream", FixedRate(3)}},
278            {Binding{iBuilder->getStreamSetTy(1, 8), "expand34Stream", FixedRate(4)}},
279            {}, {}, {}) {
280
281}
282
283radix64Kernel::radix64Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
284: BlockOrientedKernel("radix64",
285            {Binding{iBuilder->getStreamSetTy(1, 8), "expandedStream"}},
286            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
287            {}, {}, {}) {
288}
289
290base64Kernel::base64Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
291: BlockOrientedKernel("base64",
292            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
293            {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream", FixedRate(1), RoundUpTo(4)}},
294            {}, {}, {}) {
295}
296
297}
Note: See TracBrowser for help on using the repository browser.