source: icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

Last change on this file was 6261, checked in by nmedfort, 7 months ago

Work on OptimizationBranch?; revisited pipeline termination

File size: 15.2 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "radix64.h"
6#include <kernels/streamset.h>
7#include <kernels/kernel_builder.h>
8
9using namespace llvm;
10
11namespace kernel {
12
13// This kernel produces an expanded input stream by duplicating every third byte.
14// It is implemented using SIMD shufflevector operations.  With 16-byte registers,
15// a single shufflevector operation produces 16 bytes of output data from the
16// 12 bytes of input data.   With 32-byte registers, 32 bytes of output data are
17// produced from 24 bytes of input data.
18//
19// Using aligned SIMD loads, an inner loop processes three registers full of input
20// data (i.e., three BytePacks) to produce four registers full of output.   This is
21// a 3 step process.
22// Step 1:  Load input_pack0, apply the shuffle operation to produce output_pack0.
23//          At this point 3/4 of the data in input_pack0 has been processed.
24// Step 2:  Load input_pack1, apply a shuffle operation to use the remaining
25//          1/4 of input_pack0 and 1/2 of input_pack1 to produce output_pack1.
26//          At this point 1/2 of the data in input_pack1 has been processed.
27// Step 3:  Load input_pack2, apply a shuffle operation to use the remaining 1/2
28//          of input_pack1 and 1/4 of input_pack2 to produce output_pack2.
29//          Then apply a further shuffle opertaion to use the remaining 3/4 of
30//          input_pack2 to produce output_pack3.
31
32// The MultiBlockLogic is based on a natural stride taking 3 packs at a time.
33// In this case, the output produced is exactly 4 packs or 4 blocks, with no pending
34// data maintained in the kernel state.
35//
36// When processing the final partial stride of data, the kernel performs full
37// triple-pack processing for each full or partial triple-pack remaining,
38// relying on the MultiBlockKernel builder to only copy the correct number
39// of bytes to the actual output stream.
40
41void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &b, Value * const numOfStrides) {
42
43    BasicBlock * expand2_3entry = b->GetInsertBlock();
44    BasicBlock * expand_3_4_loop = b->CreateBasicBlock("expand_3_4_loop");
45    BasicBlock * expand3_4_exit = b->CreateBasicBlock("expand3_4_exit");
46
47    // Determine the require shufflevector constants.
48    const unsigned PACK_SIZE = b->getBitBlockWidth()/8;
49
50    ConstantInt * const ZERO = b->getSize(0);
51    ConstantInt * const ONE = b->getSize(1);
52    ConstantInt * const THREE = b->getSize(3);
53    ConstantInt * const FOUR = b->getSize(4);
54    ConstantInt * const SEVEN = b->getSize(7);
55
56    // Construct a list of indexes in  the form
57    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
58    unsigned sourceByteIndex = 0;
59    unsigned expand3_4_index[PACK_SIZE];
60    for (unsigned i = 0; i < PACK_SIZE; i++) {
61        expand3_4_index[i] = sourceByteIndex;
62        if (i % 4 != 2) sourceByteIndex++;
63    }
64    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
65    Value * expand_3_4_shuffle[4];
66    for (unsigned j = 0; j < 4; j++) {
67        std::vector<Constant *> Idxs;
68        for (unsigned i = 0; i < PACK_SIZE; i++) {
69            Idxs.push_back(ConstantInt::get(b->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
70        }
71        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
72    }
73
74    UndefValue * undefPack = UndefValue::get(b->fwVectorType(8));
75    Value * const numOfBlocks = b->CreateMul(numOfStrides, b->getSize(8));
76    // The main loop processes 3 packs of data at a time.
77    b->CreateBr(expand_3_4_loop);
78
79    b->SetInsertPoint(expand_3_4_loop);
80    PHINode * strideOffset = b->CreatePHI(b->getSizeTy(), 2);
81    strideOffset->addIncoming(ZERO, expand2_3entry);
82
83    Value * const baseInputOffset = b->CreateMul(strideOffset, THREE);
84    Value * const baseOutputOffset = b->CreateMul(strideOffset, FOUR);
85    Value * carryOver = undefPack;
86    for (unsigned i = 0; i < 3; ++i) {
87        ConstantInt * const index = b->getSize(i);
88        Value * const inputOffset = b->CreateAdd(baseInputOffset, index);
89        Value * const inputPackIndex = b->CreateAnd(inputOffset, SEVEN);
90        Value * const inputBlockOffset = b->CreateLShr(inputOffset, THREE);
91        Value * const input = b->fwCast(8, b->loadInputStreamPack("sourceStream", ZERO, inputPackIndex, inputBlockOffset));
92        Value * const expanded = b->CreateShuffleVector(carryOver, input, expand_3_4_shuffle[i]);
93        Value * const outputOffset = b->CreateAdd(baseOutputOffset, index);
94        Value * const outputPackIndex = b->CreateAnd(outputOffset, SEVEN);
95        Value * const outputBlockOffset = b->CreateLShr(outputOffset, THREE);
96        b->storeOutputStreamPack("expand34Stream", ZERO, outputPackIndex, outputBlockOffset, b->bitCast(expanded));
97        carryOver = input;
98    }
99    Value * expanded = b->CreateShuffleVector(carryOver, undefPack, expand_3_4_shuffle[3]);
100    Value * outputOffset = b->CreateAdd(baseOutputOffset, THREE);
101    Value * const outputPackIndex = b->CreateAnd(outputOffset, SEVEN);
102    Value * const outputBlockOffset = b->CreateLShr(outputOffset, THREE);
103    b->storeOutputStreamPack("expand34Stream", ZERO, outputPackIndex, outputBlockOffset, b->bitCast(expanded));
104
105    Value * const nextStrideOffset = b->CreateAdd(strideOffset, ONE);
106    strideOffset->addIncoming(nextStrideOffset, expand_3_4_loop);
107    Value * continueLoop = b->CreateICmpULT(nextStrideOffset, numOfBlocks);
108    b->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_exit);
109
110    b->SetInsertPoint(expand3_4_exit);
111}
112
113
114// Radix 64 determination, converting 3 bytes to 4 6-bit values.
115//
116//  00000000|zyxwvuts|rqpmnlkj|hgfedcba    Original
117//           zy                            bits to move 6 positions right
118//             xwvuts                      bits to move 8 positions left
119//                    rqpm                 bits to move 4 positions right
120//                        nlkj             bits to move 10 positions left
121//                             hqfedc      bits to move 2 positions right
122//                                   ba    bits to move 12 positions left
123//    xwvuts|  nlkjzy|  barqpm|  hgfedc    Target
124inline Value * radix64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * bytepack) const {
125
126    Value * step_right_6 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00C00000));
127    Value * right_6_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_6), 6);
128
129    Value * step_left_8 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x003F0000));
130    Value * left_8_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_8), 8);
131    Value * mid = iBuilder->simd_or(right_6_result, left_8_result);
132
133    Value * step_right_4 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x0000F000));
134    Value * right_4_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_4), 4);
135    mid = iBuilder->simd_or(mid, right_4_result);
136
137    Value * step_left_10 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000F00));
138    Value * left_10_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_10), 10);
139    mid = iBuilder->simd_or(mid, left_10_result);
140
141    Value * step_right_2 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x000000FC));
142    Value * right_2_result = iBuilder->simd_srli(32, iBuilder->simd_and(bytepack, step_right_2), 2);
143    mid = iBuilder->simd_or(mid, right_2_result);
144
145    Value * step_left_12 = iBuilder->simd_fill(32, ConstantInt::get(iBuilder->getInt32Ty(), 0x00000003));
146    Value * left_12_result = iBuilder->simd_slli(32, iBuilder->simd_and(bytepack, step_left_12), 12);
147    mid = iBuilder->simd_or(mid, left_12_result);
148
149    return iBuilder->bitCast(mid);
150}
151
152void radix64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
153    for (unsigned i = 0; i < 8; i++) {
154        Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
155        Value * radix64pack = processPackData(iBuilder, bytepack);
156        iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), radix64pack);
157    }
158}
159
160void radix64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
161
162    BasicBlock * entry = iBuilder->GetInsertBlock();
163    BasicBlock * radix64_loop = iBuilder->CreateBasicBlock("radix64_loop");
164    BasicBlock * fbExit = iBuilder->CreateBasicBlock("fbExit");
165
166    const unsigned PACK_SIZE = iBuilder->getStride()/8;
167    Constant * packSize = iBuilder->getSize(PACK_SIZE);
168
169    // Enter the loop only if there is at least one byte remaining to process.
170    iBuilder->CreateCondBr(iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0)), fbExit, radix64_loop);
171
172    iBuilder->SetInsertPoint(radix64_loop);
173    PHINode * idx = iBuilder->CreatePHI(iBuilder->getInt32Ty(), 2);
174    PHINode * loopRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
175    idx->addIncoming(ConstantInt::getNullValue(iBuilder->getInt32Ty()), entry);
176    loopRemain->addIncoming(remainingBytes, entry);
177
178    Value * bytepack = iBuilder->loadInputStreamPack("expandedStream", iBuilder->getInt32(0), idx);
179    Value * radix64pack = processPackData(iBuilder, bytepack);
180    iBuilder->storeOutputStreamPack("radix64stream", iBuilder->getInt32(0), idx, radix64pack);
181
182    Value* nextIdx = iBuilder->CreateAdd(idx, ConstantInt::get(iBuilder->getInt32Ty(), 1));
183    idx->addIncoming(nextIdx, radix64_loop);
184    Value* remainAfterLoop = iBuilder->CreateSub(loopRemain, packSize);
185    loopRemain->addIncoming(remainAfterLoop, radix64_loop);
186
187    Value* continueLoop = iBuilder->CreateICmpSGT(remainAfterLoop, iBuilder->getSize(0));
188
189    iBuilder->CreateCondBr(continueLoop, radix64_loop, fbExit);
190
191    iBuilder->SetInsertPoint(fbExit);
192}
193
194inline llvm::Value* base64Kernel::processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* bytepack) const {
195    Value * mask_gt_25 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(25)));
196    Value * mask_gt_51 = iBuilder->simd_ugt(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(51)));
197    Value * mask_eq_62 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(62)));
198    Value * mask_eq_63 = iBuilder->simd_eq(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8(63)));
199    // Strategy:
200    // 1. add ord('A') = 65 to all radix64 values, this sets the correct values for entries 0 to 25.
201    // 2. add ord('a') - ord('A') - (26 - 0) = 6 to all values >25, this sets the correct values for entries 0 to 51
202    // 3. subtract ord('a') - ord('0') + (52 - 26) = 75 to all values > 51, this sets the correct values for entries 0 to 61
203    // 4. subtract ord('0') - ord('+') + (62 - 52) = 15 for all values = 62
204    // 4. add ord('/') - ord('0') - (63 - 52) = 3 for all values = 63
205    Value * t0_25 = iBuilder->simd_add(8, bytepack, iBuilder->simd_fill(8, iBuilder->getInt8('A')));
206    Value * t0_51 = iBuilder->simd_add(8, t0_25, iBuilder->simd_and(mask_gt_25, iBuilder->simd_fill(8, iBuilder->getInt8(6))));
207    Value * t0_61 = iBuilder->simd_sub(8, t0_51, iBuilder->simd_and(mask_gt_51, iBuilder->simd_fill(8, iBuilder->getInt8(75))));
208    Value * t0_62 = iBuilder->simd_sub(8, t0_61, iBuilder->simd_and(mask_eq_62, iBuilder->simd_fill(8, iBuilder->getInt8(15))));
209    return iBuilder->bitCast(iBuilder->simd_sub(8, t0_62, iBuilder->simd_and(mask_eq_63, iBuilder->simd_fill(8, iBuilder->getInt8(12)))));
210}
211
212void base64Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
213    for (unsigned i = 0; i < 8; i++) {
214        Value * bytepack = iBuilder->loadInputStreamPack("radix64stream", iBuilder->getInt32(0), iBuilder->getInt32(i));
215        Value * base64pack = processPackData(iBuilder, bytepack);
216        iBuilder->storeOutputStreamPack("base64stream", iBuilder->getInt32(0), iBuilder->getInt32(i), base64pack);
217    }
218}
219
220// Special processing for the base 64 format.   The output must always contain a multiple
221// of 4 bytes.   When the number of radix 64 values is not a multiple of 4
222// number of radix 64 values
223void base64Kernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
224
225    BasicBlock * entry = b->GetInsertBlock();
226    BasicBlock * base64_loop = b->CreateBasicBlock("base64_loop");
227    BasicBlock * loopExit = b->CreateBasicBlock("loopExit");
228    BasicBlock * doPadding = b->CreateBasicBlock("doPadding");
229    BasicBlock * doPadding2 = b->CreateBasicBlock("doPadding2");
230    BasicBlock * fbExit = b->CreateBasicBlock("fbExit");
231
232    Constant * const ZERO = b->getSize(0);
233    Constant * const ONE = b->getSize(1);
234    Constant * const THREE = b->getSize(3);
235    Constant * const PADDING = b->getInt8('=');
236
237    Value * remainMod4 = b->CreateAnd(remainingBytes, THREE);
238    Value * padBytes = b->CreateAnd(b->CreateSub(b->getSize(4), remainMod4), THREE);
239
240    Constant * const PACK_SIZE = b->getSize(b->getStride() / 8);
241
242    // Enter the loop only if there is at least one byte remaining to process.
243    b->CreateCondBr(b->CreateICmpEQ(remainingBytes, ZERO), fbExit, base64_loop);
244
245    b->SetInsertPoint(base64_loop);
246    PHINode * idx = b->CreatePHI(b->getSizeTy(), 2);
247    PHINode * loopRemain = b->CreatePHI(b->getSizeTy(), 2);
248    idx->addIncoming(ZERO, entry);
249    loopRemain->addIncoming(remainingBytes, entry);
250    Value * bytepack = b->loadInputStreamPack("radix64stream", ZERO, idx);
251    Value * base64pack = processPackData(b, bytepack);
252    b->storeOutputStreamPack("base64stream", ZERO, idx, base64pack);
253    idx->addIncoming(b->CreateAdd(idx, ONE), base64_loop);
254    Value * remainAfterLoop = b->CreateSub(loopRemain, PACK_SIZE);
255    loopRemain->addIncoming(remainAfterLoop, base64_loop);
256    Value * continueLoop = b->CreateICmpUGT(loopRemain, PACK_SIZE);
257    b->CreateCondBr(continueLoop, base64_loop, loopExit);
258
259    b->SetInsertPoint(loopExit);
260    b->CreateCondBr(b->CreateICmpEQ(padBytes, ZERO), fbExit, doPadding);
261
262    b->SetInsertPoint(doPadding);
263    Value * i8output_ptr = b->getOutputStreamBlockPtr("base64stream", ZERO);
264    i8output_ptr = b->CreatePointerCast(i8output_ptr, b->getInt8PtrTy());
265    b->CreateStore(PADDING, b->CreateGEP(i8output_ptr, remainingBytes));
266    b->CreateCondBr(b->CreateICmpEQ(remainMod4, THREE), fbExit, doPadding2);
267
268    b->SetInsertPoint(doPadding2);
269    Value * finalPadPos = b->CreateAdd(remainingBytes, ONE);
270    b->CreateStore(PADDING, b->CreateGEP(i8output_ptr, finalPadPos));
271    b->CreateBr(fbExit);
272    b->SetInsertPoint(fbExit);
273}
274
275expand3_4Kernel::expand3_4Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet *input, StreamSet *expandedOutput)
276: MultiBlockKernel(b, "expand3_4",
277{Binding{"sourceStream", input, FixedRate(3)}},
278{Binding{"expand34Stream", expandedOutput, FixedRate(4)}},
279{}, {}, {}) {
280
281}
282
283radix64Kernel::radix64Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * input, StreamSet * output)
284: BlockOrientedKernel(b, "radix64",
285            {Binding{"expandedStream", input}},
286            {Binding{"radix64stream", output}},
287            {}, {}, {}) {
288}
289
290base64Kernel::base64Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * input, StreamSet * output)
291: BlockOrientedKernel(b, "base64",
292{Binding{"radix64stream", input}},
293{Binding{"base64stream", output, FixedRate(1), RoundUpTo(4)}},
294{}, {}, {}) {
295
296}
297
298}
Note: See TracBrowser for help on using the repository browser.