source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 5755

Last change on this file since 5755 was 5755, checked in by nmedfort, 16 months ago

Bug fixes and simplified MultiBlockKernel? logic

File size: 8.8 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14const int PACK_LANES = 2;
15
16void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
17    Value * t0 = nullptr;
18    Value * t1 = nullptr;
19    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
20        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
21        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
22        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
23        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
24    } else {
25        t0 = iBuilder->hsimd_packh(16, s0, s1);
26        t1 = iBuilder->hsimd_packl(16, s0, s1);
27    }
28    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
29    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
30}
31
32void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[]) {
33    Value * bit00224466[4];
34    Value * bit11335577[4];
35
36    for (unsigned i = 0; i < 4; i++) {
37        Value * s0 = input[2 * i];
38        Value * s1 = input[2 * i + 1];
39        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit00224466[i], bit11335577[i]);
40    }
41    Value * bit00004444[2];
42    Value * bit22226666[2];
43    Value * bit11115555[2];
44    Value * bit33337777[2];
45    for (unsigned j = 0; j<2; j++) {
46        s2p_step(iBuilder, bit00224466[2*j], bit00224466[2*j+1],
47                 iBuilder->simd_himask(4), 2, bit00004444[j], bit22226666[j]);
48        s2p_step(iBuilder, bit11335577[2*j], bit11335577[2*j+1],
49                 iBuilder->simd_himask(4), 2, bit11115555[j], bit33337777[j]);
50    }
51    s2p_step(iBuilder, bit00004444[0], bit00004444[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
52    s2p_step(iBuilder, bit11115555[0], bit11115555[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
53    s2p_step(iBuilder, bit22226666[0], bit22226666[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
54    s2p_step(iBuilder, bit33337777[0], bit33337777[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
55}
56
57/* Alternative transposition model, but small field width packs are problematic. */
58#if 0
59void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[]) {
60    Value * hi_nybble[4];
61    Value * lo_nybble[4];
62    for (unsigned i = 0; i<4; i++) {
63        Value * s0 = input[2*i];
64        Value * s1 = input[2*i+1];
65        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
66        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
67    }
68    Value * pair01[2];
69    Value * pair23[2];
70    Value * pair45[2];
71    Value * pair67[2];
72    for (unsigned i = 0; i<2; i++) {
73        pair01[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
74        pair23[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
75        pair45[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
76        pair67[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
77    }
78    output[0] = iBuilder->hsimd_packh(2, pair01[0], pair01[1]);
79    output[1] = iBuilder->hsimd_packl(2, pair01[0], pair01[1]);
80    output[2] = iBuilder->hsimd_packh(2, pair23[0], pair23[1]);
81    output[3] = iBuilder->hsimd_packl(2, pair23[0], pair23[1]);
82    output[4] = iBuilder->hsimd_packh(2, pair45[0], pair45[1]);
83    output[5] = iBuilder->hsimd_packl(2, pair45[0], pair45[1]);
84    output[6] = iBuilder->hsimd_packh(2, pair67[0], pair67[1]);
85    output[7] = iBuilder->hsimd_packl(2, pair67[0], pair67[1]);
86}
87#endif
88   
89#if 0
90void generateS2P_16Kernel(const std::unique_ptr<KernelBuilder> & iBuilder, Kernel * kBuilder) {
91    kBuilder->addInputStream(16, "unit_pack");
92    for(unsigned i = 0; i < 16; i++) {
93            kBuilder->addOutputStream(1);
94    }
95    Value * ptr = kBuilder->getInputStream(0);
96
97    Value * lo[8];
98    Value * hi[8];
99    for (unsigned i = 0; i < 8; i++) {
100        Value * s0 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i)});
101        Value * s1 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i + 1)});
102        lo[i] = iBuilder->hsimd_packl(16, s0, s1);
103        hi[i] = iBuilder->hsimd_packh(16, s0, s1);
104    }
105
106    Value * output[16];
107    s2p(iBuilder, lo, output);
108    s2p(iBuilder, hi, output + 8);
109    for (unsigned j = 0; j < 16; j++) {
110        iBuilder->CreateBlockAlignedStore(output[j], kBuilder->getOutputStream(j));
111    }
112}   
113#endif
114#ifdef S2P_MULTIBLOCK
115
116void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {
117    BasicBlock * entry = kb->GetInsertBlock();
118    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
119    BasicBlock * s2pDone = kb->CreateBasicBlock("s2pDone");
120   
121    Function::arg_iterator args = mCurrentMethod->arg_begin();
122    args++; //self
123    Value * itemsToDo = &*(args++);
124    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
125    Value * byteStreamPtr = &*(args++);
126    Value * basisBitsPtr = &*(args++);
127   
128    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
129    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
130   
131    kb->CreateBr(processBlock);
132   
133    kb->SetInsertPoint(processBlock);
134    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
135    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
136
137    Value * bytePackPtr = kb->CreateGEP(byteStreamPtr, {blockOffsetPhi, kb->getInt32(0), kb->getInt32(0)});
138    Value * basisBlockPtr = kb->CreateGEP(basisBitsPtr, blockOffsetPhi);
139    Value * bytepack[8];
140    for (unsigned i = 0; i < 8; i++) {
141        bytepack[i] = kb->CreateBlockAlignedLoad(kb->CreateGEP(bytePackPtr, kb->getInt32(i)));
142    }
143    Value * basisbits[8];
144    s2p(kb, bytepack, basisbits);
145    for (unsigned basis_idx = 0; basis_idx < 8; ++basis_idx) {
146        kb->CreateBlockAlignedStore(basisbits[basis_idx], kb->CreateGEP(basisBlockPtr, {kb->getSize(0), kb->getInt32(basis_idx)}));
147    }
148    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
149    Value * moreToDo = kb->CreateICmpULT(blockOffsetPhi, blocksToDo);
150    blockOffsetPhi->addIncoming(nextBlk, processBlock);
151    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
152    kb->SetInsertPoint(s2pDone);
153}
154#else
155void S2PKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
156    Value * bytepack[8];
157    for (unsigned i = 0; i < 8; i++) {
158        if (mAligned) {
159            bytepack[i] = iBuilder->loadInputStreamPack("byteStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
160        } else {
161            Value * ptr = iBuilder->getInputStreamPackPtr("byteStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
162            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
163            bytepack[i] = iBuilder->CreateAlignedLoad(ptr, 1);
164        }
165    }
166    Value * basisbits[8];
167    s2p(iBuilder, bytepack, basisbits);
168    for (unsigned i = 0; i < 8; ++i) {
169        iBuilder->storeOutputStreamBlock("basisBits", iBuilder->getInt32(i), basisbits[i]);
170    }
171}
172
173void S2PKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
174    // Prepare the s2p final block function:
175    // assumption: if remaining bytes is greater than 0, it is safe to read a full block of bytes.
176    //  if remaining bytes is zero, no read should be performed (e.g. for mmapped buffer).
177 
178    BasicBlock * finalPartialBlock = iBuilder->CreateBasicBlock("partial");
179    BasicBlock * finalEmptyBlock = iBuilder->CreateBasicBlock("empty");
180    BasicBlock * exitBlock = iBuilder->CreateBasicBlock("exit");
181   
182    Value * emptyBlockCond = iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0));
183    iBuilder->CreateCondBr(emptyBlockCond, finalEmptyBlock, finalPartialBlock);
184    iBuilder->SetInsertPoint(finalPartialBlock);
185    CreateDoBlockMethodCall(iBuilder);
186   
187    iBuilder->CreateBr(exitBlock);
188   
189    iBuilder->SetInsertPoint(finalEmptyBlock);
190
191    for (unsigned i = 0; i < 8; ++i) {
192        iBuilder->storeOutputStreamBlock("basisBits", iBuilder->getInt32(i), Constant::getNullValue(iBuilder->getBitBlockType()));
193    }
194
195    iBuilder->CreateBr(exitBlock);
196   
197    iBuilder->SetInsertPoint(exitBlock);
198}
199#endif
200
201S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, bool aligned)
202#ifdef S2P_MULTIBLOCK
203    : MultiBlockKernel(aligned ? "s2p" : "s2p_unaligned",
204#else
205        : BlockOrientedKernel(aligned ? "s2p" : "s2p_unaligned",
206#endif
207    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
208    {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
209  mAligned(aligned) {
210    setNoTerminateAttribute(true);
211}
212
213}
Note: See TracBrowser for help on using the repository browser.