source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 5902

Last change on this file since 5902 was 5837, checked in by cameron, 18 months ago

Pablo packh/packl and transposition with -enable-pablo-s2p

File size: 10.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <pablo/pabloAST.h>
9#include <pablo/builder.hpp>
10#include <pablo/pe_pack.h>
11
12#include <llvm/Support/raw_ostream.h>
13
14using namespace llvm;
15
16namespace kernel {
17
18const int PACK_LANES = 2;
19
20void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
21    Value * t0 = nullptr;
22    Value * t1 = nullptr;
23    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
24        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
25        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
26        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
27        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
28    } else {
29        t0 = iBuilder->hsimd_packh(16, s0, s1);
30        t1 = iBuilder->hsimd_packl(16, s0, s1);
31    }
32    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
33    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
34}
35
36void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[]) {
37    Value * bit00224466[4];
38    Value * bit11335577[4];
39
40    for (unsigned i = 0; i < 4; i++) {
41        Value * s0 = input[2 * i];
42        Value * s1 = input[2 * i + 1];
43        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit00224466[i], bit11335577[i]);
44    }
45    Value * bit00004444[2];
46    Value * bit22226666[2];
47    Value * bit11115555[2];
48    Value * bit33337777[2];
49    for (unsigned j = 0; j<2; j++) {
50        s2p_step(iBuilder, bit00224466[2*j], bit00224466[2*j+1],
51                 iBuilder->simd_himask(4), 2, bit00004444[j], bit22226666[j]);
52        s2p_step(iBuilder, bit11335577[2*j], bit11335577[2*j+1],
53                 iBuilder->simd_himask(4), 2, bit11115555[j], bit33337777[j]);
54    }
55    s2p_step(iBuilder, bit00004444[0], bit00004444[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
56    s2p_step(iBuilder, bit11115555[0], bit11115555[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
57    s2p_step(iBuilder, bit22226666[0], bit22226666[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
58    s2p_step(iBuilder, bit33337777[0], bit33337777[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
59}
60
61/* Alternative transposition model, but small field width packs are problematic. */
62#if 0
63void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[]) {
64    Value * hi_nybble[4];
65    Value * lo_nybble[4];
66    for (unsigned i = 0; i<4; i++) {
67        Value * s0 = input[2*i];
68        Value * s1 = input[2*i+1];
69        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
70        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
71    }
72    Value * pair01[2];
73    Value * pair23[2];
74    Value * pair45[2];
75    Value * pair67[2];
76    for (unsigned i = 0; i<2; i++) {
77        pair01[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
78        pair23[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
79        pair45[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
80        pair67[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
81    }
82    output[0] = iBuilder->hsimd_packh(2, pair01[0], pair01[1]);
83    output[1] = iBuilder->hsimd_packl(2, pair01[0], pair01[1]);
84    output[2] = iBuilder->hsimd_packh(2, pair23[0], pair23[1]);
85    output[3] = iBuilder->hsimd_packl(2, pair23[0], pair23[1]);
86    output[4] = iBuilder->hsimd_packh(2, pair45[0], pair45[1]);
87    output[5] = iBuilder->hsimd_packl(2, pair45[0], pair45[1]);
88    output[6] = iBuilder->hsimd_packh(2, pair67[0], pair67[1]);
89    output[7] = iBuilder->hsimd_packl(2, pair67[0], pair67[1]);
90}
91#endif
92   
93#if 0
94void generateS2P_16Kernel(const std::unique_ptr<KernelBuilder> & iBuilder, Kernel * kBuilder) {
95    kBuilder->addInputStream(16, "unit_pack");
96    for(unsigned i = 0; i < 16; i++) {
97            kBuilder->addOutputStream(1);
98    }
99    Value * ptr = kBuilder->getInputStream(0);
100
101    Value * lo[8];
102    Value * hi[8];
103    for (unsigned i = 0; i < 8; i++) {
104        Value * s0 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i)});
105        Value * s1 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i + 1)});
106        lo[i] = iBuilder->hsimd_packl(16, s0, s1);
107        hi[i] = iBuilder->hsimd_packh(16, s0, s1);
108    }
109
110    Value * output[16];
111    s2p(iBuilder, lo, output);
112    s2p(iBuilder, hi, output + 8);
113    for (unsigned j = 0; j < 16; j++) {
114        iBuilder->CreateBlockAlignedStore(output[j], kBuilder->getOutputStream(j));
115    }
116}   
117#endif
118#ifdef S2P_MULTIBLOCK
119
120void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {
121    BasicBlock * entry = kb->GetInsertBlock();
122    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
123    BasicBlock * s2pDone = kb->CreateBasicBlock("s2pDone");
124   
125    Function::arg_iterator args = mCurrentMethod->arg_begin();
126    args++; //self
127    Value * itemsToDo = &*(args++);
128    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
129    Value * byteStreamPtr = &*(args++);
130    Value * basisBitsPtr = &*(args++);
131   
132    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
133    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
134   
135    kb->CreateBr(processBlock);
136   
137    kb->SetInsertPoint(processBlock);
138    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
139    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
140
141    Value * bytePackPtr = kb->CreateGEP(byteStreamPtr, {blockOffsetPhi, kb->getInt32(0), kb->getInt32(0)});
142    Value * basisBlockPtr = kb->CreateGEP(basisBitsPtr, blockOffsetPhi);
143    Value * bytepack[8];
144    for (unsigned i = 0; i < 8; i++) {
145        bytepack[i] = kb->CreateBlockAlignedLoad(kb->CreateGEP(bytePackPtr, kb->getInt32(i)));
146    }
147    Value * basisbits[8];
148    s2p(kb, bytepack, basisbits);
149    for (unsigned basis_idx = 0; basis_idx < 8; ++basis_idx) {
150        kb->CreateBlockAlignedStore(basisbits[basis_idx], kb->CreateGEP(basisBlockPtr, {kb->getSize(0), kb->getInt32(basis_idx)}));
151    }
152    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
153    Value * moreToDo = kb->CreateICmpULT(blockOffsetPhi, blocksToDo);
154    blockOffsetPhi->addIncoming(nextBlk, processBlock);
155    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
156    kb->SetInsertPoint(s2pDone);
157}
158#else
159void S2PKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
160    Value * bytepack[8];
161    for (unsigned i = 0; i < 8; i++) {
162        if (mAligned) {
163            bytepack[i] = iBuilder->loadInputStreamPack("byteStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
164        } else {
165            Value * ptr = iBuilder->getInputStreamPackPtr("byteStream", iBuilder->getInt32(0), iBuilder->getInt32(i));
166            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
167            bytepack[i] = iBuilder->CreateAlignedLoad(ptr, 1);           
168        }
169    }
170    Value * basisbits[8];
171    s2p(iBuilder, bytepack, basisbits);
172    for (unsigned i = 0; i < 8; ++i) {
173        iBuilder->storeOutputStreamBlock("basisBits", iBuilder->getInt32(i), basisbits[i]);
174    }
175}
176
177void S2PKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
178    // Prepare the s2p final block function:
179    // assumption: if remaining bytes is greater than 0, it is safe to read a full block of bytes.
180    //  if remaining bytes is zero, no read should be performed (e.g. for mmapped buffer).
181 
182    BasicBlock * finalPartialBlock = iBuilder->CreateBasicBlock("partial");
183    BasicBlock * finalEmptyBlock = iBuilder->CreateBasicBlock("empty");
184    BasicBlock * exitBlock = iBuilder->CreateBasicBlock("exit");
185   
186    Value * emptyBlockCond = iBuilder->CreateICmpEQ(remainingBytes, iBuilder->getSize(0));
187    iBuilder->CreateCondBr(emptyBlockCond, finalEmptyBlock, finalPartialBlock);
188    iBuilder->SetInsertPoint(finalPartialBlock);
189    CreateDoBlockMethodCall(iBuilder);
190   
191    iBuilder->CreateBr(exitBlock);
192   
193    iBuilder->SetInsertPoint(finalEmptyBlock);
194
195    for (unsigned i = 0; i < 8; ++i) {
196        iBuilder->storeOutputStreamBlock("basisBits", iBuilder->getInt32(i), Constant::getNullValue(iBuilder->getBitBlockType()));
197    }
198
199    iBuilder->CreateBr(exitBlock);
200   
201    iBuilder->SetInsertPoint(exitBlock);
202}
203#endif
204
205S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, bool aligned)
206#ifdef S2P_MULTIBLOCK
207    : MultiBlockKernel(aligned ? "s2p" : "s2p_unaligned",
208#else
209        : BlockOrientedKernel(aligned ? "s2p" : "s2p_unaligned",
210#endif
211    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
212    {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
213  mAligned(aligned) {
214    if (!aligned) {
215        mStreamSetInputs[0].addAttribute(Misaligned());
216    }
217}
218void S2P_PabloKernel::generatePabloMethod() {
219    pablo::PabloBlock * const pb = getEntryScope();
220    const unsigned steps = std::log2(mCodeUnitWidth);
221    std::vector<PabloAST *> streamSet[steps + 1];
222    for (unsigned i = 0; i <= steps; i++) {
223        streamSet[i].resize(1<<i);
224    }
225    streamSet[0][0] = pb->createExtract(getInputStreamVar("codeUnitStream"), pb->getInteger(0));
226    unsigned streamWidth = mCodeUnitWidth;
227    for (unsigned i = 1; i <= steps; i++) {
228        for (unsigned j = 0; j < streamSet[i-1].size(); j++) {
229            auto strm = streamSet[i-1][j];
230            streamSet[i][2*j] = pb->createPackL(pb->getInteger(streamWidth), strm);
231            streamSet[i][2*j+1] = pb->createPackH(pb->getInteger(streamWidth), strm);
232        }
233        streamWidth = streamWidth/2;
234    }
235    for (unsigned bit = 0; bit < mCodeUnitWidth; bit++) {
236        pb->createAssign(pb->createExtract(getOutputStreamVar("basisBits"), pb->getInteger(bit)), streamSet[steps][mCodeUnitWidth-1-bit]);
237    }
238}
239
240S2P_PabloKernel::S2P_PabloKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned codeUnitWidth)
241: PabloKernel(b, "s2p_pablo" + std::to_string(codeUnitWidth),
242    {Binding{b->getStreamSetTy(1, codeUnitWidth), "codeUnitStream"}},
243    {Binding{b->getStreamSetTy(codeUnitWidth, 1), "basisBits"}}),
244  mCodeUnitWidth(codeUnitWidth) {
245}
246
247
248}
Note: See TracBrowser for help on using the repository browser.