source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 6026

Last change on this file since 6026 was 6026, checked in by xwa163, 14 months ago
  1. Implement SwizzledMultiplePDEPkernel with the same logic as new PDEPkernel, remove LZ4MultiplePDEPkernel, improve the performance
  2. Remove some unnecessary include
  3. Add prefix for some kernels
  4. Remove a legacy kernel
File size: 7.9 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "s2p_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <pablo/pabloAST.h>
9#include <pablo/builder.hpp>
10#include <pablo/pe_pack.h>
11
12#include <llvm/Support/raw_ostream.h>
13
14using namespace llvm;
15
16namespace kernel {
17
18const int PACK_LANES = 2;
19
20void s2p_step(const std::unique_ptr<KernelBuilder> & iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
21    Value * t0 = nullptr;
22    Value * t1 = nullptr;
23    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
24        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
25        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
26        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
27        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
28    } else {
29        t0 = iBuilder->hsimd_packh(16, s0, s1);
30        t1 = iBuilder->hsimd_packl(16, s0, s1);
31    }
32    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
33    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
34}
35
36void s2p(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[]) {
37    Value * bit00224466[4];
38    Value * bit11335577[4];
39
40    for (unsigned i = 0; i < 4; i++) {
41        Value * s0 = input[2 * i];
42        Value * s1 = input[2 * i + 1];
43        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit00224466[i], bit11335577[i]);
44    }
45    Value * bit00004444[2];
46    Value * bit22226666[2];
47    Value * bit11115555[2];
48    Value * bit33337777[2];
49    for (unsigned j = 0; j<2; j++) {
50        s2p_step(iBuilder, bit00224466[2*j], bit00224466[2*j+1],
51                 iBuilder->simd_himask(4), 2, bit00004444[j], bit22226666[j]);
52        s2p_step(iBuilder, bit11335577[2*j], bit11335577[2*j+1],
53                 iBuilder->simd_himask(4), 2, bit11115555[j], bit33337777[j]);
54    }
55    s2p_step(iBuilder, bit00004444[0], bit00004444[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
56    s2p_step(iBuilder, bit11115555[0], bit11115555[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
57    s2p_step(iBuilder, bit22226666[0], bit22226666[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
58    s2p_step(iBuilder, bit33337777[0], bit33337777[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
59}
60
61/* Alternative transposition model, but small field width packs are problematic. */
62#if 0
63void s2p_ideal(const std::unique_ptr<KernelBuilder> & iBuilder, Value * input[], Value * output[]) {
64    Value * hi_nybble[4];
65    Value * lo_nybble[4];
66    for (unsigned i = 0; i<4; i++) {
67        Value * s0 = input[2*i];
68        Value * s1 = input[2*i+1];
69        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
70        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
71    }
72    Value * pair01[2];
73    Value * pair23[2];
74    Value * pair45[2];
75    Value * pair67[2];
76    for (unsigned i = 0; i<2; i++) {
77        pair01[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
78        pair23[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
79        pair45[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
80        pair67[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
81    }
82    output[0] = iBuilder->hsimd_packh(2, pair01[0], pair01[1]);
83    output[1] = iBuilder->hsimd_packl(2, pair01[0], pair01[1]);
84    output[2] = iBuilder->hsimd_packh(2, pair23[0], pair23[1]);
85    output[3] = iBuilder->hsimd_packl(2, pair23[0], pair23[1]);
86    output[4] = iBuilder->hsimd_packh(2, pair45[0], pair45[1]);
87    output[5] = iBuilder->hsimd_packl(2, pair45[0], pair45[1]);
88    output[6] = iBuilder->hsimd_packh(2, pair67[0], pair67[1]);
89    output[7] = iBuilder->hsimd_packl(2, pair67[0], pair67[1]);
90}
91#endif
92   
93#if 0
94void generateS2P_16Kernel(const std::unique_ptr<KernelBuilder> & iBuilder, Kernel * kBuilder) {
95    kBuilder->addInputStream(16, "unit_pack");
96    for(unsigned i = 0; i < 16; i++) {
97            kBuilder->addOutputStream(1);
98    }
99    Value * ptr = kBuilder->getInputStream(0);
100
101    Value * lo[8];
102    Value * hi[8];
103    for (unsigned i = 0; i < 8; i++) {
104        Value * s0 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i)});
105        Value * s1 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i + 1)});
106        lo[i] = iBuilder->hsimd_packl(16, s0, s1);
107        hi[i] = iBuilder->hsimd_packh(16, s0, s1);
108    }
109
110    Value * output[16];
111    s2p(iBuilder, lo, output);
112    s2p(iBuilder, hi, output + 8);
113    for (unsigned j = 0; j < 16; j++) {
114        iBuilder->CreateBlockAlignedStore(output[j], kBuilder->getOutputStream(j));
115    }
116}   
117#endif
118
119void S2PKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfBlocks) {
120    BasicBlock * entry = kb->GetInsertBlock();
121    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
122    BasicBlock * s2pDone = kb->CreateBasicBlock("s2pDone");
123    Constant * const ZERO = kb->getSize(0);
124
125    kb->CreateBr(processBlock);
126   
127    kb->SetInsertPoint(processBlock);
128    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
129    blockOffsetPhi->addIncoming(ZERO, entry);
130
131    Value * bytepack[8];
132    for (unsigned i = 0; i < 8; i++) {
133        if (mAligned) {
134            bytepack[i] = kb->loadInputStreamPack("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
135        } else {
136            Value * ptr = kb->getInputStreamPackPtr("byteStream", ZERO, kb->getInt32(i), blockOffsetPhi);
137            // CreateLoad defaults to aligned here, so we need to force the alignment to 1 byte.
138            bytepack[i] = kb->CreateAlignedLoad(ptr, 1);
139        }
140    }
141    Value * basisbits[8];
142    s2p(kb, bytepack, basisbits);
143    for (unsigned i = 0; i < 8; ++i) {
144        kb->storeOutputStreamBlock("basisBits", kb->getInt32(i), blockOffsetPhi, basisbits[i]);
145    }
146    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
147    blockOffsetPhi->addIncoming(nextBlk, processBlock);
148    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
149    kb->CreateCondBr(moreToDo, processBlock, s2pDone);
150    kb->SetInsertPoint(s2pDone);
151}
152S2PKernel::S2PKernel(const std::unique_ptr<KernelBuilder> & b, bool aligned, std::string prefix)
153: MultiBlockKernel(aligned ? prefix + "s2p" : prefix + "s2p_unaligned",
154    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
155    {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
156  mAligned(aligned) {
157    if (!aligned) {
158        mStreamSetInputs[0].addAttribute(Misaligned());
159    }
160}
161void S2P_PabloKernel::generatePabloMethod() {
162    pablo::PabloBlock * const pb = getEntryScope();
163    const unsigned steps = std::log2(mCodeUnitWidth);
164    std::vector<PabloAST *> streamSet[steps + 1];
165    for (unsigned i = 0; i <= steps; i++) {
166        streamSet[i].resize(1<<i);
167    }
168    streamSet[0][0] = pb->createExtract(getInputStreamVar("codeUnitStream"), pb->getInteger(0));
169    unsigned streamWidth = mCodeUnitWidth;
170    for (unsigned i = 1; i <= steps; i++) {
171        for (unsigned j = 0; j < streamSet[i-1].size(); j++) {
172            auto strm = streamSet[i-1][j];
173            streamSet[i][2*j] = pb->createPackL(pb->getInteger(streamWidth), strm);
174            streamSet[i][2*j+1] = pb->createPackH(pb->getInteger(streamWidth), strm);
175        }
176        streamWidth = streamWidth/2;
177    }
178    for (unsigned bit = 0; bit < mCodeUnitWidth; bit++) {
179        pb->createAssign(pb->createExtract(getOutputStreamVar("basisBits"), pb->getInteger(bit)), streamSet[steps][mCodeUnitWidth-1-bit]);
180    }
181}
182
183S2P_PabloKernel::S2P_PabloKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned codeUnitWidth)
184: PabloKernel(b, "s2p_pablo" + std::to_string(codeUnitWidth),
185    {Binding{b->getStreamSetTy(1, codeUnitWidth), "codeUnitStream"}},
186    {Binding{b->getStreamSetTy(codeUnitWidth, 1), "basisBits"}}),
187  mCodeUnitWidth(codeUnitWidth) {
188}
189
190
191}
Note: See TracBrowser for help on using the repository browser.