source: icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp @ 5230

Last change on this file since 5230 was 5230, checked in by nmedfort, 2 years ago

Multi-threading support for PabloAST / PabloCompiler?. Requires unique LLVM Context / Module for each thread.

File size: 8.2 KB
RevLine 
[4939]1/*
[4959]2 *  Copyright (c) 2016 International Characters.
[4939]3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "s2p_kernel.h"
[4959]6#include <kernels/kernel.h>
7#include <IDISA/idisa_builder.h>
[5063]8#include <llvm/Support/raw_ostream.h>
[4939]9
[4974]10namespace kernel {
[5063]11using namespace llvm;
[4974]12
[4957]13const int PACK_LANES = 1;
[4939]14
15void s2p_step(IDISA::IDISA_Builder * iBuilder, Value * s0, Value * s1, Value * hi_mask, unsigned shift, Value * &p0, Value * &p1) {
[4957]16    Value * t0 = nullptr;
17    Value * t1 = nullptr;
18    if ((iBuilder->getBitBlockWidth() == 256) && (PACK_LANES == 2)) {
19        Value * x0 = iBuilder->esimd_mergel(128, s0, s1);
20        Value * x1 = iBuilder->esimd_mergeh(128, s0, s1);
21        t0 = iBuilder->hsimd_packh_in_lanes(PACK_LANES, 16, x0, x1);
22        t1 = iBuilder->hsimd_packl_in_lanes(PACK_LANES, 16, x0, x1);
23    }
24    else {
25        t0 = iBuilder->hsimd_packh(16, s0, s1);
26        t1 = iBuilder->hsimd_packl(16, s0, s1);
27    }
[4939]28    p0 = iBuilder->simd_if(1, hi_mask, t0, iBuilder->simd_srli(16, t1, shift));
29    p1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, t0, shift), t1);
30}
31
[5045]32void s2p(IDISA::IDISA_Builder * iBuilder, Value * input[], Value * output[]) {
[4939]33    Value * bit00224466[4];
34    Value * bit11335577[4];
[4986]35
36    for (unsigned i = 0; i < 4; i++) {
[5045]37        Value * s0 = input[2 * i];
38        Value * s1 = input[2 * i + 1];
[4959]39        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit00224466[i], bit11335577[i]);
[4939]40    }
41    Value * bit00004444[2];
42    Value * bit22226666[2];
43    Value * bit11115555[2];
44    Value * bit33337777[2];
45    for (unsigned j = 0; j<2; j++) {
46        s2p_step(iBuilder, bit00224466[2*j], bit00224466[2*j+1],
47                 iBuilder->simd_himask(4), 2, bit00004444[j], bit22226666[j]);
48        s2p_step(iBuilder, bit11335577[2*j], bit11335577[2*j+1],
49                 iBuilder->simd_himask(4), 2, bit11115555[j], bit33337777[j]);
50    }
[4959]51    s2p_step(iBuilder, bit00004444[0], bit00004444[1], iBuilder->simd_himask(8), 4, output[0], output[4]);
52    s2p_step(iBuilder, bit11115555[0], bit11115555[1], iBuilder->simd_himask(8), 4, output[1], output[5]);
53    s2p_step(iBuilder, bit22226666[0], bit22226666[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
54    s2p_step(iBuilder, bit33337777[0], bit33337777[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
[4939]55}
56
[5063]57/* Alternative transposition model, but small field width packs are problematic. */
58#if 0
59void s2p_ideal(IDISA::IDISA_Builder * iBuilder, Value * input[], Value * output[]) {
60    Value * hi_nybble[4];
61    Value * lo_nybble[4];
62    for (unsigned i = 0; i<4; i++) {
63        Value * s0 = input[2*i];
64        Value * s1 = input[2*i+1];
65        hi_nybble[i] = iBuilder->hsimd_packh(8, s0, s1);
66        lo_nybble[i] = iBuilder->hsimd_packl(8, s0, s1);
[5045]67    }
[5063]68    Value * pair01[2];
69    Value * pair23[2];
70    Value * pair45[2];
71    Value * pair67[2];
72    for (unsigned i = 0; i<2; i++) {
73        pair01[i] = iBuilder->hsimd_packh(4, hi_nybble[2*i], hi_nybble[2*i+1]);
74        pair23[i] = iBuilder->hsimd_packl(4, hi_nybble[2*i], hi_nybble[2*i+1]);
75        pair45[i] = iBuilder->hsimd_packh(4, lo_nybble[2*i], lo_nybble[2*i+1]);
76        pair67[i] = iBuilder->hsimd_packl(4, lo_nybble[2*i], lo_nybble[2*i+1]);
77    }
78    output[0] = iBuilder->hsimd_packh(2, pair01[0], pair01[1]);
79    output[1] = iBuilder->hsimd_packl(2, pair01[0], pair01[1]);
80    output[2] = iBuilder->hsimd_packh(2, pair23[0], pair23[1]);
81    output[3] = iBuilder->hsimd_packl(2, pair23[0], pair23[1]);
82    output[4] = iBuilder->hsimd_packh(2, pair45[0], pair45[1]);
83    output[5] = iBuilder->hsimd_packl(2, pair45[0], pair45[1]);
84    output[6] = iBuilder->hsimd_packh(2, pair67[0], pair67[1]);
85    output[7] = iBuilder->hsimd_packl(2, pair67[0], pair67[1]);
[5045]86}
[5063]87#endif
88   
89   
90#if 0
[5045]91
[4986]92
[5045]93void generateS2P_16Kernel(Module *, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder) {
[5046]94    kBuilder->addInputStream(16, "unit_pack");
95    for(unsigned i = 0; i < 16; i++) {
96            kBuilder->addOutputStream(1);
97    }
98    kBuilder->prepareFunction();
[5045]99
100    Value * ptr = kBuilder->getInputStream(0);
101
102    Value * lo[8];
103    Value * hi[8];
104    for (unsigned i = 0; i < 8; i++) {
105        Value * s0 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i)});
106        Value * s1 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i + 1)});
107        lo[i] = iBuilder->hsimd_packl(16, s0, s1);
108        hi[i] = iBuilder->hsimd_packh(16, s0, s1);
109    }
110
111    Value * output[16];
112    s2p(iBuilder, lo, output);
113    s2p(iBuilder, hi, output + 8);
114    for (unsigned j = 0; j < 16; j++) {
115        iBuilder->CreateBlockAlignedStore(output[j], kBuilder->getOutputStream(j));
116    }
117    kBuilder->finalize();
118}
[4976]119   
[5063]120#endif
[4976]121   
[5230]122void S2PKernel::generateFinalBlockMethod() {
[5074]123    /* Prepare the s2p final block function:
[5051]124     assumption: if remaining bytes is greater than 0, it is safe to read a full block of bytes.
125     if remaining bytes is zero, no read should be performed (e.g. for mmapped buffer).
126     */
[5202]127    auto savePoint = iBuilder->saveIP();
[5063]128    Module * m = iBuilder->getModule();
129    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
130    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
[5051]131    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "fb_entry", finalBlockFunction, 0));
[5063]132   
[5051]133    Value * self = getParameter(finalBlockFunction, "self");
134    Value * remainingBytes = getParameter(finalBlockFunction, "remainingBytes");
135   
136    BasicBlock * finalPartialBlock = BasicBlock::Create(iBuilder->getContext(), "partial", finalBlockFunction, 0);
137    BasicBlock * finalEmptyBlock = BasicBlock::Create(iBuilder->getContext(), "empty", finalBlockFunction, 0);
138    BasicBlock * exitBlock = BasicBlock::Create(iBuilder->getContext(), "exit", finalBlockFunction, 0);
139   
[5106]140    Value * emptyBlockCond = iBuilder->CreateICmpEQ(remainingBytes, ConstantInt::get(iBuilder->getSizeTy(), 0));
[5051]141    iBuilder->CreateCondBr(emptyBlockCond, finalEmptyBlock, finalPartialBlock);
142    iBuilder->SetInsertPoint(finalPartialBlock);
[5096]143    iBuilder->CreateCall(doBlockFunction, {self});
[5183]144    /* Adjust the produced item count */
145    Value * produced = getProducedItemCount(self);
146    produced = iBuilder->CreateSub(produced, ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride()));
147    setProducedItemCount(self, iBuilder->CreateAdd(produced, remainingBytes));
[5051]148   
149    iBuilder->CreateBr(exitBlock);
150   
151    iBuilder->SetInsertPoint(finalEmptyBlock);
[5096]152    Value * blockNo = getScalarField(self, blockNoScalar);
[5104]153    Value * basisBitsBlock_ptr = getStreamSetBlockPtr(self, "basisBits", blockNo);
[5051]154    iBuilder->CreateStore(Constant::getNullValue(basisBitsBlock_ptr->getType()->getPointerElementType()), basisBitsBlock_ptr);
155    iBuilder->CreateBr(exitBlock);
156   
157    iBuilder->SetInsertPoint(exitBlock);
158    iBuilder->CreateRetVoid();
[5063]159    iBuilder->restoreIP(savePoint);
160}
[5051]161
[5074]162   
[5230]163void S2PKernel::generateDoBlockLogic(Value * self, Value * blockNo) {
[5217]164
[5229]165    Value * byteStream = getStreamSetBlockPtr(self, "byteStream", blockNo);
[5202]166    Value * basisBits = getStreamSetBlockPtr(self, "basisBits", blockNo);
167
168    Value * bytepack[8];
[5063]169    for (unsigned i = 0; i < 8; i++) {
[5229]170        Value * ptr = iBuilder->CreateGEP(byteStream, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(i)});
[5217]171        bytepack[i] = iBuilder->CreateBlockAlignedLoad(ptr);
[5063]172    }
[5202]173    Value * bitblock[8];
174    s2p(iBuilder, bytepack, bitblock);
175    for (unsigned i = 0; i < 8; ++i) {
176        iBuilder->CreateBlockAlignedStore(bitblock[i], basisBits, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
[5063]177    }
[5183]178    Value * produced = getProducedItemCount(self);
179    produced = iBuilder->CreateAdd(produced, ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride()));
180    setProducedItemCount(self, produced);   
[5174]181}
182   
[5230]183void S2PKernel::generateDoBlockMethod() {
[5202]184    auto savePoint = iBuilder->saveIP();
185
186    Function * doBlockFunction = iBuilder->getModule()->getFunction(mKernelName + doBlock_suffix);
[5174]187   
188    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
189   
190    Value * self = getParameter(doBlockFunction, "self");
191    Value * blockNo = getScalarField(self, blockNoScalar);
192   
193    generateDoBlockLogic(self, blockNo);
[5202]194
[5063]195    iBuilder->CreateRetVoid();
196    iBuilder->restoreIP(savePoint);
[4976]197}
[5051]198}
Note: See TracBrowser for help on using the repository browser.