source: icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp @ 5217

Last change on this file since 5217 was 5217, checked in by nmedfort, 2 years ago

Merged PabloFunction? and PabloKernel? classes. Updated projects where necessary.

File size: 10.8 KB
Line 
1#include "p2s_kernel.h"
2#include "kernels/kernel.h"
3#include "IDISA/idisa_builder.h"
4#include <llvm/IR/TypeBuilder.h>
5#include <llvm/IR/Type.h>
6#include <iostream>
7#include <stdint.h>
8#include <llvm/Support/FileSystem.h>
9#include <llvm/Support/raw_ostream.h>
10
11
12
13namespace kernel{
14       
15void p2s_step(IDISA::IDISA_Builder * iBuilder, Value * p0, Value * p1, Value * hi_mask, unsigned shift, Value * &s1, Value * &s0) {
16    Value * t0 = iBuilder->simd_if(1, hi_mask, p0, iBuilder->simd_srli(16, p1, shift));
17    Value * t1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, p0, shift), p1);
18    s1 = iBuilder->esimd_mergeh(8, t1, t0);
19    s0 = iBuilder->esimd_mergel(8, t1, t0);
20}
21
22inline void p2s(IDISA::IDISA_Builder * iBuilder, Value * p[], Value * s[]) {
23    Value * bit00004444[2];
24    Value * bit22226666[2];
25    Value * bit11115555[2];
26    Value * bit33337777[2];
27    p2s_step(iBuilder, p[0], p[4], iBuilder->simd_himask(8), 4, bit00004444[1], bit00004444[0]);
28    p2s_step(iBuilder, p[1], p[5], iBuilder->simd_himask(8), 4, bit11115555[1], bit11115555[0]);
29    p2s_step(iBuilder, p[2], p[6], iBuilder->simd_himask(8), 4, bit22226666[1], bit22226666[0]);
30    p2s_step(iBuilder, p[3], p[7], iBuilder->simd_himask(8), 4, bit33337777[1], bit33337777[0]);
31
32    Value * bit00224466[4];
33    Value * bit11335577[4];
34    for (unsigned j = 0; j<2; j++) {
35        p2s_step(iBuilder, bit00004444[j], bit22226666[j],iBuilder->simd_himask(4), 2, bit00224466[2*j+1], bit00224466[2*j]);
36        p2s_step(iBuilder, bit11115555[j], bit33337777[j],iBuilder->simd_himask(4), 2, bit11335577[2*j+1], bit11335577[2*j]);
37    }
38    for (unsigned j = 0; j<4; j++) {
39        p2s_step(iBuilder, bit00224466[j], bit11335577[j], iBuilder->simd_himask(2), 1, s[2*j+1], s[2*j]);
40    }
41}
42               
43void p2sKernel::generateDoBlockMethod() {
44    auto savePoint = iBuilder->saveIP();
45    Module * m = iBuilder->getModule();
46   
47    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
48   
49    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
50   
51    Value * self = getParameter(doBlockFunction, "self");
52    Value * blockNo = getScalarField(self, blockNoScalar);
53    Value * basisBitsBlock_ptr = getStreamSetBlockPtr(self, "basisBits", blockNo);
54    Value * byteStreamBlock_ptr = getStreamSetBlockPtr(self, "byteStream", blockNo);
55
56    Value * p_bitblock[8];
57    for (unsigned i = 0; i < 8; i++) {
58        p_bitblock[i] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
59    }
60    Value * s_bytepack[8];
61    p2s(iBuilder, p_bitblock, s_bytepack);
62    for (unsigned j = 0; j < 8; ++j) {
63        iBuilder->CreateBlockAlignedStore(s_bytepack[j], byteStreamBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0), iBuilder->getInt32(j)});
64    }
65    iBuilder->CreateRetVoid();
66    iBuilder->restoreIP(savePoint);
67}
68       
69void p2sKernel_withCompressedOutput::generateDoBlockMethod() {
70    auto savePoint = iBuilder->saveIP();
71    Module * m = iBuilder->getModule();
72    Type * i8PtrTy = iBuilder->getInt8PtrTy(); 
73    Type * i32 = iBuilder->getIntNTy(32); 
74    Type * bitBlockPtrTy = llvm::PointerType::get(iBuilder->getBitBlockType(), 0); 
75   
76    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
77   
78    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
79    Value * self = getParameter(doBlockFunction, "self");
80    Value * blockNo = getScalarField(self, blockNoScalar);
81    Value * basisBitsBlock_ptr = getStreamSetBlockPtr(self, "basisBits", blockNo);
82    Value * delCountBlock_ptr = getStreamSetBlockPtr(self, "deletionCounts", blockNo);
83    Value * byteStreamBlock_ptr = getStreamSetBlockPtr(self, "byteStream", blockNo);
84   
85    Value * p_bitblock[8];
86    for (unsigned i = 0; i < 8; i++) {
87        p_bitblock[i] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
88    }
89    Value * s_bytepack[8];
90    p2s(iBuilder, p_bitblock, s_bytepack);
91   
92    unsigned units_per_register = iBuilder->getBitBlockWidth()/8;
93   
94    Value * unit_counts = iBuilder->fwCast(units_per_register, iBuilder->CreateBlockAlignedLoad(delCountBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
95   
96    Value * output_ptr = iBuilder->CreateBitCast(byteStreamBlock_ptr, i8PtrTy);
97    Value * offset = ConstantInt::get(i32, 0);
98   
99    for (unsigned j = 0; j < 8; ++j) {
100        iBuilder->CreateAlignedStore(s_bytepack[j], iBuilder->CreateBitCast(iBuilder->CreateGEP(output_ptr, offset), bitBlockPtrTy), 1);
101        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(j)), i32);
102    }
103    iBuilder->CreateRetVoid();
104    iBuilder->restoreIP(savePoint);
105}
106
107void p2s_16Kernel::generateDoBlockMethod() {
108    auto savePoint = iBuilder->saveIP();
109    Module * m = iBuilder->getModule();
110   
111    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
112   
113    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
114    Value * self = getParameter(doBlockFunction, "self");
115    Value * blockNo = getScalarField(self, blockNoScalar);
116    Value * basisBitsBlock_ptr = getStreamSetBlockPtr(self, "basisBits", blockNo);
117    Value * i16StreamBlock_ptr = getStreamSetBlockPtr(self, "i16Stream", blockNo);
118   
119    Value * hi_input[8];
120    for (unsigned j = 0; j < 8; ++j) {
121        hi_input[j] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(j)});
122    }
123    Value * hi_bytes[8];
124    p2s(iBuilder, hi_input, hi_bytes);
125   
126    Value * lo_input[8];
127    for (unsigned j = 0; j < 8; ++j) {
128        lo_input[j] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(j+8)});
129    }
130    Value * lo_bytes[8];
131    p2s(iBuilder, lo_input, lo_bytes);
132   
133    for (unsigned j = 0; j < 8; ++j) {
134        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
135        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
136        // iBuilder->getInt32(0),
137        iBuilder->CreateBlockAlignedStore(merge0, i16StreamBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2*j)});
138        iBuilder->CreateBlockAlignedStore(merge1, i16StreamBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2*j+1)});
139    }
140    iBuilder->CreateRetVoid();
141    iBuilder->restoreIP(savePoint);
142}
143
144void p2s_16Kernel_withCompressedOutput::generateDoBlockMethod() {
145    auto savePoint = iBuilder->saveIP();
146    Module * m = iBuilder->getModule();
147    Type * i32 = iBuilder->getIntNTy(32);
148    Type * bitBlockPtrTy = llvm::PointerType::get(iBuilder->getBitBlockType(), 0);
149
150    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
151
152    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
153    Constant * stride = ConstantInt::get(iBuilder->getSizeTy(), iBuilder->getStride());
154
155    Value * self = getParameter(doBlockFunction, "self");
156    Value * blockNo = getScalarField(self, blockNoScalar);
157    Value * basisBitsBlock_ptr = getStreamSetBlockPtr(self, "basisBits", blockNo);
158    Value * delCountBlock_ptr = getStreamSetBlockPtr(self, "deletionCounts", blockNo);
159    Value * i16UnitsGenerated = getProducedItemCount(self); // units generated to buffer
160    Value * i16BlockNo = iBuilder->CreateUDiv(i16UnitsGenerated, stride);
161
162    Value * i16StreamBase_ptr = iBuilder->CreateBitCast(getStreamSetBlockPtr(self, "i16Stream", i16BlockNo), PointerType::get(iBuilder->getInt16Ty(), 0));
163
164    Value * u16_output_ptr = iBuilder->CreateGEP(i16StreamBase_ptr, iBuilder->CreateURem(i16UnitsGenerated, stride));
165
166
167    Value * hi_input[8];
168    for (unsigned j = 0; j < 8; ++j) {
169        hi_input[j] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(j)});
170    }
171    Value * hi_bytes[8];
172    p2s(iBuilder, hi_input, hi_bytes);
173
174    Value * lo_input[8];
175    for (unsigned j = 0; j < 8; ++j) {
176        lo_input[j] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(j+8)});
177    }
178    Value * lo_bytes[8];
179    p2s(iBuilder, lo_input, lo_bytes);
180
181    const auto UTF_16_units_per_register = iBuilder->getBitBlockWidth() / 16;
182
183    Value * unit_counts = iBuilder->fwCast(UTF_16_units_per_register, iBuilder->CreateBlockAlignedLoad(delCountBlock_ptr, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
184
185    Value * offset = ConstantInt::get(i32, 0);
186
187    for (unsigned j = 0; j < 8; ++j) {
188        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
189        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
190        iBuilder->CreateAlignedStore(merge0, iBuilder->CreateBitCast(iBuilder->CreateGEP(u16_output_ptr, offset), bitBlockPtrTy), 1);
191        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(2*j)), i32);
192        iBuilder->CreateAlignedStore(merge1, iBuilder->CreateBitCast(iBuilder->CreateGEP(u16_output_ptr, offset), bitBlockPtrTy), 1);
193        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(2*j+1)), i32);
194    }
195
196    i16UnitsGenerated = iBuilder->CreateAdd(i16UnitsGenerated, iBuilder->CreateZExt(offset, iBuilder->getSizeTy()));
197    setProducedItemCount(self, i16UnitsGenerated);
198    iBuilder->CreateRetVoid();
199    iBuilder->restoreIP(savePoint);
200}
201
202void p2s_16Kernel_withCompressedOutput::generateFinalBlockMethod() {
203    auto savePoint = iBuilder->saveIP();
204    Module * m = iBuilder->getModule();
205    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
206    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
207    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "fb_entry", finalBlockFunction, 0));
208    // Final Block arguments: self, remaining, then the standard DoBlock args.
209    Function::arg_iterator args = finalBlockFunction->arg_begin();
210    Value * self = &*(args++);
211    /* Skip "remaining" arg */ args++;
212    std::vector<Value *> doBlockArgs = {self};
213    while (args != finalBlockFunction->arg_end()){
214        doBlockArgs.push_back(&*args++);
215    }
216    Value * i16UnitsGenerated = getProducedItemCount(self); // units generated to buffer
217
218    iBuilder->CreateCall(doBlockFunction, doBlockArgs);
219    i16UnitsGenerated = getProducedItemCount(self); // units generated to buffer
220    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
221        Value * ssStructPtr = getStreamSetStructPtr(self, mStreamSetOutputs[i].name);
222        Value * producerPosPtr = mStreamSetOutputBuffers[i]->getProducerPosPtr(ssStructPtr);
223        iBuilder->CreateAtomicStoreRelease(i16UnitsGenerated, producerPosPtr);
224    }
225    iBuilder->CreateRetVoid();
226    iBuilder->restoreIP(savePoint);
227}
228   
229   
230}
Note: See TracBrowser for help on using the repository browser.