source: icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp @ 5260

Last change on this file since 5260 was 5260, checked in by nmedfort, 9 months ago

Changes working towards simplifying accessing stream elements + some modifications to simplify include / forward declarations within the CodeGen? library.

File size: 10.3 KB
Line 
1#include "p2s_kernel.h"
2#include "kernels/kernel.h"
3#include "IR_Gen/idisa_builder.h"
4#include <llvm/IR/Type.h>
5#include <llvm/IR/Module.h>
6#include <iostream>
7#include <stdint.h>
8#include <llvm/Support/FileSystem.h>
9#include <llvm/Support/raw_ostream.h>
10
11using namespace llvm;
12
13namespace kernel{
14       
15void p2s_step(IDISA::IDISA_Builder * iBuilder, Value * p0, Value * p1, Value * hi_mask, unsigned shift, Value * &s1, Value * &s0) {
16    Value * t0 = iBuilder->simd_if(1, hi_mask, p0, iBuilder->simd_srli(16, p1, shift));
17    Value * t1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, p0, shift), p1);
18    s1 = iBuilder->esimd_mergeh(8, t1, t0);
19    s0 = iBuilder->esimd_mergel(8, t1, t0);
20}
21
22inline void p2s(IDISA::IDISA_Builder * iBuilder, Value * p[], Value * s[]) {
23    Value * bit00004444[2];
24    Value * bit22226666[2];
25    Value * bit11115555[2];
26    Value * bit33337777[2];
27    p2s_step(iBuilder, p[0], p[4], iBuilder->simd_himask(8), 4, bit00004444[1], bit00004444[0]);
28    p2s_step(iBuilder, p[1], p[5], iBuilder->simd_himask(8), 4, bit11115555[1], bit11115555[0]);
29    p2s_step(iBuilder, p[2], p[6], iBuilder->simd_himask(8), 4, bit22226666[1], bit22226666[0]);
30    p2s_step(iBuilder, p[3], p[7], iBuilder->simd_himask(8), 4, bit33337777[1], bit33337777[0]);
31
32    Value * bit00224466[4];
33    Value * bit11335577[4];
34    for (unsigned j = 0; j<2; j++) {
35        p2s_step(iBuilder, bit00004444[j], bit22226666[j],iBuilder->simd_himask(4), 2, bit00224466[2*j+1], bit00224466[2*j]);
36        p2s_step(iBuilder, bit11115555[j], bit33337777[j],iBuilder->simd_himask(4), 2, bit11335577[2*j+1], bit11335577[2*j]);
37    }
38    for (unsigned j = 0; j<4; j++) {
39        p2s_step(iBuilder, bit00224466[j], bit11335577[j], iBuilder->simd_himask(2), 1, s[2*j+1], s[2*j]);
40    }
41}
42               
43void P2SKernel::generateDoBlockMethod() const {
44    auto savePoint = iBuilder->saveIP();
45    Module * m = iBuilder->getModule();
46   
47    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
48   
49    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
50   
51    Value * self = getParameter(doBlockFunction, "self");
52    Value * blockNo = getScalarField(self, blockNoScalar);
53    Value * p_bitblock[8];
54    for (unsigned i = 0; i < 8; i++) {
55        Value * ptr = getStream(self, "basisBits", blockNo, iBuilder->getInt32(i));
56        p_bitblock[i] = iBuilder->CreateBlockAlignedLoad(ptr);
57    }
58    Value * s_bytepack[8];
59    p2s(iBuilder, p_bitblock, s_bytepack);
60    for (unsigned j = 0; j < 8; ++j) {
61        Value * ptr = getStream(self, "byteStream", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(j));
62        iBuilder->CreateBlockAlignedStore(s_bytepack[j], ptr);
63    }
64    iBuilder->CreateRetVoid();
65    iBuilder->restoreIP(savePoint);
66}
67
68void P2SKernelWithCompressedOutput::generateDoBlockMethod() const {
69    auto savePoint = iBuilder->saveIP();
70    Module * m = iBuilder->getModule();
71    Type * i8PtrTy = iBuilder->getInt8PtrTy();
72    Type * i32 = iBuilder->getIntNTy(32);
73    Type * bitBlockPtrTy = llvm::PointerType::get(iBuilder->getBitBlockType(), 0);
74
75    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
76
77    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
78    Value * self = getParameter(doBlockFunction, "self");
79    Value * blockNo = getScalarField(self, blockNoScalar);
80
81
82
83    Value * basisBits[8];
84    for (unsigned i = 0; i < 8; i++) {
85        Value * basisBitsBlock_ptr = getStream(self, "basisBits", blockNo, iBuilder->getInt32(i));
86        basisBits[i] = iBuilder->CreateBlockAlignedLoad(basisBitsBlock_ptr);
87    }
88    Value * bytePack[8];
89    p2s(iBuilder, basisBits, bytePack);
90
91    unsigned units_per_register = iBuilder->getBitBlockWidth()/8;
92    Value * delCountBlock_ptr = getStream(self, "deletionCounts", blockNo, iBuilder->getInt32(0));
93    Value * unit_counts = iBuilder->fwCast(units_per_register, iBuilder->CreateBlockAlignedLoad(delCountBlock_ptr));
94
95    Value * output_ptr = getStreamView(i8PtrTy, self, "byteStream", blockNo, iBuilder->getInt32(0));
96    Value * offset = iBuilder->getInt32(0);
97    for (unsigned j = 0; j < 8; ++j) {
98        iBuilder->CreateAlignedStore(bytePack[j], iBuilder->CreateBitCast(iBuilder->CreateGEP(output_ptr, offset), bitBlockPtrTy), 1);
99        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(j)), i32);
100    }
101    iBuilder->CreateRetVoid();
102    iBuilder->restoreIP(savePoint);
103}
104
105void P2S16Kernel::generateDoBlockMethod() const {
106    auto savePoint = iBuilder->saveIP();
107    Module * m = iBuilder->getModule();
108   
109    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
110   
111    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
112    Value * self = getParameter(doBlockFunction, "self");
113    Value * blockNo = getScalarField(self, blockNoScalar);   
114   
115    Value * hi_input[8];
116    for (unsigned j = 0; j < 8; ++j) {
117        Value * ptr = getStream(self, "basisBits", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(j));
118        hi_input[j] = iBuilder->CreateBlockAlignedLoad(ptr);
119    }
120    Value * hi_bytes[8];
121    p2s(iBuilder, hi_input, hi_bytes);
122   
123    Value * lo_input[8];
124    for (unsigned j = 0; j < 8; ++j) {
125        Value * ptr = getStream(self, "basisBits", blockNo, iBuilder->getInt32(0), iBuilder->getInt32(j + 8));
126        lo_input[j] = iBuilder->CreateBlockAlignedLoad(ptr);
127    }
128    Value * lo_bytes[8];
129    p2s(iBuilder, lo_input, lo_bytes);
130   
131    for (unsigned j = 0; j < 8; ++j) {
132        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
133        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
134        Value * ptr0 = getStream(self, "i16Stream", blockNo, iBuilder->getInt32(2 * j));
135        iBuilder->CreateBlockAlignedStore(merge0, ptr0);
136        Value * ptr1 = getStream(self, "i16Stream", blockNo, iBuilder->getInt32(2 * j + 1));
137        iBuilder->CreateBlockAlignedStore(merge1, ptr1);
138    }
139    iBuilder->CreateRetVoid();
140    iBuilder->restoreIP(savePoint);
141}
142
143void P2S16KernelWithCompressedOutput::generateDoBlockMethod() const {
144    auto savePoint = iBuilder->saveIP();
145    Module * m = iBuilder->getModule();
146    Type * i32 = iBuilder->getIntNTy(32);
147    Type * bitBlockPtrTy = llvm::PointerType::get(iBuilder->getBitBlockType(), 0);
148
149    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
150
151    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "entry", doBlockFunction, 0));
152
153    Value * self = getParameter(doBlockFunction, "self");
154    Value * blockNo = getScalarField(self, blockNoScalar);
155
156    Value * hi_input[8];
157    for (unsigned j = 0; j < 8; ++j) {
158        Value * ptr = getStream(self, "basisBits", blockNo, iBuilder->getInt32(j));
159        hi_input[j] = iBuilder->CreateBlockAlignedLoad(ptr);
160    }
161    Value * hi_bytes[8];
162    p2s(iBuilder, hi_input, hi_bytes);
163
164    Value * lo_input[8];
165    for (unsigned j = 0; j < 8; ++j) {
166        Value * ptr = getStream(self, "basisBits", blockNo, iBuilder->getInt32(j + 8));
167        lo_input[j] = iBuilder->CreateBlockAlignedLoad(ptr);
168    }
169    Value * lo_bytes[8];
170    p2s(iBuilder, lo_input, lo_bytes);
171
172    Value * delCountBlock_ptr = getStream(self, "deletionCounts", blockNo, iBuilder->getInt32(0));
173    Value * unit_counts = iBuilder->fwCast(iBuilder->getBitBlockWidth() / 16, iBuilder->CreateBlockAlignedLoad(delCountBlock_ptr));
174
175    PointerType * int16PtrTy = PointerType::get(iBuilder->getInt16Ty(), 0);
176    ConstantInt * stride = iBuilder->getSize(iBuilder->getStride());
177    Value * i16UnitsGenerated = getProducedItemCount(self, "i16Stream"); // units generated to buffer
178    Value * i16BlockNo = iBuilder->CreateUDiv(i16UnitsGenerated, stride);
179    Value * u16_output_ptr = getStreamView(int16PtrTy, self, "i16Stream", i16BlockNo, iBuilder->CreateURem(i16UnitsGenerated, stride));
180    Value * offset = ConstantInt::get(i32, 0);
181    for (unsigned j = 0; j < 8; ++j) {
182        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
183        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
184        iBuilder->CreateAlignedStore(merge0, iBuilder->CreateBitCast(iBuilder->CreateGEP(u16_output_ptr, offset), bitBlockPtrTy), 1);
185        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(2 * j)), i32);
186        iBuilder->CreateAlignedStore(merge1, iBuilder->CreateBitCast(iBuilder->CreateGEP(u16_output_ptr, offset), bitBlockPtrTy), 1);
187        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(2 * j + 1)), i32);
188    }
189    i16UnitsGenerated = iBuilder->CreateAdd(i16UnitsGenerated, iBuilder->CreateZExt(offset, iBuilder->getSizeTy()));
190    setProducedItemCount(self, "i16Stream", i16UnitsGenerated);
191    iBuilder->CreateRetVoid();
192    iBuilder->restoreIP(savePoint);
193}
194
195void P2S16KernelWithCompressedOutput::generateFinalBlockMethod() const {
196    auto savePoint = iBuilder->saveIP();
197    Module * m = iBuilder->getModule();
198    Function * doBlockFunction = m->getFunction(mKernelName + doBlock_suffix);
199    Function * finalBlockFunction = m->getFunction(mKernelName + finalBlock_suffix);
200    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "fb_entry", finalBlockFunction, 0));
201    // Final Block arguments: self, remaining, then the standard DoBlock args.
202    Function::arg_iterator args = finalBlockFunction->arg_begin();
203    Value * self = &*(args++);
204    /* Skip "remaining" arg */ args++;
205    std::vector<Value *> doBlockArgs = {self};
206    while (args != finalBlockFunction->arg_end()){
207        doBlockArgs.push_back(&*args++);
208    }
209    Value * i16UnitsGenerated = getProducedItemCount(self, "i16Stream"); // units generated to buffer
210    iBuilder->CreateCall(doBlockFunction, doBlockArgs);
211    i16UnitsGenerated = getProducedItemCount(self, "i16Stream"); // units generated to buffer
212    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
213        Value * ssStructPtr = getStreamSetStructPtr(self, mStreamSetOutputs[i].name);
214        Value * producerPosPtr = mStreamSetOutputBuffers[i]->getProducerPosPtr(ssStructPtr);
215        iBuilder->CreateAtomicStoreRelease(i16UnitsGenerated, producerPosPtr);
216    }
217    iBuilder->CreateRetVoid();
218    iBuilder->restoreIP(savePoint);
219}
220   
221}
Note: See TracBrowser for help on using the repository browser.