source: icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp @ 5040

Last change on this file since 5040 was 5040, checked in by cameron, 3 years ago

Buffered mode for u8u16 output

File size: 7.3 KB
Line 
1#include "p2s_kernel.h"
2#include "kernels/kernel.h"
3#include "IDISA/idisa_builder.h"
4#include <llvm/IR/TypeBuilder.h>
5#include <llvm/IR/Type.h>
6#include <iostream>
7#include <stdint.h>
8#include <llvm/Support/FileSystem.h>
9#include <llvm/Support/raw_ostream.h>
10
11
12
13extern "C" {
14    void buffered_write(const char * ptr, size_t bytes) {
15        outs().write(ptr, bytes);
16    }
17};
18
19namespace kernel{
20       
21void p2s_step(IDISA::IDISA_Builder * iBuilder, Value * p0, Value * p1, Value * hi_mask, unsigned shift, Value * &s1, Value * &s0) {
22    Value * t0 = iBuilder->simd_if(1, hi_mask, p0, iBuilder->simd_srli(16, p1, shift));
23    Value * t1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, p0, shift), p1);
24    s1 = iBuilder->esimd_mergeh(8, t1, t0);
25    s0 = iBuilder->esimd_mergel(8, t1, t0);
26}
27
28inline void p2s(IDISA::IDISA_Builder * iBuilder, Value * p[], Value * s[]) {
29    Value * bit00004444[2];
30    Value * bit22226666[2];
31    Value * bit11115555[2];
32    Value * bit33337777[2];
33    p2s_step(iBuilder, p[0], p[4], iBuilder->simd_himask(8), 4, bit00004444[1], bit00004444[0]);
34    p2s_step(iBuilder, p[1], p[5], iBuilder->simd_himask(8), 4, bit11115555[1], bit11115555[0]);
35    p2s_step(iBuilder, p[2], p[6], iBuilder->simd_himask(8), 4, bit22226666[1], bit22226666[0]);
36    p2s_step(iBuilder, p[3], p[7], iBuilder->simd_himask(8), 4, bit33337777[1], bit33337777[0]);
37
38    Value * bit00224466[4];
39    Value * bit11335577[4];
40    for (unsigned j = 0; j<2; j++) {
41        p2s_step(iBuilder, bit00004444[j], bit22226666[j],iBuilder->simd_himask(4), 2, bit00224466[2*j+1], bit00224466[2*j]);
42        p2s_step(iBuilder, bit11115555[j], bit33337777[j],iBuilder->simd_himask(4), 2, bit11335577[2*j+1], bit11335577[2*j]);
43    }
44    for (unsigned j = 0; j<4; j++) {
45        p2s_step(iBuilder, bit00224466[j], bit11335577[j], iBuilder->simd_himask(2), 1, s[2*j+1], s[2*j]);
46    }
47}
48               
49void generateP2SKernel(Module * m, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder) {
50    for (unsigned i = 0; i < 8; ++i) {
51        kBuilder->addInputStream(1);
52    }
53    kBuilder->addOutputStream(8);
54    kBuilder->prepareFunction();
55    Value * input[8];
56    for (unsigned j = 0; j < 8; ++j) {
57        input[j] = iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(j));
58    }
59    Value * output[8];
60    p2s(iBuilder, input, output);
61    Value * output_ptr = kBuilder->getOutputStream(0);
62    for (unsigned j = 0; j < 8; ++j) {
63
64        iBuilder->CreateBlockAlignedStore(output[j], iBuilder->CreateGEP(output_ptr, std::vector<Value *>({ iBuilder->getInt32(0), iBuilder->getInt32(j) })));
65    }
66    kBuilder->finalize();
67}
68
69void generateP2S_16Kernel(Module * m, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder) {
70    for (unsigned i = 0; i < 16; ++i) {
71        kBuilder->addInputStream(1);
72    }
73    kBuilder->addOutputStream(16);
74    kBuilder->prepareFunction();
75    Value * hi_input[8];
76    for (unsigned j = 0; j < 8; ++j) {
77        hi_input[j] = iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(j));
78    }
79    Value * hi_bytes[8];
80    p2s(iBuilder, hi_input, hi_bytes);
81   
82    Value * lo_input[8];
83    for (unsigned j = 0; j < 8; ++j) {
84        lo_input[j] = iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(j+8));
85    }
86    Value * lo_bytes[8];
87    p2s(iBuilder, lo_input, lo_bytes);
88   
89    Value * output_ptr = kBuilder->getOutputStream(0);
90    for (unsigned j = 0; j < 8; ++j) {
91        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
92        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
93        iBuilder->CreateBlockAlignedStore(merge0, iBuilder->CreateGEP(output_ptr, std::vector<Value *>({ iBuilder->getInt32(0), iBuilder->getInt32(2*j) })));
94        iBuilder->CreateBlockAlignedStore(merge1, iBuilder->CreateGEP(output_ptr, std::vector<Value *>({ iBuilder->getInt32(0), iBuilder->getInt32(2*j+1) })));
95    }
96    kBuilder->finalize();
97}
98   
99Function * create_write(Module * const mod) {
100    Function * write = mod->getFunction("write");
101    if (write == nullptr) {
102        FunctionType *write_type =
103        TypeBuilder<long(int, char *, long), false>::get(mod->getContext());
104        write = cast<Function>(mod->getOrInsertFunction("write", write_type,
105                                                        AttributeSet().addAttribute(mod->getContext(), 2U, Attribute::NoAlias)));
106    }
107    return write;
108}
109
110const size_t OutputBufferSize=65536;
111
112void generateP2S_16_withCompressedOutputKernel(Module * m, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder) {
113    outs().SetBufferSize(OutputBufferSize);
114    for (unsigned i = 0; i < 16; ++i) {
115        kBuilder->addInputStream(1);
116    }       
117    kBuilder->addInputStream(1);  // partial popcounts
118    kBuilder->addOutputStream(16);
119
120    kBuilder->prepareFunction();
121   
122    Type * i8PtrTy = iBuilder->getInt8PtrTy(); 
123    Type * i64 = iBuilder->getIntNTy(64); 
124    Type * bitBlockPtrTy = llvm::PointerType::get(iBuilder->getBitBlockType(), 0); 
125   
126    Function * writefn = cast<Function>(m->getOrInsertFunction("buffered_write", iBuilder->getVoidTy(), i8PtrTy, i64, nullptr));
127   
128    Value * hi_input[8];
129    for (unsigned j = 0; j < 8; ++j) {
130        hi_input[j] = iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(j));
131    }
132    Value * hi_bytes[8];
133    p2s(iBuilder, hi_input, hi_bytes);
134   
135    Value * lo_input[8];
136    for (unsigned j = 0; j < 8; ++j) {
137        lo_input[j] = iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(j+8));
138    }
139    Value * lo_bytes[8];
140    p2s(iBuilder, lo_input, lo_bytes);
141   
142    unsigned UTF_16_units_per_register = iBuilder->getBitBlockWidth()/16;
143   
144    Value * partial_counts = iBuilder->fwCast(UTF_16_units_per_register, iBuilder->CreateBlockAlignedLoad(kBuilder->getInputStream(16)));
145    if (UTF_16_units_per_register < 16) {
146        partial_counts = iBuilder->CreateZExt(partial_counts, VectorType::get(iBuilder->getIntNTy(16), iBuilder->getBitBlockWidth()/UTF_16_units_per_register));
147    }
148    Value * byte_counts = iBuilder->CreateAdd(partial_counts, partial_counts); // double the code unit count to get byte counts
149   
150    Value * output_ptr = iBuilder->CreateBitCast(kBuilder->getOutputStream(0), i8PtrTy);
151    Value * byte_offset = ConstantInt::get(i64, 0);
152   
153    for (unsigned j = 0; j < 8; ++j) {
154        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
155        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
156        //iBuilder->CallPrintRegister("merge0", merge0);
157        iBuilder->CreateAlignedStore(merge0, iBuilder->CreateBitCast(iBuilder->CreateGEP(output_ptr, byte_offset), bitBlockPtrTy), 1);
158        byte_offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(byte_counts, iBuilder->getInt32(2*j)), i64);
159        //iBuilder->CallPrintInt("byte_offset", byte_offset);
160        iBuilder->CreateAlignedStore(merge1, iBuilder->CreateBitCast(iBuilder->CreateGEP(output_ptr, byte_offset), bitBlockPtrTy), 1);
161        //iBuilder->CallPrintRegister("merge1", merge1);
162        byte_offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(byte_counts, iBuilder->getInt32(2*j+1)), i64);
163        //iBuilder->CallPrintInt("byte_offset", byte_offset);
164    }
165    iBuilder->CreateCall(writefn, std::vector<Value *>({output_ptr, byte_offset}));
166   
167    kBuilder->finalize();
168}
169
170}
Note: See TracBrowser for help on using the repository browser.