source: icGREP/icgrep-devel/icgrep/u8u16.cpp @ 6199

Last change on this file since 6199 was 6199, checked in by cameron, 7 months ago

Fix FieldCompress? kernels to have user settable field width; update to use extraction method

File size: 16.0 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <IR_Gen/idisa_target.h>                   // for GetIDISA_Builder
8#include <cc/alphabet.h>
9#include <cc/cc_compiler.h>                        // for CC_Compiler
10#include <kernels/pipeline_builder.h>
11#include <kernels/deletion.h>                      // for DeletionKernel
12#include <kernels/swizzle.h>                      // for DeletionKernel
13#include <kernels/source_kernel.h>
14#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
15#include <kernels/s2p_kernel.h>                    // for S2PKernel
16#include <kernels/stdout_kernel.h>                 // for StdOutKernel_
17#include <llvm/ExecutionEngine/ExecutionEngine.h>  // for ExecutionEngine
18#include <llvm/IR/Function.h>                      // for Function, Function...
19#include <llvm/IR/Module.h>                        // for Module
20#include <llvm/IR/Verifier.h>                      // for verifyModule
21#include <llvm/Support/CommandLine.h>              // for ParseCommandLineOp...
22#include <llvm/Support/Debug.h>                    // for dbgs
23#include <pablo/pablo_kernel.h>                    // for PabloKernel
24#include <pablo/pablo_toolchain.h>                 // for pablo_function_passes
25#include <kernels/kernel_builder.h>
26#include <pablo/pe_zeroes.h>
27#include <toolchain/toolchain.h>
28#include <toolchain/cpudriver.h>
29#include <kernels/streamset.h>
30#include <llvm/ADT/StringRef.h>
31#include <llvm/IR/CallingConv.h>
32#include <llvm/IR/DerivedTypes.h>
33#include <llvm/IR/LLVMContext.h>
34#include <llvm/IR/Value.h>
35#include <llvm/Support/Compiler.h>
36#include <pablo/builder.hpp>
37#include <boost/interprocess/anonymous_shared_memory.hpp>
38#include <boost/interprocess/mapped_region.hpp>
39#include <iostream>
40
41using namespace pablo;
42using namespace kernel;
43using namespace llvm;
44using namespace codegen;
45
46static cl::OptionCategory u8u16Options("u8u16 Options", "Transcoding control options.");
47static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(u8u16Options));
48static cl::opt<std::string> outputFile(cl::Positional, cl::desc("<output file>"), cl::cat(u8u16Options));
49static cl::opt<bool> enableAVXdel("enable-AVX-deletion", cl::desc("Enable AVX2 deletion algorithms."), cl::cat(u8u16Options));
50static cl::opt<bool> mMapBuffering("mmap-buffering", cl::desc("Enable mmap buffering."), cl::cat(u8u16Options));
51static cl::opt<bool> memAlignBuffering("memalign-buffering", cl::desc("Enable posix_memalign buffering."), cl::cat(u8u16Options));
52
53inline bool useAVX2() {
54    return enableAVXdel && AVX2_available() && codegen::BlockSize == 256;
55}
56
57class U8U16Kernel final: public pablo::PabloKernel {
58public:
59    U8U16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * BasisBits, StreamSet * u8bits, StreamSet * DelMask);
60    bool isCachable() const override { return true; }
61    bool hasSignature() const override { return false; }
62    void generatePabloMethod() override;
63};
64
65U8U16Kernel::U8U16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * BasisBits, StreamSet * u8bits, StreamSet * selectors)
66: PabloKernel(b, "u8u16",
67// input
68{Binding{"u8bit", BasisBits}},
69// outputs
70{Binding{"u16bit", u8bits},
71 Binding{"selectors", selectors}}) {
72
73}
74
75void U8U16Kernel::generatePabloMethod() {
76    PabloBuilder main(getEntryScope());
77    Zeroes * zeroes = main.createZeroes();
78   
79    //  input: 8 basis bit streams
80    std::vector<PabloAST *> u8_bits = getInputStreamSet("u8bit");
81
82    //  output: 16 u8-indexed streams, + delmask stream + error stream
83    Var * u16_hi[8];
84    for (int i = 0; i < 8; ++i) {
85        u16_hi[i] = main.createVar("u16_hi" + std::to_string(i), zeroes);
86    }
87    Var * u16_lo[8];
88    for (int i = 0; i < 8; ++i) {
89        u16_lo[i] = main.createVar("u16_lo" + std::to_string(i), zeroes);
90    }
91   
92    Var * delmask = main.createVar("delmask", zeroes);
93    Var * error_mask = main.createVar("error_mask", zeroes);
94
95    cc::Parabix_CC_Compiler ccc(getEntryScope(), u8_bits, cc::BitNumbering::BigEndian);
96
97    // The logic for processing non-ASCII bytes will be embedded within an if-hierarchy.
98    PabloAST * nonASCII = ccc.compileCC(re::makeByte(0x80, 0xFF));
99
100    // Builder for the if statement handling all non-ASCII logic
101    auto nAb = main.createScope();
102    // Bits 3 through 7 of a 2-byte prefix are data bits, needed to
103    // produce the UTF-16 code unit data ...,
104    PabloAST * bit3a1 = nAb.createAdvance(u8_bits[3], 1);
105    PabloAST * bit4a1 = nAb.createAdvance(u8_bits[4], 1);
106    PabloAST * bit5a1 = nAb.createAdvance(u8_bits[5], 1);
107    PabloAST * bit6a1 = nAb.createAdvance(u8_bits[6], 1);
108    PabloAST * bit7a1 = nAb.createAdvance(u8_bits[7], 1);
109
110    // Entry condition for 3 or 4 byte sequences: we have a prefix byte in the range 0xE0-0xFF.
111    PabloAST * pfx34 = ccc.compileCC(re::makeByte(0xE0, 0xFF), nAb);
112    // Builder for the if statement handling all logic for 3- and 4-byte sequences.
113    auto p34b = nAb.createScope();
114    // Bits 4 through 7 of a 3-byte prefix are data bits.  They must be moved
115    // to the final position of the 3-byte sequence.
116    PabloAST * bit2a1 = p34b.createAdvance(u8_bits[2], 1);
117    PabloAST * bit4a2 = p34b.createAdvance(bit4a1, 1);
118    PabloAST * bit5a2 = p34b.createAdvance(bit5a1, 1);
119    PabloAST * bit6a2 = p34b.createAdvance(bit6a1, 1);
120    PabloAST * bit7a2 = p34b.createAdvance(bit7a1, 1);
121
122    Var * const u8scope32 = nAb.createVar("u8scope32", zeroes);
123    Var * const u8scope33 = nAb.createVar("u8scope33", zeroes);
124    Var * const u8scope44 = nAb.createVar("u8scope44", zeroes);
125
126    //
127    // Logic for 4-byte UTF-8 sequences
128    //
129    // Entry condition  or 4 byte sequences: we have a prefix byte in the range 0xF0-0xFF.
130    PabloAST * pfx4 = ccc.compileCC(re::makeByte(0xF0, 0xFF), p34b);
131    // Builder for the if statement handling all logic for 4-byte sequences only.
132    auto p4b = p34b.createScope();
133    // Illegal 4-byte sequences
134    PabloAST * F0 = ccc.compileCC(re::makeByte(0xF0), p4b);
135    PabloAST * F4 = ccc.compileCC(re::makeByte(0xF4), p4b);
136    PabloAST * F0_err = p4b.createAnd(p4b.createAdvance(F0, 1), ccc.compileCC(re::makeByte(0x80, 0x8F), p4b));
137    PabloAST * F4_err = p4b.createAnd(p4b.createAdvance(F4, 1), ccc.compileCC(re::makeByte(0x90, 0xBF), p4b));
138    PabloAST * F5_FF = ccc.compileCC(re::makeByte(0xF5, 0xFF), p4b);
139
140    Var * FX_err = p34b.createVar("FX_err", zeroes);
141    p4b.createAssign(FX_err, p4b.createOr(F5_FF, p4b.createOr(F0_err, F4_err)));
142    //
143    // 4-byte prefixes have a scope that extends over the next 3 bytes.
144
145    Var * u8scope42 = p34b.createVar("u8scope42", zeroes);
146    Var * u8scope43 = p34b.createVar("u8scope43", zeroes);
147
148    p4b.createAssign(u8scope42, p4b.createAdvance(pfx4, 1));
149    p4b.createAssign(u8scope43, p4b.createAdvance(u8scope42, 1));
150    p4b.createAssign(u8scope44, p4b.createAdvance(u8scope43, 1));
151    //
152
153    //  From the 4-byte sequence 11110abc 10defghi 10jklmno 10pqrstu,
154    //  we must calculate the value abcde - 1 to produce the bit values
155    //  for u16_hi6, hi7, lo0, lo1 at the scope43 position.
156    Var * s43_lo0 = nAb.createVar("scope43_lo0", zeroes);
157    Var * s43_lo1 = nAb.createVar("scope43_lo1", zeroes);
158    Var * s43_hi6 = nAb.createVar("scope43_hi6", zeroes);
159    Var * s43_hi7 = nAb.createVar("scope43_hi7", zeroes);
160
161    Var * s43_lo2 = main.createVar("scope43_lo2", zeroes);
162    Var * s43_lo3 = main.createVar("scope43_lo3", zeroes);
163    Var * s43_lo4 = main.createVar("scope43_lo4", zeroes);
164    Var * s43_lo5 = main.createVar("scope43_lo5", zeroes);
165    Var * s43_lo6 = main.createVar("scope43_lo6", zeroes);
166    Var * s43_lo7 = main.createVar("scope43_lo7", zeroes);
167
168    p4b.createAssign(s43_lo1, p4b.createAnd(u8scope43, p4b.createNot(bit3a1)));           // e - 1
169    p4b.createAssign(s43_lo0, p4b.createAnd(u8scope43, p4b.createXor(bit2a1, s43_lo1)));  // d - borrow
170    PabloAST * brw1 = p4b.createAnd(s43_lo1, p4b.createNot(bit2a1));
171    p4b.createAssign(s43_hi7, p4b.createAnd(u8scope43, p4b.createXor(bit7a2, brw1)));     // c - borrow
172    PabloAST * brw2 = p4b.createAnd(brw1, p4b.createNot(bit7a2));
173    p4b.createAssign(s43_hi6, p4b.createAnd(u8scope43, p4b.createXor(bit6a2, brw2)));     // b - borrow
174    //
175    p4b.createAssign(s43_lo2, p4b.createAnd(u8scope43, bit4a1));
176    p4b.createAssign(s43_lo3, p4b.createAnd(u8scope43, bit5a1));
177    p4b.createAssign(s43_lo4, p4b.createAnd(u8scope43, bit6a1));
178    p4b.createAssign(s43_lo5, p4b.createAnd(u8scope43, bit7a1));
179    p4b.createAssign(s43_lo6, p4b.createAnd(u8scope43, u8_bits[2]));
180    p4b.createAssign(s43_lo7, p4b.createAnd(u8scope43, u8_bits[3]));
181    //
182    //
183    p34b.createIf(pfx4, p4b);
184    //
185    // Combined logic for 3 and 4 byte sequences
186    //
187    PabloAST * pfx3 = ccc.compileCC(re::makeByte(0xE0, 0xEF), p34b);
188
189    p34b.createAssign(u8scope32, p34b.createAdvance(pfx3, 1));
190    p34b.createAssign(u8scope33, p34b.createAdvance(u8scope32, 1));
191
192    // Illegal 3-byte sequences
193    PabloAST * E0 = ccc.compileCC(re::makeByte(0xE0), p34b);
194    PabloAST * ED = ccc.compileCC(re::makeByte(0xED), p34b);
195    PabloAST * E0_err = p34b.createAnd(p34b.createAdvance(E0, 1), ccc.compileCC(re::makeByte(0x80, 0x9F), p34b));
196    PabloAST * ED_err = p34b.createAnd(p34b.createAdvance(ED, 1), ccc.compileCC(re::makeByte(0xA0, 0xBF), p34b));
197    Var * EX_FX_err = nAb.createVar("EX_FX_err", zeroes);
198
199    p34b.createAssign(EX_FX_err, p34b.createOr(p34b.createOr(E0_err, ED_err), FX_err));
200    // Two surrogate UTF-16 units are computed at the 3rd and 4th positions of 4-byte sequences.
201    PabloAST * surrogate = p34b.createOr(u8scope43, u8scope44);
202
203    Var * p34del = nAb.createVar("p34del", zeroes);
204    p34b.createAssign(p34del, p34b.createOr(u8scope32, u8scope42));
205
206
207    // The high 5 bits of the UTF-16 code unit are only nonzero for 3 and 4-byte
208    // UTF-8 sequences.
209    p34b.createAssign(u16_hi[0], p34b.createOr(p34b.createAnd(u8scope33, bit4a2), surrogate));
210    p34b.createAssign(u16_hi[1], p34b.createOr(p34b.createAnd(u8scope33, bit5a2), surrogate));
211    p34b.createAssign(u16_hi[2], p34b.createAnd(u8scope33, bit6a2));
212    p34b.createAssign(u16_hi[3], p34b.createOr(p34b.createAnd(u8scope33, bit7a2), surrogate));
213    p34b.createAssign(u16_hi[4], p34b.createOr(p34b.createAnd(u8scope33, bit2a1), surrogate));
214
215    //
216    nAb.createIf(pfx34, p34b);
217    //
218    // Combined logic for 2, 3 and 4 byte sequences
219    //
220
221    Var * u8lastscope = main.createVar("u8lastscope", zeroes);
222
223    PabloAST * pfx2 = ccc.compileCC(re::makeByte(0xC0, 0xDF), nAb);
224    PabloAST * u8scope22 = nAb.createAdvance(pfx2, 1);
225    nAb.createAssign(u8lastscope, nAb.createOr(u8scope22, nAb.createOr(u8scope33, u8scope44)));
226    PabloAST * u8anyscope = nAb.createOr(u8lastscope, p34del);
227
228    PabloAST * C0_C1_err = ccc.compileCC(re::makeByte(0xC0, 0xC1), nAb);
229    PabloAST * scope_suffix_mismatch = nAb.createXor(u8anyscope, ccc.compileCC(re::makeByte(0x80, 0xBF), nAb));
230    nAb.createAssign(error_mask, nAb.createOr(scope_suffix_mismatch, nAb.createOr(C0_C1_err, EX_FX_err)));
231    nAb.createAssign(delmask, nAb.createOr(p34del, ccc.compileCC(re::makeByte(0xC0, 0xFF), nAb)));
232
233    // The low 3 bits of the high byte of the UTF-16 code unit as well as the high bit of the
234    // low byte are only nonzero for 2, 3 and 4 byte sequences.
235    nAb.createAssign(u16_hi[5], nAb.createOr(nAb.createAnd(u8lastscope, bit3a1), u8scope44));
236    nAb.createAssign(u16_hi[6], nAb.createOr(nAb.createAnd(u8lastscope, bit4a1), s43_hi6));
237    nAb.createAssign(u16_hi[7], nAb.createOr(nAb.createAnd(u8lastscope, bit5a1), s43_hi7));
238    nAb.createAssign(u16_lo[0], nAb.createOr(nAb.createAnd(u8lastscope, bit6a1), s43_lo0));
239
240    Var * p234_lo1 = main.createVar("p234_lo1", zeroes);
241
242    nAb.createAssign(p234_lo1, nAb.createOr(nAb.createAnd(u8lastscope, bit7a1), s43_lo1));
243
244    main.createIf(nonASCII, nAb);
245    //
246    //
247    PabloAST * ASCII = ccc.compileCC(re::makeByte(0x0, 0x7F));
248    PabloAST * last_byte = main.createOr(ASCII, u8lastscope);
249    main.createAssign(u16_lo[1], main.createOr(main.createAnd(ASCII, u8_bits[1]), p234_lo1));
250    main.createAssign(u16_lo[2], main.createOr(main.createAnd(last_byte, u8_bits[2]), s43_lo2));
251    main.createAssign(u16_lo[3], main.createOr(main.createAnd(last_byte, u8_bits[3]), s43_lo3));
252    main.createAssign(u16_lo[4], main.createOr(main.createAnd(last_byte, u8_bits[4]), s43_lo4));
253    main.createAssign(u16_lo[5], main.createOr(main.createAnd(last_byte, u8_bits[5]), s43_lo5));
254    main.createAssign(u16_lo[6], main.createOr(main.createAnd(last_byte, u8_bits[6]), s43_lo6));
255    main.createAssign(u16_lo[7], main.createOr(main.createAnd(last_byte, u8_bits[7]), s43_lo7));
256
257    Var * output = getOutputStreamVar("u16bit");
258    for (unsigned i = 0; i < 8; i++) {
259        main.createAssign(main.createExtract(output, i), u16_hi[i]);
260    }
261    for (unsigned i = 0; i < 8; i++) {
262        main.createAssign(main.createExtract(output, i + 8), u16_lo[i]);
263    }
264    PabloAST * selectors = main.createInFile(main.createNot(delmask));
265    main.createAssign(main.createExtract(getOutputStreamVar("selectors"), main.getInteger(0)), selectors);
266}
267
268typedef void (*u8u16FunctionType)(uint32_t fd, const char *);
269
270u8u16FunctionType generatePipeline(CPUDriver & pxDriver) {
271
272    auto & b = pxDriver.getBuilder();
273    auto P = pxDriver.makePipeline({Binding{b->getInt32Ty(), "inputFileDecriptor"}, Binding{b->getInt8PtrTy(), "outputFileName"}}, {});
274    Scalar * fileDescriptor = P->getInputScalar("inputFileDecriptor");
275    // File data from mmap
276    StreamSet * const ByteStream = P->CreateStreamSet(1, 8);
277    P->CreateKernelCall<MMapSourceKernel>(fileDescriptor, ByteStream);
278
279    // Transposed bits from s2p
280    StreamSet * BasisBits = P->CreateStreamSet(8);
281    P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits, cc::BitNumbering::BigEndian);
282
283    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
284    StreamSet * u8bits = P->CreateStreamSet(16);
285    StreamSet * selectors = P->CreateStreamSet();
286    P->CreateKernelCall<U8U16Kernel>(BasisBits, u8bits, selectors);
287
288    StreamSet * u16bits = P->CreateStreamSet(16);
289    StreamSet * u16bytes = P->CreateStreamSet(1, 16);
290    if (useAVX2()) {
291        // Allocate space for fully compressed swizzled UTF-16 bit streams
292        std::vector<StreamSet *> u16Swizzles(4);
293        u16Swizzles[0] = P->CreateStreamSet(4);
294        u16Swizzles[1] = P->CreateStreamSet(4);
295        u16Swizzles[2] = P->CreateStreamSet(4);
296        u16Swizzles[3] = P->CreateStreamSet(4);
297
298        // Apply a deletion algorithm to discard all but the final position of the UTF-8
299        // sequences (bit streams) for each UTF-16 code unit. Also compresses and swizzles the result.
300        P->CreateKernelCall<SwizzledDeleteByPEXTkernel>(selectors, u8bits, u16Swizzles);
301        // Produce unswizzled UTF-16 bit streams
302        P->CreateKernelCall<SwizzleGenerator>(u16Swizzles, std::vector<StreamSet *>{u16bits});
303        P->CreateKernelCall<P2S16Kernel>(u16bits, u16bytes, cc::BitNumbering::BigEndian);
304    } else {
305        P->CreateKernelCall<FieldCompressKernel>(b->getBitBlockWidth()/16, u8bits, selectors, u16bits);
306        P->CreateKernelCall<P2S16KernelWithCompressedOutput>(u16bits, selectors, u16bytes, cc::BitNumbering::BigEndian);
307    }
308
309    Scalar * outputFileName = P->getInputScalar("outputFileName");
310    P->CreateKernelCall<FileSink>(outputFileName, u16bytes);
311
312    return reinterpret_cast<u8u16FunctionType>(P->compile());
313}
314
315size_t file_size(const int fd) {
316    struct stat st;
317    if (LLVM_UNLIKELY(fstat(fd, &st) != 0)) {
318        st.st_size = 0;
319    }
320    return st.st_size;
321}
322
323int main(int argc, char *argv[]) {
324    codegen::ParseCommandLineOptions(argc, argv, {&u8u16Options, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
325    CPUDriver pxDriver("u8u16");
326    auto u8u16Function = generatePipeline(pxDriver);
327    const int fd = open(inputFile.c_str(), O_RDONLY);
328    if (LLVM_UNLIKELY(fd == -1)) {
329        std::cerr << "Error: cannot open " << inputFile << " for processing. Skipped.\n";
330    } else {
331        u8u16Function(fd, outputFile.c_str());
332        close(fd);
333    }
334    return 0;
335}
336
337                       
338
Note: See TracBrowser for help on using the repository browser.