source: icGREP/icgrep-devel/icgrep/u8u16.cpp

Last change on this file was 6296, checked in by cameron, 4 months ago

Merge branch 'master' of https://cs-git-research.cs.surrey.sfu.ca/cameron/parabix-devel

File size: 18.0 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7
8#include <IR_Gen/idisa_target.h>
9
10#include <cc/alphabet.h>
11#include <cc/cc_compiler.h>
12#include <cc/cc_kernel.h>
13#include <kernels/deletion.h>
14#include <kernels/kernel_builder.h>
15#include <kernels/p2s_kernel.h>
16#include <kernels/pipeline_builder.h>
17#include <kernels/s2p_kernel.h>
18#include <kernels/source_kernel.h>
19#include <kernels/stdout_kernel.h>
20#include <kernels/swizzle.h>
21#include <kernels/zeroextend.h>
22#include <pablo/builder.hpp>
23#include <pablo/pablo_kernel.h>
24#include <pablo/pablo_toolchain.h>
25#include <pablo/pe_zeroes.h>
26#include <toolchain/cpudriver.h>
27#include <toolchain/toolchain.h>
28
29#include <sys/stat.h>
30#include <fcntl.h>
31#include <iostream>
32
33using namespace pablo;
34using namespace kernel;
35using namespace llvm;
36using namespace codegen;
37using namespace re;
38
39static cl::OptionCategory u8u16Options("u8u16 Options", "Transcoding control options.");
40static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(u8u16Options));
41static cl::opt<std::string> outputFile(cl::Positional, cl::desc("<output file>"), cl::cat(u8u16Options));
42static cl::opt<bool> enableAVXdel("enable-AVX-deletion", cl::desc("Enable AVX2 deletion algorithms."), cl::cat(u8u16Options));
43
44static cl::opt<bool> BranchingMode("branch", cl::desc("Use Experimental branching pipeline mode"), cl::cat(u8u16Options));
45
46inline bool useAVX2() {
47    return enableAVXdel && AVX2_available() && codegen::BlockSize == 256;
48}
49
50class U8U16Kernel final: public pablo::PabloKernel {
51public:
52    U8U16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * BasisBits, StreamSet * u8bits, StreamSet * DelMask);
53    bool isCachable() const override { return true; }
54    bool hasSignature() const override { return false; }
55    void generatePabloMethod() override;
56};
57
58U8U16Kernel::U8U16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b, StreamSet * BasisBits, StreamSet * u8bits, StreamSet * selectors)
59: PabloKernel(b, "u8u16",
60// input
61{Binding{"u8bit", BasisBits}},
62// outputs
63{Binding{"u16bit", u8bits},
64 Binding{"selectors", selectors}}) {
65
66}
67
68void U8U16Kernel::generatePabloMethod() {
69    PabloBuilder main(getEntryScope());
70    Zeroes * zeroes = main.createZeroes();
71
72    //  input: 8 basis bit streams
73    std::vector<PabloAST *> u8_bits = getInputStreamSet("u8bit");
74
75    //  output: 16 u8-indexed streams, + delmask stream + error stream
76    Var * u16_hi[8];
77    for (int i = 0; i < 8; ++i) {
78        u16_hi[i] = main.createVar("u16_hi" + std::to_string(i), zeroes);
79    }
80    Var * u16_lo[8];
81    for (int i = 0; i < 8; ++i) {
82        u16_lo[i] = main.createVar("u16_lo" + std::to_string(i), zeroes);
83    }
84
85    Var * delmask = main.createVar("delmask", zeroes);
86    Var * error_mask = main.createVar("error_mask", zeroes);
87
88    cc::Parabix_CC_Compiler ccc(getEntryScope(), u8_bits);
89
90    // The logic for processing non-ASCII bytes will be embedded within an if-hierarchy.
91    PabloAST * nonASCII = ccc.compileCC(makeByte(0x80, 0xFF));
92
93    // Builder for the if statement handling all non-ASCII logic
94    auto nAb = main.createScope();
95    // Bits 3 through LE0 of a 2-byte prefix are data bits, needed to
96    // produce the UTF-16 code unit data ...,
97    PabloAST * bit4a1 = nAb.createAdvance(u8_bits[4], 1);
98    PabloAST * bit3a1 = nAb.createAdvance(u8_bits[3], 1);
99    PabloAST * bit2a1 = nAb.createAdvance(u8_bits[2], 1);
100    PabloAST * bit1a1 = nAb.createAdvance(u8_bits[1], 1);
101    PabloAST * bit0a1 = nAb.createAdvance(u8_bits[0], 1);
102
103    // Entry condition for 3 or 4 byte sequences: we have a prefix byte in the range 0xE0-0xFF.
104    PabloAST * pfx34 = ccc.compileCC(makeByte(0xE0, 0xFF), nAb);
105    // Builder for the if statement handling all logic for 3- and 4-byte sequences.
106    auto p34b = nAb.createScope();
107    // Bits LE3 through LE0 of a 3-byte prefix are data bits.  They must be moved
108    // to the final position of the 3-byte sequence.
109    PabloAST * bit5a1 = p34b.createAdvance(u8_bits[5], 1);
110    PabloAST * bit3a2 = p34b.createAdvance(bit3a1, 1);
111    PabloAST * bit2a2 = p34b.createAdvance(bit2a1, 1);
112    PabloAST * bit1a2 = p34b.createAdvance(bit1a1, 1);
113    PabloAST * bit0a2 = p34b.createAdvance(bit0a1, 1);
114
115    Var * const u8scope32 = nAb.createVar("u8scope32", zeroes);
116    Var * const u8scope33 = nAb.createVar("u8scope33", zeroes);
117    Var * const u8scope44 = nAb.createVar("u8scope44", zeroes);
118
119    //
120    // Logic for 4-byte UTF-8 sequences
121    //
122    // Entry condition  or 4 byte sequences: we have a prefix byte in the range 0xF0-0xFF.
123    PabloAST * pfx4 = ccc.compileCC(makeByte(0xF0, 0xFF), p34b);
124    // Builder for the if statement handling all logic for 4-byte sequences only.
125    auto p4b = p34b.createScope();
126    // Illegal 4-byte sequences
127    PabloAST * F0 = ccc.compileCC(makeByte(0xF0), p4b);
128    PabloAST * F4 = ccc.compileCC(makeByte(0xF4), p4b);
129    PabloAST * F0_err = p4b.createAnd(p4b.createAdvance(F0, 1), ccc.compileCC(makeByte(0x80, 0x8F), p4b));
130    PabloAST * F4_err = p4b.createAnd(p4b.createAdvance(F4, 1), ccc.compileCC(makeByte(0x90, 0xBF), p4b));
131    PabloAST * F5_FF = ccc.compileCC(makeByte(0xF5, 0xFF), p4b);
132
133    Var * FX_err = p34b.createVar("FX_err", zeroes);
134    p4b.createAssign(FX_err, p4b.createOr(F5_FF, p4b.createOr(F0_err, F4_err)));
135    //
136    // 4-byte prefixes have a scope that extends over the next 3 bytes.
137
138    Var * u8scope42 = p34b.createVar("u8scope42", zeroes);
139    Var * u8scope43 = p34b.createVar("u8scope43", zeroes);
140
141    p4b.createAssign(u8scope42, p4b.createAdvance(pfx4, 1));
142    p4b.createAssign(u8scope43, p4b.createAdvance(u8scope42, 1));
143    p4b.createAssign(u8scope44, p4b.createAdvance(u8scope43, 1));
144    //
145
146    //  From the 4-byte sequence 11110abc 10defghi 10jklmno 10pqrstu,
147    //  we must calculate the value abcde - 1 to produce the bit values
148    //  for u16_hi1, hi0, lo7, lo6 at the scope43 position.
149    Var * s43_lo7 = nAb.createVar("scope43_lo7", zeroes);
150    Var * s43_lo6 = nAb.createVar("scope43_lo6", zeroes);
151    Var * s43_hi1 = nAb.createVar("scope43_hi1", zeroes);
152    Var * s43_hi0 = nAb.createVar("scope43_hi0", zeroes);
153
154    Var * s43_lo5 = main.createVar("scope43_lo5", zeroes);
155    Var * s43_lo4 = main.createVar("scope43_lo4", zeroes);
156    Var * s43_lo3 = main.createVar("scope43_lo3", zeroes);
157    Var * s43_lo2 = main.createVar("scope43_lo2", zeroes);
158    Var * s43_lo1 = main.createVar("scope43_lo1", zeroes);
159    Var * s43_lo0 = main.createVar("scope43_lo0", zeroes);
160
161    p4b.createAssign(s43_lo6, p4b.createAnd(u8scope43, p4b.createNot(bit4a1)));           // e - 1
162    p4b.createAssign(s43_lo7, p4b.createAnd(u8scope43, p4b.createXor(bit5a1, s43_lo6)));  // d - borrow
163    PabloAST * brw1 = p4b.createAnd(s43_lo6, p4b.createNot(bit5a1));
164    p4b.createAssign(s43_hi0, p4b.createAnd(u8scope43, p4b.createXor(bit0a2, brw1)));     // c - borrow
165    PabloAST * brw2 = p4b.createAnd(brw1, p4b.createNot(bit0a2));
166    p4b.createAssign(s43_hi1, p4b.createAnd(u8scope43, p4b.createXor(bit1a2, brw2)));     // b - borrow
167    //
168    p4b.createAssign(s43_lo5, p4b.createAnd(u8scope43, bit3a1));
169    p4b.createAssign(s43_lo4, p4b.createAnd(u8scope43, bit2a1));
170    p4b.createAssign(s43_lo3, p4b.createAnd(u8scope43, bit1a1));
171    p4b.createAssign(s43_lo2, p4b.createAnd(u8scope43, bit0a1));
172    p4b.createAssign(s43_lo1, p4b.createAnd(u8scope43, u8_bits[5]));
173    p4b.createAssign(s43_lo0, p4b.createAnd(u8scope43, u8_bits[4]));
174    //
175    //
176    p34b.createIf(pfx4, p4b);
177    //
178    // Combined logic for 3 and 4 byte sequences
179    //
180    PabloAST * pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), p34b);
181
182    p34b.createAssign(u8scope32, p34b.createAdvance(pfx3, 1));
183    p34b.createAssign(u8scope33, p34b.createAdvance(u8scope32, 1));
184
185    // Illegal 3-byte sequences
186    PabloAST * E0 = ccc.compileCC(makeByte(0xE0), p34b);
187    PabloAST * ED = ccc.compileCC(makeByte(0xED), p34b);
188    PabloAST * E0_err = p34b.createAnd(p34b.createAdvance(E0, 1), ccc.compileCC(makeByte(0x80, 0x9F), p34b));
189    PabloAST * ED_err = p34b.createAnd(p34b.createAdvance(ED, 1), ccc.compileCC(makeByte(0xA0, 0xBF), p34b));
190    Var * EX_FX_err = nAb.createVar("EX_FX_err", zeroes);
191
192    p34b.createAssign(EX_FX_err, p34b.createOr(p34b.createOr(E0_err, ED_err), FX_err));
193    // Two surrogate UTF-16 units are computed at the 3rd and 4th positions of 4-byte sequences.
194    PabloAST * surrogate = p34b.createOr(u8scope43, u8scope44);
195
196    Var * p34del = nAb.createVar("p34del", zeroes);
197    p34b.createAssign(p34del, p34b.createOr(u8scope32, u8scope42));
198
199
200    // The high 5 bits of the UTF-16 code unit are only nonzero for 3 and 4-byte
201    // UTF-8 sequences.
202    p34b.createAssign(u16_hi[7], p34b.createOr(p34b.createAnd(u8scope33, bit3a2), surrogate));
203    p34b.createAssign(u16_hi[6], p34b.createOr(p34b.createAnd(u8scope33, bit2a2), surrogate));
204    p34b.createAssign(u16_hi[5], p34b.createAnd(u8scope33, bit1a2));
205    p34b.createAssign(u16_hi[4], p34b.createOr(p34b.createAnd(u8scope33, bit0a2), surrogate));
206    p34b.createAssign(u16_hi[3], p34b.createOr(p34b.createAnd(u8scope33, bit5a1), surrogate));
207
208    //
209    nAb.createIf(pfx34, p34b);
210    //
211    // Combined logic for 2, 3 and 4 byte sequences
212    //
213
214    Var * u8lastscope = main.createVar("u8lastscope", zeroes);
215
216    PabloAST * pfx2 = ccc.compileCC(makeByte(0xC0, 0xDF), nAb);
217    PabloAST * u8scope22 = nAb.createAdvance(pfx2, 1);
218    nAb.createAssign(u8lastscope, nAb.createOr(u8scope22, nAb.createOr(u8scope33, u8scope44)));
219    PabloAST * u8anyscope = nAb.createOr(u8lastscope, p34del);
220
221    PabloAST * C0_C1_err = ccc.compileCC(makeByte(0xC0, 0xC1), nAb);
222    PabloAST * scope_suffix_mismatch = nAb.createXor(u8anyscope, ccc.compileCC(makeByte(0x80, 0xBF), nAb));
223    nAb.createAssign(error_mask, nAb.createOr(scope_suffix_mismatch, nAb.createOr(C0_C1_err, EX_FX_err)));
224    nAb.createAssign(delmask, nAb.createOr(p34del, ccc.compileCC(makeByte(0xC0, 0xFF), nAb)));
225
226    // The low 3 bits of the high byte of the UTF-16 code unit as well as the high bit of the
227    // low byte are only nonzero for 2, 3 and 4 byte sequences.
228    nAb.createAssign(u16_hi[2], nAb.createOr(nAb.createAnd(u8lastscope, bit4a1), u8scope44));
229    nAb.createAssign(u16_hi[1], nAb.createOr(nAb.createAnd(u8lastscope, bit3a1), s43_hi1));
230    nAb.createAssign(u16_hi[0], nAb.createOr(nAb.createAnd(u8lastscope, bit2a1), s43_hi0));
231    nAb.createAssign(u16_lo[7], nAb.createOr(nAb.createAnd(u8lastscope, bit1a1), s43_lo7));
232
233    Var * p234_lo6 = main.createVar("p234_lo6", zeroes);
234
235    nAb.createAssign(p234_lo6, nAb.createOr(nAb.createAnd(u8lastscope, bit0a1), s43_lo6));
236
237    main.createIf(nonASCII, nAb);
238    //
239    //
240    PabloAST * ASCII = ccc.compileCC(makeByte(0x0, 0x7F));
241    PabloAST * last_byte = main.createOr(ASCII, u8lastscope);
242    main.createAssign(u16_lo[6], main.createOr(main.createAnd(ASCII, u8_bits[6]), p234_lo6));
243    main.createAssign(u16_lo[5], main.createOr(main.createAnd(last_byte, u8_bits[5]), s43_lo5));
244    main.createAssign(u16_lo[4], main.createOr(main.createAnd(last_byte, u8_bits[4]), s43_lo4));
245    main.createAssign(u16_lo[3], main.createOr(main.createAnd(last_byte, u8_bits[3]), s43_lo3));
246    main.createAssign(u16_lo[2], main.createOr(main.createAnd(last_byte, u8_bits[2]), s43_lo2));
247    main.createAssign(u16_lo[1], main.createOr(main.createAnd(last_byte, u8_bits[1]), s43_lo1));
248    main.createAssign(u16_lo[0], main.createOr(main.createAnd(last_byte, u8_bits[0]), s43_lo0));
249
250    Var * output = getOutputStreamVar("u16bit");
251    for (unsigned i = 0; i < 8; i++) {
252        main.createAssign(main.createExtract(output, i + 8), u16_hi[i]);
253    }
254    for (unsigned i = 0; i < 8; i++) {
255        main.createAssign(main.createExtract(output, i), u16_lo[i]);
256    }
257    PabloAST * selectors = main.createInFile(main.createNot(delmask));
258    main.createAssign(main.createExtract(getOutputStreamVar("selectors"), main.getInteger(0)), selectors);
259}
260
261typedef void (*u8u16FunctionType)(uint32_t fd, const char *);
262
263// ------------------------------------------------------
264
265u8u16FunctionType generatePipeline(CPUDriver & pxDriver) {
266
267    auto & b = pxDriver.getBuilder();
268    auto P = pxDriver.makePipeline({Binding{b->getInt32Ty(), "inputFileDecriptor"}, Binding{b->getInt8PtrTy(), "outputFileName"}}, {});
269    Scalar * fileDescriptor = P->getInputScalar("inputFileDecriptor");
270    // File data from mmap
271    StreamSet * const ByteStream = P->CreateStreamSet(1, 8);
272    P->CreateKernelCall<MMapSourceKernel>(fileDescriptor, ByteStream);
273
274    // Transposed bits from s2p
275    StreamSet * BasisBits = P->CreateStreamSet(8);
276    P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits);
277
278    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
279    StreamSet * u8bits = P->CreateStreamSet(16);
280    StreamSet * selectors = P->CreateStreamSet();
281    P->CreateKernelCall<U8U16Kernel>(BasisBits, u8bits, selectors);
282
283    StreamSet * u16bits = P->CreateStreamSet(16);
284    StreamSet * u16bytes = P->CreateStreamSet(1, 16);
285    if (useAVX2()) {
286        // Allocate space for fully compressed swizzled UTF-16 bit streams
287        std::vector<StreamSet *> u16Swizzles(4);
288        u16Swizzles[0] = P->CreateStreamSet(4);
289        u16Swizzles[1] = P->CreateStreamSet(4);
290        u16Swizzles[2] = P->CreateStreamSet(4);
291        u16Swizzles[3] = P->CreateStreamSet(4);
292
293        // Apply a deletion algorithm to discard all but the final position of the UTF-8
294        // sequences (bit streams) for each UTF-16 code unit. Also compresses and swizzles the result.
295        P->CreateKernelCall<SwizzledDeleteByPEXTkernel>(selectors, u8bits, u16Swizzles);
296        // Produce unswizzled UTF-16 bit streams
297        P->CreateKernelCall<SwizzleGenerator>(u16Swizzles, std::vector<StreamSet *>{u16bits});
298        P->CreateKernelCall<P2S16Kernel>(u16bits, u16bytes);
299    } else {
300        P->CreateKernelCall<FieldCompressKernel>(b->getBitBlockWidth()/16, u8bits, selectors, u16bits);
301        P->CreateKernelCall<P2S16KernelWithCompressedOutput>(u16bits, selectors, u16bytes);
302    }
303
304    Scalar * outputFileName = P->getInputScalar("outputFileName");
305    P->CreateKernelCall<FileSink>(outputFileName, u16bytes);
306
307    return reinterpret_cast<u8u16FunctionType>(P->compile());
308}
309
310// ------------------------------------------------------
311
312void makeNonAsciiBranch(const std::unique_ptr<PipelineBuilder> & P, const unsigned FieldWidth, StreamSet * const ByteStream, StreamSet * const u16bytes) {
313
314    // Transposed bits from s2p
315    StreamSet * BasisBits = P->CreateStreamSet(8);
316    P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits);
317
318    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
319    StreamSet * u8bits = P->CreateStreamSet(16);
320    StreamSet * selectors = P->CreateStreamSet();
321    P->CreateKernelCall<U8U16Kernel>(BasisBits, u8bits, selectors);
322
323    StreamSet * u16bits = P->CreateStreamSet(16);
324    if (useAVX2()) {
325        // Allocate space for fully compressed swizzled UTF-16 bit streams
326        std::vector<StreamSet *> u16Swizzles(4);
327        u16Swizzles[0] = P->CreateStreamSet(4);
328        u16Swizzles[1] = P->CreateStreamSet(4);
329        u16Swizzles[2] = P->CreateStreamSet(4);
330        u16Swizzles[3] = P->CreateStreamSet(4);
331        // Apply a deletion algorithm to discard all but the final position of the UTF-8
332        // sequences (bit streams) for each UTF-16 code unit. Also compresses and swizzles the result.
333        P->CreateKernelCall<SwizzledDeleteByPEXTkernel>(selectors, u8bits, u16Swizzles);
334        // Produce unswizzled UTF-16 bit streams
335        P->CreateKernelCall<SwizzleGenerator>(u16Swizzles, std::vector<StreamSet *>{u16bits});
336        P->CreateKernelCall<P2S16Kernel>(u16bits, u16bytes);
337    } else {
338        P->CreateKernelCall<FieldCompressKernel>(FieldWidth, u8bits, selectors, u16bits);
339        P->CreateKernelCall<P2S16KernelWithCompressedOutput>(u16bits, selectors, u16bytes);
340    }
341}
342
343void makeAllAsciiBranch(const std::unique_ptr<PipelineBuilder> & P, StreamSet * const ByteStream, StreamSet * const u16bytes) {
344    P->CreateKernelCall<ZeroExtend>(ByteStream, u16bytes);
345}
346
347u8u16FunctionType generatePipeline2(CPUDriver & pxDriver) {
348
349    auto & b = pxDriver.getBuilder();
350    auto P = pxDriver.makePipeline({Binding{b->getInt32Ty(), "inputFileDecriptor"}, Binding{b->getInt8PtrTy(), "outputFileName"}}, {});
351    Scalar * fileDescriptor = P->getInputScalar("inputFileDecriptor");
352    // File data from mmap
353    StreamSet * const ByteStream = P->CreateStreamSet(1, 8);
354    StreamSet * const u16bytes = P->CreateStreamSet(1, 16);
355    P->CreateKernelCall<MMapSourceKernel>(fileDescriptor, ByteStream);
356
357    StreamSet * const nonAscii =  P->CreateStreamSet();
358
359    CC * const nonAsciiCC = makeByte(0x80, 0xFF);
360    P->CreateKernelCall<CharacterClassKernelBuilder>(
361        "nonASCII", std::vector<CC *>{nonAsciiCC}, ByteStream, nonAscii);
362
363    auto B = P->CreateOptimizationBranch(nonAscii,
364        {Binding{"ByteStream", ByteStream}, Binding{"condition", nonAscii}}, {Binding{"u16bytes", u16bytes, BoundedRate(0, 1)}});
365
366    makeAllAsciiBranch(B->getAllZeroBranch(), ByteStream, u16bytes);
367
368    makeNonAsciiBranch(B->getNonZeroBranch(), b->getBitBlockWidth() / 16, ByteStream, u16bytes);
369
370    Scalar * outputFileName = P->getInputScalar("outputFileName");
371    P->CreateKernelCall<FileSink>(outputFileName, u16bytes);
372
373    return reinterpret_cast<u8u16FunctionType>(P->compile());
374}
375
376size_t file_size(const int fd) {
377    struct stat st;
378    if (LLVM_UNLIKELY(fstat(fd, &st) != 0)) {
379        st.st_size = 0;
380    }
381    return st.st_size;
382}
383
384int main(int argc, char *argv[]) {
385    codegen::ParseCommandLineOptions(argc, argv, {&u8u16Options, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
386    CPUDriver pxDriver("u8u16");
387    u8u16FunctionType u8u16Function = nullptr;
388    if (BranchingMode) {
389        u8u16Function = generatePipeline2(pxDriver);
390    } else {
391        u8u16Function = generatePipeline(pxDriver);
392    }
393    const int fd = open(inputFile.c_str(), O_RDONLY);
394    if (LLVM_UNLIKELY(fd == -1)) {
395        std::cerr << "Error: cannot open " << inputFile << " for processing. Skipped.\n";
396    } else {
397        u8u16Function(fd, outputFile.c_str());
398        close(fd);
399    }
400    return 0;
401}
402
403
404
Note: See TracBrowser for help on using the repository browser.