source: icGREP/icgrep-devel/icgrep/u8u16.cpp @ 5474

Last change on this file since 5474 was 5474, checked in by nmedfort, 2 years ago

Eliminated ExecutionEngine? memory leak. Intentionally broke compatibility with prior versions to ensure unchecked in projects are restructured.

File size: 24.8 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <IR_Gen/idisa_target.h>                   // for GetIDISA_Builder
8#include <cc/cc_compiler.h>                        // for CC_Compiler
9#include <kernels/deletion.h>                      // for DeletionKernel
10#include <kernels/swizzle.h>                      // for DeletionKernel
11#include <kernels/source_kernel.h>
12#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
13#include <kernels/s2p_kernel.h>                    // for S2PKernel
14#include <kernels/stdout_kernel.h>                 // for StdOutKernel
15#include <llvm/ExecutionEngine/ExecutionEngine.h>  // for ExecutionEngine
16#include <llvm/IR/Function.h>                      // for Function, Function...
17#include <llvm/IR/Module.h>                        // for Module
18#include <llvm/IR/Verifier.h>                      // for verifyModule
19#include <llvm/Support/CommandLine.h>              // for ParseCommandLineOp...
20#include <llvm/Support/Debug.h>                    // for dbgs
21#include <pablo/pablo_kernel.h>                    // for PabloKernel
22#include <pablo/pablo_toolchain.h>                 // for pablo_function_passes
23#include <kernels/kernel_builder.h>
24#include <pablo/pe_zeroes.h>
25#include <toolchain/toolchain.h>
26#include <toolchain/cpudriver.h>
27#include <kernels/streamset.h>
28#include <llvm/ADT/StringRef.h>
29#include <llvm/IR/CallingConv.h>
30#include <llvm/IR/DerivedTypes.h>
31#include <llvm/IR/LLVMContext.h>
32#include <llvm/IR/Value.h>
33#include <llvm/Support/Compiler.h>
34#include <pablo/builder.hpp>
35#include <boost/interprocess/anonymous_shared_memory.hpp>
36#include <boost/interprocess/mapped_region.hpp>
37#include <iostream>
38
39using namespace pablo;
40using namespace kernel;
41using namespace parabix;
42using namespace llvm;
43
44static cl::OptionCategory u8u16Options("u8u16 Options", "Transcoding control options.");
45static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(u8u16Options));
46static cl::opt<std::string> outputFile(cl::Positional, cl::desc("<output file>"),  cl::Required, cl::cat(u8u16Options));
47static cl::opt<bool> enableAVXdel("enable-AVX-deletion", cl::desc("Enable AVX2 deletion algorithms."), cl::cat(u8u16Options));
48static cl::opt<bool> mMapBuffering("mmap-buffering", cl::desc("Enable mmap buffering."), cl::cat(u8u16Options));
49static cl::opt<bool> memAlignBuffering("memalign-buffering", cl::desc("Enable posix_memalign buffering."), cl::cat(u8u16Options));
50
51class U8U16Kernel final: public pablo::PabloKernel {
52public:
53    U8U16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b);
54    bool isCachable() const override { return true; }
55    bool hasSignature() const override { return false; }
56    void generatePabloMethod() override;
57};
58
59U8U16Kernel::U8U16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b)
60: PabloKernel(b, "u8u16",
61{Binding{b->getStreamSetTy(8, 1), "u8bit"}},
62{Binding{b->getStreamSetTy(16, 1), "u16bit"}, Binding{b->getStreamSetTy(1, 1), "delMask"}, Binding{b->getStreamSetTy(1, 1), "errMask"}}) {
63
64}
65
66void U8U16Kernel::generatePabloMethod() {
67
68    //  input: 8 basis bit streams
69
70    const auto u8bitSet = getInputStreamVar("u8bit");
71
72    //  output: 16 u8-indexed streams, + delmask stream + error stream
73
74    cc::CC_Compiler ccc(this, u8bitSet);
75
76    PabloBuilder & main = ccc.getBuilder();
77    const auto u8_bits = ccc.getBasisBits();
78
79    Zeroes * zeroes = main.createZeroes();
80
81    // Outputs
82    Var * u16_hi[8];
83    for (int i = 0; i < 8; ++i) {
84        u16_hi[i] = main.createVar("u16_hi" + std::to_string(i), zeroes);
85    }
86    Var * u16_lo[8];
87    for (int i = 0; i < 8; ++i) {
88        u16_lo[i] = main.createVar("u16_lo" + std::to_string(i), zeroes);
89    }
90    Var * delmask = main.createVar("delmask", zeroes);
91    Var * error_mask = main.createVar("error_mask", zeroes);
92
93    // The logic for processing non-ASCII bytes will be embedded within an if-hierarchy.
94    PabloAST * nonASCII = ccc.compileCC(re::makeCC(0x80, 0xFF));
95
96    // Builder for the if statement handling all non-ASCII logic
97    PabloBuilder nAb = PabloBuilder::Create(main);
98    // Bits 3 through 7 of a 2-byte prefix are data bits, needed to
99    // produce the UTF-16 code unit data ...,
100    PabloAST * bit3a1 = nAb.createAdvance(u8_bits[3], 1);
101    PabloAST * bit4a1 = nAb.createAdvance(u8_bits[4], 1);
102    PabloAST * bit5a1 = nAb.createAdvance(u8_bits[5], 1);
103    PabloAST * bit6a1 = nAb.createAdvance(u8_bits[6], 1);
104    PabloAST * bit7a1 = nAb.createAdvance(u8_bits[7], 1);
105
106    // Entry condition for 3 or 4 byte sequences: we have a prefix byte in the range 0xE0-0xFF.
107    PabloAST * pfx34 = ccc.compileCC(re::makeCC(0xE0, 0xFF), nAb);
108    // Builder for the if statement handling all logic for 3- and 4-byte sequences.
109    PabloBuilder p34b = PabloBuilder::Create(nAb);
110    // Bits 4 through 7 of a 3-byte prefix are data bits.  They must be moved
111    // to the final position of the 3-byte sequence.
112    PabloAST * bit2a1 = p34b.createAdvance(u8_bits[2], 1);
113    PabloAST * bit4a2 = p34b.createAdvance(bit4a1, 1);
114    PabloAST * bit5a2 = p34b.createAdvance(bit5a1, 1);
115    PabloAST * bit6a2 = p34b.createAdvance(bit6a1, 1);
116    PabloAST * bit7a2 = p34b.createAdvance(bit7a1, 1);
117
118    Var * const u8scope32 = nAb.createVar("u8scope32", zeroes);
119    Var * const u8scope33 = nAb.createVar("u8scope33", zeroes);
120    Var * const u8scope44 = nAb.createVar("u8scope44", zeroes);
121
122    //
123    // Logic for 4-byte UTF-8 sequences
124    //
125    // Entry condition  or 4 byte sequences: we have a prefix byte in the range 0xF0-0xFF.
126    PabloAST * pfx4 = ccc.compileCC(re::makeCC(0xF0, 0xFF), p34b);
127    // Builder for the if statement handling all logic for 4-byte sequences only.
128    PabloBuilder p4b = PabloBuilder::Create(p34b);
129    // Illegal 4-byte sequences
130    PabloAST * F0 = ccc.compileCC(re::makeCC(0xF0), p4b);
131    PabloAST * F4 = ccc.compileCC(re::makeCC(0xF4), p4b);
132    PabloAST * F0_err = p4b.createAnd(p4b.createAdvance(F0, 1), ccc.compileCC(re::makeCC(0x80, 0x8F), p4b));
133    PabloAST * F4_err = p4b.createAnd(p4b.createAdvance(F4, 1), ccc.compileCC(re::makeCC(0x90, 0xBF), p4b));
134    PabloAST * F5_FF = ccc.compileCC(re::makeCC(0xF5, 0xFF), p4b);
135
136    Var * FX_err = p34b.createVar("FX_err", zeroes);
137    p4b.createAssign(FX_err, p4b.createOr(F5_FF, p4b.createOr(F0_err, F4_err)));
138    //
139    // 4-byte prefixes have a scope that extends over the next 3 bytes.
140
141    Var * u8scope42 = p34b.createVar("u8scope42", zeroes);
142    Var * u8scope43 = p34b.createVar("u8scope43", zeroes);
143
144    p4b.createAssign(u8scope42, p4b.createAdvance(pfx4, 1));
145    p4b.createAssign(u8scope43, p4b.createAdvance(u8scope42, 1));
146    p4b.createAssign(u8scope44, p4b.createAdvance(u8scope43, 1));
147    //
148
149    //  From the 4-byte sequence 11110abc 10defghi 10jklmno 10pqrstu,
150    //  we must calculate the value abcde - 1 to produce the bit values
151    //  for u16_hi6, hi7, lo0, lo1 at the scope43 position.
152    Var * s43_lo0 = nAb.createVar("scope43_lo0", zeroes);
153    Var * s43_lo1 = nAb.createVar("scope43_lo1", zeroes);
154    Var * s43_hi6 = nAb.createVar("scope43_hi6", zeroes);
155    Var * s43_hi7 = nAb.createVar("scope43_hi7", zeroes);
156
157    Var * s43_lo2 = main.createVar("scope43_lo2", zeroes);
158    Var * s43_lo3 = main.createVar("scope43_lo3", zeroes);
159    Var * s43_lo4 = main.createVar("scope43_lo4", zeroes);
160    Var * s43_lo5 = main.createVar("scope43_lo5", zeroes);
161    Var * s43_lo6 = main.createVar("scope43_lo6", zeroes);
162    Var * s43_lo7 = main.createVar("scope43_lo7", zeroes);
163
164    p4b.createAssign(s43_lo1, p4b.createAnd(u8scope43, p4b.createNot(bit3a1)));           // e - 1
165    p4b.createAssign(s43_lo0, p4b.createAnd(u8scope43, p4b.createXor(bit2a1, s43_lo1)));  // d - borrow
166    PabloAST * brw1 = p4b.createAnd(s43_lo1, p4b.createNot(bit2a1));
167    p4b.createAssign(s43_hi7, p4b.createAnd(u8scope43, p4b.createXor(bit7a2, brw1)));     // c - borrow
168    PabloAST * brw2 = p4b.createAnd(brw1, p4b.createNot(bit7a2));
169    p4b.createAssign(s43_hi6, p4b.createAnd(u8scope43, p4b.createXor(bit6a2, brw2)));     // b - borrow
170    //
171    p4b.createAssign(s43_lo2, p4b.createAnd(u8scope43, bit4a1));
172    p4b.createAssign(s43_lo3, p4b.createAnd(u8scope43, bit5a1));
173    p4b.createAssign(s43_lo4, p4b.createAnd(u8scope43, bit6a1));
174    p4b.createAssign(s43_lo5, p4b.createAnd(u8scope43, bit7a1));
175    p4b.createAssign(s43_lo6, p4b.createAnd(u8scope43, u8_bits[2]));
176    p4b.createAssign(s43_lo7, p4b.createAnd(u8scope43, u8_bits[3]));
177    //
178    //
179    p34b.createIf(pfx4, p4b);
180    //
181    // Combined logic for 3 and 4 byte sequences
182    //
183    PabloAST * pfx3 = ccc.compileCC(re::makeCC(0xE0, 0xEF), p34b);
184
185    p34b.createAssign(u8scope32, p34b.createAdvance(pfx3, 1));
186    p34b.createAssign(u8scope33, p34b.createAdvance(u8scope32, 1));
187
188    // Illegal 3-byte sequences
189    PabloAST * E0 = ccc.compileCC(re::makeCC(0xE0), p34b);
190    PabloAST * ED = ccc.compileCC(re::makeCC(0xED), p34b);
191    PabloAST * E0_err = p34b.createAnd(p34b.createAdvance(E0, 1), ccc.compileCC(re::makeCC(0x80, 0x9F), p34b));
192    PabloAST * ED_err = p34b.createAnd(p34b.createAdvance(ED, 1), ccc.compileCC(re::makeCC(0xA0, 0xBF), p34b));
193    Var * EX_FX_err = nAb.createVar("EX_FX_err", zeroes);
194
195    p34b.createAssign(EX_FX_err, p34b.createOr(p34b.createOr(E0_err, ED_err), FX_err));
196    // Two surrogate UTF-16 units are computed at the 3rd and 4th positions of 4-byte sequences.
197    PabloAST * surrogate = p34b.createOr(u8scope43, u8scope44);
198
199    Var * p34del = nAb.createVar("p34del", zeroes);
200    p34b.createAssign(p34del, p34b.createOr(u8scope32, u8scope42));
201
202
203    // The high 5 bits of the UTF-16 code unit are only nonzero for 3 and 4-byte
204    // UTF-8 sequences.
205    p34b.createAssign(u16_hi[0], p34b.createOr(p34b.createAnd(u8scope33, bit4a2), surrogate));
206    p34b.createAssign(u16_hi[1], p34b.createOr(p34b.createAnd(u8scope33, bit5a2), surrogate));
207    p34b.createAssign(u16_hi[2], p34b.createAnd(u8scope33, bit6a2));
208    p34b.createAssign(u16_hi[3], p34b.createOr(p34b.createAnd(u8scope33, bit7a2), surrogate));
209    p34b.createAssign(u16_hi[4], p34b.createOr(p34b.createAnd(u8scope33, bit2a1), surrogate));
210
211    //
212    nAb.createIf(pfx34, p34b);
213    //
214    // Combined logic for 2, 3 and 4 byte sequences
215    //
216
217    Var * u8lastscope = main.createVar("u8lastscope", zeroes);
218
219    PabloAST * pfx2 = ccc.compileCC(re::makeCC(0xC0, 0xDF), nAb);
220    PabloAST * u8scope22 = nAb.createAdvance(pfx2, 1);
221    nAb.createAssign(u8lastscope, nAb.createOr(u8scope22, nAb.createOr(u8scope33, u8scope44)));
222    PabloAST * u8anyscope = nAb.createOr(u8lastscope, p34del);
223
224    PabloAST * C0_C1_err = ccc.compileCC(re::makeCC(0xC0, 0xC1), nAb);
225    PabloAST * scope_suffix_mismatch = nAb.createXor(u8anyscope, ccc.compileCC(re::makeCC(0x80, 0xBF), nAb));
226    nAb.createAssign(error_mask, nAb.createOr(scope_suffix_mismatch, nAb.createOr(C0_C1_err, EX_FX_err)));
227    nAb.createAssign(delmask, nAb.createOr(p34del, ccc.compileCC(re::makeCC(0xC0, 0xFF), nAb)));
228
229    // The low 3 bits of the high byte of the UTF-16 code unit as well as the high bit of the
230    // low byte are only nonzero for 2, 3 and 4 byte sequences.
231    nAb.createAssign(u16_hi[5], nAb.createOr(nAb.createAnd(u8lastscope, bit3a1), u8scope44));
232    nAb.createAssign(u16_hi[6], nAb.createOr(nAb.createAnd(u8lastscope, bit4a1), s43_hi6));
233    nAb.createAssign(u16_hi[7], nAb.createOr(nAb.createAnd(u8lastscope, bit5a1), s43_hi7));
234    nAb.createAssign(u16_lo[0], nAb.createOr(nAb.createAnd(u8lastscope, bit6a1), s43_lo0));
235
236    Var * p234_lo1 = main.createVar("p234_lo1", zeroes);
237
238    nAb.createAssign(p234_lo1, nAb.createOr(nAb.createAnd(u8lastscope, bit7a1), s43_lo1));
239
240    main.createIf(nonASCII, nAb);
241    //
242    //
243    PabloAST * ASCII = ccc.compileCC(re::makeCC(0x0, 0x7F));
244    PabloAST * last_byte = main.createOr(ASCII, u8lastscope);
245    main.createAssign(u16_lo[1], main.createOr(main.createAnd(ASCII, u8_bits[1]), p234_lo1));
246    main.createAssign(u16_lo[2], main.createOr(main.createAnd(last_byte, u8_bits[2]), s43_lo2));
247    main.createAssign(u16_lo[3], main.createOr(main.createAnd(last_byte, u8_bits[3]), s43_lo3));
248    main.createAssign(u16_lo[4], main.createOr(main.createAnd(last_byte, u8_bits[4]), s43_lo4));
249    main.createAssign(u16_lo[5], main.createOr(main.createAnd(last_byte, u8_bits[5]), s43_lo5));
250    main.createAssign(u16_lo[6], main.createOr(main.createAnd(last_byte, u8_bits[6]), s43_lo6));
251    main.createAssign(u16_lo[7], main.createOr(main.createAnd(last_byte, u8_bits[7]), s43_lo7));
252
253    Var * output = getOutputStreamVar("u16bit");
254    Var * delmask_out = getOutputStreamVar("delMask");
255    Var * error_mask_out = getOutputStreamVar("errMask");
256
257    for (unsigned i = 0; i < 8; i++) {
258        main.createAssign(main.createExtract(output, i), u16_hi[i]);
259    }
260    for (unsigned i = 0; i < 8; i++) {
261        main.createAssign(main.createExtract(output, i + 8), u16_lo[i]);
262    }
263    main.createAssign(main.createExtract(delmask_out, main.getInteger(0)), delmask);
264    main.createAssign(main.createExtract(error_mask_out,  main.getInteger(0)), error_mask);
265}
266
267void u8u16PipelineAVX2Gen(ParabixDriver & pxDriver) {
268
269    auto & iBuilder = pxDriver.getBuilder();
270    Module * mod = iBuilder->getModule();
271    const unsigned segmentSize = codegen::SegmentSize;
272    const unsigned bufferSegments = codegen::ThreadNum+1;
273
274    assert (iBuilder);
275
276    Type * const voidTy = iBuilder->getVoidTy();
277    Type * const bitBlockType = iBuilder->getBitBlockType();
278    Type * const outputType = ArrayType::get(ArrayType::get(bitBlockType, 16), 1)->getPointerTo();
279
280    Function * const main = cast<Function>(mod->getOrInsertFunction("Main", voidTy, iBuilder->getInt32Ty(), outputType, nullptr));
281    main->setCallingConv(CallingConv::C);
282    Function::arg_iterator args = main->arg_begin();
283
284    Value * const fileDecriptor = &*(args++);
285    fileDecriptor->setName("fileDecriptor");
286    Value * const outputStream = &*(args++);
287    outputStream->setName("outputStream");
288
289    iBuilder->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
290   
291    // File data from mmap
292    StreamSetBuffer * ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
293   
294    Kernel * mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, segmentSize));
295    mmapK->setInitialArguments({fileDecriptor});
296    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
297   
298    // Transposed bits from s2p
299    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments));
300   
301    Kernel * s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
302    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
303   
304    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
305    StreamSetBuffer * U8u16Bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
306    StreamSetBuffer * DelMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
307    StreamSetBuffer * ErrorMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
308   
309    Kernel * u8u16k = pxDriver.addKernelInstance(make_unique<U8U16Kernel>(iBuilder));
310    pxDriver.makeKernelCall(u8u16k, {BasisBits}, {U8u16Bits, DelMask, ErrorMask});
311   
312    // Apply a deletion algorithm to discard all but the final position of the UTF-8
313    // sequences for each UTF-16 code unit. Swizzle the results.
314    StreamSetBuffer * SwizzleFields0 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
315    StreamSetBuffer * SwizzleFields1 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
316    StreamSetBuffer * SwizzleFields2 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
317    StreamSetBuffer * SwizzleFields3 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
318    StreamSetBuffer * DeletionCounts = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
319   
320    Kernel * delK = pxDriver.addKernelInstance(make_unique<DeleteByPEXTkernel>(iBuilder, 64, 16, true));
321    pxDriver.makeKernelCall(delK, {U8u16Bits, DelMask}, {SwizzleFields0, SwizzleFields1, SwizzleFields2, SwizzleFields3, DeletionCounts});
322   
323    //  Produce fully compressed swizzled UTF-16 bit streams
324    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
325    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
326    StreamSetBuffer * u16Swizzle2 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
327    StreamSetBuffer * u16Swizzle3 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
328   
329    Kernel * compressK = pxDriver.addKernelInstance(make_unique<SwizzledBitstreamCompressByCount>(iBuilder, 16));
330    pxDriver.makeKernelCall(compressK, {DeletionCounts, SwizzleFields0, SwizzleFields1, SwizzleFields2, SwizzleFields3},
331                           {u16Swizzle0, u16Swizzle1, u16Swizzle2, u16Swizzle3});
332   
333    // Produce unswizzled UTF-16 bit streams
334    StreamSetBuffer * u16bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
335   
336    Kernel * unSwizzleK = pxDriver.addKernelInstance(make_unique<SwizzleGenerator>(iBuilder, 16, 1, 4));
337    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1, u16Swizzle2, u16Swizzle3}, {u16bits});
338   
339    Kernel * p2sk = pxDriver.addKernelInstance(make_unique<P2S16Kernel>(iBuilder));
340   
341    Kernel * outK = pxDriver.addKernelInstance(make_unique<FileSink>(iBuilder, 16));
342    Value * fName = iBuilder->CreatePointerCast(iBuilder->GetString(outputFile.c_str()), iBuilder->getInt8PtrTy());
343    outK->setInitialArguments({fName});
344       
345    // Different choices for the output buffer depending on chosen option.
346    StreamSetBuffer * U16out = nullptr;
347    if (mMapBuffering || memAlignBuffering) {
348        U16out = pxDriver.addExternalBuffer(make_unique<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16), outputStream));
349    } else {
350        U16out = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16), segmentSize * bufferSegments));
351    }
352    pxDriver.makeKernelCall(p2sk, {u16bits}, {U16out});
353    pxDriver.makeKernelCall(outK, {U16out}, {});
354   
355    pxDriver.generatePipelineIR();
356   
357    iBuilder->CreateRetVoid();
358   
359    pxDriver.finalizeObject();
360}
361
362void u8u16PipelineGen(ParabixDriver & pxDriver) {
363   
364    auto & iBuilder = pxDriver.getBuilder();
365    Module * mod = iBuilder->getModule();
366   
367    const unsigned segmentSize = codegen::SegmentSize;
368    const unsigned bufferSegments = codegen::ThreadNum+1;
369    Type * const voidTy = iBuilder->getVoidTy();
370    Type * const bitBlockType = iBuilder->getBitBlockType();
371    Type * const outputType = ArrayType::get(ArrayType::get(bitBlockType, 16), 1)->getPointerTo();
372   
373    Function * const main = cast<Function>(mod->getOrInsertFunction("Main", voidTy, iBuilder->getInt32Ty(), outputType, nullptr));
374    main->setCallingConv(CallingConv::C);
375    Function::arg_iterator args = main->arg_begin();
376   
377    Value * const fileDecriptor = &*(args++);
378    fileDecriptor->setName("fileDecriptor");
379    Value * const outputStream = &*(args++);
380    outputStream->setName("outputStream");
381
382    iBuilder->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
383
384    // File data from mmap
385    StreamSetBuffer * ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
386   
387    Kernel * mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, segmentSize));
388    mmapK->setInitialArguments({fileDecriptor});
389    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
390   
391    // Transposed bits from s2p
392    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments));
393   
394    Kernel * s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
395    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
396   
397    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
398    StreamSetBuffer * U8u16Bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
399    StreamSetBuffer * DelMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
400    StreamSetBuffer * ErrorMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
401   
402    Kernel * u8u16k = pxDriver.addKernelInstance(make_unique<U8U16Kernel>(iBuilder));
403    pxDriver.makeKernelCall(u8u16k, {BasisBits}, {U8u16Bits, DelMask, ErrorMask});
404   
405    StreamSetBuffer * U16Bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
406   
407    StreamSetBuffer * DeletionCounts = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
408
409    Kernel * delK = pxDriver.addKernelInstance(make_unique<DeletionKernel>(iBuilder, iBuilder->getBitBlockWidth()/16, 16));
410    pxDriver.makeKernelCall(delK, {U8u16Bits, DelMask}, {U16Bits, DeletionCounts});
411   
412    Kernel * p2sk = pxDriver.addKernelInstance(make_unique<P2S16KernelWithCompressedOutput>(iBuilder));
413   
414    Kernel * outK = pxDriver.addKernelInstance(make_unique<FileSink>(iBuilder, 16));
415    Value * fName = iBuilder->CreatePointerCast(iBuilder->GetString(outputFile.c_str()), iBuilder->getInt8PtrTy());
416    outK->setInitialArguments({fName});
417   
418    // Different choices for the output buffer depending on chosen option.
419    StreamSetBuffer * U16out = nullptr;
420    if (mMapBuffering || memAlignBuffering) {
421        U16out = pxDriver.addExternalBuffer(make_unique<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16), outputStream));
422    } else {
423        U16out = pxDriver.addBuffer(make_unique<CircularCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16), segmentSize * bufferSegments, 1 /*overflow block*/));
424    }
425    pxDriver.makeKernelCall(p2sk, {U16Bits, DeletionCounts}, {U16out});
426    pxDriver.makeKernelCall(outK, {U16out}, {});
427   
428    pxDriver.generatePipelineIR();
429   
430    iBuilder->CreateRetVoid();
431
432    pxDriver.finalizeObject();
433}
434
435typedef void (*u8u16FunctionType)(uint32_t fd, char * output_data);
436
437size_t file_size(const int fd) {
438    struct stat st;
439    if (LLVM_UNLIKELY(fstat(fd, &st) != 0)) {
440        st.st_size = 0;
441    }
442    return st.st_size;
443}
444
445void u8u16(u8u16FunctionType fn_ptr, const std::string & fileName) {
446    const int fd = open(fileName.c_str(), O_RDONLY);
447    if (LLVM_UNLIKELY(fd == -1)) {
448        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
449    } else {
450        const auto fileSize = file_size(fd);
451        if (mMapBuffering) {
452            boost::interprocess::mapped_region outputBuffer(boost::interprocess::anonymous_shared_memory(2 * fileSize));
453            outputBuffer.advise(boost::interprocess::mapped_region::advice_willneed);
454            outputBuffer.advise(boost::interprocess::mapped_region::advice_sequential);
455            fn_ptr(fd, static_cast<char*>(outputBuffer.get_address()));
456        } else if (memAlignBuffering) {
457            char * outputBuffer;
458            const auto r = posix_memalign(reinterpret_cast<void **>(&outputBuffer), 32, 2 * fileSize);
459            if (LLVM_UNLIKELY(r != 0)) {
460                throw std::runtime_error("posix_memalign failed with return code " + std::to_string(r));
461            }
462            fn_ptr(fd, outputBuffer);
463            free(reinterpret_cast<void *>(outputBuffer));
464        } else { /* No external output buffer */
465            fn_ptr(fd, nullptr);
466        }
467        close(fd);
468    }
469}
470
471int main(int argc, char *argv[]) {
472    AddParabixVersionPrinter();
473    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&u8u16Options, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
474    cl::ParseCommandLineOptions(argc, argv);
475    ParabixDriver pxDriver("u8u16");
476    if (enableAVXdel && AVX2_available() && codegen::BlockSize==256) {
477        u8u16PipelineAVX2Gen(pxDriver);
478    } else {
479        u8u16PipelineGen(pxDriver);
480    }
481    auto u8u16Function = reinterpret_cast<u8u16FunctionType>(pxDriver.getMain());
482    u8u16(u8u16Function, inputFile);
483    return 0;
484}
485
486                       
Note: See TracBrowser for help on using the repository browser.