source: icGREP/icgrep-devel/icgrep/u8u16.cpp @ 5423

Last change on this file since 5423 was 5418, checked in by nmedfort, 2 years ago

Removed non-functional CUDA code from icgrep and consolidated grep and multigrep mode into a single function; allowed segment parallel pipeline to utilize process as its initial thread; modified MMapSourceKernel to map and perform mmap directly and advise the OS to drop consumed data streams.

File size: 25.7 KB
RevLine 
[5005]1/*
[5036]2 *  Copyright (c) 2016 International Characters.
[5005]3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
[5267]7#include <IR_Gen/idisa_builder.h>                  // for IDISA_Builder
8#include <IR_Gen/idisa_target.h>                   // for GetIDISA_Builder
9#include <cc/cc_compiler.h>                        // for CC_Compiler
10#include <kernels/deletion.h>                      // for DeletionKernel
[5355]11#include <kernels/swizzle.h>                      // for DeletionKernel
[5267]12#include <kernels/mmap_kernel.h>                   // for MMapSourceKernel
13#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
14#include <kernels/s2p_kernel.h>                    // for S2PKernel
15#include <kernels/stdout_kernel.h>                 // for StdOutKernel
16#include <llvm/ExecutionEngine/ExecutionEngine.h>  // for ExecutionEngine
17#include <llvm/IR/Function.h>                      // for Function, Function...
18#include <llvm/IR/Module.h>                        // for Module
19#include <llvm/IR/Verifier.h>                      // for verifyModule
20#include <llvm/Support/CommandLine.h>              // for ParseCommandLineOp...
21#include <llvm/Support/Debug.h>                    // for dbgs
22#include <pablo/pablo_kernel.h>                    // for PabloKernel
23#include <pablo/pablo_toolchain.h>                 // for pablo_function_passes
24#include <pablo/pe_zeroes.h>
[5402]25#include <kernels/toolchain.h>
[5267]26#include "kernels/streamset.h"                     // for CircularBuffer
27#include <kernels/pipeline.h>
28#include "llvm/ADT/StringRef.h"                    // for StringRef
29#include "llvm/IR/CallingConv.h"                   // for ::C
30#include "llvm/IR/DerivedTypes.h"                  // for ArrayType, Pointer...
31#include "llvm/IR/LLVMContext.h"                   // for LLVMContext
32#include "llvm/IR/Value.h"                         // for Value
33#include "llvm/Support/Compiler.h"                 // for LLVM_UNLIKELY
34#include <pablo/builder.hpp>                       // for PabloBuilder
[5418]35#include <boost/interprocess/anonymous_shared_memory.hpp>
36#include <boost/interprocess/mapped_region.hpp>
[5005]37#include <iostream>
38
[5267]39using namespace pablo;
40using namespace kernel;
41using namespace parabix;
42using namespace llvm;
[5007]43
[5267]44static cl::OptionCategory u8u16Options("u8u16 Options", "Transcoding control options.");
[5306]45static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(u8u16Options));
46static cl::opt<std::string> outputFile(cl::Positional, cl::desc("<output file>"),  cl::Required, cl::cat(u8u16Options));
[5355]47static cl::opt<bool> enableAVXdel("enable-AVX-deletion", cl::desc("Enable AVX2 deletion algorithms."), cl::cat(u8u16Options));
[5191]48static cl::opt<bool> mMapBuffering("mmap-buffering", cl::desc("Enable mmap buffering."), cl::cat(u8u16Options));
49static cl::opt<bool> memAlignBuffering("memalign-buffering", cl::desc("Enable posix_memalign buffering."), cl::cat(u8u16Options));
[5154]50
[5005]51
[5414]52std::unique_ptr<PabloKernel> u8u16_pablo(IDISA::IDISA_Builder * iBuilder) {
53
54    auto kernel = std::unique_ptr<PabloKernel>(new PabloKernel(iBuilder, "u8u16",
55                       {Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"}},
56                       {Binding{iBuilder->getStreamSetTy(16, 1), "u16bit"},
57                           Binding{iBuilder->getStreamSetTy(1, 1), "delMask"},
58                           Binding{iBuilder->getStreamSetTy(1, 1), "errMask"}}, {}));
59   
[5005]60    //  input: 8 basis bit streams
[5299]61   
[5310]62    const auto u8bitSet = kernel->getInputStreamVar("u8bit");
[5299]63   
[5005]64    //  output: 16 u8-indexed streams, + delmask stream + error stream
65   
[5414]66    cc::CC_Compiler ccc(kernel.get(), u8bitSet);
[5299]67   
[5202]68    PabloBuilder & main = ccc.getBuilder();
69    const auto u8_bits = ccc.getBasisBits();
[5299]70   
[5267]71    Zeroes * zeroes = main.createZeroes();
[5202]72
[5005]73    // Outputs
[5202]74    Var * u16_hi[8];
75    for (int i = 0; i < 8; ++i) {
76        u16_hi[i] = main.createVar("u16_hi" + std::to_string(i), zeroes);
77    }
78    Var * u16_lo[8];
79    for (int i = 0; i < 8; ++i) {
80        u16_lo[i] = main.createVar("u16_lo" + std::to_string(i), zeroes);
81    }
82    Var * delmask = main.createVar("delmask", zeroes);
83    Var * error_mask = main.createVar("error_mask", zeroes);
84
85    // The logic for processing non-ASCII bytes will be embedded within an if-hierarchy.
[5005]86    PabloAST * nonASCII = ccc.compileCC(re::makeCC(0x80, 0xFF));
87   
88    // Builder for the if statement handling all non-ASCII logic
[5202]89    PabloBuilder nAb = PabloBuilder::Create(main);
[5005]90    // Bits 3 through 7 of a 2-byte prefix are data bits, needed to
91    // produce the UTF-16 code unit data ...,
92    PabloAST * bit3a1 = nAb.createAdvance(u8_bits[3], 1);
93    PabloAST * bit4a1 = nAb.createAdvance(u8_bits[4], 1);
94    PabloAST * bit5a1 = nAb.createAdvance(u8_bits[5], 1);
95    PabloAST * bit6a1 = nAb.createAdvance(u8_bits[6], 1);
96    PabloAST * bit7a1 = nAb.createAdvance(u8_bits[7], 1);
97   
98    // Entry condition for 3 or 4 byte sequences: we have a prefix byte in the range 0xE0-0xFF.
99    PabloAST * pfx34 = ccc.compileCC(re::makeCC(0xE0, 0xFF), nAb);
100    // Builder for the if statement handling all logic for 3- and 4-byte sequences.
101    PabloBuilder p34b = PabloBuilder::Create(nAb);
102    // Bits 4 through 7 of a 3-byte prefix are data bits.  They must be moved
103    // to the final position of the 3-byte sequence.
104    PabloAST * bit2a1 = p34b.createAdvance(u8_bits[2], 1);
105    PabloAST * bit4a2 = p34b.createAdvance(bit4a1, 1);
106    PabloAST * bit5a2 = p34b.createAdvance(bit5a1, 1);
107    PabloAST * bit6a2 = p34b.createAdvance(bit6a1, 1);
108    PabloAST * bit7a2 = p34b.createAdvance(bit7a1, 1);
[5202]109
110
111    Var * const u8scope32 = nAb.createVar("u8scope32", zeroes);
112    Var * const u8scope33 = nAb.createVar("u8scope33", zeroes);
113    Var * const u8scope44 = nAb.createVar("u8scope44", zeroes);
114
[5005]115    //
116    // Logic for 4-byte UTF-8 sequences
117    //
118    // Entry condition  or 4 byte sequences: we have a prefix byte in the range 0xF0-0xFF.
119    PabloAST * pfx4 = ccc.compileCC(re::makeCC(0xF0, 0xFF), p34b);
120    // Builder for the if statement handling all logic for 4-byte sequences only.
121    PabloBuilder p4b = PabloBuilder::Create(p34b);
122    // Illegal 4-byte sequences
123    PabloAST * F0 = ccc.compileCC(re::makeCC(0xF0), p4b);
124    PabloAST * F4 = ccc.compileCC(re::makeCC(0xF4), p4b);
125    PabloAST * F0_err = p4b.createAnd(p4b.createAdvance(F0, 1), ccc.compileCC(re::makeCC(0x80, 0x8F), p4b));
126    PabloAST * F4_err = p4b.createAnd(p4b.createAdvance(F4, 1), ccc.compileCC(re::makeCC(0x90, 0xBF), p4b));
127    PabloAST * F5_FF = ccc.compileCC(re::makeCC(0xF5, 0xFF), p4b);
[5202]128
129    Var * FX_err = p34b.createVar("FX_err", zeroes);
130    p4b.createAssign(FX_err, p4b.createOr(F5_FF, p4b.createOr(F0_err, F4_err)));
[5005]131    //
132    // 4-byte prefixes have a scope that extends over the next 3 bytes.
[5202]133
134    Var * u8scope42 = p34b.createVar("u8scope42", zeroes);
135    Var * u8scope43 = p34b.createVar("u8scope43", zeroes);
136
137    p4b.createAssign(u8scope42, p4b.createAdvance(pfx4, 1));
138    p4b.createAssign(u8scope43, p4b.createAdvance(u8scope42, 1));
139    p4b.createAssign(u8scope44, p4b.createAdvance(u8scope43, 1));
[5005]140    //
141   
142    //  From the 4-byte sequence 11110abc 10defghi 10jklmno 10pqrstu,
143    //  we must calculate the value abcde - 1 to produce the bit values
144    //  for u16_hi6, hi7, lo0, lo1 at the scope43 position.
[5202]145    Var * s43_lo0 = nAb.createVar("scope43_lo0", zeroes);
146    Var * s43_lo1 = nAb.createVar("scope43_lo1", zeroes);
147    Var * s43_hi6 = nAb.createVar("scope43_hi6", zeroes);
148    Var * s43_hi7 = nAb.createVar("scope43_hi7", zeroes);
149
150    Var * s43_lo2 = main.createVar("scope43_lo2", zeroes);
151    Var * s43_lo3 = main.createVar("scope43_lo3", zeroes);
152    Var * s43_lo4 = main.createVar("scope43_lo4", zeroes);
153    Var * s43_lo5 = main.createVar("scope43_lo5", zeroes);
154    Var * s43_lo6 = main.createVar("scope43_lo6", zeroes);
155    Var * s43_lo7 = main.createVar("scope43_lo7", zeroes);
156
157    p4b.createAssign(s43_lo1, p4b.createAnd(u8scope43, p4b.createNot(bit3a1)));           // e - 1
158    p4b.createAssign(s43_lo0, p4b.createAnd(u8scope43, p4b.createXor(bit2a1, s43_lo1)));  // d - borrow
[5005]159    PabloAST * brw1 = p4b.createAnd(s43_lo1, p4b.createNot(bit2a1));
[5202]160    p4b.createAssign(s43_hi7, p4b.createAnd(u8scope43, p4b.createXor(bit7a2, brw1)));     // c - borrow
[5005]161    PabloAST * brw2 = p4b.createAnd(brw1, p4b.createNot(bit7a2));
[5202]162    p4b.createAssign(s43_hi6, p4b.createAnd(u8scope43, p4b.createXor(bit6a2, brw2)));     // b - borrow
[5005]163    //
[5202]164    p4b.createAssign(s43_lo2, p4b.createAnd(u8scope43, bit4a1));
165    p4b.createAssign(s43_lo3, p4b.createAnd(u8scope43, bit5a1));
166    p4b.createAssign(s43_lo4, p4b.createAnd(u8scope43, bit6a1));
167    p4b.createAssign(s43_lo5, p4b.createAnd(u8scope43, bit7a1));
168    p4b.createAssign(s43_lo6, p4b.createAnd(u8scope43, u8_bits[2]));
169    p4b.createAssign(s43_lo7, p4b.createAnd(u8scope43, u8_bits[3]));
[5005]170    //
171    //
[5202]172    p34b.createIf(pfx4, p4b);
[5005]173    //
174    // Combined logic for 3 and 4 byte sequences
175    //
176    PabloAST * pfx3 = ccc.compileCC(re::makeCC(0xE0, 0xEF), p34b);
177
[5202]178    p34b.createAssign(u8scope32, p34b.createAdvance(pfx3, 1));
179    p34b.createAssign(u8scope33, p34b.createAdvance(u8scope32, 1));
180
[5005]181    // Illegal 3-byte sequences
182    PabloAST * E0 = ccc.compileCC(re::makeCC(0xE0), p34b);
183    PabloAST * ED = ccc.compileCC(re::makeCC(0xED), p34b);
184    PabloAST * E0_err = p34b.createAnd(p34b.createAdvance(E0, 1), ccc.compileCC(re::makeCC(0x80, 0x9F), p34b));
185    PabloAST * ED_err = p34b.createAnd(p34b.createAdvance(ED, 1), ccc.compileCC(re::makeCC(0xA0, 0xBF), p34b));
[5202]186    Var * EX_FX_err = nAb.createVar("EX_FX_err", zeroes);
187
188    p34b.createAssign(EX_FX_err, p34b.createOr(p34b.createOr(E0_err, ED_err), FX_err));
[5005]189    // Two surrogate UTF-16 units are computed at the 3rd and 4th positions of 4-byte sequences.
190    PabloAST * surrogate = p34b.createOr(u8scope43, u8scope44);
191   
[5202]192    Var * p34del = nAb.createVar("p34del", zeroes);
193    p34b.createAssign(p34del, p34b.createOr(u8scope32, u8scope42));
[5005]194
195
196    // The high 5 bits of the UTF-16 code unit are only nonzero for 3 and 4-byte
197    // UTF-8 sequences.
[5202]198    p34b.createAssign(u16_hi[0], p34b.createOr(p34b.createAnd(u8scope33, bit4a2), surrogate));
199    p34b.createAssign(u16_hi[1], p34b.createOr(p34b.createAnd(u8scope33, bit5a2), surrogate));
200    p34b.createAssign(u16_hi[2], p34b.createAnd(u8scope33, bit6a2));
201    p34b.createAssign(u16_hi[3], p34b.createOr(p34b.createAnd(u8scope33, bit7a2), surrogate));
202    p34b.createAssign(u16_hi[4], p34b.createOr(p34b.createAnd(u8scope33, bit2a1), surrogate));
[5005]203   
204    //
[5202]205    nAb.createIf(pfx34, p34b);
[5005]206    //
207    // Combined logic for 2, 3 and 4 byte sequences
208    //
[5202]209
210    Var * u8lastscope = main.createVar("u8lastscope", zeroes);
211
[5005]212    PabloAST * pfx2 = ccc.compileCC(re::makeCC(0xC0, 0xDF), nAb);
213    PabloAST * u8scope22 = nAb.createAdvance(pfx2, 1);
[5202]214    nAb.createAssign(u8lastscope, nAb.createOr(u8scope22, nAb.createOr(u8scope33, u8scope44)));
[5005]215    PabloAST * u8anyscope = nAb.createOr(u8lastscope, p34del);
216
217    PabloAST * C0_C1_err = ccc.compileCC(re::makeCC(0xC0, 0xC1), nAb);
218    PabloAST * scope_suffix_mismatch = nAb.createXor(u8anyscope, ccc.compileCC(re::makeCC(0x80, 0xBF), nAb));
[5202]219    nAb.createAssign(error_mask, nAb.createOr(scope_suffix_mismatch, nAb.createOr(C0_C1_err, EX_FX_err)));
220    nAb.createAssign(delmask, nAb.createOr(p34del, ccc.compileCC(re::makeCC(0xC0, 0xFF), nAb)));
[5005]221   
222    // The low 3 bits of the high byte of the UTF-16 code unit as well as the high bit of the
223    // low byte are only nonzero for 2, 3 and 4 byte sequences.
[5202]224    nAb.createAssign(u16_hi[5], nAb.createOr(nAb.createAnd(u8lastscope, bit3a1), u8scope44));
225    nAb.createAssign(u16_hi[6], nAb.createOr(nAb.createAnd(u8lastscope, bit4a1), s43_hi6));
226    nAb.createAssign(u16_hi[7], nAb.createOr(nAb.createAnd(u8lastscope, bit5a1), s43_hi7));
227    nAb.createAssign(u16_lo[0], nAb.createOr(nAb.createAnd(u8lastscope, bit6a1), s43_lo0));
[5005]228
[5202]229    Var * p234_lo1 = main.createVar("p234_lo1", zeroes);
230
231    nAb.createAssign(p234_lo1, nAb.createOr(nAb.createAnd(u8lastscope, bit7a1), s43_lo1));
232
233    main.createIf(nonASCII, nAb);
[5005]234    //
235    //
236    PabloAST * ASCII = ccc.compileCC(re::makeCC(0x0, 0x7F));
[5202]237    PabloAST * last_byte = main.createOr(ASCII, u8lastscope);
238    main.createAssign(u16_lo[1], main.createOr(main.createAnd(ASCII, u8_bits[1]), p234_lo1));
239    main.createAssign(u16_lo[2], main.createOr(main.createAnd(last_byte, u8_bits[2]), s43_lo2));
240    main.createAssign(u16_lo[3], main.createOr(main.createAnd(last_byte, u8_bits[3]), s43_lo3));
241    main.createAssign(u16_lo[4], main.createOr(main.createAnd(last_byte, u8_bits[4]), s43_lo4));
242    main.createAssign(u16_lo[5], main.createOr(main.createAnd(last_byte, u8_bits[5]), s43_lo5));
243    main.createAssign(u16_lo[6], main.createOr(main.createAnd(last_byte, u8_bits[6]), s43_lo6));
244    main.createAssign(u16_lo[7], main.createOr(main.createAnd(last_byte, u8_bits[7]), s43_lo7));
[5005]245   
[5310]246    Var * output = kernel->getOutputStreamVar("u16bit");
247    Var * delmask_out = kernel->getOutputStreamVar("delMask");
248    Var * error_mask_out = kernel->getOutputStreamVar("errMask");
[5299]249   
[5005]250    for (unsigned i = 0; i < 8; i++) {
[5202]251        main.createAssign(main.createExtract(output, i), u16_hi[i]);
[5005]252    }
[5202]253    for (unsigned i = 0; i < 8; i++) {
254        main.createAssign(main.createExtract(output, i + 8), u16_lo[i]);
255    }
[5217]256    main.createAssign(main.createExtract(delmask_out, main.getInteger(0)), delmask);
257    main.createAssign(main.createExtract(error_mask_out,  main.getInteger(0)), error_mask);
[5005]258
[5414]259    pablo_function_passes(kernel.get());
260    return kernel;
[5005]261}
262
[5395]263void u8u16PipelineAVX2Gen(ParabixDriver & pxDriver) {
[5005]264
[5395]265    IDISA::IDISA_Builder * iBuilder = pxDriver.getIDISA_Builder();
266    Module * mod = iBuilder->getModule();
[5154]267    const unsigned segmentSize = codegen::SegmentSize;
[5301]268    const unsigned bufferSegments = codegen::ThreadNum+1;
[5362]269
[5217]270    assert (iBuilder);
[5154]271
[5254]272    Type * const voidTy = iBuilder->getVoidTy();
273    Type * const bitBlockType = iBuilder->getBitBlockType();
274    Type * const outputType = ArrayType::get(ArrayType::get(bitBlockType, 16), 1)->getPointerTo();
[5418]275
276    Function * const main = cast<Function>(mod->getOrInsertFunction("Main", voidTy, iBuilder->getInt32Ty(), outputType, nullptr));
[5254]277    main->setCallingConv(CallingConv::C);
278    Function::arg_iterator args = main->arg_begin();
[5418]279
280    Value * const fileDecriptor = &*(args++);
281    fileDecriptor->setName("fileDecriptor");
[5254]282    Value * const outputStream = &*(args++);
283    outputStream->setName("outputStream");
284
[5409]285    iBuilder->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
286   
[5355]287    // File data from mmap
[5418]288    StreamSetBuffer * ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
[5409]289   
[5414]290    KernelBuilder * mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, segmentSize));
[5418]291    mmapK->setInitialArguments({fileDecriptor});
[5414]292    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
[5409]293   
[5355]294    // Transposed bits from s2p
[5409]295    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments));
296   
[5414]297    KernelBuilder * s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
298    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
[5355]299   
300    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
[5409]301    StreamSetBuffer * U8u16Bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
302    StreamSetBuffer * DelMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
303    StreamSetBuffer * ErrorMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
304   
[5414]305    KernelBuilder * u8u16k = pxDriver.addKernelInstance(u8u16_pablo(iBuilder));
306    pxDriver.makeKernelCall(u8u16k, {BasisBits}, {U8u16Bits, DelMask, ErrorMask});
[5154]307   
[5355]308    // Apply a deletion algorithm to discard all but the final position of the UTF-8
[5362]309    // sequences for each UTF-16 code unit. Swizzle the results.
[5409]310    StreamSetBuffer * SwizzleFields0 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
311    StreamSetBuffer * SwizzleFields1 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
312    StreamSetBuffer * SwizzleFields2 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
313    StreamSetBuffer * SwizzleFields3 = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments));
314    StreamSetBuffer * DeletionCounts = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
315   
[5414]316    KernelBuilder * delK = pxDriver.addKernelInstance(make_unique<DeleteByPEXTkernel>(iBuilder, 64, 16, true));
317    pxDriver.makeKernelCall(delK, {U8u16Bits, DelMask}, {SwizzleFields0, SwizzleFields1, SwizzleFields2, SwizzleFields3, DeletionCounts});
[5409]318   
[5355]319    //  Produce fully compressed swizzled UTF-16 bit streams
[5409]320    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
321    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
322    StreamSetBuffer * u16Swizzle2 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
323    StreamSetBuffer * u16Swizzle3 = pxDriver.addBuffer(make_unique<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1));
324   
[5414]325    KernelBuilder * compressK = pxDriver.addKernelInstance(make_unique<SwizzledBitstreamCompressByCount>(iBuilder, 16));
326    pxDriver.makeKernelCall(compressK, {DeletionCounts, SwizzleFields0, SwizzleFields1, SwizzleFields2, SwizzleFields3},
[5409]327                           {u16Swizzle0, u16Swizzle1, u16Swizzle2, u16Swizzle3});
328   
[5355]329    // Produce unswizzled UTF-16 bit streams
[5409]330    StreamSetBuffer * u16bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
[5355]331   
[5414]332    KernelBuilder * unSwizzleK = pxDriver.addKernelInstance(make_unique<SwizzleGenerator>(iBuilder, 16, 1, 4));
333    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1, u16Swizzle2, u16Swizzle3}, {u16bits});
[5409]334   
[5414]335    KernelBuilder * p2sk = pxDriver.addKernelInstance(make_unique<P2S16Kernel>(iBuilder));
[5409]336   
[5414]337    KernelBuilder * outK = pxDriver.addKernelInstance(make_unique<FileSink>(iBuilder, 16));
[5409]338    Value * fName = iBuilder->CreatePointerCast(iBuilder->CreateGlobalString(outputFile.c_str()), iBuilder->getInt8PtrTy());
[5414]339    outK->setInitialArguments({fName});
340       
[5409]341    // Different choices for the output buffer depending on chosen option.
342    StreamSetBuffer * U16out = nullptr;
[5355]343    if (mMapBuffering || memAlignBuffering) {
[5409]344        U16out = pxDriver.addExternalBuffer(make_unique<ExternalFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16)), outputStream);
[5355]345    } else {
[5409]346        U16out = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16), segmentSize * bufferSegments));
[5355]347    }
[5414]348    pxDriver.makeKernelCall(p2sk, {u16bits}, {U16out});
349    pxDriver.makeKernelCall(outK, {U16out}, {});
[5355]350   
[5395]351    pxDriver.generatePipelineIR();
[5409]352   
[5355]353    iBuilder->CreateRetVoid();
[5409]354   
[5395]355    pxDriver.linkAndFinalize();
[5355]356}
357
[5395]358void u8u16PipelineGen(ParabixDriver & pxDriver) {
[5355]359   
[5395]360    IDISA::IDISA_Builder * iBuilder = pxDriver.getIDISA_Builder();
361    Module * mod = iBuilder->getModule();
362   
[5355]363    const unsigned segmentSize = codegen::SegmentSize;
364    const unsigned bufferSegments = codegen::ThreadNum+1;
365   
366    assert (iBuilder);
367   
368    Type * const voidTy = iBuilder->getVoidTy();
369    Type * const bitBlockType = iBuilder->getBitBlockType();
370    Type * const outputType = ArrayType::get(ArrayType::get(bitBlockType, 16), 1)->getPointerTo();
371   
[5418]372    Function * const main = cast<Function>(mod->getOrInsertFunction("Main", voidTy, iBuilder->getInt32Ty(), outputType, nullptr));
[5355]373    main->setCallingConv(CallingConv::C);
374    Function::arg_iterator args = main->arg_begin();
375   
[5418]376    Value * const fileDecriptor = &*(args++);
377    fileDecriptor->setName("fileDecriptor");
[5355]378    Value * const outputStream = &*(args++);
379    outputStream->setName("outputStream");
[5418]380
[5409]381    iBuilder->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
382
383    // File data from mmap
[5418]384    StreamSetBuffer * ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
[5355]385   
[5414]386    KernelBuilder * mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, segmentSize));
[5418]387    mmapK->setInitialArguments({fileDecriptor});
[5414]388    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
[5355]389   
[5409]390    // Transposed bits from s2p
391    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments));
[5355]392   
[5414]393    KernelBuilder * s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
394    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
395   
[5409]396    // Calculate UTF-16 data bits through bitwise logic on u8-indexed streams.
397    StreamSetBuffer * U8u16Bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
398    StreamSetBuffer * DelMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
399    StreamSetBuffer * ErrorMask = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
[5355]400   
[5414]401    KernelBuilder * u8u16k = pxDriver.addKernelInstance(u8u16_pablo(iBuilder));
402    pxDriver.makeKernelCall(u8u16k, {BasisBits}, {U8u16Bits, DelMask, ErrorMask});
[5355]403   
[5409]404    StreamSetBuffer * U16Bits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments));
405   
406    StreamSetBuffer * DeletionCounts = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments));
407
[5414]408    KernelBuilder * delK = pxDriver.addKernelInstance(make_unique<DeletionKernel>(iBuilder, iBuilder->getBitBlockWidth()/16, 16));
409    pxDriver.makeKernelCall(delK, {U8u16Bits, DelMask}, {U16Bits, DeletionCounts});
[5355]410   
[5414]411    KernelBuilder * p2sk = pxDriver.addKernelInstance(make_unique<P2S16KernelWithCompressedOutput>(iBuilder));
[5355]412   
[5414]413    KernelBuilder * outK = pxDriver.addKernelInstance(make_unique<FileSink>(iBuilder, 16));
[5409]414    Value * fName = iBuilder->CreatePointerCast(iBuilder->CreateGlobalString(outputFile.c_str()), iBuilder->getInt8PtrTy());
[5414]415    outK->setInitialArguments({fName});
[5355]416   
[5409]417    // Different choices for the output buffer depending on chosen option.
418    StreamSetBuffer * U16out = nullptr;
[5191]419    if (mMapBuffering || memAlignBuffering) {
[5409]420        U16out = pxDriver.addExternalBuffer(make_unique<ExternalFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16)), outputStream);
[5217]421    } else {
[5409]422        U16out = pxDriver.addBuffer(make_unique<CircularCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 16), segmentSize * bufferSegments, 1 /*overflow block*/));
[5191]423    }
[5414]424    pxDriver.makeKernelCall(p2sk, {U16Bits, DeletionCounts}, {U16out});
425    pxDriver.makeKernelCall(outK, {U16out}, {});
[5355]426   
[5395]427    pxDriver.generatePipelineIR();
[5355]428   
[5071]429    iBuilder->CreateRetVoid();
[5401]430
[5395]431    pxDriver.linkAndFinalize();
[5071]432}
433
434
435
[5418]436typedef void (*u8u16FunctionType)(uint32_t fd, char * output_data);
[5005]437
438u8u16FunctionType u8u16CodeGen(void) {
[5176]439    LLVMContext TheContext;                           
440    Module * M = new Module("u8u16", TheContext);
[5033]441    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
[5395]442    ParabixDriver pxDriver(idb);
443   
444    if (enableAVXdel && AVX2_available() && codegen::BlockSize==256) {
445        //u8u16PipelineAVX2(M, idb)
446        u8u16PipelineAVX2Gen(pxDriver);
447    }
448    else{
449        //u8u16Pipeline(M, idb);
450        u8u16PipelineGen(pxDriver);
451    }
452    u8u16FunctionType main = reinterpret_cast<u8u16FunctionType>(pxDriver.getPointerToMain());
[5005]453
454    delete idb;
[5395]455    return main;
[5005]456}
457
[5418]458size_t file_size(const int fd) {
459    struct stat st;
460    if (LLVM_UNLIKELY(fstat(fd, &st) != 0)) {
461        st.st_size = 0;
462    }
463    return st.st_size;
464}
465
[5007]466void u8u16(u8u16FunctionType fn_ptr, const std::string & fileName) {
[5418]467    const int fd = open(fileName.c_str(), O_RDONLY);
468    if (LLVM_UNLIKELY(fd == -1)) {
469        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
[5005]470    } else {
[5418]471        const auto fileSize = file_size(fd);
472        if (mMapBuffering) {
473            boost::interprocess::mapped_region outputBuffer(boost::interprocess::anonymous_shared_memory(2 * fileSize));
474            outputBuffer.advise(boost::interprocess::mapped_region::advice_willneed);
475            outputBuffer.advise(boost::interprocess::mapped_region::advice_sequential);
476            fn_ptr(fd, static_cast<char*>(outputBuffer.get_address()));
477        } else if (memAlignBuffering) {
478            char * outputBuffer;
479            const auto r = posix_memalign(reinterpret_cast<void **>(&outputBuffer), 32, 2 * fileSize);
480            if (LLVM_UNLIKELY(r != 0)) {
481                throw std::runtime_error("posix_memalign failed with return code " + std::to_string(r));
482            }
483            fn_ptr(fd, outputBuffer);
484            free(reinterpret_cast<void *>(outputBuffer));
485        } else { /* No external output buffer */
486            fn_ptr(fd, nullptr);
[5234]487        }
[5418]488        close(fd);
[5191]489    }
[5005]490}
491
492int main(int argc, char *argv[]) {
[5373]493    AddParabixVersionPrinter();
[5036]494    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&u8u16Options, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
[5005]495    cl::ParseCommandLineOptions(argc, argv);
[5402]496    u8u16(u8u16CodeGen(), inputFile);
[5005]497    return 0;
498}
499
500                       
Note: See TracBrowser for help on using the repository browser.