source: icGREP/icgrep-devel/icgrep/u32u8.cpp @ 6079

Last change on this file since 6079 was 6075, checked in by cameron, 17 months ago

Final block processing for u32u8 mask calculation

File size: 19.2 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <IR_Gen/idisa_target.h>                   // for GetIDISA_Builder
8#include <cc/cc_compiler.h>                        // for CC_Compiler
9#include <kernels/deletion.h>                      // for DeletionKernel
10#include <kernels/source_kernel.h>
11#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
12#include <kernels/s2p_kernel.h>                    // for S2PKernel
13#include <kernels/stdout_kernel.h>                 // for StdOutKernel_
14#include <kernels/pdep_kernel.h>
15#include <llvm/IR/Function.h>                      // for Function, Function...
16#include <llvm/IR/Module.h>                        // for Module
17#include <llvm/Support/CommandLine.h>              // for ParseCommandLineOp...
18#include <llvm/Support/Debug.h>                    // for dbgs
19#include <pablo/pablo_kernel.h>                    // for PabloKernel
20#include <pablo/pablo_toolchain.h>                 // for pablo_function_passes
21#include <kernels/kernel_builder.h>
22#include <pablo/pe_zeroes.h>
23#include <toolchain/toolchain.h>
24#include <toolchain/cpudriver.h>
25#include <kernels/streamset.h>
26#include <llvm/ADT/StringRef.h>
27#include <llvm/IR/CallingConv.h>
28#include <llvm/IR/DerivedTypes.h>
29#include <llvm/IR/LLVMContext.h>
30#include <llvm/IR/Value.h>
31#include <llvm/Support/Compiler.h>
32#include <llvm/Support/raw_ostream.h>
33#include <pablo/builder.hpp>
34#include <fcntl.h>
35
36using namespace pablo;
37using namespace kernel;
38using namespace parabix;
39using namespace llvm;
40
41static cl::OptionCategory u32u8Options("u32u8 Options", "Transcoding control options.");
42static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(u32u8Options));
43
44//
45// UTF-8 encoding requires one to four bytes per Unicode character.
46// To generate UTF-8 encoded output from sets of basis bit streams
47// representing Unicode characters (that is, codepoint-indexed streams
48// having one bit position per codepoint), deposit masks are needed
49// to identify the positions at which bits for each character are
50// to be deposited.   A UTF-8 deposit mask will have one to four bit
51// positions per character depending on the character being encoded, that is,
52// depending on the number of bytes needed to encode the character.   Within
53// each group of one to four positions for a single character, a deposit mask
54// must have exactly one 1 bit set.  Different deposit masks are used for
55// depositing bits, depending on the destination byte position within the
56// ultimate 4 byte sequence.
57//
58// The following deposit masks (shown in little-endian representation) are
59// used for depositing bits.
60//
61//  UTF-8 sequence length:          1     2     3       4
62//  Unicode bit position:
63//  Unicode codepoint bits 0-5      1    10   100    1000    u8final
64//  Bits 6-11                       1    01   010    0100    u8mask6_11
65//  Bits 12-17                      1    01   001    0010    u8mask12_17
66//  Bits 18-20                      1    01   001    0001    u8initial
67//
68//  To compute UTF-8 deposit masks, we begin by constructing an extraction
69//  mask having 4 bit positions per character, but with the number of
70//  1 bits to be kept dependent on the sequence length.  When this extraction
71//  mask is applied to the repeating constant 4-bit mask 1000, u8final above
72//  is produced. 
73//
74//  UTF-8 sequence length:             1     2     3       4
75//  extraction mask                 1000  1100  1110    1111
76//  constant mask                   1000  1000  1000    1000
77//  final position mask             1     10    100     1000
78//  From this mask, other masks may subsequently computed by
79//  bitwise logic and shifting.
80//
81//  The UTF8fieldDepositMask kernel produces this deposit mask
82//  within 64-bit fields.
83
84class UTF8fieldDepositMask final : public BlockOrientedKernel {
85public:
86    UTF8fieldDepositMask(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned depositFieldWidth = 64);
87private:
88    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
89    void generateFinalBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b, llvm::Value * const remainingBytes) override;
90    const unsigned mDepositFieldWidth;
91};
92
93UTF8fieldDepositMask::UTF8fieldDepositMask(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned depositFieldWidth)
94: BlockOrientedKernel("u8depositMask",
95            {Binding{b->getStreamSetTy(1, 21), "basis"}},
96            {Binding{b->getStreamSetTy(1, 1), "fieldDepositMask", FixedRate(4)}, 
97                Binding{b->getStreamSetTy(1, 1), "codeUnitCounts", FixedRate(4), RoundUpTo(b->getBitBlockWidth())}},
98            {}, {}, {Binding{b->getBitBlockType(), "EOFmask"}}), mDepositFieldWidth(depositFieldWidth) {
99}
100
101
102void UTF8fieldDepositMask::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
103    // If any of bits 16 through 20 are 1, a four-byte UTF-8 sequence is required.
104    Value * u8len4 = b->loadInputStreamBlock("basis", b->getSize(16), b->getSize(0));
105    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(17), b->getSize(0)));
106    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(18), b->getSize(0)));
107    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(19), b->getSize(0)));
108    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(20), b->getSize(0)), "u8len4");
109    Value * u8len34 = u8len4;
110    // Otherwise, if any of bits 11 through 15 are 1, a three-byte UTF-8 sequence is required.
111    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(11), b->getSize(0)));
112    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(12), b->getSize(0)));
113    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(13), b->getSize(0)));
114    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(14), b->getSize(0)));
115    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(15), b->getSize(0)));
116    Value * nonASCII = u8len34;
117    // Otherwise, if any of bits 7 through 10 are 1, a two-byte UTF-8 sequence is required.
118    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(7), b->getSize(0)));
119    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(8), b->getSize(0)));
120    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(9), b->getSize(0)));
121    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(10), b->getSize(0)), "nonASCII");
122    //
123    //  UTF-8 sequence length:    1     2     3       4
124    //  extraction mask        1000  1100  1110    1111
125    //  interleave u8len3|u8len4, allOnes() for bits 1, 3:  x..., ..x.
126    //  interleave prefix4, u8len2|u8len3|u8len4 for bits 0, 2:  .x.., ...x
127    Value * fileExtentMask = b->CreateNot(b->getScalarField("EOFmask"));
128   
129    Value * maskA_lo = b->esimd_mergel(1, u8len34, fileExtentMask);
130    Value * maskA_hi = b->esimd_mergeh(1, u8len34, fileExtentMask);
131    Value * maskB_lo = b->esimd_mergel(1, u8len4, nonASCII);
132    Value * maskB_hi = b->esimd_mergeh(1, u8len4, nonASCII);
133    Value * extraction_mask[4];
134    extraction_mask[0] = b->esimd_mergel(1, maskB_lo, maskA_lo);
135    extraction_mask[1] = b->esimd_mergeh(1, maskB_lo, maskA_lo);
136    extraction_mask[2] = b->esimd_mergel(1, maskB_hi, maskA_hi);
137    extraction_mask[3] = b->esimd_mergeh(1, maskB_hi, maskA_hi);
138    const unsigned bw = b->getBitBlockWidth();
139    Constant * mask1000 = Constant::getIntegerValue(b->getIntNTy(bw), APInt::getSplat(bw, APInt::getHighBitsSet(4, 1)));
140    for (unsigned j = 0; j < 4; ++j) {
141        Value * deposit_mask = b->simd_pext(mDepositFieldWidth, mask1000, extraction_mask[j]);
142        Value * unit_counts = b->simd_popcount(mDepositFieldWidth, extraction_mask[j]);
143        b->storeOutputStreamBlock("fieldDepositMask", b->getSize(0), b->getSize(j), deposit_mask);
144        b->storeOutputStreamBlock("codeUnitCounts", b->getSize(0), b->getSize(j), unit_counts);
145    }
146}
147void UTF8fieldDepositMask::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * const remainingBytes) {
148    // Standard Pablo convention for final block processing: set a bit marking
149    // the position just past EOF, as well as a mask marking all positions past EOF.
150    b->setScalarField("EOFmask", b->bitblock_mask_from(remainingBytes));
151    CreateDoBlockMethodCall(b);
152}
153
154
155//
156// Given a u8-indexed bit stream marking the final code unit position
157// of each UTF-8 sequence, this kernel computes the deposit masks
158// u8initial, u8mask12_17, and u8mask6_11.
159//
160class UTF8_DepositMasks : public pablo::PabloKernel {
161public:
162    UTF8_DepositMasks(const std::unique_ptr<kernel::KernelBuilder> & kb);
163    bool isCachable() const override { return true; }
164    bool hasSignature() const override { return false; }
165protected:
166    void generatePabloMethod() override;
167};
168
169UTF8_DepositMasks::UTF8_DepositMasks (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
170: PabloKernel(iBuilder, "UTF8_DepositMasks",
171              {Binding{iBuilder->getStreamSetTy(1), "u8final", FixedRate(1), LookAhead(2)}},
172              {Binding{iBuilder->getStreamSetTy(1), "u8initial"},
173               Binding{iBuilder->getStreamSetTy(1), "u8mask12_17"},
174               Binding{iBuilder->getStreamSetTy(1), "u8mask6_11"}}) {}
175
176void UTF8_DepositMasks::generatePabloMethod() {
177    PabloBuilder pb(getEntryScope());
178    PabloAST * u8final = pb.createExtract(getInputStreamVar("u8final"), pb.getInteger(0));
179    PabloAST * nonFinal = pb.createNot(u8final, "nonFinal");
180    PabloAST * initial = pb.createInFile(pb.createNot(pb.createAdvance(nonFinal, 1)), "u8initial");
181    PabloAST * ASCII = pb.createAnd(u8final, initial);
182    PabloAST * lookAheadFinal = pb.createLookahead(u8final, 1, "lookaheadFinal");
183    // Eliminate lookahead positions that are the final position of the prior unit.
184    PabloAST * secondLast = pb.createAnd(lookAheadFinal, nonFinal);
185    PabloAST * u8mask6_11 = pb.createOr(secondLast, ASCII, "u8mask6_11");
186    PabloAST * prefix2 = pb.createAnd(secondLast, initial);
187    PabloAST * lookAhead2 = pb.createLookahead(u8final, 2, "lookahead2");
188    PabloAST * thirdLast = pb.createAnd(pb.createAnd(lookAhead2, nonFinal), pb.createNot(secondLast));
189    PabloAST * u8mask12_17 = pb.createOr(thirdLast, pb.createOr(prefix2, ASCII), "u8mask12_17");
190    pb.createAssign(pb.createExtract(getOutputStreamVar("u8initial"), pb.getInteger(0)), initial);
191    pb.createAssign(pb.createExtract(getOutputStreamVar("u8mask6_11"), pb.getInteger(0)), u8mask6_11);
192    pb.createAssign(pb.createExtract(getOutputStreamVar("u8mask12_17"), pb.getInteger(0)), u8mask12_17);
193}
194
195// This kernel assembles the UTF-8 basis bit data, given four sets of deposited
196// bits bits 18-20, 11-17, 6-11 and 0-5, as weil as the marker streams u8initial,
197// u8final, u8prefix3 and u8prefix4.
198//
199class UTF8assembly : public pablo::PabloKernel {
200public:
201    UTF8assembly(const std::unique_ptr<kernel::KernelBuilder> & kb);
202    bool isCachable() const override { return true; }
203    bool hasSignature() const override { return false; }
204protected:
205    void generatePabloMethod() override;
206};
207
208UTF8assembly::UTF8assembly (const std::unique_ptr<kernel::KernelBuilder> & b)
209: PabloKernel(b, "UTF8assembly",
210              {Binding{b->getStreamSetTy(3), "dep18_20"},
211                Binding{b->getStreamSetTy(6), "dep12_17"},
212                Binding{b->getStreamSetTy(6), "dep6_11"},
213                Binding{b->getStreamSetTy(6), "dep0_5"},
214                Binding{b->getStreamSetTy(1), "u8initial"},
215                Binding{b->getStreamSetTy(1), "u8final"},
216                Binding{b->getStreamSetTy(1), "u8mask6_11"},
217                Binding{b->getStreamSetTy(1), "u8mask12_17"}},
218              {Binding{b->getStreamSetTy(8), "u8basis"}}) {}
219
220void UTF8assembly::generatePabloMethod() {
221    PabloBuilder pb(getEntryScope());
222    std::vector<PabloAST *> dep18_20 = getInputStreamSet("dep18_20");
223    std::vector<PabloAST *> dep12_17 = getInputStreamSet("dep12_17");
224    std::vector<PabloAST *> dep6_11 = getInputStreamSet("dep6_11");
225    std::vector<PabloAST *> dep0_5 = getInputStreamSet("dep0_5");
226    PabloAST * u8initial = pb.createExtract(getInputStreamVar("u8initial"), pb.getInteger(0));
227    PabloAST * u8final = pb.createExtract(getInputStreamVar("u8final"), pb.getInteger(0));
228    PabloAST * u8mask6_11 = pb.createExtract(getInputStreamVar("u8mask6_11"), pb.getInteger(0));
229    PabloAST * u8mask12_17 = pb.createExtract(getInputStreamVar("u8mask12_17"), pb.getInteger(0));
230    PabloAST * ASCII = pb.createAnd(u8initial, u8final);
231    PabloAST * nonASCII = pb.createNot(ASCII, "nonASCII");
232    PabloAST * u8basis[8];
233    //
234    // Deposit bit 6 is either used for bit 6 of an ASCII code unit, or
235    // bit 0 for nonASCII units.   Extract the ASCII case separately.
236    PabloAST * ASCIIbit6 = pb.createAnd(dep6_11[0], ASCII);
237    dep6_11[0] = pb.createAnd(dep6_11[0], nonASCII);
238    for (unsigned i = 0; i < 6; i++) {
239        u8basis[i] = pb.createOr(dep0_5[i], dep6_11[i]);
240        u8basis[i] = pb.createOr(u8basis[i], dep12_17[i], "basis" + std::to_string(i));
241        if (i < 3) u8basis[i] = pb.createOr(u8basis[i], dep18_20[i]);
242    }
243    // The high bit of UTF-8 prefix and suffix bytes (any nonASCII byte) is always 1.
244    u8basis[7] = nonASCII;
245    // The second highest bit of UTF-8 units is 1 for any prefix, or ASCII byte with
246    // a 1 in bit 6 of the Unicode representation.
247    u8basis[6] = pb.createOr(pb.createAnd(u8initial, nonASCII), ASCIIbit6, "basis6");
248    //
249    // For any prefix of a 3-byte or 4-byte sequence the third highest bit is set to 1.
250    u8basis[5] = pb.createOr(u8basis[5], pb.createAnd(u8initial, pb.createNot(u8mask6_11)), "basis5");
251    // For any prefix of a 4-byte sequence the fourth highest bit is set to 1.
252    u8basis[4] = pb.createOr(u8basis[4], pb.createAnd(u8initial, pb.createNot(u8mask12_17)), "basis4");
253    for (unsigned i = 0; i < 8; i++) {
254        pb.createAssign(pb.createExtract(getOutputStreamVar("u8basis"), pb.getInteger(i)), u8basis[i]);
255    }
256}
257
258void u32u8_gen (ParabixDriver & pxDriver) {
259    auto & idb = pxDriver.getBuilder();
260    Module * mod = idb->getModule();
261
262    const unsigned u32buffersize = codegen::SegmentSize * codegen::ThreadNum;
263        const unsigned u8buffersize = 4 * u32buffersize;
264
265    Type * const voidTy = idb->getVoidTy();
266   
267    Function * const main = cast<Function>(mod->getOrInsertFunction("Main", voidTy, idb->getInt32Ty(), nullptr));
268    main->setCallingConv(CallingConv::C);
269    Function::arg_iterator args = main->arg_begin();
270   
271    Value * const fileDecriptor = &*(args++);
272    fileDecriptor->setName("fileDecriptor");
273   
274    idb->SetInsertPoint(BasicBlock::Create(mod->getContext(), "entry", main,0));
275   
276    // File data from mmap
277    StreamSetBuffer * codeUnitStream = pxDriver.addBuffer<ExternalBuffer>(idb, idb->getStreamSetTy(1, 32));
278   
279    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(idb, 32);
280    mmapK->setInitialArguments({fileDecriptor});
281    pxDriver.makeKernelCall(mmapK, {}, {codeUnitStream});
282   
283    // Source buffers for transposed UTF-32 basis bits.
284    StreamSetBuffer * u32basis = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(21), u32buffersize);
285   
286    kernel::Kernel * s2p21K = pxDriver.addKernelInstance<S2P_21Kernel>(idb);
287    pxDriver.makeKernelCall(s2p21K, {codeUnitStream}, {u32basis});
288
289    StreamSetBuffer * u8unitCounts = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), u8buffersize);
290
291        // Buffers for calculated deposit masks.
292    StreamSetBuffer * u8fieldMask = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), u8buffersize);
293    StreamSetBuffer * u8final = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), u8buffersize);
294    StreamSetBuffer * u8initial = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), u8buffersize);
295    StreamSetBuffer * u8mask12_17 = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), u8buffersize);
296    StreamSetBuffer * u8mask6_11 = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), u8buffersize);
297
298    // Intermediate buffers for deposited bits
299    StreamSetBuffer * deposit18_20 = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(3), u8buffersize);
300    StreamSetBuffer * deposit12_17 = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(6), u8buffersize);
301    StreamSetBuffer * deposit6_11 = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(6), u8buffersize);
302    StreamSetBuffer * deposit0_5 = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(6), u8buffersize);
303
304    // Final buffers for computed UTF-8 basis bits and byte stream.
305    StreamSetBuffer * u8basis = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), u8buffersize);
306    StreamSetBuffer * u8bytes = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 8), u8buffersize);
307
308    // Calculate the u8final deposit mask.
309    kernel::Kernel * fieldDepositMaskK = pxDriver.addKernelInstance<UTF8fieldDepositMask>(idb);
310    pxDriver.makeKernelCall(fieldDepositMaskK, {u32basis}, {u8fieldMask, u8unitCounts});
311    kernel::Kernel * streamK = pxDriver.addKernelInstance<StreamCompressKernel>(idb, 64, 1);
312    pxDriver.makeKernelCall(streamK, {u8fieldMask, u8unitCounts}, {u8final});
313
314    kernel::Kernel * maskK = pxDriver.addKernelInstance<UTF8_DepositMasks>(idb);
315    pxDriver.makeKernelCall(maskK, {u8final}, {u8initial, u8mask12_17, u8mask6_11});
316   
317    StreamDepositCompiler deposit18_20compiler(pxDriver, 21, 18, 3, u32buffersize);
318    deposit18_20compiler.makeCall(u8initial, u32basis, deposit18_20);
319   
320    StreamDepositCompiler deposit12_17compiler(pxDriver, 21, 12, 6, u32buffersize);
321    deposit12_17compiler.makeCall(u8mask12_17, u32basis, deposit12_17);
322   
323    StreamDepositCompiler deposit6_11compiler(pxDriver, 21, 6, 6, u32buffersize);
324    deposit6_11compiler.makeCall(u8mask6_11, u32basis, deposit6_11);
325   
326    StreamDepositCompiler deposit0_5compiler(pxDriver, 21, 0, 6, u32buffersize);
327    deposit0_5compiler.makeCall(u8final, u32basis, deposit0_5);
328   
329    kernel::Kernel * u8assemblyK = pxDriver.addKernelInstance<UTF8assembly>(idb);
330    pxDriver.makeKernelCall(u8assemblyK, {deposit18_20, deposit12_17, deposit6_11, deposit0_5,
331                                          u8initial, u8final, u8mask6_11, u8mask12_17},
332                                         {u8basis});
333
334    kernel::Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(idb);
335    pxDriver.makeKernelCall(p2sK, {u8basis}, {u8bytes});
336
337    kernel::Kernel * outK = pxDriver.addKernelInstance<StdOutKernel>(idb, 8);
338    pxDriver.makeKernelCall(outK, {u8bytes}, {});
339
340    pxDriver.generatePipelineIR();
341
342    pxDriver.deallocateBuffers();
343
344    idb->CreateRetVoid();
345
346    pxDriver.finalizeObject();
347}
348
349typedef void (*u32u8FunctionType)(uint32_t fd);
350
351int main(int argc, char *argv[]) {
352    codegen::ParseCommandLineOptions(argc, argv, {&u32u8Options, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
353    ParabixDriver pxDriver("u32u8");
354    u32u8_gen(pxDriver);
355    auto u32u8Function = reinterpret_cast<u32u8FunctionType>(pxDriver.getMain());
356    const int fd = open(inputFile.c_str(), O_RDONLY);
357    if (LLVM_UNLIKELY(fd == -1)) {
358        errs() << "Error: cannot open " << inputFile << " for processing. Skipped.\n";
359    } else {
360        u32u8Function(fd);
361        close(fd);
362    }
363    return 0;
364}
Note: See TracBrowser for help on using the repository browser.