source: icGREP/icgrep-devel/icgrep/u32u8.cpp

Last change on this file was 6261, checked in by nmedfort, 9 months ago

Work on OptimizationBranch?; revisited pipeline termination

File size: 18.0 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <IR_Gen/idisa_target.h>                   // for GetIDISA_Builder
8#include <cc/cc_compiler.h>                        // for CC_Compiler
9#include <kernels/deletion.h>                      // for DeletionKernel
10#include <kernels/source_kernel.h>
11#include <kernels/p2s_kernel.h>                    // for P2S16KernelWithCom...
12#include <kernels/s2p_kernel.h>                    // for S2PKernel
13#include <kernels/stdout_kernel.h>                 // for StdOutKernel_
14#include <kernels/pdep_kernel.h>
15#include <llvm/IR/Function.h>                      // for Function, Function...
16#include <llvm/IR/Module.h>                        // for Module
17#include <llvm/Support/CommandLine.h>              // for ParseCommandLineOp...
18#include <llvm/Support/Debug.h>                    // for dbgs
19#include <pablo/pablo_kernel.h>                    // for PabloKernel
20#include <pablo/pablo_toolchain.h>                 // for pablo_function_passes
21#include <kernels/kernel_builder.h>
22#include <pablo/pe_zeroes.h>
23#include <toolchain/toolchain.h>
24#include <toolchain/cpudriver.h>
25#include <kernels/streamset.h>
26#include <kernels/hex_convert.h>
27#include <llvm/ADT/StringRef.h>
28#include <llvm/IR/CallingConv.h>
29#include <llvm/IR/DerivedTypes.h>
30#include <llvm/IR/LLVMContext.h>
31#include <llvm/IR/Value.h>
32#include <llvm/Support/Compiler.h>
33#include <llvm/Support/raw_ostream.h>
34#include <pablo/builder.hpp>
35#include <fcntl.h>
36#include <kernels/pipeline_builder.h>
37
38using namespace pablo;
39using namespace kernel;
40using namespace llvm;
41using namespace codegen;
42
43static cl::OptionCategory u32u8Options("u32u8 Options", "Transcoding control options.");
44static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(u32u8Options));
45
46//
47// UTF-8 encoding requires one to four bytes per Unicode character.
48// To generate UTF-8 encoded output from sets of basis bit streams
49// representing Unicode characters (that is, codepoint-indexed streams
50// having one bit position per codepoint), deposit masks are needed
51// to identify the positions at which bits for each character are
52// to be deposited.   A UTF-8 deposit mask will have one to four bit
53// positions per character depending on the character being encoded, that is,
54// depending on the number of bytes needed to encode the character.   Within
55// each group of one to four positions for a single character, a deposit mask
56// must have exactly one 1 bit set.  Different deposit masks are used for
57// depositing bits, depending on the destination byte position within the
58// ultimate 4 byte sequencE->
59//
60// The following deposit masks (shown in little-endian representation) are
61// used for depositing bits.
62//
63//  UTF-8 sequence length:          1     2     3       4
64//  Unicode bit position:
65//  Unicode codepoint bits 0-5      1    10   100    1000    u8final
66//  Bits 6-11                       1    01   010    0100    u8mask6_11
67//  Bits 12-17                      1    01   001    0010    u8mask12_17
68//  Bits 18-20                      1    01   001    0001    u8initial
69//
70//  To compute UTF-8 deposit masks, we begin by constructing an extraction
71//  mask having 4 bit positions per character, but with the number of
72//  1 bits to be kept dependent on the sequence length.  When this extraction
73//  mask is applied to the repeating constant 4-bit mask 1000, u8final above
74//  is produced.
75//
76//  UTF-8 sequence length:             1     2     3       4
77//  extraction mask                 1000  1100  1110    1111
78//  constant mask                   1000  1000  1000    1000
79//  final position mask             1     10    100     1000
80//  From this mask, other masks may subsequently computed by
81//  bitwise logic and shifting.
82//
83//  The UTF8fieldDepositMask kernel produces this deposit mask
84//  within 64-bit fields.
85
86class UTF8fieldDepositMask final : public BlockOrientedKernel {
87public:
88    UTF8fieldDepositMask(const std::unique_ptr<KernelBuilder> & b, StreamSet * u32basis, StreamSet * u8fieldMask, StreamSet * u8unitCounts, unsigned depositFieldWidth = sizeof(size_t) * 8);
89private:
90    void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) override;
91    void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const remainingBytes) override;
92    const unsigned mDepositFieldWidth;
93};
94
95UTF8fieldDepositMask::UTF8fieldDepositMask(const std::unique_ptr<KernelBuilder> & b, StreamSet * u32basis, StreamSet * u8fieldMask, StreamSet * u8unitCounts, unsigned depositFieldWidth)
96: BlockOrientedKernel(b, "u8depositMask",
97{Binding{"basis", u32basis}},
98{Binding{"fieldDepositMask", u8fieldMask, FixedRate(4)},
99Binding{"extractionMask", u8unitCounts, FixedRate(4)}},
100{}, {}, {Binding{b->getBitBlockType(), "EOFmask"}})
101, mDepositFieldWidth(depositFieldWidth) {
102
103}
104
105
106void UTF8fieldDepositMask::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
107    Value * fileExtentMask = b->CreateNot(b->getScalarField("EOFmask"));
108    // If any of bits 16 through 20 are 1, a four-byte UTF-8 sequence is required.
109    Value * u8len4 = b->loadInputStreamBlock("basis", b->getSize(16), b->getSize(0));
110    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(17), b->getSize(0)));
111    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(18), b->getSize(0)));
112    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(19), b->getSize(0)));
113    u8len4 = b->CreateOr(u8len4, b->loadInputStreamBlock("basis", b->getSize(20), b->getSize(0)), "u8len4");
114    u8len4 = b->CreateAnd(u8len4, fileExtentMask);
115    Value * u8len34 = u8len4;
116    // Otherwise, if any of bits 11 through 15 are 1, a three-byte UTF-8 sequence is required.
117    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(11), b->getSize(0)));
118    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(12), b->getSize(0)));
119    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(13), b->getSize(0)));
120    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(14), b->getSize(0)));
121    u8len34 = b->CreateOr(u8len34, b->loadInputStreamBlock("basis", b->getSize(15), b->getSize(0)));
122    u8len34 = b->CreateAnd(u8len34, fileExtentMask);
123    Value * nonASCII = u8len34;
124    // Otherwise, if any of bits 7 through 10 are 1, a two-byte UTF-8 sequence is required.
125    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(7), b->getSize(0)));
126    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(8), b->getSize(0)));
127    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(9), b->getSize(0)));
128    nonASCII = b->CreateOr(nonASCII, b->loadInputStreamBlock("basis", b->getSize(10), b->getSize(0)), "nonASCII");
129    nonASCII = b->CreateAnd(nonASCII, fileExtentMask);
130    //
131    //  UTF-8 sequence length:    1     2     3       4
132    //  extraction mask        1000  1100  1110    1111
133    //  interleave u8len3|u8len4, allOnes() for bits 1, 3:  x..., ..x.
134    //  interleave prefix4, u8len2|u8len3|u8len4 for bits 0, 2:  .x.., ...x
135
136    Value * maskA_lo = b->esimd_mergel(1, u8len34, fileExtentMask);
137    Value * maskA_hi = b->esimd_mergeh(1, u8len34, fileExtentMask);
138    Value * maskB_lo = b->esimd_mergel(1, u8len4, nonASCII);
139    Value * maskB_hi = b->esimd_mergeh(1, u8len4, nonASCII);
140    Value * extraction_mask[4];
141    extraction_mask[0] = b->esimd_mergel(1, maskB_lo, maskA_lo);
142    extraction_mask[1] = b->esimd_mergeh(1, maskB_lo, maskA_lo);
143    extraction_mask[2] = b->esimd_mergel(1, maskB_hi, maskA_hi);
144    extraction_mask[3] = b->esimd_mergeh(1, maskB_hi, maskA_hi);
145    const unsigned bw = b->getBitBlockWidth();
146    Constant * mask1000 = Constant::getIntegerValue(b->getIntNTy(bw), APInt::getSplat(bw, APInt::getHighBitsSet(4, 1)));
147    for (unsigned j = 0; j < 4; ++j) {
148        Value * deposit_mask = b->simd_pext(mDepositFieldWidth, mask1000, extraction_mask[j]);
149        b->storeOutputStreamBlock("fieldDepositMask", b->getSize(0), b->getSize(j), deposit_mask);
150        b->storeOutputStreamBlock("extractionMask", b->getSize(0), b->getSize(j), extraction_mask[j]);
151    }
152}
153void UTF8fieldDepositMask::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * const remainingBytes) {
154    // Standard Pablo convention for final block processing: set a bit marking
155    // the position just past EOF, as well as a mask marking all positions past EOF.
156    b->setScalarField("EOFmask", b->bitblock_mask_from(remainingBytes));
157    CreateDoBlockMethodCall(b);
158}
159
160
161//
162// Given a u8-indexed bit stream marking the final code unit position
163// of each UTF-8 sequence, this kernel computes the deposit masks
164// u8initial, u8mask12_17, and u8mask6_11.
165//
166class UTF8_DepositMasks : public pablo::PabloKernel {
167public:
168    UTF8_DepositMasks(const std::unique_ptr<KernelBuilder> & kb, StreamSet * u8final, StreamSet * u8initial, StreamSet * u8mask12_17, StreamSet * u8mask6_11);
169    bool isCachable() const override { return true; }
170    bool hasSignature() const override { return false; }
171protected:
172    void generatePabloMethod() override;
173};
174
175UTF8_DepositMasks::UTF8_DepositMasks (const std::unique_ptr<KernelBuilder> & iBuilder, StreamSet * u8final, StreamSet * u8initial, StreamSet * u8mask12_17, StreamSet * u8mask6_11)
176: PabloKernel(iBuilder, "UTF8_DepositMasks",
177              {Binding{"u8final", u8final, FixedRate(1), LookAhead(2)}},
178              {Binding{"u8initial", u8initial},
179               Binding{"u8mask12_17", u8mask12_17},
180               Binding{"u8mask6_11", u8mask6_11}}) {}
181
182void UTF8_DepositMasks::generatePabloMethod() {
183    PabloBuilder pb(getEntryScope());
184    PabloAST * u8final = pb.createExtract(getInputStreamVar("u8final"), pb.getInteger(0));
185    PabloAST * nonFinal = pb.createNot(u8final, "nonFinal");
186    PabloAST * initial = pb.createInFile(pb.createNot(pb.createAdvance(nonFinal, 1)), "u8initial");
187    PabloAST * ASCII = pb.createAnd(u8final, initial);
188    PabloAST * lookAheadFinal = pb.createLookahead(u8final, 1, "lookaheadFinal");
189    // Eliminate lookahead positions that are the final position of the prior unit.
190    PabloAST * secondLast = pb.createAnd(lookAheadFinal, nonFinal);
191    PabloAST * u8mask6_11 = pb.createInFile(pb.createOr(secondLast, ASCII, "u8mask6_11"));
192    PabloAST * prefix2 = pb.createAnd(secondLast, initial);
193    PabloAST * lookAhead2 = pb.createLookahead(u8final, 2, "lookahead2");
194    PabloAST * thirdLast = pb.createAnd(pb.createAnd(lookAhead2, nonFinal), pb.createNot(secondLast));
195    PabloAST * u8mask12_17 = pb.createInFile(pb.createOr(thirdLast, pb.createOr(prefix2, ASCII), "u8mask12_17"));
196    pb.createAssign(pb.createExtract(getOutputStreamVar("u8initial"), pb.getInteger(0)), initial);
197    pb.createAssign(pb.createExtract(getOutputStreamVar("u8mask6_11"), pb.getInteger(0)), u8mask6_11);
198    pb.createAssign(pb.createExtract(getOutputStreamVar("u8mask12_17"), pb.getInteger(0)), u8mask12_17);
199}
200
201// This kernel assembles the UTF-8 basis bit data, given four sets of deposited
202// bits bits 18-20, 11-17, 6-11 and 0-5, as weil as the marker streams u8initial,
203// u8final, u8prefix3 and u8prefix4.
204//
205class UTF8assembly : public pablo::PabloKernel {
206public:
207    UTF8assembly(const std::unique_ptr<KernelBuilder> & kb,
208                 StreamSet * deposit18_20, StreamSet * deposit12_17, StreamSet * deposit6_11, StreamSet * deposit0_5,
209                 StreamSet * u8initial, StreamSet * u8final, StreamSet * u8mask6_11, StreamSet * u8mask12_17,
210                 StreamSet * u8basis);
211    bool isCachable() const override { return true; }
212    bool hasSignature() const override { return false; }
213protected:
214    void generatePabloMethod() override;
215};
216
217UTF8assembly::UTF8assembly (const std::unique_ptr<KernelBuilder> & b,
218                            StreamSet * deposit18_20, StreamSet * deposit12_17, StreamSet * deposit6_11, StreamSet * deposit0_5,
219                            StreamSet * u8initial, StreamSet * u8final, StreamSet * u8mask6_11, StreamSet * u8mask12_17,
220                            StreamSet * u8basis)
221: PabloKernel(b, "UTF8assembly",
222{Binding{"dep18_20", deposit18_20},
223 Binding{"dep12_17", deposit12_17},
224 Binding{"dep6_11", deposit6_11},
225 Binding{"dep0_5", deposit0_5},
226 Binding{"u8initial", u8initial},
227 Binding{"u8final", u8final},
228 Binding{"u8mask6_11", u8mask6_11},
229 Binding{"u8mask12_17", u8mask12_17}},
230{Binding{"u8basis", u8basis}}) {
231
232}
233
234void UTF8assembly::generatePabloMethod() {
235    PabloBuilder pb(getEntryScope());
236    std::vector<PabloAST *> dep18_20 = getInputStreamSet("dep18_20");
237    std::vector<PabloAST *> dep12_17 = getInputStreamSet("dep12_17");
238    std::vector<PabloAST *> dep6_11 = getInputStreamSet("dep6_11");
239    std::vector<PabloAST *> dep0_5 = getInputStreamSet("dep0_5");
240    PabloAST * u8initial = pb.createExtract(getInputStreamVar("u8initial"), pb.getInteger(0));
241    PabloAST * u8final = pb.createExtract(getInputStreamVar("u8final"), pb.getInteger(0));
242    PabloAST * u8mask6_11 = pb.createExtract(getInputStreamVar("u8mask6_11"), pb.getInteger(0));
243    PabloAST * u8mask12_17 = pb.createExtract(getInputStreamVar("u8mask12_17"), pb.getInteger(0));
244    PabloAST * ASCII = pb.createAnd(u8initial, u8final);
245    PabloAST * nonASCII = pb.createNot(ASCII, "nonASCII");
246    PabloAST * u8basis[8];
247    //
248    // Deposit bit 6 is either used for bit 6 of an ASCII code unit, or
249    // bit 0 for nonASCII units.   Extract the ASCII case separately.
250    PabloAST * ASCIIbit6 = pb.createAnd(dep6_11[0], ASCII);
251    dep6_11[0] = pb.createAnd(dep6_11[0], nonASCII);
252    for (unsigned i = 0; i < 6; i++) {
253        u8basis[i] = pb.createOr(dep0_5[i], dep6_11[i]);
254        u8basis[i] = pb.createOr(u8basis[i], dep12_17[i], "basis" + std::to_string(i));
255        if (i < 3) u8basis[i] = pb.createOr(u8basis[i], dep18_20[i]);
256    }
257    // The high bit of UTF-8 prefix and suffix bytes (any nonASCII byte) is always 1.
258    u8basis[7] = nonASCII;
259    // The second highest bit of UTF-8 units is 1 for any prefix, or ASCII byte with
260    // a 1 in bit 6 of the Unicode representation.
261    u8basis[6] = pb.createOr(pb.createAnd(u8initial, nonASCII), ASCIIbit6, "basis6");
262    //
263    // For any prefix of a 3-byte or 4-byte sequence the third highest bit is set to 1.
264    u8basis[5] = pb.createOr(u8basis[5], pb.createAnd(u8initial, pb.createNot(u8mask6_11)), "basis5");
265    // For any prefix of a 4-byte sequence the fourth highest bit is set to 1.
266    u8basis[4] = pb.createOr(u8basis[4], pb.createAnd(u8initial, pb.createNot(u8mask12_17)), "basis4");
267    for (unsigned i = 0; i < 8; i++) {
268        pb.createAssign(pb.createExtract(getOutputStreamVar("u8basis"), pb.getInteger(i)), u8basis[i]);
269    }
270}
271
272void deposit(const std::unique_ptr<ProgramBuilder> & P, const unsigned base, const unsigned count, StreamSet * mask, StreamSet * inputs, StreamSet * outputs) {
273    StreamSet * const expanded = P->CreateStreamSet(count);
274    P->CreateKernelCall<StreamExpandKernel>(inputs, base, mask, expanded);
275    if (AVX2_available() && BMI2_available()) {
276        P->CreateKernelCall<PDEPFieldDepositKernel>(mask, expanded, outputs);
277    } else {
278        P->CreateKernelCall<FieldDepositKernel>(mask, expanded, outputs);
279    }
280}
281
282typedef void (*u32u8FunctionType)(uint32_t fd);
283
284u32u8FunctionType u32u8_gen (CPUDriver & pxDriver) {
285
286    auto & iBuilder = pxDriver.getBuilder();
287    Type * const int32Ty = iBuilder->getInt32Ty();
288    auto P = pxDriver.makePipeline({Binding{int32Ty, "fd"}});
289
290    Scalar * const fileDescriptor = P->getInputScalar("fd");
291
292    // Source data
293    StreamSet * const codeUnitStream = P->CreateStreamSet(1, 32);
294    P->CreateKernelCall<MMapSourceKernel>(fileDescriptor, codeUnitStream);
295
296    // Source buffers for transposed UTF-32 basis bits.
297    StreamSet * const u32basis = P->CreateStreamSet(21);
298    P->CreateKernelCall<S2P_21Kernel>(codeUnitStream, u32basis);
299
300    // Buffers for calculated deposit masks.
301    StreamSet * const u8fieldMask = P->CreateStreamSet();
302    StreamSet * const u8final = P->CreateStreamSet();
303    StreamSet * const u8initial = P->CreateStreamSet();
304    StreamSet * const u8mask12_17 = P->CreateStreamSet();
305    StreamSet * const u8mask6_11 = P->CreateStreamSet();
306
307    // Intermediate buffers for deposited bits
308    StreamSet * const deposit18_20 = P->CreateStreamSet(3);
309    StreamSet * const deposit12_17 = P->CreateStreamSet(6);
310    StreamSet * const deposit6_11 = P->CreateStreamSet(6);
311    StreamSet * const deposit0_5 = P->CreateStreamSet(6);
312
313    // Final buffers for computed UTF-8 basis bits and byte stream.
314    StreamSet * const u8basis = P->CreateStreamSet(8);
315    StreamSet * const u8bytes = P->CreateStreamSet(1, 8);
316
317    // Calculate the u8final deposit mask.
318    StreamSet * const extractionMask = P->CreateStreamSet();
319    P->CreateKernelCall<UTF8fieldDepositMask>(u32basis, u8fieldMask, extractionMask);
320    P->CreateKernelCall<StreamCompressKernel>(u8fieldMask, extractionMask, u8final);
321
322    P->CreateKernelCall<UTF8_DepositMasks>(u8final, u8initial, u8mask12_17, u8mask6_11);
323
324    deposit(P, 18, 3, u8initial, u32basis, deposit18_20);
325    deposit(P, 12, 6, u8mask12_17, u32basis, deposit12_17);
326    deposit(P, 6, 6, u8mask6_11, u32basis, deposit6_11);
327    deposit(P, 0, 6, u8final, u32basis, deposit0_5);
328
329    P->CreateKernelCall<UTF8assembly>(deposit18_20, deposit12_17, deposit6_11, deposit0_5,
330                                      u8initial, u8final, u8mask6_11, u8mask12_17,
331                                      u8basis);
332
333    P->CreateKernelCall<P2SKernel>(u8basis, u8bytes);
334
335    P->CreateKernelCall<StdOutKernel>(u8bytes);
336
337    return reinterpret_cast<u32u8FunctionType>(P->compile());
338}
339
340int main(int argc, char *argv[]) {
341    codegen::ParseCommandLineOptions(argc, argv, {&u32u8Options, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
342    CPUDriver pxDriver("u32u8");
343    auto u32u8Function = u32u8_gen(pxDriver);
344    const int fd = open(inputFile.c_str(), O_RDONLY);
345    if (LLVM_UNLIKELY(fd == -1)) {
346        errs() << "Error: cannot open " << inputFile << " for processing. Skipped.\n";
347    } else {
348        u32u8Function(fd);
349        close(fd);
350    }
351    return 0;
352}
Note: See TracBrowser for help on using the repository browser.