source: icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp @ 5881

Last change on this file since 5881 was 5881, checked in by cameron, 14 months ago

Grapheme Cluster Break kernel

File size: 13.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "grep_kernel.h"
7#include <boost/uuid/sha1.hpp>
8#include <re/printer_re.h>
9#include <re/re_toolchain.h>
10#include <re/re_reverse.h>
11#include <pablo/pablo_toolchain.h>
12#include <kernels/kernel_builder.h>
13#include <pablo/builder.hpp>
14#include <pablo/pe_ones.h>          // for Ones
15#include <pablo/pe_var.h>           // for Var
16#include <pablo/pe_zeroes.h>        // for Zeroes
17#include <pablo/boolean.h>
18#include <pablo/pe_count.h>
19#include <pablo/pe_matchstar.h>
20#include <cc/cc_compiler.h>         // for CC_Compiler
21#include <cc/alphabet.h>
22#include <cc/multiplex_CCs.h>
23#include <re/re_compiler.h>
24#include <llvm/Support/raw_ostream.h>
25
26using namespace kernel;
27using namespace pablo;
28using namespace re;
29using namespace llvm;
30
31inline static std::string sha1sum(const std::string & str) {
32    char buffer[41];    // 40 hex-digits and the terminating null
33    uint32_t digest[5]; // 160 bits in total
34    boost::uuids::detail::sha1 sha1;
35    sha1.process_bytes(str.c_str(), str.size());
36    sha1.get_digest(digest);
37    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
38             digest[0], digest[1], digest[2], digest[3], digest[4]);
39    return std::string(buffer);
40}
41
42void RequiredStreams_UTF8::generatePabloMethod() {
43    PabloBuilder pb(getEntryScope());
44    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
45   
46    PabloAST * const LF = pb.createExtract(getInput(1), pb.getInteger(0), "LF");
47    PabloAST * const CR = ccc.compileCC(makeByte(0x0D));
48    PabloAST * const LF_VT_FF_CR = ccc.compileCC("LF,VT,FF,CR", makeByte(0x0A, 0x0D), pb);
49    Var * const LineBreak = pb.createVar("LineBreak", LF_VT_FF_CR);
50   
51    // Remove the CR of any CR+LF
52    Var * const CRLF = pb.createVar("CRLF", pb.createZeroes());
53    auto crb = pb.createScope();
54    pb.createIf(CR, crb);
55    PabloAST * const lookaheadLF = crb.createLookahead(LF, 1, "lookaheadLF");
56    PabloAST * const crlf = crb.createAnd(CR, lookaheadLF);
57    crb.createAssign(CRLF, crlf);
58    PabloAST * removedCRLF = crb.createAnd(LineBreak, crb.createNot(CRLF));
59    crb.createAssign(LineBreak, removedCRLF);
60
61   
62    Zeroes * const ZEROES = pb.createZeroes();
63    PabloAST * const u8pfx = ccc.compileCC(makeByte(0xC0, 0xFF));
64
65
66    Var * const nonFinal = pb.createVar("nonFinal", u8pfx);
67    Var * const u8invalid = pb.createVar("u8invalid", ZEROES);
68    Var * const valid_pfx = pb.createVar("valid_pfx", u8pfx);
69
70    auto it = pb.createScope();
71    pb.createIf(u8pfx, it);
72    PabloAST * const u8pfx2 = ccc.compileCC(makeByte(0xC2, 0xDF), it);
73    PabloAST * const u8pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), it);
74    PabloAST * const u8pfx4 = ccc.compileCC(makeByte(0xF0, 0xF4), it);
75    PabloAST * const u8suffix = ccc.compileCC("u8suffix", makeByte(0x80, 0xBF), it);
76   
77    //
78    // Two-byte sequences
79    Var * const anyscope = it.createVar("anyscope", ZEROES);
80    auto it2 = it.createScope();
81    it.createIf(u8pfx2, it2);
82    it2.createAssign(anyscope, it2.createAdvance(u8pfx2, 1));
83    PabloAST * NEL = it2.createAnd(it2.createAdvance(ccc.compileCC(makeByte(0xC2), it2), 1), ccc.compileCC(makeByte(0x85), it2), "NEL");
84    it2.createAssign(LineBreak, it2.createOr(LineBreak, NEL));
85
86
87    //
88    // Three-byte sequences   
89    Var * const EF_invalid = it.createVar("EF_invalid", ZEROES);
90    auto it3 = it.createScope();
91    it.createIf(u8pfx3, it3);
92    PabloAST * const u8scope32 = it3.createAdvance(u8pfx3, 1);
93    it3.createAssign(nonFinal, it3.createOr(nonFinal, u8scope32));
94    PabloAST * const u8scope33 = it3.createAdvance(u8pfx3, 2);
95    PabloAST * const u8scope3X = it3.createOr(u8scope32, u8scope33);
96    it3.createAssign(anyscope, it3.createOr(anyscope, u8scope3X));
97    PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE0), it3), 1), ccc.compileCC(makeByte(0x80, 0x9F), it3));
98    PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xED), it3), 1), ccc.compileCC(makeByte(0xA0, 0xBF), it3));
99    PabloAST * const EX_invalid = it3.createOr(E0_invalid, ED_invalid);
100    it3.createAssign(EF_invalid, EX_invalid);
101    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE2), it3), 1), ccc.compileCC(makeByte(0x80), it3));
102    PabloAST * LS_PS = it3.createAnd(it3.createAdvance(E2_80, 1), ccc.compileCC(makeByte(0xA8,0xA9), it3), "LS_PS");
103    it3.createAssign(LineBreak, it3.createOr(LineBreak, LS_PS));
104
105    //
106    // Four-byte sequences
107    auto it4 = it.createScope();
108    it.createIf(u8pfx4, it4);
109    PabloAST * const u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
110    PabloAST * const u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
111    PabloAST * const u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
112    PabloAST * const u8scope4nonfinal = it4.createOr(u8scope42, u8scope43);
113    it4.createAssign(nonFinal, it4.createOr(nonFinal, u8scope4nonfinal));
114    PabloAST * const u8scope4X = it4.createOr(u8scope4nonfinal, u8scope44);
115    it4.createAssign(anyscope, it4.createOr(anyscope, u8scope4X));
116    PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF0), it4), 1), ccc.compileCC(makeByte(0x80, 0x8F), it4));
117    PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF4), it4), 1), ccc.compileCC(makeByte(0x90, 0xBF), it4));
118    PabloAST * const FX_invalid = it4.createOr(F0_invalid, F4_invalid);
119    it4.createAssign(EF_invalid, it4.createOr(EF_invalid, FX_invalid));
120   
121    //
122    // Invalid cases
123    PabloAST * const legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
124    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
125    //  a scope is a mismatch, i.e., invalid UTF-8.
126    PabloAST * const mismatch = it.createXor(anyscope, u8suffix);
127    //
128    PabloAST * const pfx_invalid = it.createXor(valid_pfx, legalpfx);
129    it.createAssign(u8invalid, it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
130    PabloAST * const u8valid = it.createNot(u8invalid, "u8valid");
131    //
132    //
133    it.createAssign(nonFinal, it.createAnd(nonFinal, u8valid));
134    pb.createAssign(nonFinal, pb.createOr(nonFinal, CRLF));
135    PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(pb.createNot(LineBreak), 1), "unterminatedLineAtEOF");
136   
137    Var * const required = getOutputStreamVar("nonFinal");
138    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), nonFinal);
139    pb.createAssign(pb.createExtract(getOutputStreamVar("UnicodeLB"), pb.getInteger(0)), pb.createOr(LineBreak, unterminatedLineAtEOF, "EOL"));
140}
141
142RequiredStreams_UTF8::RequiredStreams_UTF8(const std::unique_ptr<kernel::KernelBuilder> & kb)
143: PabloKernel(kb, "RequiredStreams_UTF8",
144// input
145{Binding{kb->getStreamSetTy(8), "basis"}, Binding{kb->getStreamSetTy(1), "lf", FixedRate(), LookAhead(1)}},
146// output
147{Binding{kb->getStreamSetTy(1), "nonFinal", FixedRate()},
148 Binding{kb->getStreamSetTy(1), "UnicodeLB", FixedRate(), Add1()}}) {
149
150}
151
152void RequiredStreams_UTF16::generatePabloMethod() {
153    PabloBuilder pb(getEntryScope());
154    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
155   
156    PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF, &cc::UTF16));    //u16hi_hi_surrogate = [\xD8-\xDB]
157    PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF, &cc::UTF16));    //u16hi_lo_surrogate = [\xDC-\xDF]
158   
159    PabloAST * invalidTemp = pb.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
160    PabloAST * u16invalid = pb.createXor(invalidTemp, u16hi_lo_surrogate, "u16invalid");
161
162    PabloAST * u16valid = pb.createNot(u16invalid, "u16valid");
163    PabloAST * nonFinal = pb.createAnd(u16hi_hi_surrogate, u16valid, "nonfinal");
164
165    PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF, &cc::UTF16)), ccc.compileCC(makeCC(0xE000, 0xFFFF, &cc::UTF16)));
166    PabloAST * u16single = pb.createAnd(u16single_temp, pb.createNot(u16invalid));
167
168    PabloAST * const nonFinalCodeUnits = pb.createExtract(getInput(1), pb.getInteger(0));
169    PabloAST * const initial = pb.createOr(u16single, u16hi_hi_surrogate, "initial");
170    PabloAST * const final = pb.createNot(pb.createOr(pb.createOr(u16hi_hi_surrogate, u16invalid), nonFinalCodeUnits), "final");
171
172    Var * const required = getOutputStreamVar("required");
173    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), initial);
174    pb.createAssign(pb.createExtract(required, pb.getInteger(1)), nonFinal);
175    pb.createAssign(pb.createExtract(required, pb.getInteger(2)), final);
176
177}
178
179RequiredStreams_UTF16::RequiredStreams_UTF16(const std::unique_ptr<kernel::KernelBuilder> & kb)
180: PabloKernel(kb, "RequiredStreams_UTF16",               
181// inputs
182{Binding{kb->getStreamSetTy(8), "basis"}},
183// output
184{Binding{kb->getStreamSetTy(3), "required", FixedRate(), Add1()}}) {
185
186}
187
188ICGrepSignature::ICGrepSignature(re::RE * const re_ast)
189: mRE(re_ast)
190, mSignature(Printer_RE::PrintRE(mRE)) {
191   
192}
193
194// Helper to compute stream set inputs to pass into PabloKernel constructor.
195inline std::vector<Binding> icGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
196                                         const std::vector<std::string> & externals,
197                                         const std::vector<cc::Alphabet *> & alphabets) {
198    std::vector<Binding> streamSetInputs = {
199        Binding{b->getStreamSetTy(8), "basis"},
200    };
201    for (auto & e : externals) {
202        streamSetInputs.push_back(Binding{b->getStreamSetTy(1, 1), e});
203    }
204    for (const auto & alphabet : alphabets) {
205        unsigned basis_size = cast<cc::MultiplexedAlphabet>(alphabet)->getMultiplexedCCs().size();
206        streamSetInputs.push_back(Binding{b->getStreamSetTy(basis_size, 1), alphabet->getName() + "_basis"});
207    }
208    return streamSetInputs;
209}
210
211ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<std::string> externals, std::vector<cc::Alphabet *> alphabets)
212: ICGrepSignature(re)
213, PabloKernel(b, "ic" + sha1sum(mSignature),
214// inputs
215icGrepInputs(b, externals, alphabets),
216// output
217{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
218, mExternals(externals)
219, mAlphabets(alphabets) {
220}
221
222std::string ICGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
223    return mSignature;
224}
225
226void ICGrepKernel::generatePabloMethod() {
227    PabloBuilder pb(getEntryScope());
228    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
229    RE_Compiler re_compiler(this, ccc);
230    for (auto & e : mExternals) {
231        re_compiler.addPrecompiled(e, pb.createExtract(getInputStreamVar(e), pb.getInteger(0)));
232    }
233    for (auto a : mAlphabets) {
234        auto mpx_basis = getInputStreamSet(a->getName() + "_basis");
235        re_compiler.addAlphabet(a, mpx_basis);
236    }
237    PabloAST * const matches = re_compiler.compile(mRE);
238    Var * const output = getOutputStreamVar("matches");
239    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
240}
241
242void MatchedLinesKernel::generatePabloMethod() {
243    PabloBuilder pb(getEntryScope());
244    PabloAST * matchResults = pb.createExtract(getInputStreamVar("matchResults"), pb.getInteger(0));
245    PabloAST * lineBreaks = pb.createExtract(getInputStreamVar("lineBreaks"), pb.getInteger(0));
246    PabloAST * notLB = pb.createNot(lineBreaks);
247    PabloAST * match_follow = pb.createMatchStar(matchResults, notLB);
248    Var * const matchedLines = getOutputStreamVar("matchedLines");
249    pb.createAssign(pb.createExtract(matchedLines, pb.getInteger(0)), pb.createAnd(match_follow, lineBreaks));
250}
251
252MatchedLinesKernel::MatchedLinesKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
253: PabloKernel(iBuilder, "MatchedLines",
254// inputs
255{Binding{iBuilder->getStreamSetTy(1), "matchResults"}
256,Binding{iBuilder->getStreamSetTy(1), "lineBreaks"}},
257// output
258{Binding{iBuilder->getStreamSetTy(1), "matchedLines"}}) {
259
260}
261
262
263void InvertMatchesKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
264    Value * input = iBuilder->loadInputStreamBlock("matchedLines", iBuilder->getInt32(0));
265    Value * lbs = iBuilder->loadInputStreamBlock("lineBreaks", iBuilder->getInt32(0));
266    Value * inverted = iBuilder->CreateXor(input, lbs);
267    iBuilder->storeOutputStreamBlock("nonMatches", iBuilder->getInt32(0), inverted);
268}
269
270InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
271: BlockOrientedKernel("Invert",
272// Inputs
273{Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
274// Outputs
275{Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
276// Input/Output Scalars and internal state
277{}, {}, {}) {
278
279}
280
281
282void PopcountKernel::generatePabloMethod() {
283    auto pb = this->getEntryScope();
284    const auto toCount = pb->createExtract(getInputStreamVar("toCount"), pb->getInteger(0));
285    pablo::Var * countResult = getOutputScalarVar("countResult");
286    pb->createAssign(countResult, pb->createCount(toCount));
287}
288
289PopcountKernel::PopcountKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
290: PabloKernel(iBuilder, "Popcount",
291{Binding{iBuilder->getStreamSetTy(1), "toCount"}},
292{},
293{},
294{Binding{iBuilder->getSizeTy(), "countResult"}}) {
295
296}
Note: See TracBrowser for help on using the repository browser.