source: icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp @ 5888

Last change on this file since 5888 was 5888, checked in by cameron, 17 months ago

Allow RE compilers to be associated with any Pablo block, not just kernel entry

File size: 13.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "grep_kernel.h"
7#include <boost/uuid/sha1.hpp>
8#include <re/printer_re.h>
9#include <re/re_toolchain.h>
10#include <re/re_reverse.h>
11#include <pablo/pablo_toolchain.h>
12#include <kernels/kernel_builder.h>
13#include <pablo/builder.hpp>
14#include <pablo/pe_ones.h>          // for Ones
15#include <pablo/pe_var.h>           // for Var
16#include <pablo/pe_zeroes.h>        // for Zeroes
17#include <pablo/boolean.h>
18#include <pablo/pe_count.h>
19#include <pablo/pe_matchstar.h>
20#include <cc/cc_compiler.h>         // for CC_Compiler
21#include <cc/alphabet.h>
22#include <cc/multiplex_CCs.h>
23#include <re/re_compiler.h>
24#include <llvm/Support/raw_ostream.h>
25
26using namespace kernel;
27using namespace pablo;
28using namespace re;
29using namespace llvm;
30
31inline static std::string sha1sum(const std::string & str) {
32    char buffer[41];    // 40 hex-digits and the terminating null
33    uint32_t digest[5]; // 160 bits in total
34    boost::uuids::detail::sha1 sha1;
35    sha1.process_bytes(str.c_str(), str.size());
36    sha1.get_digest(digest);
37    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
38             digest[0], digest[1], digest[2], digest[3], digest[4]);
39    return std::string(buffer);
40}
41
42void RequiredStreams_UTF8::generatePabloMethod() {
43    PabloBuilder pb(getEntryScope());
44    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
45   
46    PabloAST * const LF = pb.createExtract(getInput(1), pb.getInteger(0), "LF");
47    PabloAST * const CR = ccc.compileCC(makeByte(0x0D));
48    PabloAST * const LF_VT_FF_CR = ccc.compileCC("LF,VT,FF,CR", makeByte(0x0A, 0x0D), pb);
49    Var * const LineBreak = pb.createVar("LineBreak", LF_VT_FF_CR);
50   
51    // Remove the CR of any CR+LF
52    Var * const CRLF = pb.createVar("CRLF", pb.createZeroes());
53    auto crb = pb.createScope();
54    pb.createIf(CR, crb);
55    PabloAST * const lookaheadLF = crb.createLookahead(LF, 1, "lookaheadLF");
56    PabloAST * const crlf = crb.createAnd(CR, lookaheadLF);
57    crb.createAssign(CRLF, crlf);
58    PabloAST * removedCRLF = crb.createAnd(LineBreak, crb.createNot(CRLF));
59    crb.createAssign(LineBreak, removedCRLF);
60
61   
62    Zeroes * const ZEROES = pb.createZeroes();
63    PabloAST * const u8pfx = ccc.compileCC(makeByte(0xC0, 0xFF));
64
65
66    Var * const nonFinal = pb.createVar("nonFinal", u8pfx);
67    Var * const u8invalid = pb.createVar("u8invalid", ZEROES);
68    Var * const valid_pfx = pb.createVar("valid_pfx", u8pfx);
69
70    auto it = pb.createScope();
71    pb.createIf(u8pfx, it);
72    PabloAST * const u8pfx2 = ccc.compileCC(makeByte(0xC2, 0xDF), it);
73    PabloAST * const u8pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), it);
74    PabloAST * const u8pfx4 = ccc.compileCC(makeByte(0xF0, 0xF4), it);
75    PabloAST * const u8suffix = ccc.compileCC("u8suffix", makeByte(0x80, 0xBF), it);
76   
77    //
78    // Two-byte sequences
79    Var * const anyscope = it.createVar("anyscope", ZEROES);
80    auto it2 = it.createScope();
81    it.createIf(u8pfx2, it2);
82    it2.createAssign(anyscope, it2.createAdvance(u8pfx2, 1));
83    PabloAST * NEL = it2.createAnd(it2.createAdvance(ccc.compileCC(makeByte(0xC2), it2), 1), ccc.compileCC(makeByte(0x85), it2), "NEL");
84    it2.createAssign(LineBreak, it2.createOr(LineBreak, NEL));
85
86
87    //
88    // Three-byte sequences   
89    Var * const EF_invalid = it.createVar("EF_invalid", ZEROES);
90    auto it3 = it.createScope();
91    it.createIf(u8pfx3, it3);
92    PabloAST * const u8scope32 = it3.createAdvance(u8pfx3, 1);
93    it3.createAssign(nonFinal, it3.createOr(nonFinal, u8scope32));
94    PabloAST * const u8scope33 = it3.createAdvance(u8pfx3, 2);
95    PabloAST * const u8scope3X = it3.createOr(u8scope32, u8scope33);
96    it3.createAssign(anyscope, it3.createOr(anyscope, u8scope3X));
97    PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE0), it3), 1), ccc.compileCC(makeByte(0x80, 0x9F), it3));
98    PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xED), it3), 1), ccc.compileCC(makeByte(0xA0, 0xBF), it3));
99    PabloAST * const EX_invalid = it3.createOr(E0_invalid, ED_invalid);
100    it3.createAssign(EF_invalid, EX_invalid);
101    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE2), it3), 1), ccc.compileCC(makeByte(0x80), it3));
102    PabloAST * LS_PS = it3.createAnd(it3.createAdvance(E2_80, 1), ccc.compileCC(makeByte(0xA8,0xA9), it3), "LS_PS");
103    it3.createAssign(LineBreak, it3.createOr(LineBreak, LS_PS));
104
105    //
106    // Four-byte sequences
107    auto it4 = it.createScope();
108    it.createIf(u8pfx4, it4);
109    PabloAST * const u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
110    PabloAST * const u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
111    PabloAST * const u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
112    PabloAST * const u8scope4nonfinal = it4.createOr(u8scope42, u8scope43);
113    it4.createAssign(nonFinal, it4.createOr(nonFinal, u8scope4nonfinal));
114    PabloAST * const u8scope4X = it4.createOr(u8scope4nonfinal, u8scope44);
115    it4.createAssign(anyscope, it4.createOr(anyscope, u8scope4X));
116    PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF0), it4), 1), ccc.compileCC(makeByte(0x80, 0x8F), it4));
117    PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF4), it4), 1), ccc.compileCC(makeByte(0x90, 0xBF), it4));
118    PabloAST * const FX_invalid = it4.createOr(F0_invalid, F4_invalid);
119    it4.createAssign(EF_invalid, it4.createOr(EF_invalid, FX_invalid));
120   
121    //
122    // Invalid cases
123    PabloAST * const legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
124    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
125    //  a scope is a mismatch, i.e., invalid UTF-8.
126    PabloAST * const mismatch = it.createXor(anyscope, u8suffix);
127    //
128    PabloAST * const pfx_invalid = it.createXor(valid_pfx, legalpfx);
129    it.createAssign(u8invalid, it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
130    PabloAST * const u8valid = it.createNot(u8invalid, "u8valid");
131    //
132    //
133    it.createAssign(nonFinal, it.createAnd(nonFinal, u8valid));
134    pb.createAssign(nonFinal, pb.createOr(nonFinal, CRLF));
135    PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(pb.createNot(LineBreak), 1), "unterminatedLineAtEOF");
136   
137    Var * const required = getOutputStreamVar("nonFinal");
138    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), nonFinal);
139    pb.createAssign(pb.createExtract(getOutputStreamVar("UnicodeLB"), pb.getInteger(0)), pb.createOr(LineBreak, unterminatedLineAtEOF, "EOL"));
140}
141
142RequiredStreams_UTF8::RequiredStreams_UTF8(const std::unique_ptr<kernel::KernelBuilder> & kb)
143: PabloKernel(kb, "RequiredStreams_UTF8",
144// input
145{Binding{kb->getStreamSetTy(8), "basis"}, Binding{kb->getStreamSetTy(1), "lf", FixedRate(), LookAhead(1)}},
146// output
147{Binding{kb->getStreamSetTy(1), "nonFinal", FixedRate()},
148 Binding{kb->getStreamSetTy(1), "UnicodeLB", FixedRate(), Add1()}}) {
149
150}
151
152void RequiredStreams_UTF16::generatePabloMethod() {
153    PabloBuilder pb(getEntryScope());
154    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
155   
156    PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF, &cc::UTF16));    //u16hi_hi_surrogate = [\xD8-\xDB]
157    PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF, &cc::UTF16));    //u16hi_lo_surrogate = [\xDC-\xDF]
158   
159    PabloAST * invalidTemp = pb.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
160    PabloAST * u16invalid = pb.createXor(invalidTemp, u16hi_lo_surrogate, "u16invalid");
161
162    PabloAST * u16valid = pb.createNot(u16invalid, "u16valid");
163    PabloAST * nonFinal = pb.createAnd(u16hi_hi_surrogate, u16valid, "nonfinal");
164
165    PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF, &cc::UTF16)), ccc.compileCC(makeCC(0xE000, 0xFFFF, &cc::UTF16)));
166    PabloAST * u16single = pb.createAnd(u16single_temp, pb.createNot(u16invalid));
167
168    PabloAST * const nonFinalCodeUnits = pb.createExtract(getInput(1), pb.getInteger(0));
169    PabloAST * const initial = pb.createOr(u16single, u16hi_hi_surrogate, "initial");
170    PabloAST * const final = pb.createNot(pb.createOr(pb.createOr(u16hi_hi_surrogate, u16invalid), nonFinalCodeUnits), "final");
171
172    Var * const required = getOutputStreamVar("required");
173    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), initial);
174    pb.createAssign(pb.createExtract(required, pb.getInteger(1)), nonFinal);
175    pb.createAssign(pb.createExtract(required, pb.getInteger(2)), final);
176
177}
178
179RequiredStreams_UTF16::RequiredStreams_UTF16(const std::unique_ptr<kernel::KernelBuilder> & kb)
180: PabloKernel(kb, "RequiredStreams_UTF16",               
181// inputs
182{Binding{kb->getStreamSetTy(8), "basis"}},
183// output
184{Binding{kb->getStreamSetTy(3), "required", FixedRate(), Add1()}}) {
185
186}
187
188ICGrepSignature::ICGrepSignature(re::RE * const re_ast)
189: mRE(re_ast)
190, mSignature(Printer_RE::PrintRE(mRE)) {
191   
192}
193
194// Helper to compute stream set inputs to pass into PabloKernel constructor.
195inline std::vector<Binding> icGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
196                                         const std::vector<std::string> & externals,
197                                         const std::vector<cc::Alphabet *> & alphabets) {
198    std::vector<Binding> streamSetInputs = {
199        Binding{b->getStreamSetTy(8), "basis"},
200    };
201    for (auto & e : externals) {
202        streamSetInputs.push_back(Binding{b->getStreamSetTy(1, 1), e});
203    }
204    for (const auto & alphabet : alphabets) {
205        unsigned basis_size = cast<cc::MultiplexedAlphabet>(alphabet)->getMultiplexedCCs().size();
206        streamSetInputs.push_back(Binding{b->getStreamSetTy(basis_size, 1), alphabet->getName() + "_basis"});
207    }
208    return streamSetInputs;
209}
210
211ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<std::string> externals, std::vector<cc::Alphabet *> alphabets)
212: ICGrepSignature(re)
213, PabloKernel(b, "ic" + sha1sum(mSignature),
214// inputs
215icGrepInputs(b, externals, alphabets),
216// output
217{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
218, mExternals(externals)
219, mAlphabets(alphabets) {
220}
221
222std::string ICGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
223    return mSignature;
224}
225
226void ICGrepKernel::generatePabloMethod() {
227    PabloBuilder pb(getEntryScope());
228    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
229    RE_Compiler re_compiler(getEntryScope(), ccc);
230    for (auto & e : mExternals) {
231        re_compiler.addPrecompiled(e, pb.createExtract(getInputStreamVar(e), pb.getInteger(0)));
232    }
233    for (auto a : mAlphabets) {
234        auto mpx_basis = getInputStreamSet(a->getName() + "_basis");
235        re_compiler.addAlphabet(a, mpx_basis);
236    }
237    PabloAST * const matches = re_compiler.compile(mRE);
238    Var * const output = getOutputStreamVar("matches");
239    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
240}
241
242void MatchedLinesKernel::generatePabloMethod() {
243    PabloBuilder pb(getEntryScope());
244    PabloAST * matchResults = pb.createExtract(getInputStreamVar("matchResults"), pb.getInteger(0));
245    PabloAST * lineBreaks = pb.createExtract(getInputStreamVar("lineBreaks"), pb.getInteger(0));
246    PabloAST * notLB = pb.createNot(lineBreaks);
247    PabloAST * match_follow = pb.createMatchStar(matchResults, notLB);
248    Var * const matchedLines = getOutputStreamVar("matchedLines");
249    pb.createAssign(pb.createExtract(matchedLines, pb.getInteger(0)), pb.createAnd(match_follow, lineBreaks));
250}
251
252MatchedLinesKernel::MatchedLinesKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
253: PabloKernel(iBuilder, "MatchedLines",
254// inputs
255{Binding{iBuilder->getStreamSetTy(1), "matchResults"}
256,Binding{iBuilder->getStreamSetTy(1), "lineBreaks"}},
257// output
258{Binding{iBuilder->getStreamSetTy(1), "matchedLines"}}) {
259
260}
261
262
263void InvertMatchesKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
264    Value * input = iBuilder->loadInputStreamBlock("matchedLines", iBuilder->getInt32(0));
265    Value * lbs = iBuilder->loadInputStreamBlock("lineBreaks", iBuilder->getInt32(0));
266    Value * inverted = iBuilder->CreateXor(input, lbs);
267    iBuilder->storeOutputStreamBlock("nonMatches", iBuilder->getInt32(0), inverted);
268}
269
270InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
271: BlockOrientedKernel("Invert",
272// Inputs
273{Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
274// Outputs
275{Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
276// Input/Output Scalars and internal state
277{}, {}, {}) {
278
279}
280
281
282void PopcountKernel::generatePabloMethod() {
283    auto pb = this->getEntryScope();
284    const auto toCount = pb->createExtract(getInputStreamVar("toCount"), pb->getInteger(0));
285    pablo::Var * countResult = getOutputScalarVar("countResult");
286    pb->createAssign(countResult, pb->createCount(toCount));
287}
288
289PopcountKernel::PopcountKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
290: PabloKernel(iBuilder, "Popcount",
291{Binding{iBuilder->getStreamSetTy(1), "toCount"}},
292{},
293{},
294{Binding{iBuilder->getSizeTy(), "countResult"}}) {
295
296}
Note: See TracBrowser for help on using the repository browser.