source: icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp @ 5890

Last change on this file since 5890 was 5890, checked in by cameron, 14 months ago

Fixes for byte/bit kernel; match results

File size: 16.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "grep_kernel.h"
7#include <boost/uuid/sha1.hpp>
8#include <re/printer_re.h>
9#include <re/re_toolchain.h>
10#include <re/re_reverse.h>
11#include <pablo/codegenstate.h>
12#include <pablo/pablo_toolchain.h>
13#include <kernels/kernel_builder.h>
14#include <pablo/builder.hpp>
15#include <pablo/pe_ones.h>          // for Ones
16#include <pablo/pe_var.h>           // for Var
17#include <pablo/pe_zeroes.h>        // for Zeroes
18#include <pablo/boolean.h>
19#include <pablo/pe_count.h>
20#include <pablo/pe_matchstar.h>
21#include <pablo/pe_pack.h>
22#include <cc/cc_compiler.h>         // for CC_Compiler
23#include <cc/alphabet.h>
24#include <cc/multiplex_CCs.h>
25#include <re/re_compiler.h>
26#include <llvm/Support/raw_ostream.h>
27
28using namespace kernel;
29using namespace pablo;
30using namespace re;
31using namespace llvm;
32
33inline static std::string sha1sum(const std::string & str) {
34    char buffer[41];    // 40 hex-digits and the terminating null
35    uint32_t digest[5]; // 160 bits in total
36    boost::uuids::detail::sha1 sha1;
37    sha1.process_bytes(str.c_str(), str.size());
38    sha1.get_digest(digest);
39    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
40             digest[0], digest[1], digest[2], digest[3], digest[4]);
41    return std::string(buffer);
42}
43
44void RequiredStreams_UTF8::generatePabloMethod() {
45    PabloBuilder pb(getEntryScope());
46    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
47   
48    PabloAST * const LF = pb.createExtract(getInput(1), pb.getInteger(0), "LF");
49    PabloAST * const CR = ccc.compileCC(makeByte(0x0D));
50    PabloAST * const LF_VT_FF_CR = ccc.compileCC("LF,VT,FF,CR", makeByte(0x0A, 0x0D), pb);
51    Var * const LineBreak = pb.createVar("LineBreak", LF_VT_FF_CR);
52   
53    // Remove the CR of any CR+LF
54    Var * const CRLF = pb.createVar("CRLF", pb.createZeroes());
55    auto crb = pb.createScope();
56    pb.createIf(CR, crb);
57    PabloAST * const lookaheadLF = crb.createLookahead(LF, 1, "lookaheadLF");
58    PabloAST * const crlf = crb.createAnd(CR, lookaheadLF);
59    crb.createAssign(CRLF, crlf);
60    PabloAST * removedCRLF = crb.createAnd(LineBreak, crb.createNot(CRLF));
61    crb.createAssign(LineBreak, removedCRLF);
62
63   
64    Zeroes * const ZEROES = pb.createZeroes();
65    PabloAST * const u8pfx = ccc.compileCC(makeByte(0xC0, 0xFF));
66
67
68    Var * const nonFinal = pb.createVar("nonFinal", u8pfx);
69    Var * const u8invalid = pb.createVar("u8invalid", ZEROES);
70    Var * const valid_pfx = pb.createVar("valid_pfx", u8pfx);
71
72    auto it = pb.createScope();
73    pb.createIf(u8pfx, it);
74    PabloAST * const u8pfx2 = ccc.compileCC(makeByte(0xC2, 0xDF), it);
75    PabloAST * const u8pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), it);
76    PabloAST * const u8pfx4 = ccc.compileCC(makeByte(0xF0, 0xF4), it);
77    PabloAST * const u8suffix = ccc.compileCC("u8suffix", makeByte(0x80, 0xBF), it);
78   
79    //
80    // Two-byte sequences
81    Var * const anyscope = it.createVar("anyscope", ZEROES);
82    auto it2 = it.createScope();
83    it.createIf(u8pfx2, it2);
84    it2.createAssign(anyscope, it2.createAdvance(u8pfx2, 1));
85    PabloAST * NEL = it2.createAnd(it2.createAdvance(ccc.compileCC(makeByte(0xC2), it2), 1), ccc.compileCC(makeByte(0x85), it2), "NEL");
86    it2.createAssign(LineBreak, it2.createOr(LineBreak, NEL));
87
88
89    //
90    // Three-byte sequences   
91    Var * const EF_invalid = it.createVar("EF_invalid", ZEROES);
92    auto it3 = it.createScope();
93    it.createIf(u8pfx3, it3);
94    PabloAST * const u8scope32 = it3.createAdvance(u8pfx3, 1);
95    it3.createAssign(nonFinal, it3.createOr(nonFinal, u8scope32));
96    PabloAST * const u8scope33 = it3.createAdvance(u8pfx3, 2);
97    PabloAST * const u8scope3X = it3.createOr(u8scope32, u8scope33);
98    it3.createAssign(anyscope, it3.createOr(anyscope, u8scope3X));
99    PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE0), it3), 1), ccc.compileCC(makeByte(0x80, 0x9F), it3));
100    PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xED), it3), 1), ccc.compileCC(makeByte(0xA0, 0xBF), it3));
101    PabloAST * const EX_invalid = it3.createOr(E0_invalid, ED_invalid);
102    it3.createAssign(EF_invalid, EX_invalid);
103    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE2), it3), 1), ccc.compileCC(makeByte(0x80), it3));
104    PabloAST * LS_PS = it3.createAnd(it3.createAdvance(E2_80, 1), ccc.compileCC(makeByte(0xA8,0xA9), it3), "LS_PS");
105    it3.createAssign(LineBreak, it3.createOr(LineBreak, LS_PS));
106
107    //
108    // Four-byte sequences
109    auto it4 = it.createScope();
110    it.createIf(u8pfx4, it4);
111    PabloAST * const u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
112    PabloAST * const u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
113    PabloAST * const u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
114    PabloAST * const u8scope4nonfinal = it4.createOr(u8scope42, u8scope43);
115    it4.createAssign(nonFinal, it4.createOr(nonFinal, u8scope4nonfinal));
116    PabloAST * const u8scope4X = it4.createOr(u8scope4nonfinal, u8scope44);
117    it4.createAssign(anyscope, it4.createOr(anyscope, u8scope4X));
118    PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF0), it4), 1), ccc.compileCC(makeByte(0x80, 0x8F), it4));
119    PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF4), it4), 1), ccc.compileCC(makeByte(0x90, 0xBF), it4));
120    PabloAST * const FX_invalid = it4.createOr(F0_invalid, F4_invalid);
121    it4.createAssign(EF_invalid, it4.createOr(EF_invalid, FX_invalid));
122   
123    //
124    // Invalid cases
125    PabloAST * const legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
126    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
127    //  a scope is a mismatch, i.e., invalid UTF-8.
128    PabloAST * const mismatch = it.createXor(anyscope, u8suffix);
129    //
130    PabloAST * const pfx_invalid = it.createXor(valid_pfx, legalpfx);
131    it.createAssign(u8invalid, it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
132    PabloAST * const u8valid = it.createNot(u8invalid, "u8valid");
133    //
134    //
135    it.createAssign(nonFinal, it.createAnd(nonFinal, u8valid));
136    pb.createAssign(nonFinal, pb.createOr(nonFinal, CRLF));
137    PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(pb.createNot(LineBreak), 1), "unterminatedLineAtEOF");
138   
139    Var * const required = getOutputStreamVar("nonFinal");
140    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), nonFinal);
141    pb.createAssign(pb.createExtract(getOutputStreamVar("UnicodeLB"), pb.getInteger(0)), pb.createOr(LineBreak, unterminatedLineAtEOF, "EOL"));
142}
143
144RequiredStreams_UTF8::RequiredStreams_UTF8(const std::unique_ptr<kernel::KernelBuilder> & kb)
145: PabloKernel(kb, "RequiredStreams_UTF8",
146// input
147{Binding{kb->getStreamSetTy(8), "basis"}, Binding{kb->getStreamSetTy(1), "lf", FixedRate(), LookAhead(1)}},
148// output
149{Binding{kb->getStreamSetTy(1), "nonFinal", FixedRate()},
150 Binding{kb->getStreamSetTy(1), "UnicodeLB", FixedRate()}}) {
151
152}
153
154void RequiredStreams_UTF16::generatePabloMethod() {
155    PabloBuilder pb(getEntryScope());
156    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
157   
158    PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF, &cc::UTF16));    //u16hi_hi_surrogate = [\xD8-\xDB]
159    PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF, &cc::UTF16));    //u16hi_lo_surrogate = [\xDC-\xDF]
160   
161    PabloAST * invalidTemp = pb.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
162    PabloAST * u16invalid = pb.createXor(invalidTemp, u16hi_lo_surrogate, "u16invalid");
163
164    PabloAST * u16valid = pb.createNot(u16invalid, "u16valid");
165    PabloAST * nonFinal = pb.createAnd(u16hi_hi_surrogate, u16valid, "nonfinal");
166
167    PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF, &cc::UTF16)), ccc.compileCC(makeCC(0xE000, 0xFFFF, &cc::UTF16)));
168    PabloAST * u16single = pb.createAnd(u16single_temp, pb.createNot(u16invalid));
169
170    PabloAST * const nonFinalCodeUnits = pb.createExtract(getInput(1), pb.getInteger(0));
171    PabloAST * const initial = pb.createOr(u16single, u16hi_hi_surrogate, "initial");
172    PabloAST * const final = pb.createNot(pb.createOr(pb.createOr(u16hi_hi_surrogate, u16invalid), nonFinalCodeUnits), "final");
173
174    Var * const required = getOutputStreamVar("required");
175    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), initial);
176    pb.createAssign(pb.createExtract(required, pb.getInteger(1)), nonFinal);
177    pb.createAssign(pb.createExtract(required, pb.getInteger(2)), final);
178
179}
180
181RequiredStreams_UTF16::RequiredStreams_UTF16(const std::unique_ptr<kernel::KernelBuilder> & kb)
182: PabloKernel(kb, "RequiredStreams_UTF16",               
183// inputs
184{Binding{kb->getStreamSetTy(8), "basis"}},
185// output
186{Binding{kb->getStreamSetTy(3), "required", FixedRate(), Add1()}}) {
187
188}
189
190ICGrepSignature::ICGrepSignature(re::RE * const re_ast)
191: mRE(re_ast)
192, mSignature(Printer_RE::PrintRE(mRE)) {
193   
194}
195
196// Helper to compute stream set inputs to pass into PabloKernel constructor.
197inline std::vector<Binding> icGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
198                                         const std::vector<std::string> & externals,
199                                         const std::vector<cc::Alphabet *> & alphabets) {
200    std::vector<Binding> streamSetInputs = {
201        Binding{b->getStreamSetTy(8), "basis"},
202    };
203    for (auto & e : externals) {
204        streamSetInputs.push_back(Binding{b->getStreamSetTy(1, 1), e});
205    }
206    for (const auto & alphabet : alphabets) {
207        unsigned basis_size = cast<cc::MultiplexedAlphabet>(alphabet)->getMultiplexedCCs().size();
208        streamSetInputs.push_back(Binding{b->getStreamSetTy(basis_size, 1), alphabet->getName() + "_basis"});
209    }
210    return streamSetInputs;
211}
212
213ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<std::string> externals, std::vector<cc::Alphabet *> alphabets)
214: ICGrepSignature(re)
215, PabloKernel(b, "ic" + sha1sum(mSignature),
216// inputs
217icGrepInputs(b, externals, alphabets),
218// output
219{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
220, mExternals(externals)
221, mAlphabets(alphabets) {
222}
223
224std::string ICGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
225    return mSignature;
226}
227
228void ICGrepKernel::generatePabloMethod() {
229    PabloBuilder pb(getEntryScope());
230    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
231    RE_Compiler re_compiler(getEntryScope(), ccc);
232    for (auto & e : mExternals) {
233        re_compiler.addPrecompiled(e, pb.createExtract(getInputStreamVar(e), pb.getInteger(0)));
234    }
235    for (auto a : mAlphabets) {
236        auto mpx_basis = getInputStreamSet(a->getName() + "_basis");
237        re_compiler.addAlphabet(a, mpx_basis);
238    }
239    PabloAST * const matches = re_compiler.compile(mRE);
240    Var * const output = getOutputStreamVar("matches");
241    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
242}
243
244// Helper to compute stream set inputs to pass into PabloKernel constructor.
245inline std::vector<Binding> byteBitGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
246                                         const std::vector<std::string> & externals) {
247    std::vector<Binding> streamSetInputs = {
248        Binding{b->getStreamSetTy(1, 8), "bytedata"},
249    };
250    for (auto & e : externals) {
251        streamSetInputs.push_back(Binding{b->getStreamSetTy(1, 1), e});
252    }
253    return streamSetInputs;
254}
255
256
257ByteBitGrepSignature::ByteBitGrepSignature(RE * prefix, RE * suffix)
258: mPrefixRE(prefix)
259, mSuffixRE(suffix)
260, mSignature(Printer_RE::PrintRE(mPrefixRE) + Printer_RE::PrintRE(mSuffixRE) ) {
261   
262}
263
264ByteBitGrepKernel::ByteBitGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const prefixRE, RE * const suffixRE, std::vector<std::string> externals)
265: ByteBitGrepSignature(prefixRE, suffixRE)
266, PabloKernel(b, "bBc" + sha1sum(mSignature),
267              // inputs
268              byteBitGrepInputs(b, externals),
269              // output
270{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
271, mExternals(externals) {
272}
273
274std::string ByteBitGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
275    return mSignature;
276}
277
278
279void ByteBitGrepKernel::generatePabloMethod() {
280    PabloBuilder pb(getEntryScope());
281    PabloAST * u8bytes = pb.createExtract(getInput(0), pb.getInteger(0));
282    cc::Direct_CC_Compiler dcc(getEntryScope(), u8bytes);
283    RE_Compiler re_byte_compiler(getEntryScope(), dcc);
284    for (auto & e : mExternals) {
285        re_byte_compiler.addPrecompiled(e, pb.createExtract(getInputStreamVar(e), pb.getInteger(0)));
286    }
287    PabloAST * const prefixMatches = re_byte_compiler.compile(mPrefixRE);
288   
289    PabloBlock * scope1 = getEntryScope()->createScope();
290    pb.createIf(prefixMatches, scope1);
291   
292    PabloAST * nybbles[2];
293    nybbles[0] = scope1->createPackL(scope1->getInteger(8), u8bytes);
294    nybbles[1] = scope1->createPackH(scope1->getInteger(8), u8bytes);
295   
296    PabloAST * bitpairs[4];
297    for (unsigned i = 0; i < 2; i++) {
298        bitpairs[2*i] = scope1->createPackL(scope1->getInteger(4), nybbles[i]);
299        bitpairs[2*i + 1] = scope1->createPackH(scope1->getInteger(4), nybbles[i]);
300    }
301   
302    std::vector<PabloAST *> basis(8);
303    for (unsigned i = 0; i < 4; i++) {
304        basis[7-2*i] = scope1->createPackL(scope1->getInteger(2), bitpairs[i]);
305        basis[7-(2*i + 1)] = scope1->createPackH(scope1->getInteger(2), bitpairs[i]);
306    }
307   
308    cc::Parabix_CC_Compiler ccc(scope1, basis);
309    RE_Compiler re_compiler(scope1, ccc);
310    PabloAST * const matches = re_compiler.compile(mSuffixRE, prefixMatches);
311    Var * const output = getOutputStreamVar("matches");
312    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
313}
314
315
316
317
318void MatchedLinesKernel::generatePabloMethod() {
319    PabloBuilder pb(getEntryScope());
320    PabloAST * matchResults = pb.createExtract(getInputStreamVar("matchResults"), pb.getInteger(0));
321    PabloAST * lineBreaks = pb.createExtract(getInputStreamVar("lineBreaks"), pb.getInteger(0));
322    PabloAST * notLB = pb.createNot(lineBreaks);
323    PabloAST * match_follow = pb.createMatchStar(matchResults, notLB);
324    PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(notLB, 1), "unterminatedLineAtEOF");
325    Var * const matchedLines = getOutputStreamVar("matchedLines");
326    pb.createAssign(pb.createExtract(matchedLines, pb.getInteger(0)), pb.createAnd(match_follow, pb.createOr(lineBreaks, unterminatedLineAtEOF)));
327}
328
329MatchedLinesKernel::MatchedLinesKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
330: PabloKernel(iBuilder, "MatchedLines",
331// inputs
332{Binding{iBuilder->getStreamSetTy(1), "matchResults"}
333,Binding{iBuilder->getStreamSetTy(1), "lineBreaks"}},
334// output
335{Binding{iBuilder->getStreamSetTy(1), "matchedLines", FixedRate(), Add1()}}) {
336
337}
338
339
340void InvertMatchesKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
341    Value * input = iBuilder->loadInputStreamBlock("matchedLines", iBuilder->getInt32(0));
342    Value * lbs = iBuilder->loadInputStreamBlock("lineBreaks", iBuilder->getInt32(0));
343    Value * inverted = iBuilder->CreateXor(input, lbs);
344    iBuilder->storeOutputStreamBlock("nonMatches", iBuilder->getInt32(0), inverted);
345}
346
347InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
348: BlockOrientedKernel("Invert",
349// Inputs
350{Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
351// Outputs
352{Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
353// Input/Output Scalars and internal state
354{}, {}, {}) {
355
356}
357
358
359void PopcountKernel::generatePabloMethod() {
360    auto pb = this->getEntryScope();
361    const auto toCount = pb->createExtract(getInputStreamVar("toCount"), pb->getInteger(0));
362    pablo::Var * countResult = getOutputScalarVar("countResult");
363    pb->createAssign(countResult, pb->createCount(toCount));
364}
365
366PopcountKernel::PopcountKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
367: PabloKernel(iBuilder, "Popcount",
368{Binding{iBuilder->getStreamSetTy(1), "toCount"}},
369{},
370{},
371{Binding{iBuilder->getSizeTy(), "countResult"}}) {
372
373}
Note: See TracBrowser for help on using the repository browser.