source: icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp @ 5620

Last change on this file since 5620 was 5585, checked in by xuedongx, 22 months ago

use multiplexed character classes as the input to grep kernel, restructure the icGrep pipeline: Matches = RE_compiler<regexp>(CharacterClasses?, LineBreaks?)

File size: 10.9 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "grep_kernel.h"
7#include <boost/uuid/sha1.hpp>
8#include <re/printer_re.h>
9#include <re/re_toolchain.h>
10#include <pablo/pablo_toolchain.h>
11#include <kernels/kernel_builder.h>
12#include <pablo/builder.hpp>
13#include <pablo/pe_ones.h>          // for Ones
14#include <pablo/pe_var.h>           // for Var
15#include <pablo/pe_zeroes.h>        // for Zeroes
16#include <pablo/boolean.h>
17#include <pablo/pe_count.h>
18#include <pablo/pe_matchstar.h>
19#include "cc/cc_compiler.h"         // for CC_Compiler
20
21#include <llvm/Support/raw_ostream.h>
22
23using namespace kernel;
24using namespace pablo;
25using namespace re;
26using namespace llvm;
27
28inline static std::string sha1sum(const std::string & str) {
29    char buffer[41];    // 40 hex-digits and the terminating null
30    uint32_t digest[5]; // 160 bits in total
31    boost::uuids::detail::sha1 sha1;
32    sha1.process_bytes(str.c_str(), str.size());
33    sha1.get_digest(digest);
34    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
35             digest[0], digest[1], digest[2], digest[3], digest[4]);
36    return std::string(buffer);
37}
38
39RegularExpressionOptimizer::RegularExpressionOptimizer(re::RE * const re_ast)
40: mRE(regular_expression_passes(re_ast))
41, mSignature(Printer_RE::PrintRE(mRE)) {
42
43}
44
45void RequiredStreams_UTF8::generatePabloMethod() {
46   
47    cc::CC_Compiler ccc(this, getInput(0));
48    auto & pb = ccc.getBuilder();
49    Zeroes * const zero = pb.createZeroes();
50    PabloAST * LF = ccc.compileCC("LF", makeCC(0x0A), pb);
51    PabloAST * CR = ccc.compileCC(makeCC(0x0D));
52
53    Var * crlf = pb.createVar("crlf", zero);
54    PabloBuilder crb = PabloBuilder::Create(pb);
55    PabloAST * cr1 = crb.createAdvance(CR, 1, "cr1");
56    crb.createAssign(crlf, crb.createAnd(cr1, LF));
57    pb.createIf(CR, crb);
58   
59    Var * u8invalid = pb.createVar("u8invalid", zero);
60    Var * valid_pfx = pb.createVar("valid_pfx", zero);
61    Var * nonFinal = pb.createVar("nonfinal", zero);
62    PabloAST * u8pfx = ccc.compileCC(makeCC(0xC0, 0xFF));
63   
64    PabloBuilder it = PabloBuilder::Create(pb);
65
66    pb.createIf(u8pfx, it);
67    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF), it);
68    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
69    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4), it);
70    PabloAST * u8suffix = ccc.compileCC("u8suffix", makeCC(0x80, 0xBF), it);
71   
72    //
73    // Two-byte sequences
74    Var * u8scope22 = it.createVar("u8scope22", zero);
75    PabloBuilder it2 = PabloBuilder::Create(it);
76    it2.createAssign(u8scope22, it2.createAdvance(u8pfx2, 1));
77    it.createIf(u8pfx2, it2);
78    //
79    // Three-byte sequences
80   
81    Var * u8scope32 = it.createVar("u8scope32", zero);
82    Var * u8scope3X = it.createVar("u8scope3X", zero);
83    Var * EX_invalid = it.createVar("EX_invalid", zero);
84    PabloBuilder it3 = PabloBuilder::Create(it);
85    it.createIf(u8pfx3, it3);
86    it3.createAssign(u8scope32, it3.createAdvance(u8pfx3, 1));
87    PabloAST * u8scope33 = it3.createAdvance(u8pfx3, 2);
88    it3.createAssign(u8scope3X, it3.createOr(u8scope32, u8scope33));
89    PabloAST * E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xE0), it3), 1), ccc.compileCC(makeCC(0x80, 0x9F), it3));
90    PabloAST * ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xED), it3), 1), ccc.compileCC(makeCC(0xA0, 0xBF), it3));
91    it3.createAssign(EX_invalid, it3.createOr(E0_invalid, ED_invalid));
92   
93    //
94    // Four-byte sequences
95    Var * u8scope4nonfinal = it.createVar("u8scope4nonfinal", zero);
96    Var * u8scope4X = it.createVar("u8scope4X", zero);
97    Var * FX_invalid = it.createVar("FX_invalid", zero);
98    PabloBuilder it4 = PabloBuilder::Create(it);
99    it.createIf(u8pfx4, it4);
100    PabloAST * u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
101    PabloAST * u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
102    PabloAST * u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
103    it4.createAssign(u8scope4nonfinal, it4.createOr(u8scope42, u8scope43));
104    it4.createAssign(u8scope4X, it4.createOr(u8scope4nonfinal, u8scope44));
105    PabloAST * F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeCC(0xF0), it4), 1), ccc.compileCC(makeCC(0x80, 0x8F), it4));
106    PabloAST * F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeCC(0xF4), it4), 1), ccc.compileCC(makeCC(0x90, 0xBF), it4));
107    it4.createAssign(FX_invalid, it4.createOr(F0_invalid, F4_invalid));
108   
109    //
110    // Invalid cases
111    PabloAST * anyscope = it.createOr(u8scope22, it.createOr(u8scope3X, u8scope4X));
112    PabloAST * legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
113    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
114    //  a scope is a mismatch, i.e., invalid UTF-8.
115    PabloAST * mismatch = it.createXor(anyscope, u8suffix);
116    //
117    PabloAST * EF_invalid = it.createOr(EX_invalid, FX_invalid);
118    PabloAST * pfx_invalid = it.createXor(u8pfx, legalpfx);
119    it.createAssign(u8invalid, it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
120    PabloAST * u8valid = it.createNot(u8invalid, "u8valid");
121    //
122    //
123   
124    it.createAssign(valid_pfx, it.createAnd(u8pfx, u8valid));
125    it.createAssign(nonFinal, it.createAnd(it.createOr(it.createOr(u8pfx, u8scope32), u8scope4nonfinal), u8valid));
126   
127    PabloAST * u8single = pb.createAnd(ccc.compileCC(makeCC(0x00, 0x7F)), pb.createNot(u8invalid));
128   
129    Var * const required = getOutputStreamVar("required");
130    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), pb.createOr(u8single, valid_pfx, "initial"));
131    pb.createAssign(pb.createExtract(required, pb.getInteger(1)), nonFinal);
132    pb.createAssign(pb.createExtract(required, pb.getInteger(2)), pb.createNot(pb.createOr(nonFinal, u8invalid), "final"));
133    pb.createAssign(pb.createExtract(required, pb.getInteger(3)), crlf);
134}
135
136RequiredStreams_UTF8::RequiredStreams_UTF8(const std::unique_ptr<kernel::KernelBuilder> & kb)
137: PabloKernel(kb, "RequiredStreams_UTF8",               
138              {Binding{kb->getStreamSetTy(8), "basis"}}, 
139              {Binding{kb->getStreamSetTy(4), "required"}},
140              {},
141              {}) {
142}
143
144void RequiredStreams_UTF16::generatePabloMethod() {
145   
146    cc::CC_Compiler ccc(this, getInput(0));
147    auto & pb = ccc.getBuilder();
148   
149    PabloAST * LF = ccc.compileCC("LF", makeCC(0x000A), pb);
150    PabloAST * CR = ccc.compileCC("CR", makeCC(0x000D), pb);
151    PabloAST * cr1 = pb.createAdvance(CR, 1, "cr1");
152   
153    PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF));    //u16hi_hi_surrogate = [\xD8-\xDB]
154    PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF));    //u16hi_lo_surrogate = [\xDC-\xDF]
155   
156    PabloAST * invalidTemp = pb.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
157    PabloAST * u16invalid = pb.createXor(invalidTemp, u16hi_lo_surrogate, "u16invalid");
158    PabloAST * u16valid = pb.createNot(u16invalid, "u16valid");
159   
160    PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF)), ccc.compileCC(makeCC(0xE000, 0xFFFF)));
161    PabloAST * u16single = pb.createAnd(u16single_temp, pb.createNot(u16invalid));
162
163    Var * const required = getOutputStreamVar("required");
164    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), pb.createOr(u16single, u16hi_hi_surrogate, "initial"));
165    pb.createAssign(pb.createExtract(required, pb.getInteger(1)), pb.createAnd(u16hi_hi_surrogate, u16valid, "nonfinal"));
166    pb.createAssign(pb.createExtract(required, pb.getInteger(2)), pb.createNot(pb.createOr(u16hi_hi_surrogate, u16invalid), "final"));
167    pb.createAssign(pb.createExtract(required, pb.getInteger(3)), pb.createAnd(cr1, LF, "crlf"));
168}
169
170RequiredStreams_UTF16::RequiredStreams_UTF16(const std::unique_ptr<kernel::KernelBuilder> & kb)
171: PabloKernel(kb, "RequiredStreams_UTF16",               
172              {Binding{kb->getStreamSetTy(16), "basis"}}, 
173              {Binding{kb->getStreamSetTy(4), "required"}},
174              {},
175              {}) {
176}
177
178
179ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, RE * const re, bool cc, unsigned cc_size)
180: RegularExpressionOptimizer(re)
181, PabloKernel(iBuilder,
182              "ic" + sha1sum(mSignature),
183              {Binding{iBuilder->getStreamSetTy(cc ? cc_size : 8), "basis"}, Binding{iBuilder->getStreamSetTy(1, 1), "linebreak"}, Binding{iBuilder->getStreamSetTy(4, 1), "required"}},
184              {Binding{iBuilder->getStreamSetTy(1, 1), "matches"}}) {
185
186}
187
188std::string ICGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
189    return mSignature;
190}
191
192void ICGrepKernel::generatePabloMethod() {
193    re2pablo_compiler(this, mRE);
194}
195
196void MatchedLinesKernel::generatePabloMethod() {
197    auto pb = this->getEntryBlock();
198    PabloAST * matchResults = pb->createExtract(getInputStreamVar("matchResults"), pb->getInteger(0));
199    PabloAST * lineBreaks = pb->createExtract(getInputStreamVar("lineBreaks"), pb->getInteger(0));
200    PabloAST * notLB = pb->createNot(lineBreaks);
201    PabloAST * match_follow = pb->createMatchStar(matchResults, notLB);
202    Var * const matchedLines = getOutputStreamVar("matchedLines");
203    pb->createAssign(pb->createExtract(matchedLines, pb->getInteger(0)), pb->createAnd(match_follow, lineBreaks));
204}
205
206MatchedLinesKernel::MatchedLinesKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
207: PabloKernel(iBuilder, "MatchedLines",
208              {Binding{iBuilder->getStreamSetTy(1), "matchResults"}, Binding{iBuilder->getStreamSetTy(1), "lineBreaks"}},
209              {Binding{iBuilder->getStreamSetTy(1), "matchedLines"}},
210              {},
211              {}) {
212}
213
214
215void InvertMatchesKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
216    Value * input = iBuilder->loadInputStreamBlock("matchedLines", iBuilder->getInt32(0));
217    Value * lbs = iBuilder->loadInputStreamBlock("lineBreaks", iBuilder->getInt32(0));
218    Value * inverted = iBuilder->CreateXor(input, lbs);
219    iBuilder->storeOutputStreamBlock("nonMatches", iBuilder->getInt32(0), inverted);
220}
221
222InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
223: BlockOrientedKernel("Invert", {Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}}, {Binding{builder->getStreamSetTy(1, 1), "nonMatches"}}, {}, {}, {}) {
224    setNoTerminateAttribute(true);   
225}
226
227
228void PopcountKernel::generatePabloMethod() {
229    auto pb = this->getEntryBlock();
230    const auto toCount = pb->createExtract(getInputStreamVar("toCount"), pb->getInteger(0));
231    pablo::Var * countResult = getOutputScalarVar("countResult");
232    pb->createAssign(countResult, pb->createCount(toCount));
233}
234
235PopcountKernel::PopcountKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
236: PabloKernel(iBuilder, "Popcount",
237              {Binding{iBuilder->getStreamSetTy(1), "toCount"}},
238              {},
239              {},
240              {Binding{iBuilder->getSizeTy(), "countResult"}}) {
241}
242
Note: See TracBrowser for help on using the repository browser.