source: icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp @ 5863

Last change on this file since 5863 was 5863, checked in by cameron, 15 months ago

Eliminate mInitial - only 1 required UTF-8 stream

File size: 11.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "grep_kernel.h"
7#include <boost/uuid/sha1.hpp>
8#include <re/printer_re.h>
9#include <re/re_toolchain.h>
10#include <re/re_reverse.h>
11#include <pablo/pablo_toolchain.h>
12#include <kernels/kernel_builder.h>
13#include <pablo/builder.hpp>
14#include <pablo/pe_ones.h>          // for Ones
15#include <pablo/pe_var.h>           // for Var
16#include <pablo/pe_zeroes.h>        // for Zeroes
17#include <pablo/boolean.h>
18#include <pablo/pe_count.h>
19#include <pablo/pe_matchstar.h>
20#include <cc/cc_compiler.h>         // for CC_Compiler
21#include <cc/alphabet.h>
22#include <cc/multiplex_CCs.h>
23#include <re/re_compiler.h>
24#include <llvm/Support/raw_ostream.h>
25
26using namespace kernel;
27using namespace pablo;
28using namespace re;
29using namespace llvm;
30
31inline static std::string sha1sum(const std::string & str) {
32    char buffer[41];    // 40 hex-digits and the terminating null
33    uint32_t digest[5]; // 160 bits in total
34    boost::uuids::detail::sha1 sha1;
35    sha1.process_bytes(str.c_str(), str.size());
36    sha1.get_digest(digest);
37    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
38             digest[0], digest[1], digest[2], digest[3], digest[4]);
39    return std::string(buffer);
40}
41
42void RequiredStreams_UTF8::generatePabloMethod() {
43    PabloBuilder pb(getEntryScope());
44    cc::Parabix_CC_Compiler ccc(this, getInputStreamSet("basis"));
45    Zeroes * const ZEROES = pb.createZeroes();
46    PabloAST * const u8pfx = ccc.compileCC(makeByte(0xC0, 0xFF));
47
48
49    Var * const nonFinal = pb.createVar("nonFinal", u8pfx);
50    Var * const u8invalid = pb.createVar("u8invalid", ZEROES);
51    Var * const valid_pfx = pb.createVar("valid_pfx", u8pfx);
52
53    auto it = pb.createScope();
54    pb.createIf(u8pfx, it);
55    PabloAST * const u8pfx2 = ccc.compileCC(makeByte(0xC2, 0xDF), it);
56    PabloAST * const u8pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), it);
57    PabloAST * const u8pfx4 = ccc.compileCC(makeByte(0xF0, 0xF4), it);
58    PabloAST * const u8suffix = ccc.compileCC("u8suffix", makeByte(0x80, 0xBF), it);
59   
60    //
61    // Two-byte sequences
62    Var * const anyscope = it.createVar("anyscope", ZEROES);
63    auto it2 = it.createScope();
64    it.createIf(u8pfx2, it2);
65    it2.createAssign(anyscope, it2.createAdvance(u8pfx2, 1));
66
67    //
68    // Three-byte sequences   
69    Var * const EF_invalid = it.createVar("EF_invalid", ZEROES);
70    auto it3 = it.createScope();
71    it.createIf(u8pfx3, it3);
72    PabloAST * const u8scope32 = it3.createAdvance(u8pfx3, 1);
73    it3.createAssign(nonFinal, it3.createOr(nonFinal, u8scope32));
74    PabloAST * const u8scope33 = it3.createAdvance(u8pfx3, 2);
75    PabloAST * const u8scope3X = it3.createOr(u8scope32, u8scope33);
76    it3.createAssign(anyscope, it3.createOr(anyscope, u8scope3X));
77    PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE0), it3), 1), ccc.compileCC(makeByte(0x80, 0x9F), it3));
78    PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xED), it3), 1), ccc.compileCC(makeByte(0xA0, 0xBF), it3));
79    PabloAST * const EX_invalid = it3.createOr(E0_invalid, ED_invalid);
80    it3.createAssign(EF_invalid, EX_invalid);
81
82
83    //
84    // Four-byte sequences
85    auto it4 = it.createScope();
86    it.createIf(u8pfx4, it4);
87    PabloAST * const u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
88    PabloAST * const u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
89    PabloAST * const u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
90    PabloAST * const u8scope4nonfinal = it4.createOr(u8scope42, u8scope43);
91    it4.createAssign(nonFinal, it4.createOr(nonFinal, u8scope4nonfinal));
92    PabloAST * const u8scope4X = it4.createOr(u8scope4nonfinal, u8scope44);
93    it4.createAssign(anyscope, it4.createOr(anyscope, u8scope4X));
94    PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF0), it4), 1), ccc.compileCC(makeByte(0x80, 0x8F), it4));
95    PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF4), it4), 1), ccc.compileCC(makeByte(0x90, 0xBF), it4));
96    PabloAST * const FX_invalid = it4.createOr(F0_invalid, F4_invalid);
97    it4.createAssign(EF_invalid, it4.createOr(EF_invalid, FX_invalid));
98   
99    //
100    // Invalid cases
101    PabloAST * const legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
102    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
103    //  a scope is a mismatch, i.e., invalid UTF-8.
104    PabloAST * const mismatch = it.createXor(anyscope, u8suffix);
105    //
106    PabloAST * const pfx_invalid = it.createXor(valid_pfx, legalpfx);
107    it.createAssign(u8invalid, it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
108    PabloAST * const u8valid = it.createNot(u8invalid, "u8valid");
109    //
110    //
111    it.createAssign(nonFinal, it.createAnd(nonFinal, u8valid));
112
113    Var * const required = getOutputStreamVar("required");
114    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), nonFinal);
115}
116
117RequiredStreams_UTF8::RequiredStreams_UTF8(const std::unique_ptr<kernel::KernelBuilder> & kb)
118: PabloKernel(kb, "RequiredStreams_UTF8",
119// input
120{Binding{kb->getStreamSetTy(8), "basis"}},
121// output
122{Binding{kb->getStreamSetTy(1), "required", FixedRate()}}) {
123
124}
125
126void RequiredStreams_UTF16::generatePabloMethod() {
127    PabloBuilder pb(getEntryScope());
128    cc::Parabix_CC_Compiler ccc(this, getInputStreamSet("basis"));
129   
130    PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF, &cc::UTF16));    //u16hi_hi_surrogate = [\xD8-\xDB]
131    PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF, &cc::UTF16));    //u16hi_lo_surrogate = [\xDC-\xDF]
132   
133    PabloAST * invalidTemp = pb.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
134    PabloAST * u16invalid = pb.createXor(invalidTemp, u16hi_lo_surrogate, "u16invalid");
135
136    PabloAST * u16valid = pb.createNot(u16invalid, "u16valid");
137    PabloAST * nonFinal = pb.createAnd(u16hi_hi_surrogate, u16valid, "nonfinal");
138
139    PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF, &cc::UTF16)), ccc.compileCC(makeCC(0xE000, 0xFFFF, &cc::UTF16)));
140    PabloAST * u16single = pb.createAnd(u16single_temp, pb.createNot(u16invalid));
141
142    PabloAST * const nonFinalCodeUnits = pb.createExtract(getInput(1), pb.getInteger(0));
143    PabloAST * const initial = pb.createOr(u16single, u16hi_hi_surrogate, "initial");
144    PabloAST * const final = pb.createNot(pb.createOr(pb.createOr(u16hi_hi_surrogate, u16invalid), nonFinalCodeUnits), "final");
145
146    Var * const required = getOutputStreamVar("required");
147    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), initial);
148    pb.createAssign(pb.createExtract(required, pb.getInteger(1)), nonFinal);
149    pb.createAssign(pb.createExtract(required, pb.getInteger(2)), final);
150
151}
152
153RequiredStreams_UTF16::RequiredStreams_UTF16(const std::unique_ptr<kernel::KernelBuilder> & kb)
154: PabloKernel(kb, "RequiredStreams_UTF16",               
155// inputs
156{Binding{kb->getStreamSetTy(8), "basis"}},
157// output
158{Binding{kb->getStreamSetTy(3), "required", FixedRate(), Add1()}}) {
159
160}
161
162ICGrepSignature::ICGrepSignature(re::RE * const re_ast)
163: mRE(re_ast)
164, mSignature(Printer_RE::PrintRE(mRE)) {
165   
166}
167
168// Helper to compute stream set inputs to pass into PabloKernel constructor.
169inline std::vector<Binding> icGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
170                                         const std::vector<cc::Alphabet *> & alphabets) {
171    std::vector<Binding> streamSetInputs = {
172        Binding{b->getStreamSetTy(8), "basis"},
173        Binding{b->getStreamSetTy(1, 1), "linebreak"},
174        Binding{b->getStreamSetTy(1, 1), "cr+lf"},
175        Binding{b->getStreamSetTy(1, 1), "required"}
176    };
177    for (const auto & alphabet : alphabets) {
178        unsigned basis_size = cast<cc::MultiplexedAlphabet>(alphabet)->getMultiplexedCCs().size();
179        streamSetInputs.push_back(Binding{b->getStreamSetTy(basis_size, 1), alphabet->getName() + "_basis"});
180    }
181    return streamSetInputs;
182}
183
184ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<cc::Alphabet *> alphabets)
185: ICGrepSignature(re)
186, PabloKernel(b, "ic" + sha1sum(mSignature),
187// inputs
188icGrepInputs(b, alphabets),
189// output
190{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
191, mAlphabets(alphabets) {
192
193}
194
195std::string ICGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
196    return mSignature;
197}
198
199void ICGrepKernel::generatePabloMethod() {
200    PabloBuilder pb(getEntryScope());
201    cc::Parabix_CC_Compiler ccc(this, getInputStreamSet("basis"));
202    RE_Compiler re_compiler(this, ccc);
203    for (auto a : mAlphabets) {
204        auto mpx_basis = getInputStreamSet(a->getName() + "_basis");
205        re_compiler.addAlphabet(a, mpx_basis);
206    }
207    PabloAST * const matches = re_compiler.compile(mRE);
208    Var * const output = getOutputStreamVar("matches");
209    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
210}
211
212void MatchedLinesKernel::generatePabloMethod() {
213    PabloBuilder pb(getEntryScope());
214    PabloAST * matchResults = pb.createExtract(getInputStreamVar("matchResults"), pb.getInteger(0));
215    PabloAST * lineBreaks = pb.createExtract(getInputStreamVar("lineBreaks"), pb.getInteger(0));
216    PabloAST * notLB = pb.createNot(lineBreaks);
217    PabloAST * match_follow = pb.createMatchStar(matchResults, notLB);
218    Var * const matchedLines = getOutputStreamVar("matchedLines");
219    pb.createAssign(pb.createExtract(matchedLines, pb.getInteger(0)), pb.createAnd(match_follow, lineBreaks));
220}
221
222MatchedLinesKernel::MatchedLinesKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
223: PabloKernel(iBuilder, "MatchedLines",
224// inputs
225{Binding{iBuilder->getStreamSetTy(1), "matchResults"}
226,Binding{iBuilder->getStreamSetTy(1), "lineBreaks"}},
227// output
228{Binding{iBuilder->getStreamSetTy(1), "matchedLines"}}) {
229
230}
231
232
233void InvertMatchesKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
234    Value * input = iBuilder->loadInputStreamBlock("matchedLines", iBuilder->getInt32(0));
235    Value * lbs = iBuilder->loadInputStreamBlock("lineBreaks", iBuilder->getInt32(0));
236    Value * inverted = iBuilder->CreateXor(input, lbs);
237    iBuilder->storeOutputStreamBlock("nonMatches", iBuilder->getInt32(0), inverted);
238}
239
240InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
241: BlockOrientedKernel("Invert",
242// Inputs
243{Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
244// Outputs
245{Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
246// Input/Output Scalars and internal state
247{}, {}, {}) {
248
249}
250
251
252void PopcountKernel::generatePabloMethod() {
253    auto pb = this->getEntryScope();
254    const auto toCount = pb->createExtract(getInputStreamVar("toCount"), pb->getInteger(0));
255    pablo::Var * countResult = getOutputScalarVar("countResult");
256    pb->createAssign(countResult, pb->createCount(toCount));
257}
258
259PopcountKernel::PopcountKernel (const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
260: PabloKernel(iBuilder, "Popcount",
261{Binding{iBuilder->getStreamSetTy(1), "toCount"}},
262{},
263{},
264{Binding{iBuilder->getSizeTy(), "countResult"}}) {
265
266}
Note: See TracBrowser for help on using the repository browser.