source: icGREP/icgrep-devel/icgrep/lzparabix/LZParabixGrepGenerator.cpp @ 6137

Last change on this file since 6137 was 6137, checked in by xwa163, 11 months ago
  1. LZ4 ScanMatch? pipeline
  2. Refactor LZ4 Generator
  3. Adjust some naming
File size: 15.5 KB
Line 
1//
2// Created by wxy325 on 2018/6/19.
3//
4
5#include "LZParabixGrepGenerator.h"
6
7
8#include <boost/iostreams/device/mapped_file.hpp>
9
10#include <llvm/Support/PrettyStackTrace.h>
11
12#include <cc/cc_compiler.h>
13
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/kernel_builder.h>
20#include <kernels/swizzle.h>
21#include <re/re_toolchain.h>
22
23#include <re/collect_ccs.h>
24#include <re/replaceCC.h>
25#include <re/re_seq.h>
26#include <re/re_cc.h>
27
28#include <UCD/resolve_properties.h>
29#include <kernels/charclasses.h>
30#include <kernels/grep_kernel.h>
31#include <kernels/UCD_property_kernel.h>
32#include <kernels/grapheme_kernel.h>
33#include <kernels/linebreak_kernel.h>
34#include <kernels/streams_merge.h>
35#include <kernels/scanmatchgen.h>
36#include <kernels/until_n.h>
37#include <re/casing.h>
38#include <re/exclude_CC.h>
39#include <re/to_utf8.h>
40#include <re/re_analysis.h>
41#include <re/re_name_resolve.h>
42#include <re/re_name_gather.h>
43#include <re/re_multiplex.h>
44#include <re/re_utility.h>
45#include <re/grapheme_clusters.h>
46#include <re/printer_re.h>
47#include <llvm/Support/raw_ostream.h>
48#include <llvm/Support/Debug.h>
49#include <kernels/fake_stream_generating_kernel.h>
50#include <re/re_alt.h>
51
52namespace re { class CC; }
53
54using namespace llvm;
55using namespace parabix;
56using namespace kernel;
57using namespace grep;
58
59
60LZParabixGrepGenerator::LZParabixGrepGenerator(bool enableMultiplexing): LZParabixGenerator(), mEnableMultiplexing(enableMultiplexing) {
61    mGrepRecordBreak = grep::GrepRecordBreakKind::LF;
62    mMoveMatchesToEOL = true;
63}
64
65void LZParabixGrepGenerator::initREs(std::vector<re::RE *> &REs) {
66    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
67        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
68    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
69        mBreakCC = re::makeByte(0);  // Null
70    } else {
71        mBreakCC = re::makeByte(0x0A); // LF
72    }
73    re::RE * anchorRE = mBreakCC;
74    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
75        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
76        anchorName->setDefinition(re::makeUnicodeBreak());
77        anchorRE = anchorName;
78    }
79
80    mREs = REs;
81    bool allAnchored = true;
82    for(unsigned i = 0; i < mREs.size(); ++i) {
83        if (!hasEndAnchor(mREs[i])) allAnchored = false;
84        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
85        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
86        mREs[i] = resolveAnchors(mREs[i], anchorRE);
87        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
88        mREs[i] = regular_expression_passes(mREs[i]);
89    }
90    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
91}
92
93void LZParabixGrepGenerator::generateCountOnlyMainFunc(const std::unique_ptr<kernel::KernelBuilder> &iBuilder) {
94    Module * M = iBuilder->getModule();
95    Type * const int64Ty = iBuilder->getInt64Ty();
96    Type * const sizeTy = iBuilder->getSizeTy();
97    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
98//    Type * const voidTy = iBuilder->getVoidTy();
99    Type * const inputType = iBuilder->getInt8PtrTy();
100
101    Function * const main = cast<Function>(M->getOrInsertFunction("Main", int64Ty, inputType, sizeTy, sizeTy, boolTy, nullptr));
102    main->setCallingConv(CallingConv::C);
103    Function::arg_iterator args = main->arg_begin();
104    mInputStream = &*(args++);
105    mInputStream->setName("input");
106
107    mHeaderSize = &*(args++);
108    mHeaderSize->setName("mHeaderSize");
109
110    mFileSize = &*(args++);
111    mFileSize->setName("mFileSize");
112
113    mHasBlockChecksum = &*(args++);
114    mHasBlockChecksum->setName("mHasBlockChecksum");
115    // TODO for now, we do not handle blockCheckSum
116    mHasBlockChecksum = iBuilder->getInt1(false);
117
118    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
119}
120
121void LZParabixGrepGenerator::generateCountOnlyAioPipeline(re::RE *regex, bool swizzledDecompression ) {
122    auto & iBuilder = mPxDriver.getBuilder();
123    this->generateCountOnlyMainFunc(iBuilder);
124
125    this->generateLoadByteStreamAndBitStream(iBuilder);
126
127
128    StreamSetBuffer * LineBreakStream;
129    StreamSetBuffer * Matches;
130    std::vector<re::RE*> res = {regex};
131    if (mEnableMultiplexing) {
132        std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res);
133    } else {
134        std::tie(LineBreakStream, Matches) = grepPipeline(res, swizzledDecompression);
135    }
136
137
138    kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder);
139    mPxDriver.makeKernelCall(matchCountK, {Matches}, {});
140    mPxDriver.generatePipelineIR();
141
142    iBuilder->setKernel(matchCountK);
143    Value * matchedLineCount = iBuilder->getAccumulator("countResult");
144    matchedLineCount = iBuilder->CreateZExt(matchedLineCount, iBuilder->getInt64Ty());
145
146    mPxDriver.deallocateBuffers();
147
148    iBuilder->CreateRet(matchedLineCount);
149
150    mPxDriver.finalizeObject();
151}
152
153
154std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZParabixGrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs) {
155
156    this->initREs(REs);
157    auto mGrepDriver = &mPxDriver;
158
159    auto & idb = mGrepDriver->getBuilder();
160    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
161    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
162    int MaxCountFlag = 0;
163
164    //  Regular Expression Processing and Analysis Phase
165    const auto nREs = mREs.size();
166
167    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
168
169
170    std::map<std::string, StreamSetBuffer *> propertyStream;
171
172    std::vector<std::string> externalStreamNames;
173    std::set<re::Name *> UnicodeProperties;
174
175    re::CC* linefeedCC = re::makeCC(0x0A);
176
177    re::Seq* seq = re::makeSeq();
178    seq->push_back(mREs[0]);
179    seq->push_back(std::move(linefeedCC));
180
181    const auto UnicodeSets = re::collectCCs(seq, &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
182    StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize, 1);
183
184    this->generateBlockData(idb);
185    StreamSetBuffer * const LiteralBitStream = this->extractLiteralBitStream(idb);
186
187    mpx = make_unique<cc::MultiplexedAlphabet>("mpx", UnicodeSets);
188    mREs[0] = transformCCs(mpx.get(), mREs[0]);
189
190    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
191    auto numOfCharacterClasses = mpx_basis.size();
192//    llvm::outs() << "numOfCharacterClasses:" << numOfCharacterClasses << "\n";
193    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize, 1);
194
195    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), false, cc::BitNumbering::BigEndian);
196    mGrepDriver->makeKernelCall(ccK, {LiteralBitStream}, {CharClasses});
197
198    StreamSetBuffer * newLineBreak = nullptr;
199
200
201    StreamSetBuffer * uncompressedCharClasses = nullptr;
202    StreamSetBuffer * u8NoFinalStream = nullptr;
203    StreamSetBuffer * fakeMatchCopiedBits = nullptr;
204
205
206    bool allCcByteLength = re::isAllCcByteLength(mREs[0]);
207    if (allCcByteLength) {
208        auto ret = this->generateBitStreamDecompression(idb, {CharClasses});
209        uncompressedCharClasses = ret[0];
210        fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb), 1);
211        u8NoFinalStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1), this->getInputBufferBlocks(idb), 1);
212
213        Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, std::vector<unsigned>({8, 1}));
214        mPxDriver.makeKernelCall(fakeStreamGeneratorK, {uncompressedCharClasses}, {fakeMatchCopiedBits, u8NoFinalStream});
215    } else {
216        re::RE* nonFinalName = re::makeAlt({re::makeByte(0xC2, 0xF4),
217                                            re::makeSeq({re::makeByte(0xE0, 0xF4), re::makeByte(0x80, 0xBF)}),
218                                            re::makeSeq({re::makeByte(0xF0, 0xF4), re::makeByte(0x80, 0xBF), re::makeByte(0x80, 0xBF)})});
219        StreamSetBuffer * compressedU8NoFinalStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), this->getInputBufferBlocks(idb));
220        kernel::Kernel * u8NoFinalK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, nonFinalName, externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian);
221        mGrepDriver->makeKernelCall(u8NoFinalK, {LiteralBitStream}, {compressedU8NoFinalStream});
222
223        auto ret = this->generateBitStreamDecompression(idb, {CharClasses, compressedU8NoFinalStream});
224
225        uncompressedCharClasses = ret[0];
226        u8NoFinalStream = ret[1];
227
228        fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb), 1);
229        Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
230        mPxDriver.makeKernelCall(fakeStreamGeneratorK, {uncompressedCharClasses}, {fakeMatchCopiedBits});
231
232    }
233
234
235    newLineBreak = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), this->getInputBufferBlocks(idb));
236    kernel::Kernel * lineFeedGrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, transformCCs(mpx.get(), linefeedCC), externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian);
237    mGrepDriver->makeKernelCall(lineFeedGrepK, {fakeMatchCopiedBits, uncompressedCharClasses}, {newLineBreak});
238
239    externalStreamNames.push_back("UTF8_nonfinal");
240
241    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian);
242    mGrepDriver->makeKernelCall(icgrepK, {fakeMatchCopiedBits, u8NoFinalStream, uncompressedCharClasses}, {MatchResults});
243    MatchResultsBufs[0] = MatchResults;
244
245
246    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
247    if (mREs.size() > 1) {
248        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
249        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
250        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
251    }
252    StreamSetBuffer * Matches = MergedResults;
253    if (mMoveMatchesToEOL) {
254        StreamSetBuffer * OriginalMatches = Matches;
255        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
256        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
257        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, newLineBreak}, {Matches});
258    }
259
260    if (MaxCountFlag > 0) {
261        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
262        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
263        StreamSetBuffer * const AllMatches = Matches;
264        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
265        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
266    }
267
268    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(newLineBreak, Matches);
269};
270
271
272std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *>
273LZParabixGrepGenerator::grepPipeline(std::vector<re::RE *> &REs, bool swizzledDecompression) {
274
275    this->initREs(REs);
276    auto mGrepDriver = &mPxDriver;
277
278    auto & idb = mGrepDriver->getBuilder();
279    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
280    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
281    int MaxCountFlag = 0;
282
283    //  Regular Expression Processing and Analysis Phase
284    const auto nREs = mREs.size();
285
286    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
287
288
289    this->generateBlockData(idb);
290    StreamSetBuffer * const LiteralBitStream = this->extractLiteralBitStream(idb);
291//    auto compressedLineBreakStream = this->linefeedStreamFromUncompressedBits(LiteralBitStream);
292
293
294    StreamSetBuffer * uncompressedBasisBits = nullptr;
295    if (swizzledDecompression) {
296        uncompressedBasisBits = this->generateSwizzledBitStreamDecompression(idb, LiteralBitStream);
297    } else {
298        auto ret = this->generateBitStreamDecompression(idb, {LiteralBitStream});
299        uncompressedBasisBits = ret[0];
300    }
301
302    StreamSetBuffer * LineBreakStream = this->linefeedStreamFromUncompressedBits(uncompressedBasisBits);
303
304    std::map<std::string, StreamSetBuffer *> propertyStream;
305
306    for(unsigned i = 0; i < nREs; ++i) {
307        std::vector<std::string> externalStreamNames;
308        std::vector<StreamSetBuffer *> icgrepInputSets = {uncompressedBasisBits};
309
310        std::set<re::Name *> UnicodeProperties;
311
312        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
313        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian);
314        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
315        MatchResultsBufs[i] = MatchResults;
316    }
317
318    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
319    if (mREs.size() > 1) {
320        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
321        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
322        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
323    }
324    StreamSetBuffer * Matches = MergedResults;
325    if (mMoveMatchesToEOL) {
326        StreamSetBuffer * OriginalMatches = Matches;
327        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
328        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
329        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
330    }
331
332    if (MaxCountFlag > 0) {
333        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
334        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
335        StreamSetBuffer * const AllMatches = Matches;
336        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
337        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
338    }
339
340    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
341}
342
343parabix::StreamSetBuffer *
344LZParabixGrepGenerator::linefeedStreamFromUncompressedBits(parabix::StreamSetBuffer *uncompressedBasisBits) {
345    auto & idb = mPxDriver.getBuilder();
346    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
347    StreamSetBuffer * LineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
348    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);
349    mPxDriver.makeKernelCall(linefeedK, {uncompressedBasisBits}, {LineFeedStream});
350    return LineFeedStream;
351}
352
353CountOnlyGrepMainFunctionType LZParabixGrepGenerator::getCountOnlyGrepMainFunction() {
354    return reinterpret_cast<CountOnlyGrepMainFunctionType>(mPxDriver.getMain());
355}
Note: See TracBrowser for help on using the repository browser.