source: icGREP/icgrep-devel/icgrep/lzparabix/LZParabixGrepGenerator.cpp @ 6119

Last change on this file since 6119 was 6119, checked in by xwa163, 10 months ago
  1. Add some BasisSetNumbering? option to fix bug of multiplexing
  2. Use BiigEndian? BitNumbering? for lz4 and lzparabix related pipeline
  3. Support multiplexing in LZ4BitStreamAio pipeline
File size: 13.4 KB
Line 
1//
2// Created by wxy325 on 2018/6/19.
3//
4
5#include "LZParabixGrepGenerator.h"
6
7
8#include <boost/iostreams/device/mapped_file.hpp>
9
10#include <llvm/Support/PrettyStackTrace.h>
11
12#include <cc/cc_compiler.h>
13
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/kernel_builder.h>
20#include <kernels/swizzle.h>
21#include <re/re_toolchain.h>
22
23#include <re/collect_ccs.h>
24#include <re/replaceCC.h>
25
26#include <UCD/resolve_properties.h>
27#include <kernels/charclasses.h>
28#include <kernels/grep_kernel.h>
29#include <kernels/UCD_property_kernel.h>
30#include <kernels/grapheme_kernel.h>
31#include <kernels/linebreak_kernel.h>
32#include <kernels/streams_merge.h>
33#include <kernels/scanmatchgen.h>
34#include <kernels/until_n.h>
35#include <re/casing.h>
36#include <re/exclude_CC.h>
37#include <re/to_utf8.h>
38#include <re/re_analysis.h>
39#include <re/re_name_resolve.h>
40#include <re/re_name_gather.h>
41#include <re/re_multiplex.h>
42#include <re/re_utility.h>
43#include <re/grapheme_clusters.h>
44#include <re/printer_re.h>
45#include <llvm/Support/raw_ostream.h>
46#include <llvm/Support/Debug.h>
47#include <kernels/fake_stream_generating_kernel.h>
48
49namespace re { class CC; }
50
51using namespace llvm;
52using namespace parabix;
53using namespace kernel;
54using namespace grep;
55
56
57LZParabixGrepGenerator::LZParabixGrepGenerator(bool enableMultiplexing): LZParabixGenerator(), mEnableMultiplexing(enableMultiplexing) {
58    mGrepRecordBreak = grep::GrepRecordBreakKind::LF;
59    mMoveMatchesToEOL = true;
60}
61
62void LZParabixGrepGenerator::initREs(std::vector<re::RE *> &REs) {
63    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
64        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
65    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
66        mBreakCC = re::makeByte(0);  // Null
67    } else {
68        mBreakCC = re::makeByte(0x0A); // LF
69    }
70    re::RE * anchorRE = mBreakCC;
71    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
72        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
73        anchorName->setDefinition(re::makeUnicodeBreak());
74        anchorRE = anchorName;
75    }
76
77    mREs = REs;
78    bool allAnchored = true;
79    for(unsigned i = 0; i < mREs.size(); ++i) {
80        if (!hasEndAnchor(mREs[i])) allAnchored = false;
81        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
82        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
83        mREs[i] = resolveAnchors(mREs[i], anchorRE);
84        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
85        mREs[i] = regular_expression_passes(mREs[i]);
86    }
87    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
88}
89
90void LZParabixGrepGenerator::generateCountOnlyMainFunc(const std::unique_ptr<kernel::KernelBuilder> &iBuilder) {
91    Module * M = iBuilder->getModule();
92    Type * const int64Ty = iBuilder->getInt64Ty();
93    Type * const sizeTy = iBuilder->getSizeTy();
94    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
95//    Type * const voidTy = iBuilder->getVoidTy();
96    Type * const inputType = iBuilder->getInt8PtrTy();
97
98    Function * const main = cast<Function>(M->getOrInsertFunction("Main", int64Ty, inputType, sizeTy, sizeTy, boolTy, nullptr));
99    main->setCallingConv(CallingConv::C);
100    Function::arg_iterator args = main->arg_begin();
101    mInputStream = &*(args++);
102    mInputStream->setName("input");
103
104    mHeaderSize = &*(args++);
105    mHeaderSize->setName("mHeaderSize");
106
107    mFileSize = &*(args++);
108    mFileSize->setName("mFileSize");
109
110    mHasBlockChecksum = &*(args++);
111    mHasBlockChecksum->setName("mHasBlockChecksum");
112    // TODO for now, we do not handle blockCheckSum
113    mHasBlockChecksum = iBuilder->getInt1(false);
114
115    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
116}
117
118void LZParabixGrepGenerator::generateCountOnlyAioPipeline(re::RE *regex) {
119    auto & iBuilder = mPxDriver.getBuilder();
120    this->generateCountOnlyMainFunc(iBuilder);
121
122    this->generateLoadByteStreamAndBitStream(iBuilder);
123
124
125    StreamSetBuffer * LineBreakStream;
126    StreamSetBuffer * Matches;
127    std::vector<re::RE*> res = {regex};
128    if (mEnableMultiplexing) {
129        std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res);
130    } else {
131        std::tie(LineBreakStream, Matches) = grepPipeline(res);
132    }
133
134//    Kernel * outK = mPxDriver.addKernelInstance<FileSink>(iBuilder, 8);
135//    outK->setInitialArguments({iBuilder->GetString("/Users/wxy325/developer/LZ4-sample-files/workspace/lz4d-normal/8k_.txt")});
136//    mPxDriver.makeKernelCall(outK, {decompressedStream}, {});
137
138    kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder);
139    mPxDriver.makeKernelCall(matchCountK, {Matches}, {});
140    mPxDriver.generatePipelineIR();
141
142    iBuilder->setKernel(matchCountK);
143    Value * matchedLineCount = iBuilder->getAccumulator("countResult");
144    matchedLineCount = iBuilder->CreateZExt(matchedLineCount, iBuilder->getInt64Ty());
145
146    mPxDriver.deallocateBuffers();
147
148    iBuilder->CreateRet(matchedLineCount);
149
150    mPxDriver.finalizeObject();
151}
152
153
154std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZParabixGrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs) {
155
156    this->initREs(REs);
157    auto mGrepDriver = &mPxDriver;
158
159    auto & idb = mGrepDriver->getBuilder();
160    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
161    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
162    int MaxCountFlag = 0;
163
164    //  Regular Expression Processing and Analysis Phase
165    const auto nREs = mREs.size();
166
167    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
168
169
170    std::map<std::string, StreamSetBuffer *> propertyStream;
171
172    std::vector<std::string> externalStreamNames;
173    std::set<re::Name *> UnicodeProperties;
174
175    const auto UnicodeSets = re::collectCCs(mREs[0], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
176    StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
177
178    mpx = make_unique<cc::MultiplexedAlphabet>("mpx", UnicodeSets);
179    mREs[0] = transformCCs(mpx.get(), mREs[0]);
180    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
181    auto numOfCharacterClasses = mpx_basis.size();
182    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
183
184    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), false, cc::BitNumbering::BigEndian);
185    mGrepDriver->makeKernelCall(ccK, {mCompressedBasisBits}, {CharClasses});
186
187    StreamSetBuffer * CompressedLineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
188    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);
189    mPxDriver.makeKernelCall(linefeedK, {mCompressedBasisBits}, {CompressedLineFeedStream});
190
191    auto ret = this->generateAioBitStreamDecompressoin(idb, {CharClasses, CompressedLineFeedStream});
192
193    StreamSetBuffer * decompressedCharClasses = ret[0];
194    StreamSetBuffer * LineBreakStream = ret[1];
195
196
197    StreamSetBuffer * fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb));
198    Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
199    mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits});
200
201    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian);
202    mGrepDriver->makeKernelCall(icgrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {MatchResults});
203    MatchResultsBufs[0] = MatchResults;
204
205    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
206    if (mREs.size() > 1) {
207        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
208        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
209        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
210    }
211    StreamSetBuffer * Matches = MergedResults;
212    if (mMoveMatchesToEOL) {
213        StreamSetBuffer * OriginalMatches = Matches;
214        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
215        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
216        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
217    }
218
219    if (MaxCountFlag > 0) {
220        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
221        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
222        StreamSetBuffer * const AllMatches = Matches;
223        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
224        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
225    }
226
227    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
228};
229
230
231std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *>
232LZParabixGrepGenerator::grepPipeline(std::vector<re::RE *> &REs) {
233
234    this->initREs(REs);
235    auto mGrepDriver = &mPxDriver;
236
237    auto & idb = mGrepDriver->getBuilder();
238    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
239    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
240    int MaxCountFlag = 0;
241
242    //  Regular Expression Processing and Analysis Phase
243    const auto nREs = mREs.size();
244
245    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
246
247    StreamSetBuffer * CompressedLineBreakStream = this->linefeedStreamFromDecompressedBits(mCompressedBasisBits);
248    auto ret = this->generateAioBitStreamDecompressoin(idb, {mCompressedBasisBits, CompressedLineBreakStream});
249    StreamSetBuffer * decompressedBasisBits = ret[0];
250    StreamSetBuffer * LineBreakStream = ret[1];
251
252//    StreamSetBuffer * decompressedBasisBits = this->generateAioBitStreamDecompressoin(idb, {mCompressedBasisBits})[0];
253//    StreamSetBuffer * LineBreakStream = this->linefeedStreamFromDecompressedBits(decompressedBasisBits);
254
255    std::map<std::string, StreamSetBuffer *> propertyStream;
256
257    for(unsigned i = 0; i < nREs; ++i) {
258        std::vector<std::string> externalStreamNames;
259        std::vector<StreamSetBuffer *> icgrepInputSets = {decompressedBasisBits};
260
261        std::set<re::Name *> UnicodeProperties;
262
263        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
264        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian);
265        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
266        MatchResultsBufs[i] = MatchResults;
267    }
268
269    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
270    if (mREs.size() > 1) {
271        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
272        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
273        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
274    }
275    StreamSetBuffer * Matches = MergedResults;
276    if (mMoveMatchesToEOL) {
277        StreamSetBuffer * OriginalMatches = Matches;
278        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
279        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
280        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
281    }
282
283    if (MaxCountFlag > 0) {
284        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
285        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
286        StreamSetBuffer * const AllMatches = Matches;
287        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
288        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
289    }
290
291    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
292}
293
294parabix::StreamSetBuffer *
295LZParabixGrepGenerator::linefeedStreamFromDecompressedBits(parabix::StreamSetBuffer *decompressedBasisBits) {
296    auto & idb = mPxDriver.getBuilder();
297    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
298    StreamSetBuffer * LineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
299    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);
300    mPxDriver.makeKernelCall(linefeedK, {decompressedBasisBits}, {LineFeedStream});
301    return LineFeedStream;
302}
303
304CountOnlyGrepMainFunctionType LZParabixGrepGenerator::getCountOnlyGrepMainFunction() {
305    return reinterpret_cast<CountOnlyGrepMainFunctionType>(mPxDriver.getMain());
306}
Note: See TracBrowser for help on using the repository browser.