source: icGREP/icgrep-devel/icgrep/lzparabix/LZParabixGrepGenerator.cpp @ 6114

Last change on this file since 6114 was 6114, checked in by xwa163, 10 months ago

Init check in for new compression format (lzparabix) related kernels and pipelines, including compressor, decoder and grep

File size: 13.1 KB
Line 
1//
2// Created by wxy325 on 2018/6/19.
3//
4
5#include "LZParabixGrepGenerator.h"
6
7
8#include <boost/iostreams/device/mapped_file.hpp>
9
10#include <llvm/Support/PrettyStackTrace.h>
11
12#include <cc/cc_compiler.h>
13
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/kernel_builder.h>
20#include <kernels/swizzle.h>
21#include <re/re_toolchain.h>
22
23#include <re/collect_ccs.h>
24#include <re/replaceCC.h>
25
26#include <UCD/resolve_properties.h>
27#include <kernels/charclasses.h>
28#include <kernels/grep_kernel.h>
29#include <kernels/UCD_property_kernel.h>
30#include <kernels/grapheme_kernel.h>
31#include <kernels/linebreak_kernel.h>
32#include <kernels/streams_merge.h>
33#include <kernels/scanmatchgen.h>
34#include <kernels/until_n.h>
35#include <re/casing.h>
36#include <re/exclude_CC.h>
37#include <re/to_utf8.h>
38#include <re/re_analysis.h>
39#include <re/re_name_resolve.h>
40#include <re/re_name_gather.h>
41#include <re/re_multiplex.h>
42#include <re/re_utility.h>
43#include <re/grapheme_clusters.h>
44#include <re/printer_re.h>
45#include <llvm/Support/raw_ostream.h>
46#include <llvm/Support/Debug.h>
47#include <kernels/fake_stream_generating_kernel.h>
48
49namespace re { class CC; }
50
51using namespace llvm;
52using namespace parabix;
53using namespace kernel;
54using namespace grep;
55
56
57LZParabixGrepGenerator::LZParabixGrepGenerator(bool enableMultiplexing): LZParabixGenerator(), mEnableMultiplexing(enableMultiplexing) {
58    mGrepRecordBreak = grep::GrepRecordBreakKind::LF;
59    mMoveMatchesToEOL = true;
60}
61
62void LZParabixGrepGenerator::initREs(std::vector<re::RE *> &REs) {
63    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
64        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
65    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
66        mBreakCC = re::makeByte(0);  // Null
67    } else {
68        mBreakCC = re::makeByte(0x0A); // LF
69    }
70    re::RE * anchorRE = mBreakCC;
71    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
72        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
73        anchorName->setDefinition(re::makeUnicodeBreak());
74        anchorRE = anchorName;
75    }
76
77    mREs = REs;
78    bool allAnchored = true;
79    for(unsigned i = 0; i < mREs.size(); ++i) {
80        if (!hasEndAnchor(mREs[i])) allAnchored = false;
81        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
82        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
83        mREs[i] = resolveAnchors(mREs[i], anchorRE);
84        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
85        mREs[i] = regular_expression_passes(mREs[i]);
86    }
87    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
88}
89
90void LZParabixGrepGenerator::generateCountOnlyMainFunc(const std::unique_ptr<kernel::KernelBuilder> &iBuilder) {
91    Module * M = iBuilder->getModule();
92    Type * const int64Ty = iBuilder->getInt64Ty();
93    Type * const sizeTy = iBuilder->getSizeTy();
94    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
95//    Type * const voidTy = iBuilder->getVoidTy();
96    Type * const inputType = iBuilder->getInt8PtrTy();
97
98    Function * const main = cast<Function>(M->getOrInsertFunction("Main", int64Ty, inputType, sizeTy, sizeTy, boolTy, nullptr));
99    main->setCallingConv(CallingConv::C);
100    Function::arg_iterator args = main->arg_begin();
101    mInputStream = &*(args++);
102    mInputStream->setName("input");
103
104    mHeaderSize = &*(args++);
105    mHeaderSize->setName("mHeaderSize");
106
107    mFileSize = &*(args++);
108    mFileSize->setName("mFileSize");
109
110    mHasBlockChecksum = &*(args++);
111    mHasBlockChecksum->setName("mHasBlockChecksum");
112    // TODO for now, we do not handle blockCheckSum
113    mHasBlockChecksum = iBuilder->getInt1(false);
114
115    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
116}
117
118void LZParabixGrepGenerator::generateCountOnlyAioPipeline(re::RE *regex) {
119    auto & iBuilder = mPxDriver.getBuilder();
120    this->generateCountOnlyMainFunc(iBuilder);
121
122    this->generateLoadByteStreamAndBitStream(iBuilder);
123
124
125    StreamSetBuffer * LineBreakStream;
126    StreamSetBuffer * Matches;
127    std::vector<re::RE*> res = {regex};
128//    std::tie(LineBreakStream, Matches) = grepPipeline(res);
129    std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res);
130
131
132//    Kernel * outK = mPxDriver.addKernelInstance<FileSink>(iBuilder, 8);
133//    outK->setInitialArguments({iBuilder->GetString("/Users/wxy325/developer/LZ4-sample-files/workspace/lz4d-normal/8k_.txt")});
134//    mPxDriver.makeKernelCall(outK, {decompressedStream}, {});
135
136    kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder);
137    mPxDriver.makeKernelCall(matchCountK, {Matches}, {});
138    mPxDriver.generatePipelineIR();
139
140    iBuilder->setKernel(matchCountK);
141    Value * matchedLineCount = iBuilder->getAccumulator("countResult");
142    matchedLineCount = iBuilder->CreateZExt(matchedLineCount, iBuilder->getInt64Ty());
143
144    mPxDriver.deallocateBuffers();
145
146    iBuilder->CreateRet(matchedLineCount);
147
148    mPxDriver.finalizeObject();
149}
150
151
152std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZParabixGrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs) {
153
154    this->initREs(REs);
155    auto mGrepDriver = &mPxDriver;
156
157    auto & idb = mGrepDriver->getBuilder();
158    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
159    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
160    int MaxCountFlag = 0;
161
162    //  Regular Expression Processing and Analysis Phase
163    const auto nREs = mREs.size();
164
165    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
166
167
168    std::map<std::string, StreamSetBuffer *> propertyStream;
169
170    std::vector<std::string> externalStreamNames;
171    std::set<re::Name *> UnicodeProperties;
172
173    const auto UnicodeSets = re::collectCCs(mREs[0], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
174    StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
175
176    mpx = make_unique<cc::MultiplexedAlphabet>("mpx", UnicodeSets);
177    mREs[0] = transformCCs(mpx.get(), mREs[0]);
178    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
179    auto numOfCharacterClasses = mpx_basis.size();
180    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
181
182    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
183    mGrepDriver->makeKernelCall(ccK, {mCompressedBasisBits}, {CharClasses});
184
185    StreamSetBuffer * CompressedLineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
186    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
187    mPxDriver.makeKernelCall(linefeedK, {mCompressedBasisBits}, {CompressedLineFeedStream});
188
189    auto ret = this->generateAioBitStreamDecompressoin(idb, {CharClasses, CompressedLineFeedStream});
190
191    StreamSetBuffer * decompressedCharClasses = ret[0];
192    StreamSetBuffer * LineBreakStream = ret[1];
193
194
195    StreamSetBuffer * fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb));
196    Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
197    mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits});
198
199    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
200    mGrepDriver->makeKernelCall(icgrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {MatchResults});
201    MatchResultsBufs[0] = MatchResults;
202
203    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
204    if (mREs.size() > 1) {
205        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
206        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
207        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
208    }
209    StreamSetBuffer * Matches = MergedResults;
210    if (mMoveMatchesToEOL) {
211        StreamSetBuffer * OriginalMatches = Matches;
212        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
213        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
214        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
215    }
216
217    if (MaxCountFlag > 0) {
218        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
219        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
220        StreamSetBuffer * const AllMatches = Matches;
221        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
222        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
223    }
224
225    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
226};
227
228
229std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *>
230LZParabixGrepGenerator::grepPipeline(std::vector<re::RE *> &REs) {
231
232    this->initREs(REs);
233    auto mGrepDriver = &mPxDriver;
234
235    auto & idb = mGrepDriver->getBuilder();
236    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
237    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
238    int MaxCountFlag = 0;
239
240    //  Regular Expression Processing and Analysis Phase
241    const auto nREs = mREs.size();
242
243    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
244
245    StreamSetBuffer * CompressedLineBreakStream = this->linefeedStreamFromDecompressedBits(mCompressedBasisBits);
246    auto ret = this->generateAioBitStreamDecompressoin(idb, {mCompressedBasisBits, CompressedLineBreakStream});
247    StreamSetBuffer * decompressedBasisBits = ret[0];
248    StreamSetBuffer * LineBreakStream = ret[1];
249
250//    StreamSetBuffer * decompressedBasisBits = this->generateAioBitStreamDecompressoin(idb, {mCompressedBasisBits})[0];
251//    StreamSetBuffer * LineBreakStream = this->linefeedStreamFromDecompressedBits(decompressedBasisBits);
252
253    std::map<std::string, StreamSetBuffer *> propertyStream;
254
255    for(unsigned i = 0; i < nREs; ++i) {
256        std::vector<std::string> externalStreamNames;
257        std::vector<StreamSetBuffer *> icgrepInputSets = {decompressedBasisBits};
258
259        std::set<re::Name *> UnicodeProperties;
260
261        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
262        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
263        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
264        MatchResultsBufs[i] = MatchResults;
265    }
266
267    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
268    if (mREs.size() > 1) {
269        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
270        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
271        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
272    }
273    StreamSetBuffer * Matches = MergedResults;
274    if (mMoveMatchesToEOL) {
275        StreamSetBuffer * OriginalMatches = Matches;
276        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
277        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
278        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
279    }
280
281    if (MaxCountFlag > 0) {
282        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
283        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
284        StreamSetBuffer * const AllMatches = Matches;
285        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
286        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
287    }
288
289    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
290}
291
292parabix::StreamSetBuffer *
293LZParabixGrepGenerator::linefeedStreamFromDecompressedBits(parabix::StreamSetBuffer *decompressedBasisBits) {
294    auto & idb = mPxDriver.getBuilder();
295    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
296    StreamSetBuffer * LineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
297    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
298    mPxDriver.makeKernelCall(linefeedK, {decompressedBasisBits}, {LineFeedStream});
299    return LineFeedStream;
300}
301
302CountOnlyGrepMainFunctionType LZParabixGrepGenerator::getCountOnlyGrepMainFunction() {
303    return reinterpret_cast<CountOnlyGrepMainFunctionType>(mPxDriver.getMain());
304}
Note: See TracBrowser for help on using the repository browser.