source: icGREP/icgrep-devel/icgrep/lzparabix/LZParabixGrepGenerator.cpp @ 6123

Last change on this file since 6123 was 6123, checked in by xwa163, 15 months ago

Encode BitStream? directly in LZParabix compressed file

File size: 13.7 KB
Line 
1//
2// Created by wxy325 on 2018/6/19.
3//
4
5#include "LZParabixGrepGenerator.h"
6
7
8#include <boost/iostreams/device/mapped_file.hpp>
9
10#include <llvm/Support/PrettyStackTrace.h>
11
12#include <cc/cc_compiler.h>
13
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/kernel_builder.h>
20#include <kernels/swizzle.h>
21#include <re/re_toolchain.h>
22
23#include <re/collect_ccs.h>
24#include <re/replaceCC.h>
25
26#include <UCD/resolve_properties.h>
27#include <kernels/charclasses.h>
28#include <kernels/grep_kernel.h>
29#include <kernels/UCD_property_kernel.h>
30#include <kernels/grapheme_kernel.h>
31#include <kernels/linebreak_kernel.h>
32#include <kernels/streams_merge.h>
33#include <kernels/scanmatchgen.h>
34#include <kernels/until_n.h>
35#include <re/casing.h>
36#include <re/exclude_CC.h>
37#include <re/to_utf8.h>
38#include <re/re_analysis.h>
39#include <re/re_name_resolve.h>
40#include <re/re_name_gather.h>
41#include <re/re_multiplex.h>
42#include <re/re_utility.h>
43#include <re/grapheme_clusters.h>
44#include <re/printer_re.h>
45#include <llvm/Support/raw_ostream.h>
46#include <llvm/Support/Debug.h>
47#include <kernels/fake_stream_generating_kernel.h>
48
49namespace re { class CC; }
50
51using namespace llvm;
52using namespace parabix;
53using namespace kernel;
54using namespace grep;
55
56
57LZParabixGrepGenerator::LZParabixGrepGenerator(bool enableMultiplexing): LZParabixGenerator(), mEnableMultiplexing(enableMultiplexing) {
58    mGrepRecordBreak = grep::GrepRecordBreakKind::LF;
59    mMoveMatchesToEOL = true;
60}
61
62void LZParabixGrepGenerator::initREs(std::vector<re::RE *> &REs) {
63    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
64        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
65    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
66        mBreakCC = re::makeByte(0);  // Null
67    } else {
68        mBreakCC = re::makeByte(0x0A); // LF
69    }
70    re::RE * anchorRE = mBreakCC;
71    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
72        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
73        anchorName->setDefinition(re::makeUnicodeBreak());
74        anchorRE = anchorName;
75    }
76
77    mREs = REs;
78    bool allAnchored = true;
79    for(unsigned i = 0; i < mREs.size(); ++i) {
80        if (!hasEndAnchor(mREs[i])) allAnchored = false;
81        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
82        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
83        mREs[i] = resolveAnchors(mREs[i], anchorRE);
84        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
85        mREs[i] = regular_expression_passes(mREs[i]);
86    }
87    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
88}
89
90void LZParabixGrepGenerator::generateCountOnlyMainFunc(const std::unique_ptr<kernel::KernelBuilder> &iBuilder) {
91    Module * M = iBuilder->getModule();
92    Type * const int64Ty = iBuilder->getInt64Ty();
93    Type * const sizeTy = iBuilder->getSizeTy();
94    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
95//    Type * const voidTy = iBuilder->getVoidTy();
96    Type * const inputType = iBuilder->getInt8PtrTy();
97
98    Function * const main = cast<Function>(M->getOrInsertFunction("Main", int64Ty, inputType, sizeTy, sizeTy, boolTy, nullptr));
99    main->setCallingConv(CallingConv::C);
100    Function::arg_iterator args = main->arg_begin();
101    mInputStream = &*(args++);
102    mInputStream->setName("input");
103
104    mHeaderSize = &*(args++);
105    mHeaderSize->setName("mHeaderSize");
106
107    mFileSize = &*(args++);
108    mFileSize->setName("mFileSize");
109
110    mHasBlockChecksum = &*(args++);
111    mHasBlockChecksum->setName("mHasBlockChecksum");
112    // TODO for now, we do not handle blockCheckSum
113    mHasBlockChecksum = iBuilder->getInt1(false);
114
115    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
116}
117
118void LZParabixGrepGenerator::generateCountOnlyAioPipeline(re::RE *regex) {
119    auto & iBuilder = mPxDriver.getBuilder();
120    this->generateCountOnlyMainFunc(iBuilder);
121
122    this->generateLoadByteStreamAndBitStream(iBuilder);
123
124
125    StreamSetBuffer * LineBreakStream;
126    StreamSetBuffer * Matches;
127    std::vector<re::RE*> res = {regex};
128    if (mEnableMultiplexing) {
129        std::tie(LineBreakStream, Matches) = multiplexingGrepPipeline(res);
130    } else {
131        std::tie(LineBreakStream, Matches) = grepPipeline(res);
132    }
133
134//    Kernel * outK = mPxDriver.addKernelInstance<FileSink>(iBuilder, 8);
135//    outK->setInitialArguments({iBuilder->GetString("/Users/wxy325/developer/LZ4-sample-files/workspace/lz4d-normal/8k_.txt")});
136//    mPxDriver.makeKernelCall(outK, {decompressedStream}, {});
137
138    kernel::Kernel * matchCountK = mPxDriver.addKernelInstance<kernel::PopcountKernel>(iBuilder);
139    mPxDriver.makeKernelCall(matchCountK, {Matches}, {});
140    mPxDriver.generatePipelineIR();
141
142    iBuilder->setKernel(matchCountK);
143    Value * matchedLineCount = iBuilder->getAccumulator("countResult");
144    matchedLineCount = iBuilder->CreateZExt(matchedLineCount, iBuilder->getInt64Ty());
145
146    mPxDriver.deallocateBuffers();
147
148    iBuilder->CreateRet(matchedLineCount);
149
150    mPxDriver.finalizeObject();
151}
152
153
154std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *> LZParabixGrepGenerator::multiplexingGrepPipeline(std::vector<re::RE *> &REs) {
155
156    this->initREs(REs);
157    auto mGrepDriver = &mPxDriver;
158
159    auto & idb = mGrepDriver->getBuilder();
160    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
161    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
162    int MaxCountFlag = 0;
163
164    //  Regular Expression Processing and Analysis Phase
165    const auto nREs = mREs.size();
166
167    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
168
169
170    std::map<std::string, StreamSetBuffer *> propertyStream;
171
172    std::vector<std::string> externalStreamNames;
173    std::set<re::Name *> UnicodeProperties;
174
175    const auto UnicodeSets = re::collectCCs(mREs[0], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
176    StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
177
178
179    this->generateBlockData(idb);
180    StreamSetBuffer * const LiteralBitStream = this->extractLiteralBitStream(idb);
181
182    mpx = make_unique<cc::MultiplexedAlphabet>("mpx", UnicodeSets);
183    mREs[0] = transformCCs(mpx.get(), mREs[0]);
184    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
185    auto numOfCharacterClasses = mpx_basis.size();
186    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
187
188    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), false, cc::BitNumbering::BigEndian);
189    mGrepDriver->makeKernelCall(ccK, {LiteralBitStream}, {CharClasses});
190
191    StreamSetBuffer * CompressedLineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
192    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);
193    mPxDriver.makeKernelCall(linefeedK, {LiteralBitStream}, {CompressedLineFeedStream});
194
195    auto ret = this->generateBitStreamDecompression(idb, {CharClasses, CompressedLineFeedStream});
196//    auto ret = this->generateAioBitStreamDecompressoin(idb, {CharClasses, CompressedLineFeedStream});
197
198    StreamSetBuffer * decompressedCharClasses = ret[0];
199    StreamSetBuffer * LineBreakStream = ret[1];
200
201
202    StreamSetBuffer * fakeMatchCopiedBits = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(8), this->getInputBufferBlocks(idb));
203    Kernel* fakeStreamGeneratorK = mPxDriver.addKernelInstance<FakeStreamGeneratingKernel>(idb, numOfCharacterClasses, 8);
204    mPxDriver.makeKernelCall(fakeStreamGeneratorK, {decompressedCharClasses}, {fakeMatchCopiedBits});
205
206    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[0], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()}, cc::BitNumbering::BigEndian);
207    mGrepDriver->makeKernelCall(icgrepK, {fakeMatchCopiedBits, decompressedCharClasses}, {MatchResults});
208    MatchResultsBufs[0] = MatchResults;
209
210    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
211    if (mREs.size() > 1) {
212        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
213        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
214        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
215    }
216    StreamSetBuffer * Matches = MergedResults;
217    if (mMoveMatchesToEOL) {
218        StreamSetBuffer * OriginalMatches = Matches;
219        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
220        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
221        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
222    }
223
224    if (MaxCountFlag > 0) {
225        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
226        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
227        StreamSetBuffer * const AllMatches = Matches;
228        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
229        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
230    }
231
232    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
233};
234
235
236std::pair<parabix::StreamSetBuffer *, parabix::StreamSetBuffer *>
237LZParabixGrepGenerator::grepPipeline(std::vector<re::RE *> &REs) {
238
239    this->initREs(REs);
240    auto mGrepDriver = &mPxDriver;
241
242    auto & idb = mGrepDriver->getBuilder();
243    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
244    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
245    int MaxCountFlag = 0;
246
247    //  Regular Expression Processing and Analysis Phase
248    const auto nREs = mREs.size();
249
250    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
251
252
253    this->generateBlockData(idb);
254    StreamSetBuffer * const LiteralBitStream = this->extractLiteralBitStream(idb);
255    auto compressedLineBreakStream = this->linefeedStreamFromDecompressedBits(LiteralBitStream);
256
257
258    auto ret = this->generateBitStreamDecompression(idb, {LiteralBitStream, compressedLineBreakStream});
259    StreamSetBuffer * decompressedBasisBits = ret[0];
260    StreamSetBuffer * LineBreakStream = ret[1];
261
262//    StreamSetBuffer * decompressedBasisBits = this->generateAioBitStreamDecompressoin(idb, {mCompressedBasisBits})[0];
263//    StreamSetBuffer * LineBreakStream = this->linefeedStreamFromDecompressedBits(decompressedBasisBits);
264
265    std::map<std::string, StreamSetBuffer *> propertyStream;
266
267    for(unsigned i = 0; i < nREs; ++i) {
268        std::vector<std::string> externalStreamNames;
269        std::vector<StreamSetBuffer *> icgrepInputSets = {decompressedBasisBits};
270
271        std::set<re::Name *> UnicodeProperties;
272
273        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
274        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>(), cc::BitNumbering::BigEndian);
275        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
276        MatchResultsBufs[i] = MatchResults;
277    }
278
279    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
280    if (mREs.size() > 1) {
281        MergedResults = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
282        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
283        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
284    }
285    StreamSetBuffer * Matches = MergedResults;
286    if (mMoveMatchesToEOL) {
287        StreamSetBuffer * OriginalMatches = Matches;
288        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
289        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
290        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
291    }
292
293    if (MaxCountFlag > 0) {
294        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
295        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
296        StreamSetBuffer * const AllMatches = Matches;
297        Matches = mGrepDriver->addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
298        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
299    }
300
301    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
302}
303
304parabix::StreamSetBuffer *
305LZParabixGrepGenerator::linefeedStreamFromDecompressedBits(parabix::StreamSetBuffer *decompressedBasisBits) {
306    auto & idb = mPxDriver.getBuilder();
307    const unsigned baseBufferSize = this->getInputBufferBlocks(idb);
308    StreamSetBuffer * LineFeedStream = mPxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
309    kernel::Kernel * linefeedK = mPxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()}, cc::BitNumbering::BigEndian);
310    mPxDriver.makeKernelCall(linefeedK, {decompressedBasisBits}, {LineFeedStream});
311    return LineFeedStream;
312}
313
314CountOnlyGrepMainFunctionType LZParabixGrepGenerator::getCountOnlyGrepMainFunction() {
315    return reinterpret_cast<CountOnlyGrepMainFunctionType>(mPxDriver.getMain());
316}
Note: See TracBrowser for help on using the repository browser.