source: icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp @ 5921

Last change on this file since 5921 was 5921, checked in by xwa163, 12 months ago
  1. Initial checkin for new approach for lz4 index decoder that always use 4MB buffer
  2. Add test case for new approach (for now test cases will fail when test file is larger than 4MB)
File size: 16.4 KB
Line 
1
2#include "LZ4Generator.h"
3
4#include <boost/filesystem.hpp>
5#include <boost/iostreams/device/mapped_file.hpp>
6
7#include <llvm/Support/CommandLine.h>
8#include <llvm/Support/PrettyStackTrace.h>
9
10#include <cc/cc_compiler.h>
11
12#include <lz4FrameDecoder.h>
13#include <kernels/streamset.h>
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/lz4/lz4_extract_e_m0.h>
20#include <kernels/lz4/lz4_generate_deposit_stream.h>
21#include <kernels/lz4/lz4_numbers_to_bitstream_kernel.h>
22#include <kernels/lz4/lz4_bitstream_not_kernel.h>
23#include <kernels/kernel_builder.h>
24#include <kernels/lz4/lz4_block_decoder.h>
25#include <kernels/deletion.h>
26#include <kernels/swizzle.h>
27#include <kernels/pdep_kernel.h>
28#include <kernels/lz4/lz4_multiple_pdep_kernel.h>
29#include <kernels/lz4/lz4_match_copy_kernel.h>
30#include <kernels/lz4/lz4_swizzled_match_copy_kernel.h>
31#include <kernels/lz4/lz4_block_decoder_new.h>
32#include <kernels/lz4/lz4_index_builder.h>
33
34namespace re { class CC; }
35
36using namespace llvm;
37using namespace parabix;
38using namespace kernel;
39
40LZ4Generator::LZ4Generator():pxDriver("lz4d") {
41
42}
43
44MainFunctionType LZ4Generator::getMainFunc() {
45    return reinterpret_cast<MainFunctionType>(pxDriver.getMain());
46}
47
48
49
50void LZ4Generator::generateExtractOnlyPipeline(const std::string& outputFile) {
51    auto & iBuilder = pxDriver.getBuilder();
52    this->generateMainFunc(iBuilder);
53
54    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
55
56    // GeneratePipeline
57    this->generateLoadByteStreamAndBitStream(iBuilder);
58
59    this->generateExtractAndDepositMarkers(iBuilder);
60
61    auto swizzle = this->generateSwizzleExtractData(iBuilder);
62
63
64    // Produce unswizzled bit streams
65    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
66    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
67
68    pxDriver.makeKernelCall(unSwizzleK, {swizzle.first, swizzle.second}, {extractedbits});
69
70
71    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
72    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
73
74    // --------------------------------------------------------
75    // End
76    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
77
78    outK->setInitialArguments({iBuilder->GetString(outputFile)});
79    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
80
81    pxDriver.generatePipelineIR();
82    pxDriver.deallocateBuffers();
83
84    iBuilder->CreateRetVoid();
85
86    pxDriver.finalizeObject();
87}
88
89void LZ4Generator::generateExtractAndDepositOnlyPipeline(const std::string &outputFile) {
90    auto & iBuilder = pxDriver.getBuilder();
91    this->generateMainFunc(iBuilder);
92
93    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
94    StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
95
96
97
98    // GeneratePipeline
99    this->generateLoadByteStreamAndBitStream(iBuilder);
100    this->generateExtractAndDepositMarkers(iBuilder);
101
102    auto swizzle = this->generateSwizzleExtractData(iBuilder);
103
104    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
105    StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
106
107    Kernel * multiplePdepK = pxDriver.addKernelInstance<LZ4MultiplePDEPkernel>(iBuilder, 4, 2, 4);
108    pxDriver.makeKernelCall(multiplePdepK, {DepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
109
110    // Produce unswizzled bit streams
111    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
112    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
113    pxDriver.makeKernelCall(unSwizzleK, {depositedSwizzle0, depositedSwizzle1}, {extractedbits});
114
115//    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1}, {extractedbits});
116
117    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
118    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
119
120    // --------------------------------------------------------
121    // End
122    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
123    outK->setInitialArguments({iBuilder->GetString(outputFile)});
124    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
125
126    pxDriver.generatePipelineIR();
127    pxDriver.deallocateBuffers();
128
129    iBuilder->CreateRetVoid();
130
131    pxDriver.finalizeObject();
132}
133
134void LZ4Generator::generatePipeline(const std::string& outputFile) {
135    auto & iBuilder = pxDriver.getBuilder();
136    this->generateMainFunc(iBuilder);
137
138    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
139//    StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
140
141
142    // GeneratePipeline
143    this->generateLoadByteStreamAndBitStream(iBuilder);
144    this->generateExtractAndDepositMarkers(iBuilder);
145
146    auto swizzle = this->generateSwizzleExtractData(iBuilder);
147
148    //TODO buffer blocks should be decompressedBufferBlocks
149    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
150    StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
151
152    Kernel * multiplePdepK = pxDriver.addKernelInstance<LZ4MultiplePDEPkernel>(iBuilder, 4, 2, 4);
153    pxDriver.makeKernelCall(multiplePdepK, {DepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
154
155
156    StreamSetBuffer * matchCopiedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
157    StreamSetBuffer * matchCopiedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
158
159    Kernel * swizzledMatchCopyK = pxDriver.addKernelInstance<LZ4SwizzledMatchCopyKernel>(iBuilder, 4, 2, 4);
160    pxDriver.makeKernelCall(swizzledMatchCopyK, {M0_Start, M0_End, Match_Offset, depositedSwizzle0, depositedSwizzle1}, {matchCopiedSwizzle0, matchCopiedSwizzle1});
161
162
163    // Produce unswizzled bit streams
164    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
165    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
166    pxDriver.makeKernelCall(unSwizzleK, {matchCopiedSwizzle0, matchCopiedSwizzle1}, {extractedbits});
167//    pxDriver.makeKernelCall(unSwizzleK, {depositedSwizzle0, depositedSwizzle1}, {extractedbits});
168
169//    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1}, {extractedbits});
170
171
172    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
173    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
174
175//    Kernel * matchCopyK = pxDriver.addKernelInstance<LZ4MatchCopyKernel>(iBuilder);
176//    pxDriver.makeKernelCall(matchCopyK, {DecompressedByteStream, M0_Start, M0_End, Match_Offset}, {FinalDecompressedByteStream});
177
178    // --------------------------------------------------------
179    // End
180    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
181    outK->setInitialArguments({iBuilder->GetString(outputFile)});
182    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
183//    pxDriver.makeKernelCall(outK, {FinalDecompressedByteStream}, {});
184
185    pxDriver.generatePipelineIR();
186    pxDriver.deallocateBuffers();
187
188    iBuilder->CreateRetVoid();
189
190    pxDriver.finalizeObject();
191}
192
193void LZ4Generator::generateMainFunc(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
194    Module * M = iBuilder->getModule();
195    Type * const sizeTy = iBuilder->getSizeTy();
196    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
197    Type * const voidTy = iBuilder->getVoidTy();
198    Type * const inputType = iBuilder->getInt8PtrTy();
199
200    Function * const main = cast<Function>(M->getOrInsertFunction("Main", voidTy, inputType, sizeTy, sizeTy, boolTy, nullptr));
201    main->setCallingConv(CallingConv::C);
202    Function::arg_iterator args = main->arg_begin();
203    inputStream = &*(args++);
204    inputStream->setName("input");
205
206    headerSize = &*(args++);
207    headerSize->setName("headerSize");
208
209    fileSize = &*(args++);
210    fileSize->setName("fileSize");
211
212    hasBlockChecksum = &*(args++);
213    hasBlockChecksum->setName("hasBlockChecksum");
214
215    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
216}
217
218void LZ4Generator::generateLoadByteStreamAndBitStream(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
219    ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
220    BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), this->getInputBufferBlocks());
221
222
223    kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
224    sourceK->setInitialArguments({inputStream, fileSize});
225    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
226    Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder, /*aligned = */ true);
227    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
228}
229
230void LZ4Generator::generateExtractAndDepositMarkers(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
231    //// Decode Block Information
232    StreamSetBuffer * const BlockData_IsCompressed = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
233    StreamSetBuffer * const BlockData_BlockStart = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
234    StreamSetBuffer * const BlockData_BlockEnd = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
235
236    Kernel * blockDecoderK = pxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
237    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(hasBlockChecksum, iBuilder->getInt1Ty()), headerSize});
238    pxDriver.makeKernelCall(blockDecoderK, {ByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
239
240
241    //// Generate Helper Markers Extenders, FX, XF
242    StreamSetBuffer * const Extenders = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
243    StreamSetBuffer * const CC_0xFX = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
244    StreamSetBuffer * const CC_0xXF = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
245
246
247    Kernel * extenderK = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
248    pxDriver.makeKernelCall(extenderK, {BasisBits}, {Extenders});
249
250    re::CC* xfCC = re::makeCC(0x0f);
251    re::CC* fxCC = re::makeCC(0xf0);
252    for (re::codepoint_t i = 1; i <= 0xf; i++) {
253        xfCC = re::makeCC(xfCC, re::makeCC(i * 0x10 + 0x0f));
254        fxCC = re::makeCC(fxCC, re::makeCC(0xf0 + i));
255    }
256
257    Kernel * CC_0xFXKernel = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "CC_0xFX", std::vector<re::CC *>{fxCC}, 8);
258    pxDriver.makeKernelCall(CC_0xFXKernel, {BasisBits}, {CC_0xFX});
259
260    Kernel * CC_0xXFKernel = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "CC_0xXF", std::vector<re::CC *>{xfCC}, 8);
261    pxDriver.makeKernelCall(CC_0xXFKernel, {BasisBits}, {CC_0xXF});
262
263
264    //// Generate Extract/Deposit Markers, M0_Start, M0_End, MatchOffset
265
266    size_t m0BufferSize = this->getDecompressedBufferBlocks() * 2;
267    size_t e1BufferSize = this->getInputBufferBlocks();
268
269    M0_Start = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
270    M0_End = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
271
272    //TODO handle uncompressed part
273    StreamSetBuffer * const UncompressedStartPos = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
274    StreamSetBuffer * const UncompressedLength = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
275    StreamSetBuffer * const UncompressedOutputPos = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
276
277    EMarker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), e1BufferSize);
278    StreamSetBuffer * const M0Marker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), m0BufferSize);
279    DepositMarker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), m0BufferSize);
280    Match_Offset = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
281
282
283
284    std::map<std::string, size_t> m_empty = {};
285
286    Kernel * extractEM0K = pxDriver.addKernelInstance<LZ4ExtractEM0Kernel>(iBuilder, m_empty);
287
288    pxDriver.makeKernelCall(
289            extractEM0K,
290            {
291                    ByteStream,
292                    Extenders,
293                    CC_0xFX,
294                    CC_0xXF,
295
296                    // Block Data
297                    BlockData_IsCompressed,
298                    BlockData_BlockStart,
299                    BlockData_BlockEnd
300            }, {
301                    //Uncompressed Data
302                    UncompressedStartPos,
303                    UncompressedLength,
304                    UncompressedOutputPos,
305
306                    EMarker,
307                    M0_Start,
308                    M0_End,
309                    Match_Offset
310            });
311
312
313
314    Kernel * buildM0StartMarkerK = pxDriver.addKernelInstance<LZ4NumbersToBitstreamKernel>("buildM0Marker", iBuilder);
315    pxDriver.makeKernelCall(buildM0StartMarkerK, {M0_Start, M0_End}, {M0Marker});
316
317
318    Kernel * generateDepositK = pxDriver.addKernelInstance<LZ4GenerateDepositStreamKernel>(iBuilder);
319    pxDriver.makeKernelCall(generateDepositK, {M0Marker}, {DepositMarker}); // TODO deposit
320
321}
322
323std::pair<StreamSetBuffer*, StreamSetBuffer*> LZ4Generator::generateSwizzleExtractData(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
324    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
325    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
326
327
328    StreamSetBuffer * const DeletionMask = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
329
330    Kernel * ExtractToDeletionMaskK = pxDriver.addKernelInstance<LZ4BitStreamNotKernel>(iBuilder);
331    pxDriver.makeKernelCall(
332            ExtractToDeletionMaskK,
333            {
334                    EMarker
335            }, {
336                    DeletionMask
337            }
338    );
339
340    Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 8);
341    pxDriver.makeKernelCall(delK, {DeletionMask, BasisBits}, {u16Swizzle0, u16Swizzle1});
342    return std::make_pair(u16Swizzle0, u16Swizzle1);
343}
344
345int LZ4Generator::getInputBufferBlocks() {
346    const int segmentSize = codegen::SegmentSize;
347    const int bufferSegments = codegen::BufferSegments * codegen::ThreadNum * 8 * 16 * 32 * 2;
348    return segmentSize * bufferSegments * 16;
349}
350
351int LZ4Generator::getDecompressedBufferBlocks() {
352    const unsigned copyBackWindowBlocks = 256U * 256U / codegen::BlockSize;
353    // At least * 2 since we need to leave 1 window as source of match copy,
354    // while the other window as the destination for match copy
355    const unsigned decompressBufBlocks = copyBackWindowBlocks * 2;
356    return decompressBufBlocks;
357}
358
359
360
361
362// Kernel Pipeline
Note: See TracBrowser for help on using the repository browser.