source: icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp @ 5895

Last change on this file since 5895 was 5895, checked in by xwa163, 15 months ago
  1. Fix match copy kernel in large file for new infrastructure
  2. Enable testing for full LZ4 decode pipeline
File size: 15.5 KB
Line 
1
2#include "LZ4Generator.h"
3
4#include <boost/filesystem.hpp>
5#include <boost/iostreams/device/mapped_file.hpp>
6
7#include <llvm/Support/CommandLine.h>
8#include <llvm/Support/PrettyStackTrace.h>
9
10#include <cc/cc_compiler.h>
11
12#include <lz4FrameDecoder.h>
13#include <kernels/streamset.h>
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/lz4/lz4_extract_e_m0.h>
20#include <kernels/lz4/lz4_generate_deposit_stream.h>
21#include <kernels/lz4/lz4_numbers_to_bitstream_kernel.h>
22//#include <kernels/LZ4MarkerToMaskKernel.h>
23#include <kernels/lz4/lz4_bitstream_not_kernel.h>
24#include <kernels/kernel_builder.h>
25#include <kernels/lz4/lz4_block_decoder.h>
26#include <kernels/deletion.h>
27#include <kernels/swizzle.h>
28#include <kernels/pdep_kernel.h>
29#include <kernels/lz4/lz4_multiple_pdep_kernel.h>
30#include <kernels/lz4/lz4_match_copy_kernel.h>
31
32namespace re { class CC; }
33
34using namespace llvm;
35using namespace parabix;
36using namespace kernel;
37
38LZ4Generator::LZ4Generator():pxDriver("lz4d") {
39
40}
41
42MainFunctionType LZ4Generator::getMainFunc() {
43    return reinterpret_cast<MainFunctionType>(pxDriver.getMain());
44}
45
46
47
48void LZ4Generator::generateExtractOnlyPipeline(const std::string& outputFile) {
49    auto & iBuilder = pxDriver.getBuilder();
50    this->generateMainFunc(iBuilder);
51
52    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
53
54    // GeneratePipeline
55    this->generateLoadByteStreamAndBitStream(iBuilder);
56
57    this->generateExtractAndDepositMarkers(iBuilder);
58
59    auto swizzle = this->generateSwizzleExtractData(iBuilder);
60
61
62    // Produce unswizzled bit streams
63    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
64    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
65
66    pxDriver.makeKernelCall(unSwizzleK, {swizzle.first, swizzle.second}, {extractedbits});
67
68
69    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
70    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
71
72    // --------------------------------------------------------
73    // End
74    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
75
76    outK->setInitialArguments({iBuilder->GetString(outputFile)});
77    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
78
79    pxDriver.generatePipelineIR();
80    pxDriver.deallocateBuffers();
81
82    iBuilder->CreateRetVoid();
83
84    pxDriver.finalizeObject();
85}
86
87void LZ4Generator::generateExtractAndDepositOnlyPipeline(const std::string &outputFile) {
88    auto & iBuilder = pxDriver.getBuilder();
89    this->generateMainFunc(iBuilder);
90
91    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
92    StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
93
94
95
96    // GeneratePipeline
97    this->generateLoadByteStreamAndBitStream(iBuilder);
98    this->generateExtractAndDepositMarkers(iBuilder);
99
100    auto swizzle = this->generateSwizzleExtractData(iBuilder);
101
102    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
103    StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
104
105    Kernel * multiplePdepK = pxDriver.addKernelInstance<LZ4MultiplePDEPkernel>(iBuilder, 4, 2, 4);
106    pxDriver.makeKernelCall(multiplePdepK, {DepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
107
108    // Produce unswizzled bit streams
109    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
110    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
111    pxDriver.makeKernelCall(unSwizzleK, {depositedSwizzle0, depositedSwizzle1}, {extractedbits});
112
113//    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1}, {extractedbits});
114
115    // TODO MatchCopy before p2s
116
117    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
118    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
119
120    // --------------------------------------------------------
121    // End
122    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
123    outK->setInitialArguments({iBuilder->GetString(outputFile)});
124    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
125
126    pxDriver.generatePipelineIR();
127    pxDriver.deallocateBuffers();
128
129    iBuilder->CreateRetVoid();
130
131    pxDriver.finalizeObject();
132}
133
134void LZ4Generator::generatePipeline(const std::string& outputFile) {
135    auto & iBuilder = pxDriver.getBuilder();
136    this->generateMainFunc(iBuilder);
137
138    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
139    StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
140
141
142
143    // GeneratePipeline
144    this->generateLoadByteStreamAndBitStream(iBuilder);
145    this->generateExtractAndDepositMarkers(iBuilder);
146
147    auto swizzle = this->generateSwizzleExtractData(iBuilder);
148
149    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
150    StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
151
152    Kernel * multiplePdepK = pxDriver.addKernelInstance<LZ4MultiplePDEPkernel>(iBuilder, 4, 2, 4);
153    pxDriver.makeKernelCall(multiplePdepK, {DepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
154
155    // Produce unswizzled bit streams
156    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
157    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
158    pxDriver.makeKernelCall(unSwizzleK, {depositedSwizzle0, depositedSwizzle1}, {extractedbits});
159
160//    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1}, {extractedbits});
161
162    // TODO MatchCopy before p2s
163
164    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
165    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
166
167    Kernel * matchCopyK = pxDriver.addKernelInstance<LZ4MatchCopyKernel>(iBuilder);
168    pxDriver.makeKernelCall(matchCopyK, {DecompressedByteStream, M0_Start, M0_End, Match_Offset}, {FinalDecompressedByteStream});
169
170    // --------------------------------------------------------
171    // End
172    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
173    outK->setInitialArguments({iBuilder->GetString(outputFile)});
174    pxDriver.makeKernelCall(outK, {FinalDecompressedByteStream}, {});
175
176    pxDriver.generatePipelineIR();
177    pxDriver.deallocateBuffers();
178
179    iBuilder->CreateRetVoid();
180
181    pxDriver.finalizeObject();
182}
183
184void LZ4Generator::generateMainFunc(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
185    Module * M = iBuilder->getModule();
186    Type * const sizeTy = iBuilder->getSizeTy();
187    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
188    Type * const voidTy = iBuilder->getVoidTy();
189    Type * const inputType = iBuilder->getInt8PtrTy();
190
191    Function * const main = cast<Function>(M->getOrInsertFunction("Main", voidTy, inputType, sizeTy, sizeTy, boolTy, nullptr));
192    main->setCallingConv(CallingConv::C);
193    Function::arg_iterator args = main->arg_begin();
194    inputStream = &*(args++);
195    inputStream->setName("input");
196
197    headerSize = &*(args++);
198    headerSize->setName("headerSize");
199
200    fileSize = &*(args++);
201    fileSize->setName("fileSize");
202
203    hasBlockChecksum = &*(args++);
204    hasBlockChecksum->setName("hasBlockChecksum");
205
206    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
207}
208
209void LZ4Generator::generateLoadByteStreamAndBitStream(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
210    ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
211    BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), this->getInputBufferBlocks());
212
213
214    kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
215    sourceK->setInitialArguments({inputStream, fileSize});
216    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
217    Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder, /*aligned = */ true);
218    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
219}
220
221void LZ4Generator::generateExtractAndDepositMarkers(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
222    //// Decode Block Information
223    StreamSetBuffer * const BlockData_IsCompressed = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
224    StreamSetBuffer * const BlockData_BlockStart = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
225    StreamSetBuffer * const BlockData_BlockEnd = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
226
227    Kernel * blockDecoderK = pxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
228    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(hasBlockChecksum, iBuilder->getInt1Ty()), headerSize});
229    pxDriver.makeKernelCall(blockDecoderK, {ByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
230
231
232    //// Generate Helper Markers Extenders, FX, XF
233    StreamSetBuffer * const Extenders = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
234    StreamSetBuffer * const CC_0xFX = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
235    StreamSetBuffer * const CC_0xXF = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
236
237
238    Kernel * extenderK = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
239    pxDriver.makeKernelCall(extenderK, {BasisBits}, {Extenders});
240
241    re::CC* xfCC = re::makeCC(0x0f);
242    re::CC* fxCC = re::makeCC(0xf0);
243    for (re::codepoint_t i = 1; i <= 0xf; i++) {
244        xfCC = re::makeCC(xfCC, re::makeCC(i * 0x10 + 0x0f));
245        fxCC = re::makeCC(fxCC, re::makeCC(0xf0 + i));
246    }
247
248    Kernel * CC_0xFXKernel = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "CC_0xFX", std::vector<re::CC *>{fxCC}, 8);
249    pxDriver.makeKernelCall(CC_0xFXKernel, {BasisBits}, {CC_0xFX});
250
251    Kernel * CC_0xXFKernel = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "CC_0xXF", std::vector<re::CC *>{xfCC}, 8);
252    pxDriver.makeKernelCall(CC_0xXFKernel, {BasisBits}, {CC_0xXF});
253
254
255    //// Generate Extract/Deposit Markers, M0_Start, M0_End, MatchOffset
256
257    size_t m0BufferSize = this->getDecompressedBufferBlocks() * 2;
258    size_t e1BufferSize = this->getInputBufferBlocks();
259
260    M0_Start = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
261    M0_End = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
262
263    //TODO handle uncompressed part
264    StreamSetBuffer * const UncompressedStartPos = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
265    StreamSetBuffer * const UncompressedLength = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
266    StreamSetBuffer * const UncompressedOutputPos = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
267
268    EMarker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), e1BufferSize);
269    StreamSetBuffer * const M0Marker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), m0BufferSize);
270    DepositMarker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), m0BufferSize);
271    Match_Offset = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
272
273
274
275    std::map<std::string, size_t> m_empty = {};
276
277    Kernel * extractEM0K = pxDriver.addKernelInstance<LZ4ExtractEM0Kernel>(iBuilder, m_empty);
278
279    pxDriver.makeKernelCall(
280            extractEM0K,
281            {
282                    ByteStream,
283                    Extenders,
284                    CC_0xFX,
285                    CC_0xXF,
286
287                    // Block Data
288                    BlockData_IsCompressed,
289                    BlockData_BlockStart,
290                    BlockData_BlockEnd
291            }, {
292                    //Uncompressed Data
293                    UncompressedStartPos,
294                    UncompressedLength,
295                    UncompressedOutputPos,
296
297                    EMarker,
298                    M0_Start,
299                    M0_End,
300                    Match_Offset
301            });
302
303
304
305    Kernel * buildM0StartMarkerK = pxDriver.addKernelInstance<LZ4NumbersToBitstreamKernel>("buildM0Marker", iBuilder);
306    pxDriver.makeKernelCall(buildM0StartMarkerK, {M0_Start, M0_End}, {M0Marker});
307
308
309    Kernel * generateDepositK = pxDriver.addKernelInstance<LZ4GenerateDepositStreamKernel>(iBuilder);
310    pxDriver.makeKernelCall(generateDepositK, {M0Marker}, {DepositMarker}); // TODO deposit
311
312}
313
314std::pair<StreamSetBuffer*, StreamSetBuffer*> LZ4Generator::generateSwizzleExtractData(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
315    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
316    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
317
318
319    StreamSetBuffer * const DeletionMask = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
320
321    Kernel * ExtractToDeletionMaskK = pxDriver.addKernelInstance<LZ4BitStreamNotKernel>(iBuilder);
322    pxDriver.makeKernelCall(
323            ExtractToDeletionMaskK,
324            {
325                    EMarker
326            }, {
327                    DeletionMask
328            }
329    );
330
331    Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 8);
332    pxDriver.makeKernelCall(delK, {DeletionMask, BasisBits}, {u16Swizzle0, u16Swizzle1});
333    return std::make_pair(u16Swizzle0, u16Swizzle1);
334}
335
336int LZ4Generator::getInputBufferBlocks() {
337    const int segmentSize = codegen::SegmentSize;
338    const int bufferSegments = codegen::BufferSegments * codegen::ThreadNum * 8 * 16 * 32 * 2;
339    return segmentSize * bufferSegments * 16;
340}
341
342int LZ4Generator::getDecompressedBufferBlocks() {
343    const unsigned decompressBufBlocks = 256U * 256U / codegen::BlockSize * 2 * 2; // TODO at least *2 since we need to leave 1 for match copy window
344    return decompressBufBlocks;
345}
346
347
348
349// Kernel Pipeline
Note: See TracBrowser for help on using the repository browser.