source: icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp @ 5906

Last change on this file since 5906 was 5906, checked in by xwa163, 11 months ago

Implement swizzled match copy kernel, which can do match copy in swizzled bitstream form

File size: 16.5 KB
Line 
1
2#include "LZ4Generator.h"
3
4#include <boost/filesystem.hpp>
5#include <boost/iostreams/device/mapped_file.hpp>
6
7#include <llvm/Support/CommandLine.h>
8#include <llvm/Support/PrettyStackTrace.h>
9
10#include <cc/cc_compiler.h>
11
12#include <lz4FrameDecoder.h>
13#include <kernels/streamset.h>
14#include <kernels/cc_kernel.h>
15#include <kernels/s2p_kernel.h>
16#include <kernels/p2s_kernel.h>
17#include <kernels/source_kernel.h>
18#include <kernels/stdout_kernel.h>
19#include <kernels/lz4/lz4_extract_e_m0.h>
20#include <kernels/lz4/lz4_generate_deposit_stream.h>
21#include <kernels/lz4/lz4_numbers_to_bitstream_kernel.h>
22//#include <kernels/LZ4MarkerToMaskKernel.h>
23#include <kernels/lz4/lz4_bitstream_not_kernel.h>
24#include <kernels/kernel_builder.h>
25#include <kernels/lz4/lz4_block_decoder.h>
26#include <kernels/deletion.h>
27#include <kernels/swizzle.h>
28#include <kernels/pdep_kernel.h>
29#include <kernels/lz4/lz4_multiple_pdep_kernel.h>
30#include <kernels/lz4/lz4_match_copy_kernel.h>
31#include <kernels/lz4/lz4_swizzled_match_copy_kernel.h>
32
33namespace re { class CC; }
34
35using namespace llvm;
36using namespace parabix;
37using namespace kernel;
38
39LZ4Generator::LZ4Generator():pxDriver("lz4d") {
40
41}
42
43MainFunctionType LZ4Generator::getMainFunc() {
44    return reinterpret_cast<MainFunctionType>(pxDriver.getMain());
45}
46
47
48
49void LZ4Generator::generateExtractOnlyPipeline(const std::string& outputFile) {
50    auto & iBuilder = pxDriver.getBuilder();
51    this->generateMainFunc(iBuilder);
52
53    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
54
55    // GeneratePipeline
56    this->generateLoadByteStreamAndBitStream(iBuilder);
57
58    this->generateExtractAndDepositMarkers(iBuilder);
59
60    auto swizzle = this->generateSwizzleExtractData(iBuilder);
61
62
63    // Produce unswizzled bit streams
64    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
65    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
66
67    pxDriver.makeKernelCall(unSwizzleK, {swizzle.first, swizzle.second}, {extractedbits});
68
69
70    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
71    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
72
73    // --------------------------------------------------------
74    // End
75    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
76
77    outK->setInitialArguments({iBuilder->GetString(outputFile)});
78    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
79
80    pxDriver.generatePipelineIR();
81    pxDriver.deallocateBuffers();
82
83    iBuilder->CreateRetVoid();
84
85    pxDriver.finalizeObject();
86}
87
88void LZ4Generator::generateExtractAndDepositOnlyPipeline(const std::string &outputFile) {
89    auto & iBuilder = pxDriver.getBuilder();
90    this->generateMainFunc(iBuilder);
91
92    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
93    StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
94
95
96
97    // GeneratePipeline
98    this->generateLoadByteStreamAndBitStream(iBuilder);
99    this->generateExtractAndDepositMarkers(iBuilder);
100
101    auto swizzle = this->generateSwizzleExtractData(iBuilder);
102
103    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
104    StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
105
106    Kernel * multiplePdepK = pxDriver.addKernelInstance<LZ4MultiplePDEPkernel>(iBuilder, 4, 2, 4);
107    pxDriver.makeKernelCall(multiplePdepK, {DepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
108
109    // Produce unswizzled bit streams
110    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
111    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
112    pxDriver.makeKernelCall(unSwizzleK, {depositedSwizzle0, depositedSwizzle1}, {extractedbits});
113
114//    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1}, {extractedbits});
115
116    // TODO MatchCopy before p2s
117
118    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
119    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
120
121    // --------------------------------------------------------
122    // End
123    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
124    outK->setInitialArguments({iBuilder->GetString(outputFile)});
125    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
126
127    pxDriver.generatePipelineIR();
128    pxDriver.deallocateBuffers();
129
130    iBuilder->CreateRetVoid();
131
132    pxDriver.finalizeObject();
133}
134
135void LZ4Generator::generatePipeline(const std::string& outputFile) {
136    auto & iBuilder = pxDriver.getBuilder();
137    this->generateMainFunc(iBuilder);
138
139    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
140//    StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
141
142
143    // GeneratePipeline
144    this->generateLoadByteStreamAndBitStream(iBuilder);
145    this->generateExtractAndDepositMarkers(iBuilder);
146
147    auto swizzle = this->generateSwizzleExtractData(iBuilder);
148
149    //TODO buffer blocks should be decompressedBufferBlocks
150    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
151    StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
152
153    Kernel * multiplePdepK = pxDriver.addKernelInstance<LZ4MultiplePDEPkernel>(iBuilder, 4, 2, 4);
154    pxDriver.makeKernelCall(multiplePdepK, {DepositMarker, swizzle.first, swizzle.second}, {depositedSwizzle0, depositedSwizzle1});
155
156
157    StreamSetBuffer * matchCopiedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
158    StreamSetBuffer * matchCopiedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
159
160    Kernel * swizzledMatchCopyK = pxDriver.addKernelInstance<LZ4SwizzledMatchCopyKernel>(iBuilder, 4, 2, 4);
161    pxDriver.makeKernelCall(swizzledMatchCopyK, {M0_Start, M0_End, Match_Offset, depositedSwizzle0, depositedSwizzle1}, {matchCopiedSwizzle0, matchCopiedSwizzle1});
162
163
164    // Produce unswizzled bit streams
165    StreamSetBuffer * extractedbits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), this->getInputBufferBlocks());
166    Kernel * unSwizzleK = pxDriver.addKernelInstance<SwizzleGenerator>(iBuilder, 8, 1, 2);
167    pxDriver.makeKernelCall(unSwizzleK, {matchCopiedSwizzle0, matchCopiedSwizzle1}, {extractedbits});
168//    pxDriver.makeKernelCall(unSwizzleK, {depositedSwizzle0, depositedSwizzle1}, {extractedbits});
169
170//    pxDriver.makeKernelCall(unSwizzleK, {u16Swizzle0, u16Swizzle1}, {extractedbits});
171
172    // TODO MatchCopy before p2s
173
174    Kernel * p2sK = pxDriver.addKernelInstance<P2SKernel>(iBuilder);
175    pxDriver.makeKernelCall(p2sK, {extractedbits}, {DecompressedByteStream});
176
177//    Kernel * matchCopyK = pxDriver.addKernelInstance<LZ4MatchCopyKernel>(iBuilder);
178//    pxDriver.makeKernelCall(matchCopyK, {DecompressedByteStream, M0_Start, M0_End, Match_Offset}, {FinalDecompressedByteStream});
179
180    // --------------------------------------------------------
181    // End
182    Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
183    outK->setInitialArguments({iBuilder->GetString(outputFile)});
184    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
185//    pxDriver.makeKernelCall(outK, {FinalDecompressedByteStream}, {});
186
187    pxDriver.generatePipelineIR();
188    pxDriver.deallocateBuffers();
189
190    iBuilder->CreateRetVoid();
191
192    pxDriver.finalizeObject();
193}
194
195void LZ4Generator::generateMainFunc(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
196    Module * M = iBuilder->getModule();
197    Type * const sizeTy = iBuilder->getSizeTy();
198    Type * const boolTy = iBuilder->getIntNTy(sizeof(bool) * 8);
199    Type * const voidTy = iBuilder->getVoidTy();
200    Type * const inputType = iBuilder->getInt8PtrTy();
201
202    Function * const main = cast<Function>(M->getOrInsertFunction("Main", voidTy, inputType, sizeTy, sizeTy, boolTy, nullptr));
203    main->setCallingConv(CallingConv::C);
204    Function::arg_iterator args = main->arg_begin();
205    inputStream = &*(args++);
206    inputStream->setName("input");
207
208    headerSize = &*(args++);
209    headerSize->setName("headerSize");
210
211    fileSize = &*(args++);
212    fileSize->setName("fileSize");
213
214    hasBlockChecksum = &*(args++);
215    hasBlockChecksum->setName("hasBlockChecksum");
216
217    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
218}
219
220void LZ4Generator::generateLoadByteStreamAndBitStream(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
221    ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
222    BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), this->getInputBufferBlocks());
223
224
225    kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
226    sourceK->setInitialArguments({inputStream, fileSize});
227    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
228    Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder, /*aligned = */ true);
229    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
230}
231
232void LZ4Generator::generateExtractAndDepositMarkers(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
233    //// Decode Block Information
234    StreamSetBuffer * const BlockData_IsCompressed = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
235    StreamSetBuffer * const BlockData_BlockStart = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
236    StreamSetBuffer * const BlockData_BlockEnd = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
237
238    Kernel * blockDecoderK = pxDriver.addKernelInstance<LZ4BlockDecoderKernel>(iBuilder);
239    blockDecoderK->setInitialArguments({iBuilder->CreateTrunc(hasBlockChecksum, iBuilder->getInt1Ty()), headerSize});
240    pxDriver.makeKernelCall(blockDecoderK, {ByteStream}, {BlockData_IsCompressed, BlockData_BlockStart, BlockData_BlockEnd});
241
242
243    //// Generate Helper Markers Extenders, FX, XF
244    StreamSetBuffer * const Extenders = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
245    StreamSetBuffer * const CC_0xFX = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
246    StreamSetBuffer * const CC_0xXF = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
247
248
249    Kernel * extenderK = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(0xFF)}, 8);
250    pxDriver.makeKernelCall(extenderK, {BasisBits}, {Extenders});
251
252    re::CC* xfCC = re::makeCC(0x0f);
253    re::CC* fxCC = re::makeCC(0xf0);
254    for (re::codepoint_t i = 1; i <= 0xf; i++) {
255        xfCC = re::makeCC(xfCC, re::makeCC(i * 0x10 + 0x0f));
256        fxCC = re::makeCC(fxCC, re::makeCC(0xf0 + i));
257    }
258
259    Kernel * CC_0xFXKernel = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "CC_0xFX", std::vector<re::CC *>{fxCC}, 8);
260    pxDriver.makeKernelCall(CC_0xFXKernel, {BasisBits}, {CC_0xFX});
261
262    Kernel * CC_0xXFKernel = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "CC_0xXF", std::vector<re::CC *>{xfCC}, 8);
263    pxDriver.makeKernelCall(CC_0xXFKernel, {BasisBits}, {CC_0xXF});
264
265
266    //// Generate Extract/Deposit Markers, M0_Start, M0_End, MatchOffset
267
268    size_t m0BufferSize = this->getDecompressedBufferBlocks() * 2;
269    size_t e1BufferSize = this->getInputBufferBlocks();
270
271    M0_Start = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
272    M0_End = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
273
274    //TODO handle uncompressed part
275    StreamSetBuffer * const UncompressedStartPos = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
276    StreamSetBuffer * const UncompressedLength = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
277    StreamSetBuffer * const UncompressedOutputPos = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
278
279    EMarker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), e1BufferSize);
280    StreamSetBuffer * const M0Marker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), m0BufferSize);
281    DepositMarker = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), m0BufferSize);
282    Match_Offset = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 64), this->getInputBufferBlocks());
283
284
285
286    std::map<std::string, size_t> m_empty = {};
287
288    Kernel * extractEM0K = pxDriver.addKernelInstance<LZ4ExtractEM0Kernel>(iBuilder, m_empty);
289
290    pxDriver.makeKernelCall(
291            extractEM0K,
292            {
293                    ByteStream,
294                    Extenders,
295                    CC_0xFX,
296                    CC_0xXF,
297
298                    // Block Data
299                    BlockData_IsCompressed,
300                    BlockData_BlockStart,
301                    BlockData_BlockEnd
302            }, {
303                    //Uncompressed Data
304                    UncompressedStartPos,
305                    UncompressedLength,
306                    UncompressedOutputPos,
307
308                    EMarker,
309                    M0_Start,
310                    M0_End,
311                    Match_Offset
312            });
313
314
315
316    Kernel * buildM0StartMarkerK = pxDriver.addKernelInstance<LZ4NumbersToBitstreamKernel>("buildM0Marker", iBuilder);
317    pxDriver.makeKernelCall(buildM0StartMarkerK, {M0_Start, M0_End}, {M0Marker});
318
319
320    Kernel * generateDepositK = pxDriver.addKernelInstance<LZ4GenerateDepositStreamKernel>(iBuilder);
321    pxDriver.makeKernelCall(generateDepositK, {M0Marker}, {DepositMarker}); // TODO deposit
322
323}
324
325std::pair<StreamSetBuffer*, StreamSetBuffer*> LZ4Generator::generateSwizzleExtractData(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
326    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
327    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), this->getInputBufferBlocks(), 1);
328
329
330    StreamSetBuffer * const DeletionMask = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), this->getInputBufferBlocks());
331
332    Kernel * ExtractToDeletionMaskK = pxDriver.addKernelInstance<LZ4BitStreamNotKernel>(iBuilder);
333    pxDriver.makeKernelCall(
334            ExtractToDeletionMaskK,
335            {
336                    EMarker
337            }, {
338                    DeletionMask
339            }
340    );
341
342    Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 8);
343    pxDriver.makeKernelCall(delK, {DeletionMask, BasisBits}, {u16Swizzle0, u16Swizzle1});
344    return std::make_pair(u16Swizzle0, u16Swizzle1);
345}
346
347int LZ4Generator::getInputBufferBlocks() {
348    const int segmentSize = codegen::SegmentSize;
349    const int bufferSegments = codegen::BufferSegments * codegen::ThreadNum * 8 * 16 * 32 * 2;
350    return segmentSize * bufferSegments * 16;
351}
352
353int LZ4Generator::getDecompressedBufferBlocks() {
354    const unsigned copyBackWindowBlocks = 256U * 256U / codegen::BlockSize;
355    // At least * 2 since we need to leave 1 window as source of match copy,
356    // while the other window as the destination for match copy
357    const unsigned decompressBufBlocks = copyBackWindowBlocks * 2;
358    return decompressBufBlocks;
359}
360
361
362
363// Kernel Pipeline
Note: See TracBrowser for help on using the repository browser.