source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6038

Last change on this file since 6038 was 6038, checked in by xwa163, 13 months ago

revert r6035

File size: 44.0 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
9#include <kernels/kernel_builder.h>
10#include <llvm/Support/raw_ostream.h>
11#include <IR_Gen/idisa_target.h>
12
13using namespace llvm;
14
15namespace kernel {
16
17inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
18    Value * m = kb->simd_not(del_mask);
19    Value * mk = kb->simd_slli(fw, del_mask, 1);
20    std::vector<Value *> move_masks;
21    for (unsigned shift = 1; shift < fw; shift *= 2) {
22        Value * mp = mk;
23        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
24            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
25        }
26        Value * mv = kb->simd_and(mp, m);
27        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
28        mk = kb->simd_and(mk, kb->simd_not(mp));
29        move_masks.push_back(mv);
30    }
31    return move_masks;
32}
33
34inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
35    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
36    for (unsigned i = 0; i < mv.size(); i++) {
37        unsigned shift = 1 << i;
38        Value * t = kb->simd_and(s, mv[i]);
39        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
40    }
41    return s;
42}
43
44// Apply deletion to a set of stream_count input streams to produce a set of output streams.
45// Kernel inputs: stream_count data streams plus one del_mask stream
46// Outputs: the deleted streams, plus a partial sum popcount
47
48void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
49    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
50    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
51    for (unsigned j = 0; j < mStreamCount; ++j) {
52        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
53        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
54        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
55    }
56    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
57    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
58}
59
60void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
61    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
62    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
63    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
64    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
65    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
66    for (unsigned j = 0; j < mStreamCount; ++j) {
67        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
68        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
69        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
70    }
71    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
72    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
73}
74
75DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
76: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
77                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
78                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
79                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
80                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
81                      {}, {}, {})
82, mDeletionFieldWidth(fieldWidth)
83, mStreamCount(streamCount) {
84}
85
86void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
87    BasicBlock * entry = kb->GetInsertBlock();
88    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
89    BasicBlock * done = kb->CreateBasicBlock("done");
90    Constant * const ZERO = kb->getSize(0);
91    kb->CreateBr(processBlock);
92    kb->SetInsertPoint(processBlock);
93    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
94    blockOffsetPhi->addIncoming(ZERO, entry);
95    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
96    Value * delMask = kb->simd_not(extractionMask);
97    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
98    for (unsigned j = 0; j < mStreamCount; ++j) {
99        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
100        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
101        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
102    }
103    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
104    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
105    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
106    blockOffsetPhi->addIncoming(nextBlk, processBlock);
107    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
108    kb->CreateCondBr(moreToDo, processBlock, done);
109    kb->SetInsertPoint(done);
110}
111
112FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
113: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
114                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
115                          Binding{kb->getStreamSetTy(), "extractionMask"}},
116                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
117                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
118                      {}, {}, {})
119, mCompressFieldWidth(fieldWidth)
120, mStreamCount(streamCount) {
121}
122
123void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
124    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
125    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
126    Constant * PEXT_func = nullptr;
127    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
128    if (mPEXTWidth == 64) {
129        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
130    } else if (mPEXTWidth == 32) {
131        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
132    }
133    BasicBlock * entry = kb->GetInsertBlock();
134    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
135    BasicBlock * done = kb->CreateBasicBlock("done");
136    Constant * const ZERO = kb->getSize(0);
137    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
138    kb->CreateBr(processBlock);
139    kb->SetInsertPoint(processBlock);
140    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
141    blockOffsetPhi->addIncoming(ZERO, entry);
142    std::vector<Value *> mask(fieldsPerBlock);
143    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
144    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
145    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
146    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
147    for (unsigned i = 0; i < fieldsPerBlock; i++) {
148        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
149        Value * popc = kb->CreateCall(popc_func, mask[i]);
150        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
151    }
152    for (unsigned j = 0; j < mStreamCount; ++j) {
153        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
154        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
155        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
156        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
157        for (unsigned i = 0; i < fieldsPerBlock; i++) {
158            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
159            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
160            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
161        }
162    }
163    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
164    blockOffsetPhi->addIncoming(nextBlk, processBlock);
165    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
166    kb->CreateCondBr(moreToDo, processBlock, done);
167    kb->SetInsertPoint(done);
168}
169
170PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
171: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
172                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
173                       Binding{kb->getStreamSetTy(), "extractionMask"}},
174                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
175                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
176                   {}, {}, {})
177, mPEXTWidth(fieldWidth)
178, mStreamCount(streamCount) {
179    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
180}
181   
182StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
183: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
184                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
185                       Binding{kb->getStreamSetTy(), "unitCounts"}},
186                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
187                   {}, {}, {})
188, mCompressedFieldWidth(fieldWidth)
189, mStreamCount(streamCount) {
190    addScalar(kb->getSizeTy(), "pendingItemCount");
191    for (unsigned i = 0; i < streamCount; i++) {
192        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
193    }
194
195}
196   
197void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
198    const unsigned fw = mCompressedFieldWidth;
199    Type * fwTy = b->getIntNTy(fw);
200    Type * sizeTy = b->getSizeTy();
201    const unsigned numFields = b->getBitBlockWidth()/fw;
202    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
203    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
204    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
205    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
206    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
207    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
208    BasicBlock * entry = b->GetInsertBlock();
209    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
210    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
211    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
212    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
213    Constant * const ZERO = b->getSize(0);
214   
215    Value * pendingItemCount = b->getScalarField("pendingItemCount");
216    std::vector<Value *> pendingData(mStreamCount);
217    for (unsigned i = 0; i < mStreamCount; i++) {
218        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
219    }
220   
221    b->CreateBr(segmentLoop);
222    // Main Loop
223    b->SetInsertPoint(segmentLoop);
224    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
225    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
226    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
227    PHINode * pendingDataPhi[mStreamCount];
228    blockOffsetPhi->addIncoming(ZERO, entry);
229    outputBlockPhi->addIncoming(ZERO, entry);
230    pendingItemsPhi->addIncoming(pendingItemCount, entry);
231    for (unsigned i = 0; i < mStreamCount; i++) {
232        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
233        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
234    }
235    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
236    // For each field determine the (partial) sum popcount of all fields up to and
237    // including the current field.
238    Value * partialSum = fieldPopCounts;
239    for (unsigned i = 1; i < numFields; i *= 2) {
240        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
241    }
242    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
243    //
244    // Now determine for each source field the output offset of the first bit.
245    // Note that this depends on the number of pending bits.
246    //
247    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
248    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
249    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
250    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
251    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
252   //
253    // Determine the relative field number for each output field.   Note that the total
254    // number of fields involved is numFields + 1.   However, the first field always
255    // be immediately combined into the current pending data field, so we calculate
256    // field numbers for all subsequent fields, (the fields that receive overflow bits).
257    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
258  //
259    // Now process the input data block of each stream in the input stream set.
260    //
261    // First load all the stream set blocks and the pending data.
262    std::vector<Value *> sourceBlock(mStreamCount);
263    for (unsigned i = 0; i < mStreamCount; i++) {
264        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
265    }
266    // Now separate the bits of each field into ones that go into the current field
267    // and ones that go into the overflow field.   Extract the first field separately,
268    // and then shift and combine subsequent fields.
269    std::vector<Value *> pendingOutput(mStreamCount);
270    std::vector<Value *> outputFields(mStreamCount);
271    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
272    for (unsigned i = 0; i < mStreamCount; i++) {
273        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
274        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
275        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
276        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
277        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
278        // shift back currentFieldBits to combine with nextFieldBits.
279        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
280    }
281    // Now combine forward all fields with the same field number.  This may require
282    // up to log2 numFields steps.
283    for (unsigned j = 1; j < numFields; j*=2) {
284        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
285        for (unsigned i = 0; i < mStreamCount; i++) {
286            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
287            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
288       }
289    }
290    // Now compress the data fields, eliminating all but the last field from
291    // each run of consecutive field having the same field number as a subsequent field.
292    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
293    // So we add 1 to field numbers first.
294    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
295    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
296    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
297    for (unsigned i = 0; i < mStreamCount; i++) {
298        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
299    }
300    //
301    // Finally combine the pendingOutput and outputField data.
302    // (a) shift forward outputField data to fill the pendingOutput values.
303    // (b) shift back outputField data to clear data added to pendingOutput.
304    //
305    // However, we may need to increment pendingFieldIndex if we previously
306    // filled the field with the extracted firstField value.  The first
307    // value of the fieldNo vector will be 0 or 1.
308    // It is possible that pendingFieldIndex will reach the total number
309    // of fields held in register.  mvmd_sll may not handle this if it
310    // translates to an LLVM shl.
311    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
312    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
313    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
314    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
315    for (unsigned i = 0; i < mStreamCount; i++) {
316        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
317        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
318        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
319        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
320    }
321    //
322    // Write the pendingOutput data to outputStream.
323    // Note: this data may be overwritten later, but we avoid branching.
324    for (unsigned i = 0; i < mStreamCount; i++) {
325        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
326    }
327    // Now determine the total amount of pending items and whether
328    // the pending data all fits within the pendingOutput.
329    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
330    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
331    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
332    //
333    // Prepare Phi nodes for the next iteration.
334    //
335    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
336    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
337    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
338    // But don't advance the output if all the data does fit into pendingOutput.
339    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
340    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
341    pendingItemsPhi->addIncoming(newPending, segmentLoop);
342
343    for (unsigned i = 0; i < mStreamCount; i++) {
344        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
345        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
346    }
347    //
348    // Now continue the loop if there are more blocks to process.
349    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
350    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
351   
352    b->SetInsertPoint(segmentDone);
353    // Save kernel state.
354    b->setScalarField("pendingItemCount", newPending);
355    for (unsigned i = 0; i < mStreamCount; i++) {
356        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
357    }
358    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
359    b->SetInsertPoint(finalWrite);
360    for (unsigned i = 0; i < mStreamCount; i++) {
361        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
362        Value * pending = b->bitCast(pendingOutput[i]);
363        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
364    }
365    b->CreateBr(updateProducedCount);
366    b->SetInsertPoint(updateProducedCount);
367     Value * produced = b->getProducedItemCount("compressedOutput");
368    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
369    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
370    b->setProducedItemCount("compressedOutput", produced);
371}
372
373SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
374: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
375                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
376                  {}, {}, {}, {})
377, mStreamCount(streamCount)
378, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
379// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
380// to prevent rounding errors.
381, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
382, mPEXTWidth(PEXT_width)
383{
384    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
385        && "mDelCountFieldWidth must be a power of 2");
386    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
387    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
388
389    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
390    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
391
392    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
393    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
394    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
395        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
396        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
397    }
398    addScalar(b->getSizeTy(), "pendingOffset");
399}
400
401void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
402    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
403    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
404    generateProcessingLoop(b, delMask, false);
405}
406
407void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
408    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
409    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
410    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
411    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
412    generateProcessingLoop(b, delMask, true);
413}
414
415/*
416What this function does in pseudo code:
417for (mSwizzleFactor)
418    create a swizzle set containing mSwizzleFactor blocks
419    apply PEXT to each block in the swizzle set
420    store the swizzleSet in PEXTedSwizzleSets vector
421
422for (each swizzle row i)
423    for (each swizzle set j)
424        processes row i in swizzle set j
425        store output in pendingData[j]
426*/
427
428void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
429
430    // selectors marks the positions we want to keep
431    Value * const selectors = b->CreateNot(delMask);
432
433    const auto swizzleSets = makeSwizzleSets(b, selectors);
434
435    // Compress the PEXTedSwizzleSets
436    // Output is written and committed to the output buffer one swizzle at a time.
437    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
438    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
439    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
440    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
441    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
442
443    // All output groups have the same count.
444    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
445    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
446    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
447    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
448
449    // There is a separate vector of pending data for each swizzle group.
450    std::vector<Value *> pendingData;
451    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
452        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
453    }
454
455    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
456
457    // For each row i
458    for (unsigned i = 0; i < mSwizzleFactor; i++) {
459
460        // Generate code for each of the mSwizzleFactor fields making up a block.
461        // We load the count for the field and process all swizzle groups accordingly.
462        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
463        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
464        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
465        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
466
467        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
468        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
469
470        // Data from the ith swizzle pack of each group is processed
471        // according to the same newItemCount, pendingSpace, ...
472        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
473            Value * const newItems = swizzleSets[j][i];
474            // Combine as many of the new items as possible into the pending group.
475            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
476            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
477            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
478            // To avoid an unpredictable branch, always store the combined group, whether full or not.
479            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
480            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
481            // Any items in excess of the space available in the current pending group overflow for the next group.
482            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
483            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
484            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
485        }
486
487        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
488        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
489
490        outputProduced = b->CreateAdd(outputProduced, newItemCount);
491    }
492
493    if (flush) { // incase we selected the overflow group on the final iteration
494        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
495        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
496        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
497            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
498            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
499        }
500    } else {
501        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
502            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
503        }
504        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
505        b->setScalarField("pendingOffset", pendingOffset);
506        // unless this is our final stride, don't report partially written fields.
507        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
508    }
509
510    b->setProducedItemCount("outputSwizzle0", outputProduced);
511}
512
513/*
514Apply PEXT deletion to the blocks in strms and swizzle the result.
515
516Q: Why is it advantageous to swizzle the PEXTed streams?
517
518A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
519
520Example:
521Input:     11101101
522PEXT mask: 11110000
523Output:    00001110
524
525PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
526cleared. This has implications if we're working with multiple streams.
527
528For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
529(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
530Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
531Stream 2:   qrstuv00 wxy00000 z1000000 23456000
532Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
533Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
534
535If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
536should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
537time:
538
539The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
540
541Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
542Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
543Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
544Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
545
546Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
547compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
548
549Args:
550    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
551    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
552        sequentially to PEXT a full block.)
553
554Returns:
555    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
556*/
557
558std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
559
560    Constant * pext = nullptr;
561    if (mPEXTWidth == 64) {
562        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
563    } else if (mPEXTWidth == 32) {
564        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
565    }
566
567    Value * const m = b->fwCast(mPEXTWidth, selectors);
568
569    std::vector<Value *> masks(mSwizzleFactor);
570    for (unsigned i = 0; i < mSwizzleFactor; i++) {
571        masks[i] = b->CreateExtractElement(m, i);
572
573    }
574
575    std::vector<std::vector<Value *>> swizzleSets;
576    swizzleSets.reserve(mSwizzleSetCount);
577
578    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
579
580    UndefValue * const outputInitializer = UndefValue::get(vecTy);
581
582    std::vector<Value *> input(mSwizzleFactor);
583    // For each of the k swizzle sets required to apply PEXT to all input streams
584    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
585
586        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
587            const unsigned k = (i * mSwizzleFactor) + j;
588            if (k < mStreamCount) {
589                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
590            } else {
591                input[j] = Constant::getNullValue(vecTy);
592            }
593        }
594
595        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
596        // then splat the pext mask and apply it to each output row
597
598        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
599        // For each of the input streams
600        for (unsigned j = 0; j < mSwizzleFactor; j++) {
601            for (unsigned k = 0; k < mSwizzleFactor; k++) {
602                // Load block j,k
603                Value * const field = b->CreateExtractElement(input[j], k);
604                // Apply PEXT deletion
605                Value * const selected = b->CreateCall(pext, {field, masks[k]});
606                // Then store it as our k,j-th output
607                output[k] = b->CreateInsertElement(output[k], selected, j);
608            }
609        }
610
611        swizzleSets.emplace_back(output);
612    }
613
614    return swizzleSets;
615}
616
617// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
618// Kernel inputs: stream_count data streams plus one del_mask stream
619// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
620
621void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
622    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
623    generateProcessingLoop(kb, delMask);
624}
625
626void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
627    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
628    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
629    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
630    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
631    generateProcessingLoop(kb, delMask);
632}
633
634void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
635    Constant * PEXT_func = nullptr;
636    if (mPEXTWidth == 64) {
637        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
638    } else if (mPEXTWidth == 32) {
639        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
640    }
641    std::vector<Value *> masks(mSwizzleFactor);
642    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
643    for (unsigned i = 0; i < mSwizzleFactor; i++) {
644        masks[i] = kb->CreateExtractElement(m, i);
645    }
646
647    for (unsigned i = 0; i < mStreamCount; ++i) {
648        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
649        Value * value = kb->fwCast(mPEXTWidth, input);
650        Value * output = UndefValue::get(value->getType());
651        for (unsigned j = 0; j < mSwizzleFactor; j++) {
652            Value * field = kb->CreateExtractElement(value, j);
653            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
654            output = kb->CreateInsertElement(output, compressed, j);
655        }
656        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
657    }
658    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
659    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
660}
661
662DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
663: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
664              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
665                  Binding{b->getStreamSetTy(), "delMaskSet"}},
666              {}, {}, {}, {})
667, mDelCountFieldWidth(fw)
668, mStreamCount(streamCount)
669, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
670, mPEXTWidth(PEXT_width) {
671    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
672    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
673}
674
675   
676//
677// This kernel performs final stream compression for a set of N bitstreams, given
678// (a) a set of bitstreams partially compressed within K-bit fields and stored
679//     in K-bit swizzled form, and
680// (b) a stream of deletion/extraction counts per K-bit stride.
681//
682// Restrictions:  At present, only K=64 is supported.
683//                At present, N must be an exact multiple of BLOCK_SIZE/K.
684//
685// The kernel always consumes full blocks of input and emits data into the output
686// buffer in swizzles of K items at a time.   Upon completion of a segment,
687// up to K-1 pending output items per stream may be stored in the kernel state.
688//
689// Note: that both input streams and output streams are stored in swizzled form.
690//
691
692SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
693: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
694                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
695, mBitStreamCount(bitStreamCount)
696    , mFieldWidth(fieldWidth)
697    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
698    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
699        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
700        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
701        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
702        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
703        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
704        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
705            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
706            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
707            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
708        }
709        addScalar(kb->getSizeTy(), "pendingOffset");
710}
711   
712void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
713       
714    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
715    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
716   
717    // Output is written and committed to the output buffer one swizzle at a time.
718    //
719    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
720    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
721   
722    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
723    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
724    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
725
726    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
727    Value * pendingOffset = kb->getScalarField("pendingOffset");
728    // There is a separate vector of pending data for each swizzle group.
729    std::vector<Value *> pendingData;
730    std::vector<Value *> outputStreamPtr;
731    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
732        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
733        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
734    }
735   
736    // Generate code for each of the mSwizzleFactor fields making up a block.
737    // We load the count for the field and process all swizzle groups accordingly.
738    for (unsigned i = 0; i < mSwizzleFactor; i++) {
739        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
740        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
741        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
742       
743        // Data from the ith swizzle pack of each group is processed
744        // according to the same newItemCount, pendingSpace, ...
745        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
746            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
747            // Combine as many of the new items as possible into the pending group.
748            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
749            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
750            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
751            // Any items in excess of the space available in the current pending group overflow for the next group.
752            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
753            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
754            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
755        }
756        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
757        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
758    }
759    kb->setScalarField("pendingOffset", pendingOffset);
760    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
761    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
762    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
763        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
764    }
765    kb->setProducedItemCount("outputSwizzle0", produced);
766}
767
768void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
769    CreateDoBlockMethodCall(kb);
770    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
771    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
772   
773    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
774    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
775    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
776    Value * pendingOffset = kb->getScalarField("pendingOffset");
777
778    // Write the pending data.
779    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
780        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
781        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
782        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
783    }
784    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
785}
786   
787void StreamFilterCompiler::makeCall(parabix::StreamSetBuffer * mask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
788    if (mBufferBlocks == 0) {
789        llvm::report_fatal_error("StreamFilterCompiler needs a non-zero bufferBlocks parameter (for now).");
790    }
791    auto & iBuilder = mDriver.getBuilder();
792    unsigned N = IDISA::getNumOfStreams(ssType);
793    if (IDISA::getStreamFieldWidth(ssType) != 1) {
794        llvm::report_fatal_error("StreamFilterCompiler only compresses bit streams (for now)");
795    }
796    Kernel * compressK = nullptr;
797    if (AVX2_available()) {
798        compressK = mDriver.addKernelInstance<PEXTFieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
799    } else {
800        compressK = mDriver.addKernelInstance<FieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
801    }
802    parabix::StreamSetBuffer * compressedFields = mDriver.addBuffer<parabix::CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
803    parabix::StreamSetBuffer * unitCounts = mDriver.addBuffer<parabix::CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), mBufferBlocks);
804   
805    mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields, unitCounts});
806   
807    Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
808    mDriver.makeKernelCall(streamK, {compressedFields, unitCounts}, {outputs});
809}
810
811
812}
Note: See TracBrowser for help on using the repository browser.