source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6055

Last change on this file since 6055 was 6055, checked in by cameron, 12 months ago

Various small fixes

File size: 44.0 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
9#include <kernels/kernel_builder.h>
10#include <llvm/Support/raw_ostream.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Intrinsics.h>
13
14using namespace llvm;
15
16namespace kernel {
17
18inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
19    Value * m = kb->simd_not(del_mask);
20    Value * mk = kb->simd_slli(fw, del_mask, 1);
21    std::vector<Value *> move_masks;
22    for (unsigned shift = 1; shift < fw; shift *= 2) {
23        Value * mp = mk;
24        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
25            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
26        }
27        Value * mv = kb->simd_and(mp, m);
28        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
29        mk = kb->simd_and(mk, kb->simd_not(mp));
30        move_masks.push_back(mv);
31    }
32    return move_masks;
33}
34
35inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
36    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
37    for (unsigned i = 0; i < mv.size(); i++) {
38        unsigned shift = 1 << i;
39        Value * t = kb->simd_and(s, mv[i]);
40        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
41    }
42    return s;
43}
44
45// Apply deletion to a set of stream_count input streams to produce a set of output streams.
46// Kernel inputs: stream_count data streams plus one del_mask stream
47// Outputs: the deleted streams, plus a partial sum popcount
48
49void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
50    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
51    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
52    for (unsigned j = 0; j < mStreamCount; ++j) {
53        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
54        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
55        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
56    }
57    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
58    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
59}
60
61void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
62    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
63    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
64    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
65    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
66    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
67    for (unsigned j = 0; j < mStreamCount; ++j) {
68        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
69        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
70        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
71    }
72    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
73    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
74}
75
76DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
77: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
78                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
79                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
80                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
81                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
82                      {}, {}, {})
83, mDeletionFieldWidth(fieldWidth)
84, mStreamCount(streamCount) {
85}
86
87void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
88    BasicBlock * entry = kb->GetInsertBlock();
89    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
90    BasicBlock * done = kb->CreateBasicBlock("done");
91    Constant * const ZERO = kb->getSize(0);
92    kb->CreateBr(processBlock);
93    kb->SetInsertPoint(processBlock);
94    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
95    blockOffsetPhi->addIncoming(ZERO, entry);
96    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
97    Value * delMask = kb->simd_not(extractionMask);
98    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
99    for (unsigned j = 0; j < mStreamCount; ++j) {
100        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
101        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
102        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
103    }
104    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
105    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
106    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
107    blockOffsetPhi->addIncoming(nextBlk, processBlock);
108    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
109    kb->CreateCondBr(moreToDo, processBlock, done);
110    kb->SetInsertPoint(done);
111}
112
113FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
114: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
115                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
116                          Binding{kb->getStreamSetTy(), "extractionMask"}},
117                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
118                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
119                      {}, {}, {})
120, mCompressFieldWidth(fieldWidth)
121, mStreamCount(streamCount) {
122}
123
124void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
125    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
126    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
127    Constant * PEXT_func = nullptr;
128    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
129    if (mPEXTWidth == 64) {
130        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
131    } else if (mPEXTWidth == 32) {
132        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
133    }
134    BasicBlock * entry = kb->GetInsertBlock();
135    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
136    BasicBlock * done = kb->CreateBasicBlock("done");
137    Constant * const ZERO = kb->getSize(0);
138    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
139    kb->CreateBr(processBlock);
140    kb->SetInsertPoint(processBlock);
141    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
142    blockOffsetPhi->addIncoming(ZERO, entry);
143    std::vector<Value *> mask(fieldsPerBlock);
144    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
145    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
146    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
147    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
148    for (unsigned i = 0; i < fieldsPerBlock; i++) {
149        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
150        Value * popc = kb->CreateCall(popc_func, mask[i]);
151        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
152    }
153    for (unsigned j = 0; j < mStreamCount; ++j) {
154        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
155        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
156        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
157        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
158        for (unsigned i = 0; i < fieldsPerBlock; i++) {
159            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
160            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
161            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
162        }
163    }
164    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
165    blockOffsetPhi->addIncoming(nextBlk, processBlock);
166    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
167    kb->CreateCondBr(moreToDo, processBlock, done);
168    kb->SetInsertPoint(done);
169}
170
171PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
172: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
173                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
174                       Binding{kb->getStreamSetTy(), "extractionMask"}},
175                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
176                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
177                   {}, {}, {})
178, mPEXTWidth(fieldWidth)
179, mStreamCount(streamCount) {
180    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
181}
182   
183StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
184: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
185                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
186                       Binding{kb->getStreamSetTy(), "unitCounts"}},
187                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
188                   {}, {}, {})
189, mCompressedFieldWidth(fieldWidth)
190, mStreamCount(streamCount) {
191    addScalar(kb->getSizeTy(), "pendingItemCount");
192    for (unsigned i = 0; i < streamCount; i++) {
193        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
194    }
195
196}
197   
198void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
199    const unsigned fw = mCompressedFieldWidth;
200    Type * fwTy = b->getIntNTy(fw);
201    Type * sizeTy = b->getSizeTy();
202    const unsigned numFields = b->getBitBlockWidth()/fw;
203    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
204    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
205    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
206    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
207    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
208    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
209    BasicBlock * entry = b->GetInsertBlock();
210    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
211    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
212    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
213    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
214    Constant * const ZERO = b->getSize(0);
215   
216    Value * pendingItemCount = b->getScalarField("pendingItemCount");
217    std::vector<Value *> pendingData(mStreamCount);
218    for (unsigned i = 0; i < mStreamCount; i++) {
219        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
220    }
221   
222    b->CreateBr(segmentLoop);
223    // Main Loop
224    b->SetInsertPoint(segmentLoop);
225    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
226    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
227    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
228    PHINode * pendingDataPhi[mStreamCount];
229    blockOffsetPhi->addIncoming(ZERO, entry);
230    outputBlockPhi->addIncoming(ZERO, entry);
231    pendingItemsPhi->addIncoming(pendingItemCount, entry);
232    for (unsigned i = 0; i < mStreamCount; i++) {
233        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
234        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
235    }
236    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
237    // For each field determine the (partial) sum popcount of all fields up to and
238    // including the current field.
239    Value * partialSum = fieldPopCounts;
240    for (unsigned i = 1; i < numFields; i *= 2) {
241        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
242    }
243    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
244    //
245    // Now determine for each source field the output offset of the first bit.
246    // Note that this depends on the number of pending bits.
247    //
248    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
249    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
250    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
251    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
252    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
253   //
254    // Determine the relative field number for each output field.   Note that the total
255    // number of fields involved is numFields + 1.   However, the first field always
256    // be immediately combined into the current pending data field, so we calculate
257    // field numbers for all subsequent fields, (the fields that receive overflow bits).
258    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
259  //
260    // Now process the input data block of each stream in the input stream set.
261    //
262    // First load all the stream set blocks and the pending data.
263    std::vector<Value *> sourceBlock(mStreamCount);
264    for (unsigned i = 0; i < mStreamCount; i++) {
265        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
266    }
267    // Now separate the bits of each field into ones that go into the current field
268    // and ones that go into the overflow field.   Extract the first field separately,
269    // and then shift and combine subsequent fields.
270    std::vector<Value *> pendingOutput(mStreamCount);
271    std::vector<Value *> outputFields(mStreamCount);
272    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
273    for (unsigned i = 0; i < mStreamCount; i++) {
274        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
275        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
276        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
277        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
278        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
279        // shift back currentFieldBits to combine with nextFieldBits.
280        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
281    }
282    // Now combine forward all fields with the same field number.  This may require
283    // up to log2 numFields steps.
284    for (unsigned j = 1; j < numFields; j*=2) {
285        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
286        for (unsigned i = 0; i < mStreamCount; i++) {
287            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
288            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
289       }
290    }
291    // Now compress the data fields, eliminating all but the last field from
292    // each run of consecutive field having the same field number as a subsequent field.
293    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
294    // So we add 1 to field numbers first.
295    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
296    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
297    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
298    for (unsigned i = 0; i < mStreamCount; i++) {
299        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
300    }
301    //
302    // Finally combine the pendingOutput and outputField data.
303    // (a) shift forward outputField data to fill the pendingOutput values.
304    // (b) shift back outputField data to clear data added to pendingOutput.
305    //
306    // However, we may need to increment pendingFieldIndex if we previously
307    // filled the field with the extracted firstField value.  The first
308    // value of the fieldNo vector will be 0 or 1.
309    // It is possible that pendingFieldIndex will reach the total number
310    // of fields held in register.  mvmd_sll may not handle this if it
311    // translates to an LLVM shl.
312    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
313    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
314    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
315    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
316    for (unsigned i = 0; i < mStreamCount; i++) {
317        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
318        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
319        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
320        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
321    }
322    //
323    // Write the pendingOutput data to outputStream.
324    // Note: this data may be overwritten later, but we avoid branching.
325    for (unsigned i = 0; i < mStreamCount; i++) {
326        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
327    }
328    // Now determine the total amount of pending items and whether
329    // the pending data all fits within the pendingOutput.
330    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
331    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
332    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
333    //
334    // Prepare Phi nodes for the next iteration.
335    //
336    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
337    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
338    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
339    // But don't advance the output if all the data does fit into pendingOutput.
340    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
341    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
342    pendingItemsPhi->addIncoming(newPending, segmentLoop);
343
344    for (unsigned i = 0; i < mStreamCount; i++) {
345        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
346        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
347    }
348    //
349    // Now continue the loop if there are more blocks to process.
350    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
351    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
352   
353    b->SetInsertPoint(segmentDone);
354    // Save kernel state.
355    b->setScalarField("pendingItemCount", newPending);
356    for (unsigned i = 0; i < mStreamCount; i++) {
357        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
358    }
359    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
360    b->SetInsertPoint(finalWrite);
361    for (unsigned i = 0; i < mStreamCount; i++) {
362        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
363        Value * pending = b->bitCast(pendingOutput[i]);
364        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
365    }
366    b->CreateBr(updateProducedCount);
367    b->SetInsertPoint(updateProducedCount);
368     Value * produced = b->getProducedItemCount("compressedOutput");
369    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
370    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
371    b->setProducedItemCount("compressedOutput", produced);
372}
373
374SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
375: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
376                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
377                  {}, {}, {}, {})
378, mStreamCount(streamCount)
379, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
380// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
381// to prevent rounding errors.
382, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
383, mPEXTWidth(PEXT_width)
384{
385    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
386        && "mDelCountFieldWidth must be a power of 2");
387    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
388    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
389
390    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
391    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
392
393    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
394    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
395    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
396        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
397        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
398    }
399    addScalar(b->getSizeTy(), "pendingOffset");
400}
401
402void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
403    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
404    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
405    generateProcessingLoop(b, delMask, false);
406}
407
408void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
409    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
410    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
411    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
412    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
413    generateProcessingLoop(b, delMask, true);
414}
415
416/*
417What this function does in pseudo code:
418for (mSwizzleFactor)
419    create a swizzle set containing mSwizzleFactor blocks
420    apply PEXT to each block in the swizzle set
421    store the swizzleSet in PEXTedSwizzleSets vector
422
423for (each swizzle row i)
424    for (each swizzle set j)
425        processes row i in swizzle set j
426        store output in pendingData[j]
427*/
428
429void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
430
431    // selectors marks the positions we want to keep
432    Value * const selectors = b->CreateNot(delMask);
433
434    const auto swizzleSets = makeSwizzleSets(b, selectors);
435
436    // Compress the PEXTedSwizzleSets
437    // Output is written and committed to the output buffer one swizzle at a time.
438    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
439    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
440    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
441    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
442    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
443
444    // All output groups have the same count.
445    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
446    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
447    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
448    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
449
450    // There is a separate vector of pending data for each swizzle group.
451    std::vector<Value *> pendingData;
452    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
453        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
454    }
455
456    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
457
458    // For each row i
459    for (unsigned i = 0; i < mSwizzleFactor; i++) {
460
461        // Generate code for each of the mSwizzleFactor fields making up a block.
462        // We load the count for the field and process all swizzle groups accordingly.
463        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
464        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
465        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
466        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
467
468        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
469        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
470
471        // Data from the ith swizzle pack of each group is processed
472        // according to the same newItemCount, pendingSpace, ...
473        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
474            Value * const newItems = swizzleSets[j][i];
475            // Combine as many of the new items as possible into the pending group.
476            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
477            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
478            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
479            // To avoid an unpredictable branch, always store the combined group, whether full or not.
480            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
481            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
482            // Any items in excess of the space available in the current pending group overflow for the next group.
483            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
484            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
485            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
486        }
487
488        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
489        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
490
491        outputProduced = b->CreateAdd(outputProduced, newItemCount);
492    }
493
494    if (flush) { // incase we selected the overflow group on the final iteration
495        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
496        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
497        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
498            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
499            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
500        }
501    } else {
502        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
503            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
504        }
505        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
506        b->setScalarField("pendingOffset", pendingOffset);
507        // unless this is our final stride, don't report partially written fields.
508        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
509    }
510
511    b->setProducedItemCount("outputSwizzle0", outputProduced);
512}
513
514/*
515Apply PEXT deletion to the blocks in strms and swizzle the result.
516
517Q: Why is it advantageous to swizzle the PEXTed streams?
518
519A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
520
521Example:
522Input:     11101101
523PEXT mask: 11110000
524Output:    00001110
525
526PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
527cleared. This has implications if we're working with multiple streams.
528
529For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
530(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
531Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
532Stream 2:   qrstuv00 wxy00000 z1000000 23456000
533Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
534Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
535
536If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
537should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
538time:
539
540The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
541
542Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
543Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
544Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
545Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
546
547Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
548compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
549
550Args:
551    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
552    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
553        sequentially to PEXT a full block.)
554
555Returns:
556    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
557*/
558
559std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
560
561    Constant * pext = nullptr;
562    if (mPEXTWidth == 64) {
563        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
564    } else if (mPEXTWidth == 32) {
565        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
566    }
567
568    Value * const m = b->fwCast(mPEXTWidth, selectors);
569
570    std::vector<Value *> masks(mSwizzleFactor);
571    for (unsigned i = 0; i < mSwizzleFactor; i++) {
572        masks[i] = b->CreateExtractElement(m, i);
573
574    }
575
576    std::vector<std::vector<Value *>> swizzleSets;
577    swizzleSets.reserve(mSwizzleSetCount);
578
579    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
580
581    UndefValue * const outputInitializer = UndefValue::get(vecTy);
582
583    std::vector<Value *> input(mSwizzleFactor);
584    // For each of the k swizzle sets required to apply PEXT to all input streams
585    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
586
587        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
588            const unsigned k = (i * mSwizzleFactor) + j;
589            if (k < mStreamCount) {
590                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
591            } else {
592                input[j] = Constant::getNullValue(vecTy);
593            }
594        }
595
596        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
597        // then splat the pext mask and apply it to each output row
598
599        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
600        // For each of the input streams
601        for (unsigned j = 0; j < mSwizzleFactor; j++) {
602            for (unsigned k = 0; k < mSwizzleFactor; k++) {
603                // Load block j,k
604                Value * const field = b->CreateExtractElement(input[j], k);
605                // Apply PEXT deletion
606                Value * const selected = b->CreateCall(pext, {field, masks[k]});
607                // Then store it as our k,j-th output
608                output[k] = b->CreateInsertElement(output[k], selected, j);
609            }
610        }
611
612        swizzleSets.emplace_back(output);
613    }
614
615    return swizzleSets;
616}
617
618// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
619// Kernel inputs: stream_count data streams plus one del_mask stream
620// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
621
622void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
623    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
624    generateProcessingLoop(kb, delMask);
625}
626
627void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
628    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
629    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
630    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
631    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
632    generateProcessingLoop(kb, delMask);
633}
634
635void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
636    Constant * PEXT_func = nullptr;
637    if (mPEXTWidth == 64) {
638        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
639    } else if (mPEXTWidth == 32) {
640        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
641    }
642    std::vector<Value *> masks(mSwizzleFactor);
643    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
644    for (unsigned i = 0; i < mSwizzleFactor; i++) {
645        masks[i] = kb->CreateExtractElement(m, i);
646    }
647
648    for (unsigned i = 0; i < mStreamCount; ++i) {
649        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
650        Value * value = kb->fwCast(mPEXTWidth, input);
651        Value * output = UndefValue::get(value->getType());
652        for (unsigned j = 0; j < mSwizzleFactor; j++) {
653            Value * field = kb->CreateExtractElement(value, j);
654            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
655            output = kb->CreateInsertElement(output, compressed, j);
656        }
657        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
658    }
659    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
660    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
661}
662
663DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
664: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
665              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
666                  Binding{b->getStreamSetTy(), "delMaskSet"}},
667              {}, {}, {}, {})
668, mDelCountFieldWidth(fw)
669, mStreamCount(streamCount)
670, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
671, mPEXTWidth(PEXT_width) {
672    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
673    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
674}
675
676   
677//
678// This kernel performs final stream compression for a set of N bitstreams, given
679// (a) a set of bitstreams partially compressed within K-bit fields and stored
680//     in K-bit swizzled form, and
681// (b) a stream of deletion/extraction counts per K-bit stride.
682//
683// Restrictions:  At present, only K=64 is supported.
684//                At present, N must be an exact multiple of BLOCK_SIZE/K.
685//
686// The kernel always consumes full blocks of input and emits data into the output
687// buffer in swizzles of K items at a time.   Upon completion of a segment,
688// up to K-1 pending output items per stream may be stored in the kernel state.
689//
690// Note: that both input streams and output streams are stored in swizzled form.
691//
692
693SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
694: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
695                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
696, mBitStreamCount(bitStreamCount)
697    , mFieldWidth(fieldWidth)
698    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
699    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
700        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
701        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
702        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
703        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
704        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
705        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
706            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
707            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
708            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
709        }
710        addScalar(kb->getSizeTy(), "pendingOffset");
711}
712   
713void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
714       
715    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
716    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
717   
718    // Output is written and committed to the output buffer one swizzle at a time.
719    //
720    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
721    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
722   
723    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
724    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
725    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
726
727    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
728    Value * pendingOffset = kb->getScalarField("pendingOffset");
729    // There is a separate vector of pending data for each swizzle group.
730    std::vector<Value *> pendingData;
731    std::vector<Value *> outputStreamPtr;
732    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
733        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
734        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
735    }
736   
737    // Generate code for each of the mSwizzleFactor fields making up a block.
738    // We load the count for the field and process all swizzle groups accordingly.
739    for (unsigned i = 0; i < mSwizzleFactor; i++) {
740        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
741        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
742        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
743       
744        // Data from the ith swizzle pack of each group is processed
745        // according to the same newItemCount, pendingSpace, ...
746        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
747            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
748            // Combine as many of the new items as possible into the pending group.
749            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
750            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
751            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
752            // Any items in excess of the space available in the current pending group overflow for the next group.
753            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
754            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
755            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
756        }
757        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
758        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
759    }
760    kb->setScalarField("pendingOffset", pendingOffset);
761    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
762    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
763    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
764        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
765    }
766    kb->setProducedItemCount("outputSwizzle0", produced);
767}
768
769void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
770    CreateDoBlockMethodCall(kb);
771    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
772    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
773   
774    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
775    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
776    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
777    Value * pendingOffset = kb->getScalarField("pendingOffset");
778
779    // Write the pending data.
780    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
781        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
782        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
783        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
784    }
785    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
786}
787   
788void StreamFilterCompiler::makeCall(parabix::StreamSetBuffer * mask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
789    if (mBufferBlocks == 0) {
790        llvm::report_fatal_error("StreamFilterCompiler needs a non-zero bufferBlocks parameter (for now).");
791    }
792    auto & iBuilder = mDriver.getBuilder();
793    unsigned N = IDISA::getNumOfStreams(ssType);
794    if (IDISA::getStreamFieldWidth(ssType) != 1) {
795        llvm::report_fatal_error("StreamFilterCompiler only compresses bit streams (for now)");
796    }
797    Kernel * compressK = nullptr;
798    if (AVX2_available()) {
799        compressK = mDriver.addKernelInstance<PEXTFieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
800    } else {
801        compressK = mDriver.addKernelInstance<FieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
802    }
803    parabix::StreamSetBuffer * compressedFields = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
804    parabix::StreamSetBuffer * unitCounts = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(), mBufferBlocks);
805   
806    mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields, unitCounts});
807   
808    Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
809    mDriver.makeKernelCall(streamK, {compressedFields, unitCounts}, {outputs});
810}
811
812
813}
Note: See TracBrowser for help on using the repository browser.