source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

Last change on this file was 6261, checked in by nmedfort, 7 months ago

Work on OptimizationBranch?; revisited pipeline termination

File size: 42.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
9#include <kernels/kernel_builder.h>
10#include <llvm/Support/raw_ostream.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Intrinsics.h>
13
14using namespace llvm;
15
16inline size_t ceil_udiv(const size_t n, const size_t m) {
17    return (n + m - 1) / m;
18}
19
20namespace kernel {
21
22inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
23    Value * m = kb->simd_not(del_mask);
24    Value * mk = kb->simd_slli(fw, del_mask, 1);
25    std::vector<Value *> move_masks;
26    for (unsigned shift = 1; shift < fw; shift *= 2) {
27        Value * mp = mk;
28        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
29            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
30        }
31        Value * mv = kb->simd_and(mp, m);
32        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
33        mk = kb->simd_and(mk, kb->simd_not(mp));
34        move_masks.push_back(mv);
35    }
36    return move_masks;
37}
38
39inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
40    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
41    for (unsigned i = 0; i < mv.size(); i++) {
42        unsigned shift = 1 << i;
43        Value * t = kb->simd_and(s, mv[i]);
44        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
45    }
46    return s;
47}
48
49// Apply deletion to a set of stream_count input streams to produce a set of output streams.
50// Kernel inputs: stream_count data streams plus one del_mask stream
51// Outputs: the deleted streams, plus a partial sum popcount
52
53void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
54    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
55    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
56    for (unsigned j = 0; j < mStreamCount; ++j) {
57        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
58        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
59        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
60    }
61    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
62    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
63}
64
65void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
66    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
67    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
68    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
69    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
70    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
71    for (unsigned j = 0; j < mStreamCount; ++j) {
72        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
73        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
74        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
75    }
76    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
77    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
78}
79
80DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned fieldWidth, const unsigned streamCount)
81: BlockOrientedKernel(b, "del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
82                      {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
83                          Binding{b->getStreamSetTy(), "delMaskSet"}},
84                      {Binding{b->getStreamSetTy(streamCount), "outputStreamSet"},
85                          Binding{b->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(b->getBitBlockWidth())}},
86                      {}, {}, {})
87, mDeletionFieldWidth(fieldWidth)
88, mStreamCount(streamCount) {
89}
90
91void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
92    BasicBlock * entry = kb->GetInsertBlock();
93    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
94    BasicBlock * done = kb->CreateBasicBlock("done");
95    Constant * const ZERO = kb->getSize(0);
96    kb->CreateBr(processBlock);
97    kb->SetInsertPoint(processBlock);
98    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
99    blockOffsetPhi->addIncoming(ZERO, entry);
100    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
101    Value * delMask = kb->simd_not(extractionMask);
102    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
103    for (unsigned j = 0; j < mStreamCount; ++j) {
104        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
105        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
106        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
107    }
108    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
109    blockOffsetPhi->addIncoming(nextBlk, processBlock);
110    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
111    kb->CreateCondBr(moreToDo, processBlock, done);
112    kb->SetInsertPoint(done);
113}
114
115FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw
116                                         , StreamSet * inputStreamSet, StreamSet * extractionMask
117                                         , StreamSet * outputStreamSet)
118: MultiBlockKernel(b, "fieldCompress" + std::to_string(fw) + "_" + std::to_string(inputStreamSet->getNumElements()),
119// inputs
120{Binding{"inputStreamSet", inputStreamSet},
121Binding{"extractionMask", extractionMask}},
122// outputs
123{Binding{"outputStreamSet", outputStreamSet}},
124{}, {}, {})
125, mCompressFieldWidth(fw)
126, mStreamCount(inputStreamSet->getNumElements()) {
127
128}
129
130void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
131    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
132    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
133    Constant * PEXT_func = nullptr;
134    if (mPEXTWidth == 64) {
135        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
136    } else if (mPEXTWidth == 32) {
137        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
138    }
139    BasicBlock * entry = kb->GetInsertBlock();
140    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
141    BasicBlock * done = kb->CreateBasicBlock("done");
142    Constant * const ZERO = kb->getSize(0);
143    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
144    kb->CreateBr(processBlock);
145    kb->SetInsertPoint(processBlock);
146    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
147    blockOffsetPhi->addIncoming(ZERO, entry);
148    std::vector<Value *> mask(fieldsPerBlock);
149    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
150    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
151    for (unsigned i = 0; i < fieldsPerBlock; i++) {
152        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
153    }
154    for (unsigned j = 0; j < mStreamCount; ++j) {
155        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
156        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
157        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
158        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
159        for (unsigned i = 0; i < fieldsPerBlock; i++) {
160            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
161            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
162            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
163        }
164    }
165    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
166    blockOffsetPhi->addIncoming(nextBlk, processBlock);
167    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
168    kb->CreateCondBr(moreToDo, processBlock, done);
169    kb->SetInsertPoint(done);
170}
171
172PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned fieldWidth, const unsigned streamCount)
173: MultiBlockKernel(b, "PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
174                   {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
175                       Binding{b->getStreamSetTy(), "extractionMask"}},
176                   {Binding{b->getStreamSetTy(streamCount), "outputStreamSet"}},
177                   {}, {}, {})
178, mPEXTWidth(fieldWidth)
179, mStreamCount(streamCount) {
180    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
181}
182
183StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
184                                           , StreamSet * source
185                                           , StreamSet * extractionMask
186                                           , StreamSet * compressedOutput
187                                           , const unsigned FieldWidth)
188: MultiBlockKernel(b, "streamCompress" + std::to_string(FieldWidth) + "_" + std::to_string(source->getNumElements()),
189{Binding{"sourceStreamSet", source},
190Binding{"extractionMask", extractionMask}},
191{Binding{"compressedOutput", compressedOutput, PopcountOf("extractionMask"), BlockSize(1)}},
192{}, {}, {})
193, mCompressedFieldWidth(FieldWidth)
194, mStreamCount(source->getNumElements()) {
195    for (unsigned i = 0; i < mStreamCount; i++) {
196        addInternalScalar(b->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
197    }
198}
199
200void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
201    IntegerType * const fwTy = b->getIntNTy(mCompressedFieldWidth);
202    IntegerType * const sizeTy = b->getSizeTy();
203    const unsigned numFields = b->getBitBlockWidth() / mCompressedFieldWidth;
204    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(mCompressedFieldWidth));
205    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
206    Constant * CFW = ConstantInt::get(fwTy, mCompressedFieldWidth);
207    Constant * fwSplat = ConstantVector::getSplat(numFields, CFW);
208    Constant * numFieldConst = ConstantInt::get(fwTy, numFields);
209    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, mCompressedFieldWidth - 1));
210    BasicBlock * entry = b->GetInsertBlock();
211    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
212    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
213    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
214    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
215    Constant * const ZERO = ConstantInt::get(sizeTy, 0);
216    Constant * const ONE = ConstantInt::get(sizeTy, 1);
217    Constant * const BlockWidth = b->getSize(b->getBitBlockWidth());
218
219    Value * produced = b->getProducedItemCount("compressedOutput");
220    Value * const pendingItemCount = b->CreateZExtOrTrunc(b->CreateURem(produced, BlockWidth), fwTy);
221
222    std::vector<Value *> pendingData(mStreamCount);
223    for (unsigned i = 0; i < mStreamCount; i++) {
224        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
225    }
226
227    b->CreateBr(segmentLoop);
228    // Main Loop
229    b->SetInsertPoint(segmentLoop);
230    PHINode * blockOffsetPhi = b->CreatePHI(sizeTy, 2);
231    PHINode * outputBlockPhi = b->CreatePHI(sizeTy, 2);
232    PHINode * pendingItemsPhi = b->CreatePHI(fwTy, 2);
233    PHINode * pendingDataPhi[mStreamCount];
234    blockOffsetPhi->addIncoming(ZERO, entry);
235    outputBlockPhi->addIncoming(ZERO, entry);
236    pendingItemsPhi->addIncoming(pendingItemCount, entry);
237    for (unsigned i = 0; i < mStreamCount; i++) {
238        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
239        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
240    }
241    Value * fieldPopCounts = b->simd_popcount(mCompressedFieldWidth, b->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi));
242    // For each field determine the (partial) sum popcount of all fields up to and
243    // including the current field.
244    Value * partialSum = fieldPopCounts;
245    for (unsigned i = 1; i < numFields; i *= 2) {
246        partialSum = b->simd_add(mCompressedFieldWidth, partialSum, b->mvmd_slli(mCompressedFieldWidth, partialSum, i));
247    }
248    // Value * blockPopCount = b->CreateZExtOrTrunc(b->mvmd_extract(mCompressedFieldWidth, partialSum, numFields - 1), fwTy);
249
250    Value * blockPopCount = b->mvmd_extract(mCompressedFieldWidth, partialSum, numFields - 1);
251
252    //
253    // Now determine for each source field the output offset of the first bit.
254    // Note that this depends on the number of pending bits.
255    //
256    Value * pendingOffset = b->CreateURem(pendingItemsPhi, CFW);
257    Value * splatPending = b->simd_fill(mCompressedFieldWidth, b->CreateZExtOrTrunc(pendingOffset, fwTy));
258    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, CFW);
259    Value * offsets = b->simd_add(mCompressedFieldWidth, b->mvmd_slli(mCompressedFieldWidth, partialSum, 1), splatPending);
260    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
261   //
262    // Determine the relative field number for each output field.   Note that the total
263    // number of fields involved is numFields + 1.   However, the first field always
264    // be immediately combined into the current pending data field, so we calculate
265    // field numbers for all subsequent fields, (the fields that receive overflow bits).
266    Value * pendingSum = b->simd_add(mCompressedFieldWidth, partialSum, splatPending);
267    Value * fieldNo = b->simd_srli(mCompressedFieldWidth, pendingSum, std::log2(mCompressedFieldWidth));
268    // Now process the input data block of each stream in the input stream set.
269    //
270    // First load all the stream set blocks and the pending data.
271    std::vector<Value *> sourceBlock(mStreamCount);
272    for (unsigned i = 0; i < mStreamCount; i++) {
273        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
274    }
275    // Now separate the bits of each field into ones that go into the current field
276    // and ones that go into the overflow field.   Extract the first field separately,
277    // and then shift and combine subsequent fields.
278    std::vector<Value *> pendingOutput(mStreamCount);
279    std::vector<Value *> outputFields(mStreamCount);
280    Value * backShift = b->simd_sub(mCompressedFieldWidth, fwSplat, offsets);
281    for (unsigned i = 0; i < mStreamCount; i++) {
282        Value * currentFieldBits = b->simd_sllv(mCompressedFieldWidth, sourceBlock[i], offsets);
283        Value * nextFieldBits = b->simd_srlv(mCompressedFieldWidth, sourceBlock[i], backShift);
284        Value * firstField = b->mvmd_extract(mCompressedFieldWidth, currentFieldBits, 0);
285        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
286        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
287        // shift back currentFieldBits to combine with nextFieldBits.
288        outputFields[i] = b->simd_or(b->mvmd_srli(mCompressedFieldWidth, currentFieldBits, 1), nextFieldBits);
289    }
290    // Now combine forward all fields with the same field number.  This may require
291    // up to log2 numFields steps.
292    for (unsigned j = 1; j < numFields; j*=2) {
293        Value * select = b->simd_eq(mCompressedFieldWidth, fieldNo, b->mvmd_slli(mCompressedFieldWidth, fieldNo, j));
294        for (unsigned i = 0; i < mStreamCount; i++) {
295            Value * fields_fwd = b->mvmd_slli(mCompressedFieldWidth, outputFields[i], j);
296            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
297       }
298    }
299    // Now compress the data fields, eliminating all but the last field from
300    // each run of consecutive field having the same field number as a subsequent field.
301    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
302    // So we add 1 to field numbers first.
303    Value * nonZeroFieldNo = b->simd_add(mCompressedFieldWidth, fieldNo, oneSplat);
304    Value * eqNext = b->simd_eq(mCompressedFieldWidth, nonZeroFieldNo, b->mvmd_srli(mCompressedFieldWidth, nonZeroFieldNo, 1));
305    Value * compressMask = b->hsimd_signmask(mCompressedFieldWidth, b->simd_not(eqNext));
306    for (unsigned i = 0; i < mStreamCount; i++) {
307        outputFields[i] = b->mvmd_compress(mCompressedFieldWidth, outputFields[i], compressMask);
308    }
309    //
310    // Finally combine the pendingOutput and outputField data.
311    // (a) shift forward outputField data to fill the pendingOutput values.
312    // (b) shift back outputField data to clear data added to pendingOutput.
313    //
314    // However, we may need to increment pendingFieldIndex if we previously
315    // filled the field with the extracted firstField value.  The first
316    // value of the fieldNo vector will be 0 or 1.
317    // It is possible that pendingFieldIndex will reach the total number
318    // of fields held in register.  mvmd_sll may not handle this if it
319    // translates to an LLVM shl.
320    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(mCompressedFieldWidth, fieldNo, 0), fwTy);
321    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
322    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
323    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
324    for (unsigned i = 0; i < mStreamCount; i++) {
325        Value * shiftedField = b->mvmd_sll(mCompressedFieldWidth, outputFields[i], pendingFieldIdx);
326        Value * outputFwd = b->fwCast(mCompressedFieldWidth, shiftedField);
327        shiftedField = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
328        pendingOutput[i] = b->simd_or(pendingOutput[i], shiftedField);
329        outputFields[i] = b->mvmd_srl(mCompressedFieldWidth, outputFields[i], shftBack);
330    }
331    //
332    // Write the pendingOutput data to outputStream.
333    // Note: this data may be overwritten later, but we avoid branching.
334    for (unsigned i = 0; i < mStreamCount; i++) {
335        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
336    }
337    // Now determine the total amount of pending items and whether
338    // the pending data all fits within the pendingOutput.
339    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
340    Constant * BLOCK_WIDTH = ConstantInt::get(fwTy, b->getBitBlockWidth());
341    Value * doesFit = b->CreateICmpULT(newPending, BLOCK_WIDTH);
342    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, BLOCK_WIDTH));
343    //
344    // Prepare Phi nodes for the next iteration.
345    //
346    Value * nextBlk = b->CreateAdd(blockOffsetPhi, ONE);
347    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
348    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, ONE);
349    // But don't advance the output if all the data does fit into pendingOutput.
350    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
351    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
352    pendingItemsPhi->addIncoming(newPending, segmentLoop);
353
354    for (unsigned i = 0; i < mStreamCount; i++) {
355        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(mCompressedFieldWidth, pendingOutput[i]), b->fwCast(mCompressedFieldWidth, outputFields[i]));
356        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
357    }
358    //
359    // Now continue the loop if there are more blocks to process.
360    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
361    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
362
363    b->SetInsertPoint(segmentDone);
364    // Save kernel state.
365    for (unsigned i = 0; i < mStreamCount; i++) {
366        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
367    }
368    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
369
370    b->SetInsertPoint(finalWrite);
371    for (unsigned i = 0; i < mStreamCount; i++) {
372        Value * pending = b->bitCast(pendingOutput[i]);
373        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
374    }
375    b->CreateBr(updateProducedCount);
376
377    b->SetInsertPoint(updateProducedCount);
378
379}
380
381Bindings makeSwizzledDeleteByPEXTOutputBindings(const std::vector<StreamSet *> & outputStreamSets, const unsigned PEXTWidth) {
382    const auto n = outputStreamSets.size();
383    Bindings outputs;
384    outputs.reserve(n);
385    outputs.emplace_back("outputSwizzle0", outputStreamSets[0], PopcountOf("selectors"), BlockSize(PEXTWidth)); // PopcountOfNot("delMaskSet")
386    for (unsigned i = 1; i < n; ++i) {
387        outputs.emplace_back("outputSwizzle" + std::to_string(i), outputStreamSets[i], RateEqualTo("outputSwizzle0"), BlockSize(PEXTWidth));
388    }
389    return outputs;
390}
391
392SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b,
393                                                       StreamSet * selectors, StreamSet * inputStreamSet,
394                                                       const std::vector<StreamSet *> & outputStreamSets,
395                                                       const unsigned PEXTWidth)
396
397: MultiBlockKernel(b, "PEXTdel" + std::to_string(PEXTWidth) + "_" + std::to_string(inputStreamSet->getNumElements()),
398{Binding{"selectors", selectors}, Binding{"inputStreamSet", inputStreamSet}},
399makeSwizzledDeleteByPEXTOutputBindings(outputStreamSets, PEXTWidth),
400{}, {}, {})
401, mStreamCount(inputStreamSet->getNumElements())
402, mSwizzleFactor(b->getBitBlockWidth() / PEXTWidth)
403, mSwizzleSetCount(ceil_udiv(mStreamCount, mSwizzleFactor))
404, mPEXTWidth(PEXTWidth) {
405
406    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0) && "mDelCountFieldWidth must be a power of 2");
407    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
408    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
409    assert (mSwizzleSetCount);
410    assert (outputStreamSets.size() == mSwizzleSetCount);
411    assert (outputStreamSets[0]->getNumElements() == mSwizzleFactor);
412
413    addInternalScalar(b->getBitBlockType(), "pendingSwizzleData0");
414    for (unsigned i = 1; i < outputStreamSets.size(); ++i) {
415        assert (outputStreamSets[i]->getNumElements() == mSwizzleFactor);
416        addInternalScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
417    }
418}
419
420void SwizzledDeleteByPEXTkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
421    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
422
423    BasicBlock * const entry = b->GetInsertBlock();
424    BasicBlock * const beginLoop = b->CreateBasicBlock("beginLoop");
425
426    ConstantInt * const ZERO = b->getSize(0);
427    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
428    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
429    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
430    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
431    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
432
433    // All output groups have the same count.
434    Value * const baseOutputProduced = b->getProducedItemCount("outputSwizzle0");
435    Value * const baseProducedOffset = b->CreateAnd(baseOutputProduced, BLOCK_WIDTH_MASK);
436
437    // There is a separate vector of pending data for each swizzle group.
438    std::vector<Value *> pendingData(mSwizzleSetCount);
439    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
440        pendingData[i] = b->getScalarField("pendingSwizzleData" + std::to_string(i));
441    }
442    b->CreateBr(beginLoop);
443
444    b->SetInsertPoint(beginLoop);
445    PHINode * const strideIndex = b->CreatePHI(numOfBlocks->getType(), 2);
446    strideIndex->addIncoming(ZERO, entry);
447    PHINode * const producedOffsetPhi = b->CreatePHI(numOfBlocks->getType(), 2);
448    producedOffsetPhi->addIncoming(baseProducedOffset, entry);
449    std::vector<PHINode *> pendingDataPhi(mSwizzleSetCount);
450    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
451        pendingDataPhi[i] = b->CreatePHI(pendingData[i]->getType(), 2);
452        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
453        pendingData[i] = pendingDataPhi[i];
454    }
455
456    Value * const selectors = b->loadInputStreamBlock("selectors", strideIndex);
457
458    const auto swizzleSets = makeSwizzleSets(b, selectors, strideIndex);
459
460    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
461
462    // Compress the PEXTedSwizzleSets
463    // Output is written and committed to the output buffer one swizzle at a time.
464    Value * producedOffset = producedOffsetPhi;
465
466    // For each row i
467    for (unsigned i = 0; i < mSwizzleFactor; i++) {
468
469        // Generate code for each of the mSwizzleFactor fields making up a block.
470        // We load the count for the field and process all swizzle groups accordingly.
471        Value * const pendingOffset = b->CreateAnd(producedOffset, PEXT_WIDTH_MASK);
472        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
473        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
474        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
475
476        Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
477        Value * const spaceVector = b->simd_fill(mPEXTWidth, pendingSpace);
478
479        Value * const outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
480        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
481        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
482
483        // Data from the ith swizzle pack of each group is processed
484        // according to the same newItemCount, pendingSpace, ...
485        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
486            Value * const newItems = swizzleSets[j][i];
487            // Combine as many of the new items as possible into the pending group.
488            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
489            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
490            // To avoid an unpredictable branch, always store the combined group, whether full or not.
491            b->storeOutputStreamBlock("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset, combinedGroup);
492            // Any items in excess of the space available in the current pending group overflow for the next group.
493            Value * overFlowGroup = b->CreateLShr(newItems, spaceVector);
494            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
495            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
496        }
497        producedOffset = b->CreateAdd(producedOffset, newItemCount);
498    }
499
500    BasicBlock * const finishedLoop = b->CreateBasicBlock("finishedLoop");
501    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
502    BasicBlock * const loopEndBlock = b->GetInsertBlock();
503    strideIndex->addIncoming(nextStrideIndex, loopEndBlock);
504    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
505        pendingDataPhi[i]->addIncoming(pendingData[i], loopEndBlock);
506    }
507    producedOffsetPhi->addIncoming(producedOffset, loopEndBlock);
508    Value * const doneLoop = b->CreateICmpEQ(nextStrideIndex, numOfBlocks);
509
510    b->CreateUnlikelyCondBr(doneLoop, finishedLoop, beginLoop);
511
512    b->SetInsertPoint(finishedLoop);
513    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
514        b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
515    }
516}
517
518/*
519Apply PEXT deletion to the blocks in strms and swizzle the result.
520
521Q: Why is it advantageous to swizzle the PEXTed streams?
522
523A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
524
525Example:
526Input:     11101101
527PEXT mask: 11110000
528Output:    00001110
529
530PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
531cleared. This has implications if we're working with multiple streams.
532
533For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
534(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
535Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
536Stream 2:   qrstuv00 wxy00000 z1000000 23456000
537Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
538Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
539
540If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
541should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
542time:
543
544The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
545
546Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
547Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
548Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
549Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
550
551Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
552compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
553
554Args:
555    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
556    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
557        sequentially to PEXT a full block.)
558
559Returns:
560    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
561*/
562
563SwizzledDeleteByPEXTkernel::SwizzleSets SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors, Value * const strideIndex) {
564
565    Constant * pext = nullptr;
566    if (mPEXTWidth == 64) {
567        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
568    } else if (mPEXTWidth == 32) {
569        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
570    }
571
572    Value * const m = b->fwCast(mPEXTWidth, selectors);
573
574    std::vector<Value *> masks(mSwizzleFactor);
575    for (unsigned i = 0; i < mSwizzleFactor; i++) {
576        masks[i] = b->CreateExtractElement(m, i);
577
578    }
579
580    SwizzleSets swizzleSets;
581    swizzleSets.reserve(mSwizzleSetCount);
582
583    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
584
585    UndefValue * const outputInitializer = UndefValue::get(vecTy);
586
587    std::vector<Value *> input(mSwizzleFactor);
588    // For each of the k swizzle sets required to apply PEXT to all input streams
589    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
590
591        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
592            const unsigned k = (i * mSwizzleFactor) + j;
593            if (k < mStreamCount) {
594                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k), strideIndex), vecTy);
595            } else {
596                input[j] = Constant::getNullValue(vecTy);
597            }
598        }
599
600        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
601        // then splat the pext mask and apply it to each output row
602
603        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
604        // For each of the input streams
605        for (unsigned j = 0; j < mSwizzleFactor; j++) {
606            for (unsigned k = 0; k < mSwizzleFactor; k++) {
607                // Load block j,k
608                Value * const field = b->CreateExtractElement(input[j], k);
609                // Apply PEXT deletion
610                Value * const selected = b->CreateCall(pext, {field, masks[k]});
611                // Then store it as our k,j-th output
612                output[k] = b->CreateInsertElement(output[k], selected, j);
613            }
614        }
615        swizzleSets.emplace_back(output);
616    }
617
618    return swizzleSets;
619}
620
621
622// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
623// Kernel inputs: stream_count data streams plus one del_mask stream
624// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
625
626void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
627    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
628    generateProcessingLoop(kb, delMask);
629}
630
631void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
632    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
633    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
634    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
635    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
636    generateProcessingLoop(kb, delMask);
637}
638
639void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
640    Constant * PEXT_func = nullptr;
641    if (mPEXTWidth == 64) {
642        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
643    } else if (mPEXTWidth == 32) {
644        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
645    }
646    std::vector<Value *> masks(mSwizzleFactor);
647    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
648    for (unsigned i = 0; i < mSwizzleFactor; i++) {
649        masks[i] = kb->CreateExtractElement(m, i);
650    }
651
652    for (unsigned i = 0; i < mStreamCount; ++i) {
653        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
654        Value * value = kb->fwCast(mPEXTWidth, input);
655        Value * output = UndefValue::get(value->getType());
656        for (unsigned j = 0; j < mSwizzleFactor; j++) {
657            Value * field = kb->CreateExtractElement(value, j);
658            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
659            output = kb->CreateInsertElement(output, compressed, j);
660        }
661        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
662    }
663    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
664    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
665}
666
667DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
668: BlockOrientedKernel(b, "PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
669              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
670                  Binding{b->getStreamSetTy(), "delMaskSet"}},
671              {}, {}, {}, {})
672, mDelCountFieldWidth(fw)
673, mStreamCount(streamCount)
674, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
675, mPEXTWidth(PEXT_width) {
676    mOutputStreamSets.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
677    mOutputStreamSets.emplace_back(b->getStreamSetTy(), "deletionCounts");
678}
679
680
681//
682// This kernel performs final stream compression for a set of N bitstreams, given
683// (a) a set of bitstreams partially compressed within K-bit fields and stored
684//     in K-bit swizzled form, and
685// (b) a stream of deletion/extraction counts per K-bit stride.
686//
687// Restrictions:  At present, only K=64 is supported.
688//                At present, N must be an exact multiple of BLOCK_SIZE/K.
689//
690// The kernel always consumes full blocks of input and emits data into the output
691// buffer in swizzles of K items at a time.   Upon completion of a segment,
692// up to K-1 pending output items per stream may be stored in the kernel state.
693//
694// Note: that both input streams and output streams are stored in swizzled form.
695//
696
697SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned bitStreamCount, unsigned fieldWidth)
698: BlockOrientedKernel(b, "swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
699                     {Binding{b->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
700, mBitStreamCount(bitStreamCount)
701, mFieldWidth(fieldWidth)
702, mSwizzleFactor(b->getBitBlockWidth() / fieldWidth)
703, mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
704    assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
705    assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
706    mInputStreamSets.push_back(Binding{b->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
707    mOutputStreamSets.push_back(Binding{b->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
708    addInternalScalar(b->getBitBlockType(), "pendingSwizzleData0");
709    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
710        mInputStreamSets.push_back(Binding{b->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
711        mOutputStreamSets.push_back(Binding{b->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
712        addInternalScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
713    }
714    addInternalScalar(b->getSizeTy(), "pendingOffset");
715}
716
717void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
718
719    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
720    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
721
722    // Output is written and committed to the output buffer one swizzle at a time.
723    //
724    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
725    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
726
727    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
728    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
729    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
730
731    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
732    Value * pendingOffset = kb->getScalarField("pendingOffset");
733    // There is a separate vector of pending data for each swizzle group.
734    std::vector<Value *> pendingData;
735    std::vector<Value *> outputStreamPtr;
736    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
737        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
738        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
739    }
740
741    // Generate code for each of the mSwizzleFactor fields making up a block.
742    // We load the count for the field and process all swizzle groups accordingly.
743    for (unsigned i = 0; i < mSwizzleFactor; i++) {
744        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
745        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
746        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
747
748        Value * const fieldWidths = kb->simd_fill(mFieldWidth, pendingOffset);
749
750        // Data from the ith swizzle pack of each group is processed
751        // according to the same newItemCount, pendingSpace, ...
752        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
753            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
754            // Combine as many of the new items as possible into the pending group.
755            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, fieldWidths));
756            // To avoid an unpredictable branch, always store the combined group, whether full or not.
757            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
758            // Any items in excess of the space available in the current pending group overflow for the next group.
759            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
760            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
761            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
762        }
763        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
764        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
765    }
766    kb->setScalarField("pendingOffset", pendingOffset);
767//    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
768//    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
769    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
770        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
771    }
772//    kb->setProducedItemCount("outputSwizzle0", produced);
773}
774
775void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
776    CreateDoBlockMethodCall(kb);
777    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
778    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
779
780    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
781    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
782    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
783//    Value * pendingOffset = kb->getScalarField("pendingOffset");
784
785    // Write the pending data.
786    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
787        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
788        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
789        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
790    }
791//    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
792}
793
794}
Note: See TracBrowser for help on using the repository browser.