source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6024

Last change on this file since 6024 was 6024, checked in by cameron, 10 months ago

mvmd_compress for SSE2, StreamCompress? bug fix

File size: 42.5 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
15    Value * m = kb->simd_not(del_mask);
16    Value * mk = kb->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = kb->simd_and(mp, m);
24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
25        mk = kb->simd_and(mk, kb->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = kb->simd_and(s, mv[i]);
36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
52    }
53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
67    }
68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
84    BasicBlock * entry = kb->GetInsertBlock();
85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
86    BasicBlock * done = kb->CreateBasicBlock("done");
87    Constant * const ZERO = kb->getSize(0);
88    kb->CreateBr(processBlock);
89    kb->SetInsertPoint(processBlock);
90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
91    blockOffsetPhi->addIncoming(ZERO, entry);
92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
93    Value * delMask = kb->simd_not(extractionMask);
94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
95    for (unsigned j = 0; j < mStreamCount; ++j) {
96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
99    }
100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
105    kb->CreateCondBr(moreToDo, processBlock, done);
106    kb->SetInsertPoint(done);
107}
108
109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
115                      {}, {}, {})
116, mCompressFieldWidth(fieldWidth)
117, mStreamCount(streamCount) {
118}
119
120void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
121    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
122    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
123    Constant * PEXT_func = nullptr;
124    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
125    if (mPEXTWidth == 64) {
126        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
127    } else if (mPEXTWidth == 32) {
128        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
129    }
130    BasicBlock * entry = kb->GetInsertBlock();
131    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
132    BasicBlock * done = kb->CreateBasicBlock("done");
133    Constant * const ZERO = kb->getSize(0);
134    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
135    kb->CreateBr(processBlock);
136    kb->SetInsertPoint(processBlock);
137    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
138    blockOffsetPhi->addIncoming(ZERO, entry);
139    std::vector<Value *> mask(fieldsPerBlock);
140    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
141    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
142    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
143    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
144    for (unsigned i = 0; i < fieldsPerBlock; i++) {
145        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
146        Value * popc = kb->CreateCall(popc_func, mask[i]);
147        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
148    }
149    for (unsigned j = 0; j < mStreamCount; ++j) {
150        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
151        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
152        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
153        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
154        for (unsigned i = 0; i < fieldsPerBlock; i++) {
155            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
156            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
157            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
158        }
159    }
160    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
161    blockOffsetPhi->addIncoming(nextBlk, processBlock);
162    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
163    kb->CreateCondBr(moreToDo, processBlock, done);
164    kb->SetInsertPoint(done);
165}
166
167PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
168: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
169                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
170                       Binding{kb->getStreamSetTy(), "extractionMask"}},
171                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
172                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
173                   {}, {}, {})
174, mPEXTWidth(fieldWidth)
175, mStreamCount(streamCount) {
176    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
177}
178   
179StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
180: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
181                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
182                       Binding{kb->getStreamSetTy(), "unitCounts"}},
183                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
184                   {}, {}, {})
185, mCompressedFieldWidth(fieldWidth)
186, mStreamCount(streamCount) {
187    addScalar(kb->getSizeTy(), "pendingItemCount");
188    for (unsigned i = 0; i < streamCount; i++) {
189        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
190    }
191
192}
193   
194void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
195    const unsigned fw = mCompressedFieldWidth;
196    Type * fwTy = b->getIntNTy(fw);
197    Type * sizeTy = b->getSizeTy();
198    const unsigned numFields = b->getBitBlockWidth()/fw;
199    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
200    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
201    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
202    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
203    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
204    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
205    BasicBlock * entry = b->GetInsertBlock();
206    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
207    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
208    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
209    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
210    Constant * const ZERO = b->getSize(0);
211   
212    Value * pendingItemCount = b->getScalarField("pendingItemCount");
213    std::vector<Value *> pendingData(mStreamCount);
214    for (unsigned i = 0; i < mStreamCount; i++) {
215        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
216    }
217   
218    b->CreateBr(segmentLoop);
219    // Main Loop
220    b->SetInsertPoint(segmentLoop);
221    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
222    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
223    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
224    PHINode * pendingDataPhi[mStreamCount];
225    blockOffsetPhi->addIncoming(ZERO, entry);
226    outputBlockPhi->addIncoming(ZERO, entry);
227    pendingItemsPhi->addIncoming(pendingItemCount, entry);
228    for (unsigned i = 0; i < mStreamCount; i++) {
229        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
230        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
231    }
232    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
233    // For each field determine the (partial) sum popcount of all fields up to and
234    // including the current field.
235    Value * partialSum = fieldPopCounts;
236    for (unsigned i = 1; i < numFields; i *= 2) {
237        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
238    }
239    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
240    //
241    // Now determine for each source field the output offset of the first bit.
242    // Note that this depends on the number of pending bits.
243    //
244    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
245    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
246    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
247    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
248    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
249   //
250    // Determine the relative field number for each output field.   Note that the total
251    // number of fields involved is numFields + 1.   However, the first field always
252    // be immediately combined into the current pending data field, so we calculate
253    // field numbers for all subsequent fields, (the fields that receive overflow bits).
254    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
255  //
256    // Now process the input data block of each stream in the input stream set.
257    //
258    // First load all the stream set blocks and the pending data.
259    std::vector<Value *> sourceBlock(mStreamCount);
260    for (unsigned i = 0; i < mStreamCount; i++) {
261        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
262    }
263    // Now separate the bits of each field into ones that go into the current field
264    // and ones that go into the overflow field.   Extract the first field separately,
265    // and then shift and combine subsequent fields.
266    std::vector<Value *> pendingOutput(mStreamCount);
267    std::vector<Value *> outputFields(mStreamCount);
268    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
269    for (unsigned i = 0; i < mStreamCount; i++) {
270        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
271        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
272        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
273        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
274        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
275        // shift back currentFieldBits to combine with nextFieldBits.
276        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
277    }
278    // Now combine forward all fields with the same field number.  This may require
279    // up to log2 numFields steps.
280    for (unsigned j = 1; j < numFields; j*=2) {
281        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
282        for (unsigned i = 0; i < mStreamCount; i++) {
283            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
284            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
285       }
286    }
287    // Now compress the data fields, eliminating all but the last field from
288    // each run of consecutive field having the same field number as a subsequent field.
289    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
290    // So we add 1 to field numbers first.
291    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
292    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
293    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
294    for (unsigned i = 0; i < mStreamCount; i++) {
295        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
296    }
297    //
298    // Finally combine the pendingOutput and outputField data.
299    // (a) shift forward outputField data to fill the pendingOutput values.
300    // (b) shift back outputField data to clear data added to pendingOutput.
301    //
302    // However, we may need to increment pendingFieldIndex if we previously
303    // filled the field with the extracted firstField value.  The first
304    // value of the fieldNo vector will be 0 or 1.
305    // It is possible that pendingFieldIndex will reach the total number
306    // of fields held in register.  mvmd_sll may not handle this if it
307    // translates to an LLVM shl.
308    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
309    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
310    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
311    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
312    for (unsigned i = 0; i < mStreamCount; i++) {
313        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
314        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
315        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
316        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
317    }
318    //
319    // Write the pendingOutput data to outputStream.
320    // Note: this data may be overwritten later, but we avoid branching.
321    for (unsigned i = 0; i < mStreamCount; i++) {
322        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
323    }
324    // Now determine the total amount of pending items and whether
325    // the pending data all fits within the pendingOutput.
326    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
327    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
328    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
329    //
330    // Prepare Phi nodes for the next iteration.
331    //
332    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
333    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
334    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
335    // But don't advance the output if all the data does fit into pendingOutput.
336    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
337    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
338    pendingItemsPhi->addIncoming(newPending, segmentLoop);
339
340    for (unsigned i = 0; i < mStreamCount; i++) {
341        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
342        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
343    }
344    //
345    // Now continue the loop if there are more blocks to process.
346    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
347    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
348   
349    b->SetInsertPoint(segmentDone);
350    // Save kernel state.
351    b->setScalarField("pendingItemCount", newPending);
352    for (unsigned i = 0; i < mStreamCount; i++) {
353        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
354    }
355    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
356    b->SetInsertPoint(finalWrite);
357    for (unsigned i = 0; i < mStreamCount; i++) {
358        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
359        Value * pending = b->bitCast(pendingOutput[i]);
360        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
361    }
362    b->CreateBr(updateProducedCount);
363    b->SetInsertPoint(updateProducedCount);
364     Value * produced = b->getProducedItemCount("compressedOutput");
365    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
366    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
367    b->setProducedItemCount("compressedOutput", produced);
368}
369
370SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
371: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
372                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
373                  {}, {}, {}, {})
374, mStreamCount(streamCount)
375, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
376// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
377// to prevent rounding errors.
378, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
379, mPEXTWidth(PEXT_width)
380{
381    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
382        && "mDelCountFieldWidth must be a power of 2");
383    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
384    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
385
386    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
387    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
388
389    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
390    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
391    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
392        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
393        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
394    }
395    addScalar(b->getSizeTy(), "pendingOffset");
396}
397
398void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
399    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
400    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
401    generateProcessingLoop(b, delMask, false);
402}
403
404void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
405    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
406    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
407    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
408    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
409    generateProcessingLoop(b, delMask, true);
410}
411
412/*
413What this function does in pseudo code:
414for (mSwizzleFactor)
415    create a swizzle set containing mSwizzleFactor blocks
416    apply PEXT to each block in the swizzle set
417    store the swizzleSet in PEXTedSwizzleSets vector
418
419for (each swizzle row i)
420    for (each swizzle set j)
421        processes row i in swizzle set j
422        store output in pendingData[j]
423*/
424
425void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
426
427    // selectors marks the positions we want to keep
428    Value * const selectors = b->CreateNot(delMask);
429
430    const auto swizzleSets = makeSwizzleSets(b, selectors);
431
432    // Compress the PEXTedSwizzleSets
433    // Output is written and committed to the output buffer one swizzle at a time.
434    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
435    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
436    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
437    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
438    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
439
440    // All output groups have the same count.
441    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
442    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
443    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
444    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
445
446    // There is a separate vector of pending data for each swizzle group.
447    std::vector<Value *> pendingData;
448    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
449        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
450    }
451
452    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
453
454    // For each row i
455    for (unsigned i = 0; i < mSwizzleFactor; i++) {
456
457        // Generate code for each of the mSwizzleFactor fields making up a block.
458        // We load the count for the field and process all swizzle groups accordingly.
459        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
460        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
461        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
462        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
463
464        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
465        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
466
467        // Data from the ith swizzle pack of each group is processed
468        // according to the same newItemCount, pendingSpace, ...
469        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
470            Value * const newItems = swizzleSets[j][i];
471            // Combine as many of the new items as possible into the pending group.
472            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
473            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
474            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
475            // To avoid an unpredictable branch, always store the combined group, whether full or not.
476            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
477            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
478            // Any items in excess of the space available in the current pending group overflow for the next group.
479            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
480            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
481            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
482        }
483
484        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
485        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
486
487        outputProduced = b->CreateAdd(outputProduced, newItemCount);
488    }
489
490    if (flush) { // incase we selected the overflow group on the final iteration
491        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
492        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
493        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
494            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
495            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
496        }
497    } else {
498        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
499            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
500        }
501        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
502        b->setScalarField("pendingOffset", pendingOffset);
503        // unless this is our final stride, don't report partially written fields.
504        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
505    }
506
507    b->setProducedItemCount("outputSwizzle0", outputProduced);
508}
509
510/*
511Apply PEXT deletion to the blocks in strms and swizzle the result.
512
513Q: Why is it advantageous to swizzle the PEXTed streams?
514
515A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
516
517Example:
518Input:     11101101
519PEXT mask: 11110000
520Output:    00001110
521
522PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
523cleared. This has implications if we're working with multiple streams.
524
525For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
526(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
527Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
528Stream 2:   qrstuv00 wxy00000 z1000000 23456000
529Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
530Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
531
532If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
533should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
534time:
535
536The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
537
538Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
539Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
540Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
541Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
542
543Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
544compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
545
546Args:
547    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
548    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
549        sequentially to PEXT a full block.)
550
551Returns:
552    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
553*/
554
555std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
556
557    Constant * pext = nullptr;
558    if (mPEXTWidth == 64) {
559        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
560    } else if (mPEXTWidth == 32) {
561        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
562    }
563
564    Value * const m = b->fwCast(mPEXTWidth, selectors);
565
566    std::vector<Value *> masks(mSwizzleFactor);
567    for (unsigned i = 0; i < mSwizzleFactor; i++) {
568        masks[i] = b->CreateExtractElement(m, i);
569
570    }
571
572    std::vector<std::vector<Value *>> swizzleSets;
573    swizzleSets.reserve(mSwizzleSetCount);
574
575    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
576
577    UndefValue * const outputInitializer = UndefValue::get(vecTy);
578
579    std::vector<Value *> input(mSwizzleFactor);
580    // For each of the k swizzle sets required to apply PEXT to all input streams
581    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
582
583        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
584            const unsigned k = (i * mSwizzleFactor) + j;
585            if (k < mStreamCount) {
586                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
587            } else {
588                input[j] = Constant::getNullValue(vecTy);
589            }
590        }
591
592        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
593        // then splat the pext mask and apply it to each output row
594
595        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
596        // For each of the input streams
597        for (unsigned j = 0; j < mSwizzleFactor; j++) {
598            for (unsigned k = 0; k < mSwizzleFactor; k++) {
599                // Load block j,k
600                Value * const field = b->CreateExtractElement(input[j], k);
601                // Apply PEXT deletion
602                Value * const selected = b->CreateCall(pext, {field, masks[k]});
603                // Then store it as our k,j-th output
604                output[k] = b->CreateInsertElement(output[k], selected, j);
605            }
606        }
607
608        swizzleSets.emplace_back(output);
609    }
610
611    return swizzleSets;
612}
613
614// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
615// Kernel inputs: stream_count data streams plus one del_mask stream
616// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
617
618void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
619    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
620    generateProcessingLoop(kb, delMask);
621}
622
623void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
624    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
625    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
626    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
627    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
628    generateProcessingLoop(kb, delMask);
629}
630
631void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
632    Constant * PEXT_func = nullptr;
633    if (mPEXTWidth == 64) {
634        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
635    } else if (mPEXTWidth == 32) {
636        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
637    }
638    std::vector<Value *> masks(mSwizzleFactor);
639    Value * const m = kb->fwCast(mSwizzleFactor, kb->simd_not(delMask));
640    for (unsigned i = 0; i < mSwizzleFactor; i++) {
641        masks.push_back(kb->CreateExtractElement(m, i));
642    }
643
644    for (unsigned i = 0; i < mStreamCount; ++i) {
645        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
646        Value * value = kb->fwCast(mPEXTWidth, input);
647        Value * output = UndefValue::get(value->getType());
648        for (unsigned j = 0; j < mSwizzleFactor; j++) {
649            Value * field = kb->CreateExtractElement(value, j);
650            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
651            output = kb->CreateInsertElement(output, compressed, j);
652        }
653        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
654    }
655    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
656    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
657}
658
659DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
660: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
661              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
662                  Binding{b->getStreamSetTy(), "delMaskSet"}},
663              {}, {}, {}, {})
664, mDelCountFieldWidth(fw)
665, mStreamCount(streamCount)
666, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
667, mPEXTWidth(PEXT_width) {
668    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
669    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
670}
671
672   
673//
674// This kernel performs final stream compression for a set of N bitstreams, given
675// (a) a set of bitstreams partially compressed within K-bit fields and stored
676//     in K-bit swizzled form, and
677// (b) a stream of deletion/extraction counts per K-bit stride.
678//
679// Restrictions:  At present, only K=64 is supported.
680//                At present, N must be an exact multiple of BLOCK_SIZE/K.
681//
682// The kernel always consumes full blocks of input and emits data into the output
683// buffer in swizzles of K items at a time.   Upon completion of a segment,
684// up to K-1 pending output items per stream may be stored in the kernel state.
685//
686// Note: that both input streams and output streams are stored in swizzled form.
687//
688
689SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
690: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
691                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
692, mBitStreamCount(bitStreamCount)
693    , mFieldWidth(fieldWidth)
694    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
695    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
696        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
697        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
698        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
699        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
700        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
701        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
702            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
703            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
704            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
705        }
706        addScalar(kb->getSizeTy(), "pendingOffset");
707}
708   
709void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
710       
711    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
712    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
713   
714    // Output is written and committed to the output buffer one swizzle at a time.
715    //
716    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
717    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
718   
719    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
720    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
721    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
722
723    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
724    Value * pendingOffset = kb->getScalarField("pendingOffset");
725    // There is a separate vector of pending data for each swizzle group.
726    std::vector<Value *> pendingData;
727    std::vector<Value *> outputStreamPtr;
728    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
729        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
730        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
731    }
732   
733    // Generate code for each of the mSwizzleFactor fields making up a block.
734    // We load the count for the field and process all swizzle groups accordingly.
735    for (unsigned i = 0; i < mSwizzleFactor; i++) {
736        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
737        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
738        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
739       
740        // Data from the ith swizzle pack of each group is processed
741        // according to the same newItemCount, pendingSpace, ...
742        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
743            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
744            // Combine as many of the new items as possible into the pending group.
745            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
746            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
747            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
748            // Any items in excess of the space available in the current pending group overflow for the next group.
749            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
750            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
751            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
752        }
753        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
754        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
755    }
756    kb->setScalarField("pendingOffset", pendingOffset);
757    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
758    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
759    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
760        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
761    }
762    kb->setProducedItemCount("outputSwizzle0", produced);
763}
764
765void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
766    CreateDoBlockMethodCall(kb);
767    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
768    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
769   
770    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
771    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
772    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
773    Value * pendingOffset = kb->getScalarField("pendingOffset");
774
775    // Write the pending data.
776    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
777        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
778        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
779        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
780    }
781    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
782}
783}
Note: See TracBrowser for help on using the repository browser.