source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6035

Last change on this file since 6035 was 6035, checked in by xwa163, 10 months ago

PEXTFieldCompressKernel: fix bug in final block

File size: 43.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
15    Value * m = kb->simd_not(del_mask);
16    Value * mk = kb->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = kb->simd_and(mp, m);
24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
25        mk = kb->simd_and(mk, kb->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = kb->simd_and(s, mv[i]);
36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
52    }
53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
67    }
68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
84    BasicBlock * entry = kb->GetInsertBlock();
85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
86    BasicBlock * done = kb->CreateBasicBlock("done");
87    Constant * const ZERO = kb->getSize(0);
88    kb->CreateBr(processBlock);
89    kb->SetInsertPoint(processBlock);
90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
91    blockOffsetPhi->addIncoming(ZERO, entry);
92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
93    Value * delMask = kb->simd_not(extractionMask);
94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
95    for (unsigned j = 0; j < mStreamCount; ++j) {
96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
99    }
100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
105    kb->CreateCondBr(moreToDo, processBlock, done);
106    kb->SetInsertPoint(done);
107}
108
109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
115                      {}, {}, {})
116, mCompressFieldWidth(fieldWidth)
117, mStreamCount(streamCount) {
118}
119
120void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
121    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
122    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
123    Constant * fieldTyZero = kb->getIntN(mPEXTWidth, 0);
124    Constant * PEXT_func = nullptr;
125    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
126    if (mPEXTWidth == 64) {
127        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
128    } else if (mPEXTWidth == 32) {
129        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
130    }
131    BasicBlock * entry = kb->GetInsertBlock();
132    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
133    BasicBlock * done = kb->CreateBasicBlock("done");
134    Constant * const ZERO = kb->getSize(0);
135    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
136    kb->CreateBr(processBlock);
137    kb->SetInsertPoint(processBlock);
138    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
139    blockOffsetPhi->addIncoming(ZERO, entry);
140    std::vector<Value *> mask(fieldsPerBlock);
141    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
142    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
143    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
144    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
145
146    Value* remainingMask = kb->getAvailableItemCount("extractionMask");
147    Value* PEXTWidth = kb->getSize(mPEXTWidth);
148    for (unsigned i = 0; i < fieldsPerBlock; i++) {
149        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
150        Value* hasFullMask = kb->CreateICmpUGE(remainingMask, PEXTWidth);
151        Value* shiftAmount = kb->CreateSub(PEXTWidth, remainingMask);
152        Value * targetMask = kb->CreateSelect(
153                hasFullMask,
154                mask[i],
155                kb->CreateShl(mask[i], shiftAmount)
156        );
157        targetMask = kb->CreateSelect(kb->CreateICmpEQ(remainingMask, kb->getSize(0)), fieldTyZero, targetMask);
158        Value * popc = kb->CreateCall(popc_func, targetMask);
159        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
160        remainingMask = kb->CreateSelect(
161                hasFullMask,
162                kb->CreateSub(remainingMask, kb->getSize(mPEXTWidth)),
163                kb->getSize(0)
164        );
165    }
166    for (unsigned j = 0; j < mStreamCount; ++j) {
167        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
168        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
169        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
170        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
171        for (unsigned i = 0; i < fieldsPerBlock; i++) {
172            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
173            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
174            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
175        }
176    }
177    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
178    blockOffsetPhi->addIncoming(nextBlk, processBlock);
179    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
180    kb->CreateCondBr(moreToDo, processBlock, done);
181    kb->SetInsertPoint(done);
182}
183
184PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
185: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
186                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
187                       Binding{kb->getStreamSetTy(), "extractionMask"}},
188                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
189                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
190                   {}, {}, {})
191, mPEXTWidth(fieldWidth)
192, mStreamCount(streamCount) {
193    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
194}
195   
196StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
197: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
198                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
199                       Binding{kb->getStreamSetTy(), "unitCounts"}},
200                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
201                   {}, {}, {})
202, mCompressedFieldWidth(fieldWidth)
203, mStreamCount(streamCount) {
204    addScalar(kb->getSizeTy(), "pendingItemCount");
205    for (unsigned i = 0; i < streamCount; i++) {
206        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
207    }
208
209}
210   
211void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
212    const unsigned fw = mCompressedFieldWidth;
213    Type * fwTy = b->getIntNTy(fw);
214    Type * sizeTy = b->getSizeTy();
215    const unsigned numFields = b->getBitBlockWidth()/fw;
216    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
217    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
218    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
219    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
220    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
221    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
222    BasicBlock * entry = b->GetInsertBlock();
223    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
224    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
225    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
226    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
227    Constant * const ZERO = b->getSize(0);
228   
229    Value * pendingItemCount = b->getScalarField("pendingItemCount");
230    std::vector<Value *> pendingData(mStreamCount);
231    for (unsigned i = 0; i < mStreamCount; i++) {
232        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
233    }
234   
235    b->CreateBr(segmentLoop);
236    // Main Loop
237    b->SetInsertPoint(segmentLoop);
238    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
239    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
240    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
241    PHINode * pendingDataPhi[mStreamCount];
242    blockOffsetPhi->addIncoming(ZERO, entry);
243    outputBlockPhi->addIncoming(ZERO, entry);
244    pendingItemsPhi->addIncoming(pendingItemCount, entry);
245    for (unsigned i = 0; i < mStreamCount; i++) {
246        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
247        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
248    }
249    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
250    // For each field determine the (partial) sum popcount of all fields up to and
251    // including the current field.
252    Value * partialSum = fieldPopCounts;
253    for (unsigned i = 1; i < numFields; i *= 2) {
254        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
255    }
256    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
257    //
258    // Now determine for each source field the output offset of the first bit.
259    // Note that this depends on the number of pending bits.
260    //
261    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
262    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
263    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
264    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
265    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
266   //
267    // Determine the relative field number for each output field.   Note that the total
268    // number of fields involved is numFields + 1.   However, the first field always
269    // be immediately combined into the current pending data field, so we calculate
270    // field numbers for all subsequent fields, (the fields that receive overflow bits).
271    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
272  //
273    // Now process the input data block of each stream in the input stream set.
274    //
275    // First load all the stream set blocks and the pending data.
276    std::vector<Value *> sourceBlock(mStreamCount);
277    for (unsigned i = 0; i < mStreamCount; i++) {
278        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
279    }
280    // Now separate the bits of each field into ones that go into the current field
281    // and ones that go into the overflow field.   Extract the first field separately,
282    // and then shift and combine subsequent fields.
283    std::vector<Value *> pendingOutput(mStreamCount);
284    std::vector<Value *> outputFields(mStreamCount);
285    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
286    for (unsigned i = 0; i < mStreamCount; i++) {
287        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
288        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
289        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
290        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
291        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
292        // shift back currentFieldBits to combine with nextFieldBits.
293        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
294    }
295    // Now combine forward all fields with the same field number.  This may require
296    // up to log2 numFields steps.
297    for (unsigned j = 1; j < numFields; j*=2) {
298        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
299        for (unsigned i = 0; i < mStreamCount; i++) {
300            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
301            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
302       }
303    }
304    // Now compress the data fields, eliminating all but the last field from
305    // each run of consecutive field having the same field number as a subsequent field.
306    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
307    // So we add 1 to field numbers first.
308    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
309    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
310    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
311    for (unsigned i = 0; i < mStreamCount; i++) {
312        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
313    }
314    //
315    // Finally combine the pendingOutput and outputField data.
316    // (a) shift forward outputField data to fill the pendingOutput values.
317    // (b) shift back outputField data to clear data added to pendingOutput.
318    //
319    // However, we may need to increment pendingFieldIndex if we previously
320    // filled the field with the extracted firstField value.  The first
321    // value of the fieldNo vector will be 0 or 1.
322    // It is possible that pendingFieldIndex will reach the total number
323    // of fields held in register.  mvmd_sll may not handle this if it
324    // translates to an LLVM shl.
325    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
326    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
327    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
328    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
329    for (unsigned i = 0; i < mStreamCount; i++) {
330        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
331        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
332        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
333        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
334    }
335    //
336    // Write the pendingOutput data to outputStream.
337    // Note: this data may be overwritten later, but we avoid branching.
338    for (unsigned i = 0; i < mStreamCount; i++) {
339        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
340    }
341    // Now determine the total amount of pending items and whether
342    // the pending data all fits within the pendingOutput.
343    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
344    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
345    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
346    //
347    // Prepare Phi nodes for the next iteration.
348    //
349    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
350    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
351    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
352    // But don't advance the output if all the data does fit into pendingOutput.
353    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
354    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
355    pendingItemsPhi->addIncoming(newPending, segmentLoop);
356
357    for (unsigned i = 0; i < mStreamCount; i++) {
358        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
359        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
360    }
361    //
362    // Now continue the loop if there are more blocks to process.
363    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
364    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
365   
366    b->SetInsertPoint(segmentDone);
367    // Save kernel state.
368    b->setScalarField("pendingItemCount", newPending);
369    for (unsigned i = 0; i < mStreamCount; i++) {
370        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
371    }
372    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
373    b->SetInsertPoint(finalWrite);
374    for (unsigned i = 0; i < mStreamCount; i++) {
375        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
376        Value * pending = b->bitCast(pendingOutput[i]);
377        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
378    }
379    b->CreateBr(updateProducedCount);
380    b->SetInsertPoint(updateProducedCount);
381     Value * produced = b->getProducedItemCount("compressedOutput");
382    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
383    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
384    b->setProducedItemCount("compressedOutput", produced);
385}
386
387SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
388: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
389                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
390                  {}, {}, {}, {})
391, mStreamCount(streamCount)
392, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
393// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
394// to prevent rounding errors.
395, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
396, mPEXTWidth(PEXT_width)
397{
398    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
399        && "mDelCountFieldWidth must be a power of 2");
400    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
401    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
402
403    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
404    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
405
406    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
407    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
408    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
409        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
410        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
411    }
412    addScalar(b->getSizeTy(), "pendingOffset");
413}
414
415void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
416    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
417    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
418    generateProcessingLoop(b, delMask, false);
419}
420
421void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
422    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
423    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
424    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
425    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
426    generateProcessingLoop(b, delMask, true);
427}
428
429/*
430What this function does in pseudo code:
431for (mSwizzleFactor)
432    create a swizzle set containing mSwizzleFactor blocks
433    apply PEXT to each block in the swizzle set
434    store the swizzleSet in PEXTedSwizzleSets vector
435
436for (each swizzle row i)
437    for (each swizzle set j)
438        processes row i in swizzle set j
439        store output in pendingData[j]
440*/
441
442void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
443
444    // selectors marks the positions we want to keep
445    Value * const selectors = b->CreateNot(delMask);
446
447    const auto swizzleSets = makeSwizzleSets(b, selectors);
448
449    // Compress the PEXTedSwizzleSets
450    // Output is written and committed to the output buffer one swizzle at a time.
451    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
452    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
453    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
454    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
455    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
456
457    // All output groups have the same count.
458    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
459    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
460    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
461    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
462
463    // There is a separate vector of pending data for each swizzle group.
464    std::vector<Value *> pendingData;
465    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
466        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
467    }
468
469    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
470
471    // For each row i
472    for (unsigned i = 0; i < mSwizzleFactor; i++) {
473
474        // Generate code for each of the mSwizzleFactor fields making up a block.
475        // We load the count for the field and process all swizzle groups accordingly.
476        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
477        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
478        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
479        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
480
481        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
482        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
483
484        // Data from the ith swizzle pack of each group is processed
485        // according to the same newItemCount, pendingSpace, ...
486        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
487            Value * const newItems = swizzleSets[j][i];
488            // Combine as many of the new items as possible into the pending group.
489            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
490            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
491            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
492            // To avoid an unpredictable branch, always store the combined group, whether full or not.
493            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
494            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
495            // Any items in excess of the space available in the current pending group overflow for the next group.
496            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
497            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
498            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
499        }
500
501        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
502        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
503
504        outputProduced = b->CreateAdd(outputProduced, newItemCount);
505    }
506
507    if (flush) { // incase we selected the overflow group on the final iteration
508        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
509        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
510        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
511            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
512            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
513        }
514    } else {
515        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
516            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
517        }
518        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
519        b->setScalarField("pendingOffset", pendingOffset);
520        // unless this is our final stride, don't report partially written fields.
521        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
522    }
523
524    b->setProducedItemCount("outputSwizzle0", outputProduced);
525}
526
527/*
528Apply PEXT deletion to the blocks in strms and swizzle the result.
529
530Q: Why is it advantageous to swizzle the PEXTed streams?
531
532A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
533
534Example:
535Input:     11101101
536PEXT mask: 11110000
537Output:    00001110
538
539PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
540cleared. This has implications if we're working with multiple streams.
541
542For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
543(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
544Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
545Stream 2:   qrstuv00 wxy00000 z1000000 23456000
546Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
547Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
548
549If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
550should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
551time:
552
553The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
554
555Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
556Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
557Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
558Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
559
560Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
561compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
562
563Args:
564    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
565    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
566        sequentially to PEXT a full block.)
567
568Returns:
569    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
570*/
571
572std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
573
574    Constant * pext = nullptr;
575    if (mPEXTWidth == 64) {
576        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
577    } else if (mPEXTWidth == 32) {
578        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
579    }
580
581    Value * const m = b->fwCast(mPEXTWidth, selectors);
582
583    std::vector<Value *> masks(mSwizzleFactor);
584    for (unsigned i = 0; i < mSwizzleFactor; i++) {
585        masks[i] = b->CreateExtractElement(m, i);
586
587    }
588
589    std::vector<std::vector<Value *>> swizzleSets;
590    swizzleSets.reserve(mSwizzleSetCount);
591
592    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
593
594    UndefValue * const outputInitializer = UndefValue::get(vecTy);
595
596    std::vector<Value *> input(mSwizzleFactor);
597    // For each of the k swizzle sets required to apply PEXT to all input streams
598    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
599
600        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
601            const unsigned k = (i * mSwizzleFactor) + j;
602            if (k < mStreamCount) {
603                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
604            } else {
605                input[j] = Constant::getNullValue(vecTy);
606            }
607        }
608
609        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
610        // then splat the pext mask and apply it to each output row
611
612        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
613        // For each of the input streams
614        for (unsigned j = 0; j < mSwizzleFactor; j++) {
615            for (unsigned k = 0; k < mSwizzleFactor; k++) {
616                // Load block j,k
617                Value * const field = b->CreateExtractElement(input[j], k);
618                // Apply PEXT deletion
619                Value * const selected = b->CreateCall(pext, {field, masks[k]});
620                // Then store it as our k,j-th output
621                output[k] = b->CreateInsertElement(output[k], selected, j);
622            }
623        }
624
625        swizzleSets.emplace_back(output);
626    }
627
628    return swizzleSets;
629}
630
631// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
632// Kernel inputs: stream_count data streams plus one del_mask stream
633// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
634
635void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
636    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
637    generateProcessingLoop(kb, delMask);
638}
639
640void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
641    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
642    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
643    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
644    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
645    generateProcessingLoop(kb, delMask);
646}
647
648void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
649    Constant * PEXT_func = nullptr;
650    if (mPEXTWidth == 64) {
651        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
652    } else if (mPEXTWidth == 32) {
653        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
654    }
655    std::vector<Value *> masks(mSwizzleFactor);
656    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
657    for (unsigned i = 0; i < mSwizzleFactor; i++) {
658        masks[i] = kb->CreateExtractElement(m, i);
659    }
660
661    for (unsigned i = 0; i < mStreamCount; ++i) {
662        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
663        Value * value = kb->fwCast(mPEXTWidth, input);
664        Value * output = UndefValue::get(value->getType());
665        for (unsigned j = 0; j < mSwizzleFactor; j++) {
666            Value * field = kb->CreateExtractElement(value, j);
667            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
668            output = kb->CreateInsertElement(output, compressed, j);
669        }
670        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
671    }
672    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
673    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
674}
675
676DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
677: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
678              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
679                  Binding{b->getStreamSetTy(), "delMaskSet"}},
680              {}, {}, {}, {})
681, mDelCountFieldWidth(fw)
682, mStreamCount(streamCount)
683, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
684, mPEXTWidth(PEXT_width) {
685    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
686    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
687}
688
689   
690//
691// This kernel performs final stream compression for a set of N bitstreams, given
692// (a) a set of bitstreams partially compressed within K-bit fields and stored
693//     in K-bit swizzled form, and
694// (b) a stream of deletion/extraction counts per K-bit stride.
695//
696// Restrictions:  At present, only K=64 is supported.
697//                At present, N must be an exact multiple of BLOCK_SIZE/K.
698//
699// The kernel always consumes full blocks of input and emits data into the output
700// buffer in swizzles of K items at a time.   Upon completion of a segment,
701// up to K-1 pending output items per stream may be stored in the kernel state.
702//
703// Note: that both input streams and output streams are stored in swizzled form.
704//
705
706SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
707: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
708                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
709, mBitStreamCount(bitStreamCount)
710    , mFieldWidth(fieldWidth)
711    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
712    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
713        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
714        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
715        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
716        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
717        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
718        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
719            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
720            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
721            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
722        }
723        addScalar(kb->getSizeTy(), "pendingOffset");
724}
725   
726void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
727       
728    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
729    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
730   
731    // Output is written and committed to the output buffer one swizzle at a time.
732    //
733    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
734    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
735   
736    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
737    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
738    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
739
740    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
741    Value * pendingOffset = kb->getScalarField("pendingOffset");
742    // There is a separate vector of pending data for each swizzle group.
743    std::vector<Value *> pendingData;
744    std::vector<Value *> outputStreamPtr;
745    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
746        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
747        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
748    }
749   
750    // Generate code for each of the mSwizzleFactor fields making up a block.
751    // We load the count for the field and process all swizzle groups accordingly.
752    for (unsigned i = 0; i < mSwizzleFactor; i++) {
753        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
754        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
755        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
756       
757        // Data from the ith swizzle pack of each group is processed
758        // according to the same newItemCount, pendingSpace, ...
759        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
760            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
761            // Combine as many of the new items as possible into the pending group.
762            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
763            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
764            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
765            // Any items in excess of the space available in the current pending group overflow for the next group.
766            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
767            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
768            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
769        }
770        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
771        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
772    }
773    kb->setScalarField("pendingOffset", pendingOffset);
774    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
775    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
776    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
777        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
778    }
779    kb->setProducedItemCount("outputSwizzle0", produced);
780}
781
782void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
783    CreateDoBlockMethodCall(kb);
784    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
785    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
786   
787    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
788    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
789    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
790    Value * pendingOffset = kb->getScalarField("pendingOffset");
791
792    // Write the pending data.
793    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
794        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
795        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
796        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
797    }
798    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
799}
800}
Note: See TracBrowser for help on using the repository browser.