source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6200

Last change on this file since 6200 was 6200, checked in by cameron, 6 months ago

Modify stream compress kernel to use PopCountOf? extraction mask

File size: 42.8 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
9#include <kernels/kernel_builder.h>
10#include <llvm/Support/raw_ostream.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Intrinsics.h>
13
14using namespace llvm;
15
16inline size_t ceil_udiv(const size_t n, const size_t m) {
17    return (n + m - 1) / m;
18}
19
20namespace kernel {
21
22inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
23    Value * m = kb->simd_not(del_mask);
24    Value * mk = kb->simd_slli(fw, del_mask, 1);
25    std::vector<Value *> move_masks;
26    for (unsigned shift = 1; shift < fw; shift *= 2) {
27        Value * mp = mk;
28        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
29            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
30        }
31        Value * mv = kb->simd_and(mp, m);
32        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
33        mk = kb->simd_and(mk, kb->simd_not(mp));
34        move_masks.push_back(mv);
35    }
36    return move_masks;
37}
38
39inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
40    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
41    for (unsigned i = 0; i < mv.size(); i++) {
42        unsigned shift = 1 << i;
43        Value * t = kb->simd_and(s, mv[i]);
44        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
45    }
46    return s;
47}
48
49// Apply deletion to a set of stream_count input streams to produce a set of output streams.
50// Kernel inputs: stream_count data streams plus one del_mask stream
51// Outputs: the deleted streams, plus a partial sum popcount
52
53void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
54    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
55    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
56    for (unsigned j = 0; j < mStreamCount; ++j) {
57        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
58        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
59        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
60    }
61    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
62    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
63}
64
65void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
66    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
67    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
68    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
69    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
70    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
71    for (unsigned j = 0; j < mStreamCount; ++j) {
72        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
73        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
74        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
75    }
76    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
77    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
78}
79
80DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
81: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
82                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
83                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
84                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
85                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
86                      {}, {}, {})
87, mDeletionFieldWidth(fieldWidth)
88, mStreamCount(streamCount) {
89}
90
91void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
92    BasicBlock * entry = kb->GetInsertBlock();
93    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
94    BasicBlock * done = kb->CreateBasicBlock("done");
95    Constant * const ZERO = kb->getSize(0);
96    kb->CreateBr(processBlock);
97    kb->SetInsertPoint(processBlock);
98    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
99    blockOffsetPhi->addIncoming(ZERO, entry);
100    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
101    Value * delMask = kb->simd_not(extractionMask);
102    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
103    for (unsigned j = 0; j < mStreamCount; ++j) {
104        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
105        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
106        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
107    }
108    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
109    blockOffsetPhi->addIncoming(nextBlk, processBlock);
110    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
111    kb->CreateCondBr(moreToDo, processBlock, done);
112    kb->SetInsertPoint(done);
113}
114
115FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw
116                                         , StreamSet * inputStreamSet, StreamSet * extractionMask
117                                         , StreamSet * outputStreamSet)
118: MultiBlockKernel("fieldCompress" + std::to_string(fw) + "_" + std::to_string(inputStreamSet->getNumElements()),
119// inputs
120{Binding{"inputStreamSet", inputStreamSet},
121Binding{"extractionMask", extractionMask}},
122// outputs
123{Binding{"outputStreamSet", outputStreamSet}},
124{}, {}, {})
125, mCompressFieldWidth(fw)
126, mStreamCount(inputStreamSet->getNumElements()) {
127
128}
129
130void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
131    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
132    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
133    Constant * PEXT_func = nullptr;
134    if (mPEXTWidth == 64) {
135        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
136    } else if (mPEXTWidth == 32) {
137        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
138    }
139    BasicBlock * entry = kb->GetInsertBlock();
140    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
141    BasicBlock * done = kb->CreateBasicBlock("done");
142    Constant * const ZERO = kb->getSize(0);
143    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
144    kb->CreateBr(processBlock);
145    kb->SetInsertPoint(processBlock);
146    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
147    blockOffsetPhi->addIncoming(ZERO, entry);
148    std::vector<Value *> mask(fieldsPerBlock);
149    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
150    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
151    for (unsigned i = 0; i < fieldsPerBlock; i++) {
152        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
153    }
154    for (unsigned j = 0; j < mStreamCount; ++j) {
155        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
156        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
157        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
158        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
159        for (unsigned i = 0; i < fieldsPerBlock; i++) {
160            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
161            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
162            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
163        }
164    }
165    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
166    blockOffsetPhi->addIncoming(nextBlk, processBlock);
167    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
168    kb->CreateCondBr(moreToDo, processBlock, done);
169    kb->SetInsertPoint(done);
170}
171
172PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
173: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
174                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
175                       Binding{kb->getStreamSetTy(), "extractionMask"}},
176                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
177                   {}, {}, {})
178, mPEXTWidth(fieldWidth)
179, mStreamCount(streamCount) {
180    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
181}
182   
183StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
184                                           , StreamSet * source
185                                           , StreamSet * extractionMask
186                                           , StreamSet * compressedOutput
187                                           , const unsigned FieldWidth)
188: MultiBlockKernel("streamCompress" + std::to_string(FieldWidth) + "_" + std::to_string(source->getNumElements()),
189{Binding{"sourceStreamSet", source},
190Binding{"extractionMask", extractionMask}},
191{Binding{"compressedOutput", compressedOutput, PopcountOf("extractionMask")}},
192{}, {}, {})
193, mCompressedFieldWidth(FieldWidth)
194, mStreamCount(source->getNumElements()) {
195    Type * const fwTy = b->getIntNTy(mCompressedFieldWidth);
196    addInternalScalar(fwTy, "pendingItemCount");
197    for (unsigned i = 0; i < mStreamCount; i++) {
198        addInternalScalar(b->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
199    }
200
201}
202   
203void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
204    IntegerType * const fwTy = b->getIntNTy(mCompressedFieldWidth);
205    IntegerType * const sizeTy = b->getSizeTy();
206    const unsigned numFields = b->getBitBlockWidth() / mCompressedFieldWidth;
207    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(mCompressedFieldWidth));
208    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
209    Constant * CFW = ConstantInt::get(fwTy, mCompressedFieldWidth);
210    Constant * fwSplat = ConstantVector::getSplat(numFields, CFW);
211    Constant * numFieldConst = ConstantInt::get(fwTy, numFields);
212    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, mCompressedFieldWidth - 1));
213    BasicBlock * entry = b->GetInsertBlock();
214    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
215    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
216    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
217    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
218    Constant * const ZERO = ConstantInt::get(sizeTy, 0);
219    Constant * const ONE = ConstantInt::get(sizeTy, 1);
220
221
222    Value * pendingItemCount = b->getScalarField("pendingItemCount");
223    std::vector<Value *> pendingData(mStreamCount);
224    for (unsigned i = 0; i < mStreamCount; i++) {
225        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
226    }
227   
228    b->CreateBr(segmentLoop);
229    // Main Loop
230    b->SetInsertPoint(segmentLoop);
231    PHINode * blockOffsetPhi = b->CreatePHI(sizeTy, 2);
232    PHINode * outputBlockPhi = b->CreatePHI(sizeTy, 2);
233    PHINode * pendingItemsPhi = b->CreatePHI(fwTy, 2);
234    PHINode * pendingDataPhi[mStreamCount];
235    blockOffsetPhi->addIncoming(ZERO, entry);
236    outputBlockPhi->addIncoming(ZERO, entry);
237    pendingItemsPhi->addIncoming(pendingItemCount, entry);
238    for (unsigned i = 0; i < mStreamCount; i++) {
239        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
240        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
241    }
242    Value * fieldPopCounts = b->simd_popcount(mCompressedFieldWidth, b->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi));
243    // For each field determine the (partial) sum popcount of all fields up to and
244    // including the current field.
245    Value * partialSum = fieldPopCounts;
246    for (unsigned i = 1; i < numFields; i *= 2) {
247        partialSum = b->simd_add(mCompressedFieldWidth, partialSum, b->mvmd_slli(mCompressedFieldWidth, partialSum, i));
248    }
249    // Value * blockPopCount = b->CreateZExtOrTrunc(b->mvmd_extract(mCompressedFieldWidth, partialSum, numFields - 1), fwTy);
250
251    Value * blockPopCount = b->mvmd_extract(mCompressedFieldWidth, partialSum, numFields - 1);
252
253    //
254    // Now determine for each source field the output offset of the first bit.
255    // Note that this depends on the number of pending bits.
256    //
257    Value * pendingOffset = b->CreateURem(pendingItemsPhi, CFW);
258    Value * splatPending = b->simd_fill(mCompressedFieldWidth, b->CreateZExtOrTrunc(pendingOffset, fwTy));
259    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, CFW);
260    Value * offsets = b->simd_add(mCompressedFieldWidth, b->mvmd_slli(mCompressedFieldWidth, partialSum, 1), splatPending);
261    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
262   //
263    // Determine the relative field number for each output field.   Note that the total
264    // number of fields involved is numFields + 1.   However, the first field always
265    // be immediately combined into the current pending data field, so we calculate
266    // field numbers for all subsequent fields, (the fields that receive overflow bits).
267    Value * pendingSum = b->simd_add(mCompressedFieldWidth, partialSum, splatPending);
268    Value * fieldNo = b->simd_srli(mCompressedFieldWidth, pendingSum, std::log2(mCompressedFieldWidth));
269    // Now process the input data block of each stream in the input stream set.
270    //
271    // First load all the stream set blocks and the pending data.
272    std::vector<Value *> sourceBlock(mStreamCount);
273    for (unsigned i = 0; i < mStreamCount; i++) {
274        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
275    }
276    // Now separate the bits of each field into ones that go into the current field
277    // and ones that go into the overflow field.   Extract the first field separately,
278    // and then shift and combine subsequent fields.
279    std::vector<Value *> pendingOutput(mStreamCount);
280    std::vector<Value *> outputFields(mStreamCount);
281    Value * backShift = b->simd_sub(mCompressedFieldWidth, fwSplat, offsets);
282    for (unsigned i = 0; i < mStreamCount; i++) {
283        Value * currentFieldBits = b->simd_sllv(mCompressedFieldWidth, sourceBlock[i], offsets);
284        Value * nextFieldBits = b->simd_srlv(mCompressedFieldWidth, sourceBlock[i], backShift);
285        Value * firstField = b->mvmd_extract(mCompressedFieldWidth, currentFieldBits, 0);
286        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
287        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
288        // shift back currentFieldBits to combine with nextFieldBits.
289        outputFields[i] = b->simd_or(b->mvmd_srli(mCompressedFieldWidth, currentFieldBits, 1), nextFieldBits);
290    }
291    // Now combine forward all fields with the same field number.  This may require
292    // up to log2 numFields steps.
293    for (unsigned j = 1; j < numFields; j*=2) {
294        Value * select = b->simd_eq(mCompressedFieldWidth, fieldNo, b->mvmd_slli(mCompressedFieldWidth, fieldNo, j));
295        for (unsigned i = 0; i < mStreamCount; i++) {
296            Value * fields_fwd = b->mvmd_slli(mCompressedFieldWidth, outputFields[i], j);
297            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
298       }
299    }
300    // Now compress the data fields, eliminating all but the last field from
301    // each run of consecutive field having the same field number as a subsequent field.
302    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
303    // So we add 1 to field numbers first.
304    Value * nonZeroFieldNo = b->simd_add(mCompressedFieldWidth, fieldNo, oneSplat);
305    Value * eqNext = b->simd_eq(mCompressedFieldWidth, nonZeroFieldNo, b->mvmd_srli(mCompressedFieldWidth, nonZeroFieldNo, 1));
306    Value * compressMask = b->hsimd_signmask(mCompressedFieldWidth, b->simd_not(eqNext));
307    for (unsigned i = 0; i < mStreamCount; i++) {
308        outputFields[i] = b->mvmd_compress(mCompressedFieldWidth, outputFields[i], compressMask);
309    }
310    //
311    // Finally combine the pendingOutput and outputField data.
312    // (a) shift forward outputField data to fill the pendingOutput values.
313    // (b) shift back outputField data to clear data added to pendingOutput.
314    //
315    // However, we may need to increment pendingFieldIndex if we previously
316    // filled the field with the extracted firstField value.  The first
317    // value of the fieldNo vector will be 0 or 1.
318    // It is possible that pendingFieldIndex will reach the total number
319    // of fields held in register.  mvmd_sll may not handle this if it
320    // translates to an LLVM shl.
321    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(mCompressedFieldWidth, fieldNo, 0), fwTy);
322    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
323    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
324    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
325    for (unsigned i = 0; i < mStreamCount; i++) {
326        Value * shiftedField = b->mvmd_sll(mCompressedFieldWidth, outputFields[i], pendingFieldIdx);
327        Value * outputFwd = b->fwCast(mCompressedFieldWidth, shiftedField);
328        shiftedField = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
329        pendingOutput[i] = b->simd_or(pendingOutput[i], shiftedField);
330        outputFields[i] = b->mvmd_srl(mCompressedFieldWidth, outputFields[i], shftBack);
331    }
332    //
333    // Write the pendingOutput data to outputStream.
334    // Note: this data may be overwritten later, but we avoid branching.
335    for (unsigned i = 0; i < mStreamCount; i++) {
336        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
337    }
338    // Now determine the total amount of pending items and whether
339    // the pending data all fits within the pendingOutput.
340    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
341    Constant * BLOCK_WIDTH = ConstantInt::get(fwTy, b->getBitBlockWidth());
342    Value * doesFit = b->CreateICmpULT(newPending, BLOCK_WIDTH);
343    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, BLOCK_WIDTH));
344    //
345    // Prepare Phi nodes for the next iteration.
346    //
347    Value * nextBlk = b->CreateAdd(blockOffsetPhi, ONE);
348    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
349    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, ONE);
350    // But don't advance the output if all the data does fit into pendingOutput.
351    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
352    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
353    pendingItemsPhi->addIncoming(newPending, segmentLoop);
354
355    for (unsigned i = 0; i < mStreamCount; i++) {
356        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(mCompressedFieldWidth, pendingOutput[i]), b->fwCast(mCompressedFieldWidth, outputFields[i]));
357        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
358    }
359    //
360    // Now continue the loop if there are more blocks to process.
361    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
362    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
363   
364    b->SetInsertPoint(segmentDone);
365    // Save kernel state.
366    b->setScalarField("pendingItemCount", newPending);
367    for (unsigned i = 0; i < mStreamCount; i++) {
368        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
369    }
370    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
371
372    b->SetInsertPoint(finalWrite);
373    for (unsigned i = 0; i < mStreamCount; i++) {
374        Value * pending = b->bitCast(pendingOutput[i]);
375        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
376    }
377    b->CreateBr(updateProducedCount);
378
379    b->SetInsertPoint(updateProducedCount);
380    //Value * produced = b->getProducedItemCount("compressedOutput");
381    //Value * const blockOffset = b->CreateMul(nextOutputBlk, b->getSize(b->getBitBlockWidth()));
382    //produced = b->CreateAdd(produced, blockOffset);
383    //newPending = b->CreateZExtOrTrunc(newPending, sizeTy);
384    //produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
385    //b->setProducedItemCount("compressedOutput", produced);
386}
387
388Bindings makeSwizzledDeleteByPEXTOutputBindings(const std::vector<StreamSet *> & outputStreamSets, const unsigned PEXTWidth) {
389    const auto n = outputStreamSets.size();
390    Bindings outputs;
391    outputs.reserve(n);
392    outputs.emplace_back("outputSwizzle0", outputStreamSets[0], PopcountOf("selectors"), BlockSize(PEXTWidth)); // PopcountOfNot("delMaskSet")
393    for (unsigned i = 1; i < n; ++i) {
394        outputs.emplace_back("outputSwizzle" + std::to_string(i), outputStreamSets[i], RateEqualTo("outputSwizzle0"), BlockSize(PEXTWidth));
395    }
396    return outputs;
397}
398
399SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b,
400                                                       StreamSet * selectors, StreamSet * inputStreamSet,
401                                                       const std::vector<StreamSet *> & outputStreamSets,
402                                                       const unsigned PEXTWidth)
403
404: MultiBlockKernel("PEXTdel" + std::to_string(PEXTWidth) + "_" + std::to_string(inputStreamSet->getNumElements()),
405{Binding{"selectors", selectors}, Binding{"inputStreamSet", inputStreamSet}},
406makeSwizzledDeleteByPEXTOutputBindings(outputStreamSets, PEXTWidth),
407{}, {}, {})
408, mStreamCount(inputStreamSet->getNumElements())
409, mSwizzleFactor(b->getBitBlockWidth() / PEXTWidth)
410, mSwizzleSetCount(ceil_udiv(mStreamCount, mSwizzleFactor))
411, mPEXTWidth(PEXTWidth) {
412
413    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0) && "mDelCountFieldWidth must be a power of 2");
414    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
415    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
416    assert (mSwizzleSetCount);
417    assert (outputStreamSets.size() == mSwizzleSetCount);
418    assert (outputStreamSets[0]->getNumElements() == mSwizzleFactor);
419
420    addInternalScalar(b->getBitBlockType(), "pendingSwizzleData0");
421    for (unsigned i = 1; i < outputStreamSets.size(); ++i) {
422        assert (outputStreamSets[i]->getNumElements() == mSwizzleFactor);
423        addInternalScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
424    }
425}
426
427void SwizzledDeleteByPEXTkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
428    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
429
430    BasicBlock * const entry = b->GetInsertBlock();
431    BasicBlock * const beginLoop = b->CreateBasicBlock("beginLoop");
432
433    ConstantInt * const ZERO = b->getSize(0);
434    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
435    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
436    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
437    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
438    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
439
440    // All output groups have the same count.
441    Value * const baseOutputProduced = b->getProducedItemCount("outputSwizzle0");
442    Value * const baseProducedOffset = b->CreateAnd(baseOutputProduced, BLOCK_WIDTH_MASK);
443
444    // There is a separate vector of pending data for each swizzle group.
445    std::vector<Value *> pendingData(mSwizzleSetCount);
446    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
447        pendingData[i] = b->getScalarField("pendingSwizzleData" + std::to_string(i));
448    }
449    b->CreateBr(beginLoop);
450
451    b->SetInsertPoint(beginLoop);
452    PHINode * const strideIndex = b->CreatePHI(numOfBlocks->getType(), 2);
453    strideIndex->addIncoming(ZERO, entry);
454    PHINode * const producedOffsetPhi = b->CreatePHI(numOfBlocks->getType(), 2);
455    producedOffsetPhi->addIncoming(baseProducedOffset, entry);
456    std::vector<PHINode *> pendingDataPhi(mSwizzleSetCount);
457    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
458        pendingDataPhi[i] = b->CreatePHI(pendingData[i]->getType(), 2);
459        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
460        pendingData[i] = pendingDataPhi[i];
461    }
462
463    Value * const selectors = b->loadInputStreamBlock("selectors", strideIndex);
464
465    const auto swizzleSets = makeSwizzleSets(b, selectors, strideIndex);
466
467    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
468
469    // Compress the PEXTedSwizzleSets
470    // Output is written and committed to the output buffer one swizzle at a time.
471    Value * producedOffset = producedOffsetPhi;
472
473    // For each row i
474    for (unsigned i = 0; i < mSwizzleFactor; i++) {
475
476        // Generate code for each of the mSwizzleFactor fields making up a block.
477        // We load the count for the field and process all swizzle groups accordingly.
478        Value * const pendingOffset = b->CreateAnd(producedOffset, PEXT_WIDTH_MASK);
479        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
480        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
481        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
482
483        Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
484        Value * const spaceVector = b->simd_fill(mPEXTWidth, pendingSpace);
485
486        Value * const outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
487        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
488        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
489
490        // Data from the ith swizzle pack of each group is processed
491        // according to the same newItemCount, pendingSpace, ...
492        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
493            Value * const newItems = swizzleSets[j][i];
494            // Combine as many of the new items as possible into the pending group.
495            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
496            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
497            // To avoid an unpredictable branch, always store the combined group, whether full or not.
498            b->storeOutputStreamBlock("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset, combinedGroup);
499            // Any items in excess of the space available in the current pending group overflow for the next group.
500            Value * overFlowGroup = b->CreateLShr(newItems, spaceVector);
501            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
502            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
503        }
504        producedOffset = b->CreateAdd(producedOffset, newItemCount);
505    }
506
507    BasicBlock * const finishedLoop = b->CreateBasicBlock("finishedLoop");
508    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
509    BasicBlock * const loopEndBlock = b->GetInsertBlock();
510    strideIndex->addIncoming(nextStrideIndex, loopEndBlock);
511    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
512        pendingDataPhi[i]->addIncoming(pendingData[i], loopEndBlock);
513    }
514    producedOffsetPhi->addIncoming(producedOffset, loopEndBlock);
515    Value * const doneLoop = b->CreateICmpEQ(nextStrideIndex, numOfBlocks);
516
517    b->CreateUnlikelyCondBr(doneLoop, finishedLoop, beginLoop);
518
519    b->SetInsertPoint(finishedLoop);
520    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
521        b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
522    }
523}
524
525/*
526Apply PEXT deletion to the blocks in strms and swizzle the result.
527
528Q: Why is it advantageous to swizzle the PEXTed streams?
529
530A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
531
532Example:
533Input:     11101101
534PEXT mask: 11110000
535Output:    00001110
536
537PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
538cleared. This has implications if we're working with multiple streams.
539
540For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
541(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
542Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
543Stream 2:   qrstuv00 wxy00000 z1000000 23456000
544Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
545Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
546
547If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
548should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
549time:
550
551The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
552
553Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
554Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
555Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
556Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
557
558Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
559compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
560
561Args:
562    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
563    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
564        sequentially to PEXT a full block.)
565
566Returns:
567    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
568*/
569
570SwizzledDeleteByPEXTkernel::SwizzleSets SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors, Value * const strideIndex) {
571
572    Constant * pext = nullptr;
573    if (mPEXTWidth == 64) {
574        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
575    } else if (mPEXTWidth == 32) {
576        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
577    }
578
579    Value * const m = b->fwCast(mPEXTWidth, selectors);
580
581    std::vector<Value *> masks(mSwizzleFactor);
582    for (unsigned i = 0; i < mSwizzleFactor; i++) {
583        masks[i] = b->CreateExtractElement(m, i);
584
585    }
586
587    SwizzleSets swizzleSets;
588    swizzleSets.reserve(mSwizzleSetCount);
589
590    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
591
592    UndefValue * const outputInitializer = UndefValue::get(vecTy);
593
594    std::vector<Value *> input(mSwizzleFactor);
595    // For each of the k swizzle sets required to apply PEXT to all input streams
596    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
597
598        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
599            const unsigned k = (i * mSwizzleFactor) + j;
600            if (k < mStreamCount) {
601                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k), strideIndex), vecTy);
602            } else {
603                input[j] = Constant::getNullValue(vecTy);
604            }
605        }
606
607        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
608        // then splat the pext mask and apply it to each output row
609
610        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
611        // For each of the input streams
612        for (unsigned j = 0; j < mSwizzleFactor; j++) {
613            for (unsigned k = 0; k < mSwizzleFactor; k++) {
614                // Load block j,k
615                Value * const field = b->CreateExtractElement(input[j], k);
616                // Apply PEXT deletion
617                Value * const selected = b->CreateCall(pext, {field, masks[k]});
618                // Then store it as our k,j-th output
619                output[k] = b->CreateInsertElement(output[k], selected, j);
620            }
621        }
622        swizzleSets.emplace_back(output);
623    }
624
625    return swizzleSets;
626}
627
628
629// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
630// Kernel inputs: stream_count data streams plus one del_mask stream
631// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
632
633void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
634    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
635    generateProcessingLoop(kb, delMask);
636}
637
638void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
639    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
640    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
641    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
642    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
643    generateProcessingLoop(kb, delMask);
644}
645
646void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
647    Constant * PEXT_func = nullptr;
648    if (mPEXTWidth == 64) {
649        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
650    } else if (mPEXTWidth == 32) {
651        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
652    }
653    std::vector<Value *> masks(mSwizzleFactor);
654    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
655    for (unsigned i = 0; i < mSwizzleFactor; i++) {
656        masks[i] = kb->CreateExtractElement(m, i);
657    }
658
659    for (unsigned i = 0; i < mStreamCount; ++i) {
660        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
661        Value * value = kb->fwCast(mPEXTWidth, input);
662        Value * output = UndefValue::get(value->getType());
663        for (unsigned j = 0; j < mSwizzleFactor; j++) {
664            Value * field = kb->CreateExtractElement(value, j);
665            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
666            output = kb->CreateInsertElement(output, compressed, j);
667        }
668        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
669    }
670    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
671    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
672}
673
674DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
675: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
676              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
677                  Binding{b->getStreamSetTy(), "delMaskSet"}},
678              {}, {}, {}, {})
679, mDelCountFieldWidth(fw)
680, mStreamCount(streamCount)
681, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
682, mPEXTWidth(PEXT_width) {
683    mOutputStreamSets.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
684    mOutputStreamSets.emplace_back(b->getStreamSetTy(), "deletionCounts");
685}
686
687   
688//
689// This kernel performs final stream compression for a set of N bitstreams, given
690// (a) a set of bitstreams partially compressed within K-bit fields and stored
691//     in K-bit swizzled form, and
692// (b) a stream of deletion/extraction counts per K-bit stride.
693//
694// Restrictions:  At present, only K=64 is supported.
695//                At present, N must be an exact multiple of BLOCK_SIZE/K.
696//
697// The kernel always consumes full blocks of input and emits data into the output
698// buffer in swizzles of K items at a time.   Upon completion of a segment,
699// up to K-1 pending output items per stream may be stored in the kernel state.
700//
701// Note: that both input streams and output streams are stored in swizzled form.
702//
703
704SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
705: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
706                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
707, mBitStreamCount(bitStreamCount)
708, mFieldWidth(fieldWidth)
709, mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
710, mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
711    assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
712    assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
713    mInputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
714    mOutputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
715    addInternalScalar(kb->getBitBlockType(), "pendingSwizzleData0");
716    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
717        mInputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
718        mOutputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
719        addInternalScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
720    }
721    addInternalScalar(kb->getSizeTy(), "pendingOffset");
722}
723   
724void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
725       
726    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
727    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
728   
729    // Output is written and committed to the output buffer one swizzle at a time.
730    //
731    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
732    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
733   
734    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
735    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
736    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
737
738    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
739    Value * pendingOffset = kb->getScalarField("pendingOffset");
740    // There is a separate vector of pending data for each swizzle group.
741    std::vector<Value *> pendingData;
742    std::vector<Value *> outputStreamPtr;
743    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
744        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
745        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
746    }
747   
748    // Generate code for each of the mSwizzleFactor fields making up a block.
749    // We load the count for the field and process all swizzle groups accordingly.
750    for (unsigned i = 0; i < mSwizzleFactor; i++) {
751        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
752        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
753        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
754       
755        Value * const fieldWidths = kb->simd_fill(mFieldWidth, pendingOffset);
756
757        // Data from the ith swizzle pack of each group is processed
758        // according to the same newItemCount, pendingSpace, ...
759        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
760            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
761            // Combine as many of the new items as possible into the pending group.
762            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, fieldWidths));
763            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
764            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
765            // Any items in excess of the space available in the current pending group overflow for the next group.
766            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
767            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
768            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
769        }
770        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
771        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
772    }
773    kb->setScalarField("pendingOffset", pendingOffset);
774//    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
775//    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
776    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
777        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
778    }
779//    kb->setProducedItemCount("outputSwizzle0", produced);
780}
781
782void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
783    CreateDoBlockMethodCall(kb);
784    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
785    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
786   
787    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
788    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
789    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
790//    Value * pendingOffset = kb->getScalarField("pendingOffset");
791
792    // Write the pending data.
793    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
794        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
795        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
796        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
797    }
798//    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
799}
800
801}
Note: See TracBrowser for help on using the repository browser.