source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6018

Last change on this file since 6018 was 6018, checked in by cameron, 12 months ago

PEXTFieldCompressKernel

File size: 42.2 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
15    Value * m = kb->simd_not(del_mask);
16    Value * mk = kb->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = kb->simd_and(mp, m);
24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
25        mk = kb->simd_and(mk, kb->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = kb->simd_and(s, mv[i]);
36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
52    }
53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
67    }
68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
84    BasicBlock * entry = kb->GetInsertBlock();
85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
86    BasicBlock * done = kb->CreateBasicBlock("done");
87    Constant * const ZERO = kb->getSize(0);
88    kb->CreateBr(processBlock);
89    kb->SetInsertPoint(processBlock);
90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
91    blockOffsetPhi->addIncoming(ZERO, entry);
92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
93    Value * delMask = kb->simd_not(extractionMask);
94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
95    for (unsigned j = 0; j < mStreamCount; ++j) {
96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
99    }
100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
105    kb->CreateCondBr(moreToDo, processBlock, done);
106    kb->SetInsertPoint(done);
107}
108
109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
115                      {}, {}, {})
116, mCompressFieldWidth(fieldWidth)
117, mStreamCount(streamCount) {
118}
119
120void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
121    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
122    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
123    Constant * PEXT_func = nullptr;
124    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
125    if (mPEXTWidth == 64) {
126        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
127    } else if (mPEXTWidth == 32) {
128        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
129    }
130    BasicBlock * entry = kb->GetInsertBlock();
131    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
132    BasicBlock * done = kb->CreateBasicBlock("done");
133    Constant * const ZERO = kb->getSize(0);
134    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
135    kb->CreateBr(processBlock);
136    kb->SetInsertPoint(processBlock);
137    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
138    blockOffsetPhi->addIncoming(ZERO, entry);
139    std::vector<Value *> mask(fieldsPerBlock);
140    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
141    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
142    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
143    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
144    for (unsigned i = 0; i < fieldsPerBlock; i++) {
145        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
146        Value * popc = kb->CreateCall(popc_func, mask[i]);
147        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
148    }
149    for (unsigned j = 0; j < mStreamCount; ++j) {
150        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
151        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
152        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
153        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
154        for (unsigned i = 0; i < fieldsPerBlock; i++) {
155            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
156            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
157            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
158        }
159    }
160    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
161    blockOffsetPhi->addIncoming(nextBlk, processBlock);
162    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
163    kb->CreateCondBr(moreToDo, processBlock, done);
164    kb->SetInsertPoint(done);
165}
166
167PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
168: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
169                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
170                       Binding{kb->getStreamSetTy(), "extractionMask"}},
171                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
172                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
173                   {}, {}, {})
174, mPEXTWidth(fieldWidth)
175, mStreamCount(streamCount) {
176    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
177}
178   
179StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
180: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
181                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
182                       Binding{kb->getStreamSetTy(), "unitCounts"}},
183                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
184                   {}, {}, {})
185, mCompressedFieldWidth(fieldWidth)
186, mStreamCount(streamCount) {
187    addScalar(kb->getSizeTy(), "pendingItemCount");
188    for (unsigned i = 0; i < streamCount; i++) {
189        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
190    }
191
192}
193   
194void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
195    const unsigned fw = mCompressedFieldWidth;
196    Type * fwTy = b->getIntNTy(fw);
197    Type * sizeTy = b->getSizeTy();
198    const unsigned numFields = b->getBitBlockWidth()/fw;
199    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
200    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
201    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
202    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
203    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
204    BasicBlock * entry = b->GetInsertBlock();
205    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
206    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
207    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
208    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
209    Constant * const ZERO = b->getSize(0);
210   
211    Value * pendingItemCount = b->getScalarField("pendingItemCount");
212    std::vector<Value *> pendingData(mStreamCount);
213    for (unsigned i = 0; i < mStreamCount; i++) {
214        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
215    }
216   
217    b->CreateBr(segmentLoop);
218    // Main Loop
219    b->SetInsertPoint(segmentLoop);
220    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
221    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
222    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
223    PHINode * pendingDataPhi[mStreamCount];
224    blockOffsetPhi->addIncoming(ZERO, entry);
225    outputBlockPhi->addIncoming(ZERO, entry);
226    pendingItemsPhi->addIncoming(pendingItemCount, entry);
227    for (unsigned i = 0; i < mStreamCount; i++) {
228        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
229        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
230    }
231    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
232    // For each field determine the (partial) sum popcount of all fields up to and
233    // including the current field.
234    Value * partialSum = fieldPopCounts;
235    for (unsigned i = 1; i < numFields; i *= 2) {
236        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
237    }
238    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
239    //
240    // Now determine for each source field the output offset of the first bit.
241    // Note that this depends on the number of pending bits.
242    //
243    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
244    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
245    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
246    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
247    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
248   //
249    // Determine the relative field number for each output field.   Note that the total
250    // number of fields involved is numFields + 1.   However, the first field always
251    // be immediately combined into the current pending data field, so we calculate
252    // field numbers for all subsequent fields, (the fields that receive overflow bits).
253    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
254  //
255    // Now process the input data block of each stream in the input stream set.
256    //
257    // First load all the stream set blocks and the pending data.
258    std::vector<Value *> sourceBlock(mStreamCount);
259    for (unsigned i = 0; i < mStreamCount; i++) {
260        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
261    }
262    // Now separate the bits of each field into ones that go into the current field
263    // and ones that go into the overflow field.   Extract the first field separately,
264    // and then shift and combine subsequent fields.
265    std::vector<Value *> pendingOutput(mStreamCount);
266    std::vector<Value *> outputFields(mStreamCount);
267    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
268    for (unsigned i = 0; i < mStreamCount; i++) {
269        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
270        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
271        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
272        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
273        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
274        // shift back currentFieldBits to combine with nextFieldBits.
275        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
276    }
277    // Now combine forward all fields with the same field number.  This may require
278    // up to log2 numFields steps.
279    for (unsigned j = 1; j < numFields; j*=2) {
280        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
281        for (unsigned i = 0; i < mStreamCount; i++) {
282            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
283            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
284        }
285    }
286    // Now compress the data fields, eliminating all but the last field from
287    // each run of consecutive field having the same field number.
288    // same field number as a subsequent field.
289    Value * eqNext = b->simd_eq(fw, fieldNo, b->mvmd_srli(fw, fieldNo, 1));
290    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
291    for (unsigned i = 0; i < mStreamCount; i++) {
292        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
293   }
294    //
295    // Finally combine the pendingOutput and outputField data.
296    // (a) shift forward outputField data to fill the pendingOutput values.
297    // (b) shift back outputField data to clear data added to pendingOutput.
298    //
299    // However, we may need to increment pendingFieldIndex if we previously
300    // filled the field with the extracted firstField value.  The first
301    // value of the fieldNo vector will be 0 or 1.
302    // It is possible that pendingFieldIndex will reach the total number
303    // of fields held in register.  mvmd_sll may not handle this if it
304    // translates to an LLVM shl.
305    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
306    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
307    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
308    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
309    for (unsigned i = 0; i < mStreamCount; i++) {
310        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
311        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
312        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
313        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
314    }
315    //
316    // Write the pendingOutput data to outputStream.
317    // Note: this data may be overwritten later, but we avoid branching.
318    for (unsigned i = 0; i < mStreamCount; i++) {
319        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
320    }
321    // Now determine the total amount of pending items and whether
322    // the pending data all fits within the pendingOutput.
323    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
324    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
325    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
326    //
327    // Prepare Phi nodes for the next iteration.
328    //
329    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
330    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
331    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
332    // But don't advance the output if all the data does fit into pendingOutput.
333    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
334    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
335    pendingItemsPhi->addIncoming(newPending, segmentLoop);
336
337    for (unsigned i = 0; i < mStreamCount; i++) {
338        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
339        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
340    }
341    //
342    // Now continue the loop if there are more blocks to process.
343    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
344    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
345   
346    b->SetInsertPoint(segmentDone);
347    // Save kernel state.
348    b->setScalarField("pendingItemCount", newPending);
349    for (unsigned i = 0; i < mStreamCount; i++) {
350        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
351    }
352    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
353    b->SetInsertPoint(finalWrite);
354    for (unsigned i = 0; i < mStreamCount; i++) {
355        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
356        Value * pending = b->bitCast(pendingOutput[i]);
357        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
358    }
359    b->CreateBr(updateProducedCount);
360    b->SetInsertPoint(updateProducedCount);
361     Value * produced = b->getProducedItemCount("compressedOutput");
362    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
363    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
364    b->setProducedItemCount("compressedOutput", produced);
365}
366
367SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
368: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
369                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
370                  {}, {}, {}, {})
371, mStreamCount(streamCount)
372, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
373// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
374// to prevent rounding errors.
375, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
376, mPEXTWidth(PEXT_width)
377{
378    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
379        && "mDelCountFieldWidth must be a power of 2");
380    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
381    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
382
383    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
384    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
385
386    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
387    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
388    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
389        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
390        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
391    }
392    addScalar(b->getSizeTy(), "pendingOffset");
393}
394
395void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
396    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
397    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
398    generateProcessingLoop(b, delMask, false);
399}
400
401void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
402    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
403    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
404    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
405    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
406    generateProcessingLoop(b, delMask, true);
407}
408
409/*
410What this function does in pseudo code:
411for (mSwizzleFactor)
412    create a swizzle set containing mSwizzleFactor blocks
413    apply PEXT to each block in the swizzle set
414    store the swizzleSet in PEXTedSwizzleSets vector
415
416for (each swizzle row i)
417    for (each swizzle set j)
418        processes row i in swizzle set j
419        store output in pendingData[j]
420*/
421
422void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
423
424    // selectors marks the positions we want to keep
425    Value * const selectors = b->CreateNot(delMask);
426
427    const auto swizzleSets = makeSwizzleSets(b, selectors);
428
429    // Compress the PEXTedSwizzleSets
430    // Output is written and committed to the output buffer one swizzle at a time.
431    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
432    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
433    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
434    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
435    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
436
437    // All output groups have the same count.
438    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
439    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
440    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
441    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
442
443    // There is a separate vector of pending data for each swizzle group.
444    std::vector<Value *> pendingData;
445    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
446        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
447    }
448
449    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
450
451    // For each row i
452    for (unsigned i = 0; i < mSwizzleFactor; i++) {
453
454        // Generate code for each of the mSwizzleFactor fields making up a block.
455        // We load the count for the field and process all swizzle groups accordingly.
456        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
457        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
458        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
459        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
460
461        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
462        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
463
464        // Data from the ith swizzle pack of each group is processed
465        // according to the same newItemCount, pendingSpace, ...
466        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
467            Value * const newItems = swizzleSets[j][i];
468            // Combine as many of the new items as possible into the pending group.
469            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
470            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
471            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
472            // To avoid an unpredictable branch, always store the combined group, whether full or not.
473            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
474            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
475            // Any items in excess of the space available in the current pending group overflow for the next group.
476            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
477            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
478            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
479        }
480
481        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
482        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
483
484        outputProduced = b->CreateAdd(outputProduced, newItemCount);
485    }
486
487    if (flush) { // incase we selected the overflow group on the final iteration
488        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
489        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
490        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
491            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
492            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
493        }
494    } else {
495        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
496            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
497        }
498        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
499        b->setScalarField("pendingOffset", pendingOffset);
500        // unless this is our final stride, don't report partially written fields.
501        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
502    }
503
504    b->setProducedItemCount("outputSwizzle0", outputProduced);
505}
506
507/*
508Apply PEXT deletion to the blocks in strms and swizzle the result.
509
510Q: Why is it advantageous to swizzle the PEXTed streams?
511
512A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
513
514Example:
515Input:     11101101
516PEXT mask: 11110000
517Output:    00001110
518
519PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
520cleared. This has implications if we're working with multiple streams.
521
522For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
523(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
524Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
525Stream 2:   qrstuv00 wxy00000 z1000000 23456000
526Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
527Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
528
529If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
530should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
531time:
532
533The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
534
535Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
536Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
537Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
538Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
539
540Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
541compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
542
543Args:
544    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
545    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
546        sequentially to PEXT a full block.)
547
548Returns:
549    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
550*/
551
552std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
553
554    Constant * pext = nullptr;
555    if (mPEXTWidth == 64) {
556        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
557    } else if (mPEXTWidth == 32) {
558        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
559    }
560
561    Value * const m = b->fwCast(mPEXTWidth, selectors);
562
563    std::vector<Value *> masks(mSwizzleFactor);
564    for (unsigned i = 0; i < mSwizzleFactor; i++) {
565        masks[i] = b->CreateExtractElement(m, i);
566
567    }
568
569    std::vector<std::vector<Value *>> swizzleSets;
570    swizzleSets.reserve(mSwizzleSetCount);
571
572    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
573
574    UndefValue * const outputInitializer = UndefValue::get(vecTy);
575
576    std::vector<Value *> input(mSwizzleFactor);
577    // For each of the k swizzle sets required to apply PEXT to all input streams
578    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
579
580        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
581            const unsigned k = (i * mSwizzleFactor) + j;
582            if (k < mStreamCount) {
583                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
584            } else {
585                input[j] = Constant::getNullValue(vecTy);
586            }
587        }
588
589        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
590        // then splat the pext mask and apply it to each output row
591
592        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
593        // For each of the input streams
594        for (unsigned j = 0; j < mSwizzleFactor; j++) {
595            for (unsigned k = 0; k < mSwizzleFactor; k++) {
596                // Load block j,k
597                Value * const field = b->CreateExtractElement(input[j], k);
598                // Apply PEXT deletion
599                Value * const selected = b->CreateCall(pext, {field, masks[k]});
600                // Then store it as our k,j-th output
601                output[k] = b->CreateInsertElement(output[k], selected, j);
602            }
603        }
604
605        swizzleSets.emplace_back(output);
606    }
607
608    return swizzleSets;
609}
610
611// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
612// Kernel inputs: stream_count data streams plus one del_mask stream
613// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
614
615void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
616    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
617    generateProcessingLoop(kb, delMask);
618}
619
620void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
621    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
622    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
623    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
624    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
625    generateProcessingLoop(kb, delMask);
626}
627
628void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
629    Constant * PEXT_func = nullptr;
630    if (mPEXTWidth == 64) {
631        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
632    } else if (mPEXTWidth == 32) {
633        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
634    }
635    std::vector<Value *> masks(mSwizzleFactor);
636    Value * const m = kb->fwCast(mSwizzleFactor, kb->simd_not(delMask));
637    for (unsigned i = 0; i < mSwizzleFactor; i++) {
638        masks.push_back(kb->CreateExtractElement(m, i));
639    }
640
641    for (unsigned i = 0; i < mStreamCount; ++i) {
642        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
643        Value * value = kb->fwCast(mPEXTWidth, input);
644        Value * output = UndefValue::get(value->getType());
645        for (unsigned j = 0; j < mSwizzleFactor; j++) {
646            Value * field = kb->CreateExtractElement(value, j);
647            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
648            output = kb->CreateInsertElement(output, compressed, j);
649        }
650        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
651    }
652    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
653    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
654}
655
656DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
657: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
658              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
659                  Binding{b->getStreamSetTy(), "delMaskSet"}},
660              {}, {}, {}, {})
661, mDelCountFieldWidth(fw)
662, mStreamCount(streamCount)
663, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
664, mPEXTWidth(PEXT_width) {
665    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
666    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
667}
668
669   
670//
671// This kernel performs final stream compression for a set of N bitstreams, given
672// (a) a set of bitstreams partially compressed within K-bit fields and stored
673//     in K-bit swizzled form, and
674// (b) a stream of deletion/extraction counts per K-bit stride.
675//
676// Restrictions:  At present, only K=64 is supported.
677//                At present, N must be an exact multiple of BLOCK_SIZE/K.
678//
679// The kernel always consumes full blocks of input and emits data into the output
680// buffer in swizzles of K items at a time.   Upon completion of a segment,
681// up to K-1 pending output items per stream may be stored in the kernel state.
682//
683// Note: that both input streams and output streams are stored in swizzled form.
684//
685
686SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
687: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
688                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
689, mBitStreamCount(bitStreamCount)
690    , mFieldWidth(fieldWidth)
691    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
692    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
693        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
694        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
695        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
696        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
697        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
698        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
699            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
700            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
701            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
702        }
703        addScalar(kb->getSizeTy(), "pendingOffset");
704}
705   
706void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
707       
708    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
709    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
710   
711    // Output is written and committed to the output buffer one swizzle at a time.
712    //
713    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
714    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
715   
716    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
717    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
718    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
719
720    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
721    Value * pendingOffset = kb->getScalarField("pendingOffset");
722    // There is a separate vector of pending data for each swizzle group.
723    std::vector<Value *> pendingData;
724    std::vector<Value *> outputStreamPtr;
725    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
726        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
727        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
728    }
729   
730    // Generate code for each of the mSwizzleFactor fields making up a block.
731    // We load the count for the field and process all swizzle groups accordingly.
732    for (unsigned i = 0; i < mSwizzleFactor; i++) {
733        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
734        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
735        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
736       
737        // Data from the ith swizzle pack of each group is processed
738        // according to the same newItemCount, pendingSpace, ...
739        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
740            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
741            // Combine as many of the new items as possible into the pending group.
742            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
743            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
744            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
745            // Any items in excess of the space available in the current pending group overflow for the next group.
746            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
747            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
748            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
749        }
750        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
751        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
752    }
753    kb->setScalarField("pendingOffset", pendingOffset);
754    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
755    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
756    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
757        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
758    }
759    kb->setProducedItemCount("outputSwizzle0", produced);
760}
761
762void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
763    CreateDoBlockMethodCall(kb);
764    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
765    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
766   
767    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
768    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
769    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
770    Value * pendingOffset = kb->getScalarField("pendingOffset");
771
772    // Write the pending data.
773    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
774        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
775        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
776        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
777    }
778    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
779}
780}
Note: See TracBrowser for help on using the repository browser.