source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6184

Last change on this file since 6184 was 6184, checked in by nmedfort, 7 months ago

Initial version of PipelineKernel? + revised StreamSet? model.

File size: 44.5 KB
RevLine 
[4981]1/*
[6006]2 *  Copyright (c) 2018 International Characters.
[4981]3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
[5260]6#include "deletion.h"
[6037]7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
[5436]9#include <kernels/kernel_builder.h>
[5297]10#include <llvm/Support/raw_ostream.h>
[6037]11#include <IR_Gen/idisa_target.h>
[6055]12#include <llvm/IR/Intrinsics.h>
[4981]13
[5260]14using namespace llvm;
15
[6184]16inline size_t ceil_udiv(const size_t n, const size_t m) {
17    return (n + m - 1) / m;
18}
19
[5260]20namespace kernel {
21
[6006]22inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
23    Value * m = kb->simd_not(del_mask);
24    Value * mk = kb->simd_slli(fw, del_mask, 1);
[4981]25    std::vector<Value *> move_masks;
26    for (unsigned shift = 1; shift < fw; shift *= 2) {
27        Value * mp = mk;
28        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
[6006]29            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
[4981]30        }
[6006]31        Value * mv = kb->simd_and(mp, m);
32        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
33        mk = kb->simd_and(mk, kb->simd_not(mp));
[4981]34        move_masks.push_back(mv);
35    }
36    return move_masks;
37}
38
[6006]39inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
40    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
[4981]41    for (unsigned i = 0; i < mv.size(); i++) {
42        unsigned shift = 1 << i;
[6006]43        Value * t = kb->simd_and(s, mv[i]);
44        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
[4981]45    }
46    return s;
47}
48
[6004]49// Apply deletion to a set of stream_count input streams to produce a set of output streams.
50// Kernel inputs: stream_count data streams plus one del_mask stream
51// Outputs: the deleted streams, plus a partial sum popcount
52
[6006]53void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
54    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
55    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
[6004]56    for (unsigned j = 0; j < mStreamCount; ++j) {
[6006]57        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
58        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
59        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
[5007]60    }
[6006]61    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
62    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
[5007]63}
64
[6006]65void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
66    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
67    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
68    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
69    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
70    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
[6004]71    for (unsigned j = 0; j < mStreamCount; ++j) {
[6006]72        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
73        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
74        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
[6004]75    }
[6006]76    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
77    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
[6004]78}
79
[6006]80DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
[6004]81: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
[6006]82                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
83                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
84                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
85                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
[6004]86                      {}, {}, {})
87, mDeletionFieldWidth(fieldWidth)
88, mStreamCount(streamCount) {
89}
90
[6006]91void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
92    BasicBlock * entry = kb->GetInsertBlock();
93    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
94    BasicBlock * done = kb->CreateBasicBlock("done");
95    Constant * const ZERO = kb->getSize(0);
96    kb->CreateBr(processBlock);
97    kb->SetInsertPoint(processBlock);
98    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
99    blockOffsetPhi->addIncoming(ZERO, entry);
100    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
101    Value * delMask = kb->simd_not(extractionMask);
102    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
103    for (unsigned j = 0; j < mStreamCount; ++j) {
104        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
105        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
106        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
107    }
[6085]108#ifndef STREAM_COMPRESS_USING_EXTRACTION_MASK
[6006]109    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
110    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
[6085]111#endif
[6006]112    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
113    blockOffsetPhi->addIncoming(nextBlk, processBlock);
114    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
115    kb->CreateCondBr(moreToDo, processBlock, done);
116    kb->SetInsertPoint(done);
117}
118
[6085]119#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
[6184]120FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
121                                         , StreamSet * inputStreamSet, StreamSet * extractionMask
122                                         , StreamSet * outputStreamSet)
123: MultiBlockKernel("fieldCompress" + std::to_string(b->getBitBlockWidth() / inputStreamSet->getNumElements()) + "_" + std::to_string(inputStreamSet->getNumElements()),
124// inputs
125{Binding{"inputStreamSet", inputStreamSet},
126Binding{"extractionMask", extractionMask}},
127// outputs
128{Binding{"outputStreamSet", outputStreamSet}},
129{}, {}, {})
130, mCompressFieldWidth(b->getBitBlockWidth() / inputStreamSet->getNumElements())
131, mStreamCount(inputStreamSet->getNumElements()) {
132
133}
[6085]134#else
[6184]135FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
136                                         , StreamSet * inputStreamSet, StreamSet * extractionMask
137                                         , StreamSet * outputStreamSet, StreamSet * unitCounts)
138: MultiBlockKernel("fieldCompress" + std::to_string(b->getBitBlockWidth() / inputStreamSet->getNumElements()) + "_" + std::to_string(inputStreamSet->getNumElements()),
139// inputs
140{Binding{"inputStreamSet", inputStreamSet},
141Binding{"extractionMask", extractionMask}},
142// outputs
143{Binding{"outputStreamSet", outputStreamSet},
144Binding{"unitCounts", unitCounts, FixedRate(), RoundUpTo(b->getBitBlockWidth())}},
145{}, {}, {})
146, mCompressFieldWidth(b->getBitBlockWidth() / inputStreamSet->getNumElements())
147, mStreamCount(inputStreamSet->getNumElements()) {
148
149}
[6085]150#endif
[6006]151
[6018]152void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
153    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
154    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
155    Constant * PEXT_func = nullptr;
156    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
157    if (mPEXTWidth == 64) {
158        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
159    } else if (mPEXTWidth == 32) {
160        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
161    }
162    BasicBlock * entry = kb->GetInsertBlock();
163    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
164    BasicBlock * done = kb->CreateBasicBlock("done");
165    Constant * const ZERO = kb->getSize(0);
166    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
167    kb->CreateBr(processBlock);
168    kb->SetInsertPoint(processBlock);
169    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
170    blockOffsetPhi->addIncoming(ZERO, entry);
171    std::vector<Value *> mask(fieldsPerBlock);
172    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
173    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
[6085]174#ifndef STREAM_COMPRESS_USING_EXTRACTION_MASK
[6018]175    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
176    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
177    for (unsigned i = 0; i < fieldsPerBlock; i++) {
178        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
[6038]179        Value * popc = kb->CreateCall(popc_func, mask[i]);
[6018]180        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
181    }
[6085]182#else
183    for (unsigned i = 0; i < fieldsPerBlock; i++) {
184        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
185    }
186#endif
[6018]187    for (unsigned j = 0; j < mStreamCount; ++j) {
188        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
189        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
190        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
191        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
192        for (unsigned i = 0; i < fieldsPerBlock; i++) {
193            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
194            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
195            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
196        }
197    }
198    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
199    blockOffsetPhi->addIncoming(nextBlk, processBlock);
200    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
201    kb->CreateCondBr(moreToDo, processBlock, done);
202    kb->SetInsertPoint(done);
203}
204
205PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
206: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
207                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
208                       Binding{kb->getStreamSetTy(), "extractionMask"}},
[6085]209#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
210                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
211#else
[6018]212                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
213                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
[6085]214#endif
[6018]215                   {}, {}, {})
216, mPEXTWidth(fieldWidth)
217, mStreamCount(streamCount) {
218    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
219}
220   
[6184]221StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb
222                                           , StreamSet * source
223                                           #ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
224                                           , StreamSet * extractionMask
225                                           #else
226                                           , StreamSet * unitCounts
227                                           #endif
228                                           , StreamSet * compresedOutput
229                                           , const unsigned FieldWidth)
230: MultiBlockKernel("streamCompress" + std::to_string(FieldWidth) + "_" + std::to_string(source->getNumElements()),
231{Binding{"sourceStreamSet", source},
[6085]232#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
[6184]233Binding{"extractionMask", extractionMask}},
[6085]234#else
[6184]235Binding{"unitCounts", unitCounts}},
[6085]236#endif
[6184]237{Binding{"compressedOutput", compresedOutput, BoundedRate(0, 1)}},
238{}, {}, {})
239, mCompressedFieldWidth(FieldWidth)
240, mStreamCount(source->getNumElements()) {
241    addInternalScalar(kb->getSizeTy(), "pendingItemCount");
242    for (unsigned i = 0; i < mStreamCount; i++) {
243        addInternalScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
[6013]244    }
245
246}
247   
248void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
249    const unsigned fw = mCompressedFieldWidth;
250    Type * fwTy = b->getIntNTy(fw);
251    Type * sizeTy = b->getSizeTy();
252    const unsigned numFields = b->getBitBlockWidth()/fw;
253    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
[6024]254    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
[6013]255    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
256    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
257    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
258    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
259    BasicBlock * entry = b->GetInsertBlock();
260    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
261    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
[6014]262    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
263    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
[6013]264    Constant * const ZERO = b->getSize(0);
[6184]265
[6013]266    Value * pendingItemCount = b->getScalarField("pendingItemCount");
267    std::vector<Value *> pendingData(mStreamCount);
268    for (unsigned i = 0; i < mStreamCount; i++) {
269        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
270    }
271   
272    b->CreateBr(segmentLoop);
273    // Main Loop
274    b->SetInsertPoint(segmentLoop);
275    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
276    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
277    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
278    PHINode * pendingDataPhi[mStreamCount];
279    blockOffsetPhi->addIncoming(ZERO, entry);
280    outputBlockPhi->addIncoming(ZERO, entry);
281    pendingItemsPhi->addIncoming(pendingItemCount, entry);
282    for (unsigned i = 0; i < mStreamCount; i++) {
283        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
284        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
285    }
[6085]286#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
287    Value * fieldPopCounts = b->simd_popcount(fw, b->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi));
288#else
[6013]289    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
[6085]290#endif
[6013]291    // For each field determine the (partial) sum popcount of all fields up to and
292    // including the current field.
293    Value * partialSum = fieldPopCounts;
294    for (unsigned i = 1; i < numFields; i *= 2) {
295        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
296    }
[6085]297    Value * blockPopCount = b->CreateZExtOrTrunc(b->mvmd_extract(fw, partialSum, numFields-1), sizeTy);
[6013]298    //
299    // Now determine for each source field the output offset of the first bit.
300    // Note that this depends on the number of pending bits.
301    //
302    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
303    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
304    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
305    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
306    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
307   //
308    // Determine the relative field number for each output field.   Note that the total
309    // number of fields involved is numFields + 1.   However, the first field always
310    // be immediately combined into the current pending data field, so we calculate
311    // field numbers for all subsequent fields, (the fields that receive overflow bits).
312    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
313  //
314    // Now process the input data block of each stream in the input stream set.
315    //
316    // First load all the stream set blocks and the pending data.
317    std::vector<Value *> sourceBlock(mStreamCount);
318    for (unsigned i = 0; i < mStreamCount; i++) {
319        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
320    }
321    // Now separate the bits of each field into ones that go into the current field
322    // and ones that go into the overflow field.   Extract the first field separately,
323    // and then shift and combine subsequent fields.
324    std::vector<Value *> pendingOutput(mStreamCount);
325    std::vector<Value *> outputFields(mStreamCount);
[6014]326    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
[6013]327    for (unsigned i = 0; i < mStreamCount; i++) {
328        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
[6014]329        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
[6013]330        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
331        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
332        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
333        // shift back currentFieldBits to combine with nextFieldBits.
334        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
335    }
336    // Now combine forward all fields with the same field number.  This may require
337    // up to log2 numFields steps.
338    for (unsigned j = 1; j < numFields; j*=2) {
339        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
340        for (unsigned i = 0; i < mStreamCount; i++) {
341            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
342            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
[6024]343       }
[6013]344    }
345    // Now compress the data fields, eliminating all but the last field from
[6024]346    // each run of consecutive field having the same field number as a subsequent field.
347    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
348    // So we add 1 to field numbers first.
349    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
350    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
[6013]351    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
352    for (unsigned i = 0; i < mStreamCount; i++) {
353        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
[6024]354    }
[6013]355    //
356    // Finally combine the pendingOutput and outputField data.
357    // (a) shift forward outputField data to fill the pendingOutput values.
358    // (b) shift back outputField data to clear data added to pendingOutput.
[6014]359    //
360    // However, we may need to increment pendingFieldIndex if we previously
361    // filled the field with the extracted firstField value.  The first
362    // value of the fieldNo vector will be 0 or 1.
363    // It is possible that pendingFieldIndex will reach the total number
364    // of fields held in register.  mvmd_sll may not handle this if it
365    // translates to an LLVM shl.
366    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
367    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
368    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
[6013]369    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
370    for (unsigned i = 0; i < mStreamCount; i++) {
[6184]371        Value * shiftedField = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
372
373        Value * outputFwd = b->fwCast(fw, shiftedField);
374        shiftedField = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
375
376        pendingOutput[i] = b->simd_or(pendingOutput[i], shiftedField);
[6013]377        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
[6014]378    }
[6013]379    //
380    // Write the pendingOutput data to outputStream.
381    // Note: this data may be overwritten later, but we avoid branching.
382    for (unsigned i = 0; i < mStreamCount; i++) {
383        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
384    }
385    // Now determine the total amount of pending items and whether
386    // the pending data all fits within the pendingOutput.
387    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
388    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
389    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
390    //
391    // Prepare Phi nodes for the next iteration.
392    //
393    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
394    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
395    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
396    // But don't advance the output if all the data does fit into pendingOutput.
397    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
398    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
399    pendingItemsPhi->addIncoming(newPending, segmentLoop);
400
401    for (unsigned i = 0; i < mStreamCount; i++) {
402        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
403        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
404    }
405    //
406    // Now continue the loop if there are more blocks to process.
407    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
408    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
409   
410    b->SetInsertPoint(segmentDone);
411    // Save kernel state.
412    b->setScalarField("pendingItemCount", newPending);
413    for (unsigned i = 0; i < mStreamCount; i++) {
414        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
415    }
[6014]416    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
417    b->SetInsertPoint(finalWrite);
418    for (unsigned i = 0; i < mStreamCount; i++) {
419        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
420        Value * pending = b->bitCast(pendingOutput[i]);
421        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
422    }
423    b->CreateBr(updateProducedCount);
424    b->SetInsertPoint(updateProducedCount);
[6085]425    Value * produced = b->getProducedItemCount("compressedOutput");
[6013]426    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
[6014]427    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
[6013]428    b->setProducedItemCount("compressedOutput", produced);
429}
430
[6184]431Bindings makeSwizzledDeleteByPEXTOutputBindings(const std::vector<StreamSet *> & outputStreamSets, const unsigned PEXTWidth) {
432    const auto n = outputStreamSets.size();
433    Bindings outputs;
434    outputs.reserve(n);
435    outputs.emplace_back("outputSwizzle0", outputStreamSets[0], PopcountOf("selectors"), BlockSize(PEXTWidth)); // PopcountOfNot("delMaskSet")
436    for (unsigned i = 1; i < n; ++i) {
437        outputs.emplace_back("outputSwizzle" + std::to_string(i), outputStreamSets[i], RateEqualTo("outputSwizzle0"), BlockSize(PEXTWidth));
438    }
439    return outputs;
440}
441
442SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b,
443                                                       StreamSet * selectors, StreamSet * inputStreamSet,
444                                                       const std::vector<StreamSet *> & outputStreamSets,
445                                                       const unsigned PEXTWidth)
446
447: MultiBlockKernel("PEXTdel" + std::to_string(PEXTWidth) + "_" + std::to_string(inputStreamSet->getNumElements()),
448{Binding{"selectors", selectors}, Binding{"inputStreamSet", inputStreamSet}},
449std::move(makeSwizzledDeleteByPEXTOutputBindings(outputStreamSets, PEXTWidth)),
450{}, {}, {})
451, mStreamCount(inputStreamSet->getNumElements())
452, mSwizzleFactor(b->getBitBlockWidth() / PEXTWidth)
453, mSwizzleSetCount(ceil_udiv(mStreamCount, mSwizzleFactor))
454, mPEXTWidth(PEXTWidth) {
455
456    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0) && "mDelCountFieldWidth must be a power of 2");
[5540]457    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
458    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
[6184]459    assert (mSwizzleSetCount);
460    assert (outputStreamSets.size() == mSwizzleSetCount);
461    assert (outputStreamSets[0]->getNumElements() == mSwizzleFactor);
[5706]462
[6184]463    addInternalScalar(b->getBitBlockType(), "pendingSwizzleData0");
464    for (unsigned i = 1; i < outputStreamSets.size(); ++i) {
465        assert (outputStreamSets[i]->getNumElements() == mSwizzleFactor);
466        addInternalScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
[5540]467    }
468}
469
[6184]470void SwizzledDeleteByPEXTkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
[5540]471    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
472
[6184]473    BasicBlock * const entry = b->GetInsertBlock();
474    BasicBlock * const beginLoop = b->CreateBasicBlock("beginLoop");
[5540]475
[6184]476    ConstantInt * const ZERO = b->getSize(0);
[5985]477    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
478    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
479    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
480    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
481    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
[5540]482
[5985]483    // All output groups have the same count.
[6184]484    Value * const baseOutputProduced = b->getProducedItemCount("outputSwizzle0");
485    Value * const baseProducedOffset = b->CreateAnd(baseOutputProduced, BLOCK_WIDTH_MASK);
[5985]486
[5540]487    // There is a separate vector of pending data for each swizzle group.
[6184]488    std::vector<Value *> pendingData(mSwizzleSetCount);
[5540]489    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
[6184]490        pendingData[i] = b->getScalarField("pendingSwizzleData" + std::to_string(i));
[5540]491    }
[6184]492    b->CreateBr(beginLoop);
[5540]493
[6184]494    b->SetInsertPoint(beginLoop);
495    PHINode * const strideIndex = b->CreatePHI(numOfBlocks->getType(), 2);
496    strideIndex->addIncoming(ZERO, entry);
497    PHINode * const producedOffsetPhi = b->CreatePHI(numOfBlocks->getType(), 2);
498    producedOffsetPhi->addIncoming(baseProducedOffset, entry);
499    std::vector<PHINode *> pendingDataPhi(mSwizzleSetCount);
500    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
501        pendingDataPhi[i] = b->CreatePHI(pendingData[i]->getType(), 2);
502        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
503        pendingData[i] = pendingDataPhi[i];
504    }
505
506    Value * const selectors = b->loadInputStreamBlock("selectors", strideIndex);
507
508    const auto swizzleSets = makeSwizzleSets(b, selectors, strideIndex);
509
[5985]510    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
511
[6184]512    // Compress the PEXTedSwizzleSets
513    // Output is written and committed to the output buffer one swizzle at a time.
514    Value * producedOffset = producedOffsetPhi;
515
[5540]516    // For each row i
517    for (unsigned i = 0; i < mSwizzleFactor; i++) {
[5985]518
[5540]519        // Generate code for each of the mSwizzleFactor fields making up a block.
520        // We load the count for the field and process all swizzle groups accordingly.
[6184]521        Value * const pendingOffset = b->CreateAnd(producedOffset, PEXT_WIDTH_MASK);
[5985]522        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
523        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
524        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
525
[6184]526        Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
527        Value * const spaceVector = b->simd_fill(mPEXTWidth, pendingSpace);
528
529        Value * const outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
[5985]530        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
531        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
532
[5540]533        // Data from the ith swizzle pack of each group is processed
534        // according to the same newItemCount, pendingSpace, ...
535        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
[5985]536            Value * const newItems = swizzleSets[j][i];
[5540]537            // Combine as many of the new items as possible into the pending group.
[5985]538            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
539            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
[5926]540            // To avoid an unpredictable branch, always store the combined group, whether full or not.
[6184]541            b->storeOutputStreamBlock("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset, combinedGroup);
[5540]542            // Any items in excess of the space available in the current pending group overflow for the next group.
[6184]543            Value * overFlowGroup = b->CreateLShr(newItems, spaceVector);
[5540]544            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
[5985]545            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
[5540]546        }
[6184]547        producedOffset = b->CreateAdd(producedOffset, newItemCount);
548    }
[5985]549
[6184]550    BasicBlock * const finishedLoop = b->CreateBasicBlock("finishedLoop");
551    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
552    BasicBlock * const loopEndBlock = b->GetInsertBlock();
553    strideIndex->addIncoming(nextStrideIndex, loopEndBlock);
554    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
555        pendingDataPhi[i]->addIncoming(pendingData[i], loopEndBlock);
[5540]556    }
[6184]557    producedOffsetPhi->addIncoming(producedOffset, loopEndBlock);
558    Value * const doneLoop = b->CreateICmpEQ(nextStrideIndex, numOfBlocks);
[5985]559
[6184]560    b->CreateUnlikelyCondBr(doneLoop, finishedLoop, beginLoop);
561
562    b->SetInsertPoint(finishedLoop);
563    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
564        b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
[5540]565    }
566}
567
568/*
569Apply PEXT deletion to the blocks in strms and swizzle the result.
570
571Q: Why is it advantageous to swizzle the PEXTed streams?
572
573A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
574
575Example:
576Input:     11101101
577PEXT mask: 11110000
578Output:    00001110
579
580PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
581cleared. This has implications if we're working with multiple streams.
582
583For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
584(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
585Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
586Stream 2:   qrstuv00 wxy00000 z1000000 23456000
587Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
588Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
589
590If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
591should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
592time:
593
594The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
595
596Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
597Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
598Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
599Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
600
[5985]601Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
[5540]602compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
603
604Args:
605    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
[5985]606    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
[5540]607        sequentially to PEXT a full block.)
608
609Returns:
610    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
611*/
[5985]612
[6184]613SwizzledDeleteByPEXTkernel::SwizzleSets SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors, Value * const strideIndex) {
[5985]614
615    Constant * pext = nullptr;
[5540]616    if (mPEXTWidth == 64) {
[5985]617        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
[5540]618    } else if (mPEXTWidth == 32) {
[5985]619        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
[5540]620    }
[5985]621
622    Value * const m = b->fwCast(mPEXTWidth, selectors);
623
624    std::vector<Value *> masks(mSwizzleFactor);
625    for (unsigned i = 0; i < mSwizzleFactor; i++) {
626        masks[i] = b->CreateExtractElement(m, i);
627
[5540]628    }
629
[6184]630    SwizzleSets swizzleSets;
[5985]631    swizzleSets.reserve(mSwizzleSetCount);
[5540]632
[5985]633    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
[5540]634
[5985]635    UndefValue * const outputInitializer = UndefValue::get(vecTy);
[5540]636
[5985]637    std::vector<Value *> input(mSwizzleFactor);
638    // For each of the k swizzle sets required to apply PEXT to all input streams
639    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
640
641        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
642            const unsigned k = (i * mSwizzleFactor) + j;
643            if (k < mStreamCount) {
[6184]644                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k), strideIndex), vecTy);
[5985]645            } else {
646                input[j] = Constant::getNullValue(vecTy);
647            }
[5540]648        }
[5985]649
650        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
651        // then splat the pext mask and apply it to each output row
652
653        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
654        // For each of the input streams
655        for (unsigned j = 0; j < mSwizzleFactor; j++) {
656            for (unsigned k = 0; k < mSwizzleFactor; k++) {
657                // Load block j,k
658                Value * const field = b->CreateExtractElement(input[j], k);
659                // Apply PEXT deletion
660                Value * const selected = b->CreateCall(pext, {field, masks[k]});
661                // Then store it as our k,j-th output
662                output[k] = b->CreateInsertElement(output[k], selected, j);
663            }
664        }
665        swizzleSets.emplace_back(output);
[5540]666    }
667
[5985]668    return swizzleSets;
[5540]669}
670
[6184]671
[5362]672// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
[5313]673// Kernel inputs: stream_count data streams plus one del_mask stream
[5362]674// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
[5313]675
[6006]676void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
677    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
678    generateProcessingLoop(kb, delMask);
[5313]679}
680
[6006]681void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
682    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
683    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
684    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
685    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
686    generateProcessingLoop(kb, delMask);
[5362]687}
688
[6006]689void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
[5985]690    Constant * PEXT_func = nullptr;
691    if (mPEXTWidth == 64) {
[6006]692        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
[5985]693    } else if (mPEXTWidth == 32) {
[6006]694        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
[5440]695    }
[5985]696    std::vector<Value *> masks(mSwizzleFactor);
[6031]697    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
[5985]698    for (unsigned i = 0; i < mSwizzleFactor; i++) {
[6031]699        masks[i] = kb->CreateExtractElement(m, i);
[5313]700    }
701
[5985]702    for (unsigned i = 0; i < mStreamCount; ++i) {
[6006]703        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
704        Value * value = kb->fwCast(mPEXTWidth, input);
[5985]705        Value * output = UndefValue::get(value->getType());
706        for (unsigned j = 0; j < mSwizzleFactor; j++) {
[6006]707            Value * field = kb->CreateExtractElement(value, j);
708            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
709            output = kb->CreateInsertElement(output, compressed, j);
[5362]710        }
[6006]711        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
[5362]712    }
[6006]713    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
714    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
[5362]715}
716
[5985]717DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
718: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
719              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
720                  Binding{b->getStreamSetTy(), "delMaskSet"}},
721              {}, {}, {}, {})
[5313]722, mDelCountFieldWidth(fw)
[5362]723, mStreamCount(streamCount)
[5985]724, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
725, mPEXTWidth(PEXT_width) {
[6184]726    mOutputStreamSets.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
727    mOutputStreamSets.emplace_back(b->getStreamSetTy(), "deletionCounts");
[5313]728}
729
[5355]730   
731//
732// This kernel performs final stream compression for a set of N bitstreams, given
733// (a) a set of bitstreams partially compressed within K-bit fields and stored
734//     in K-bit swizzled form, and
735// (b) a stream of deletion/extraction counts per K-bit stride.
736//
737// Restrictions:  At present, only K=64 is supported.
738//                At present, N must be an exact multiple of BLOCK_SIZE/K.
739//
740// The kernel always consumes full blocks of input and emits data into the output
741// buffer in swizzles of K items at a time.   Upon completion of a segment,
742// up to K-1 pending output items per stream may be stored in the kernel state.
743//
744// Note: that both input streams and output streams are stored in swizzled form.
745//
746
[6006]747SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
[5435]748: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
[6006]749                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
[5355]750, mBitStreamCount(bitStreamCount)
[6184]751, mFieldWidth(fieldWidth)
752, mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
753, mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
754    assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
755    assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
756    mInputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
757    mOutputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
758    addInternalScalar(kb->getBitBlockType(), "pendingSwizzleData0");
759    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
760        mInputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
761        mOutputStreamSets.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
762        addInternalScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
763    }
764    addInternalScalar(kb->getSizeTy(), "pendingOffset");
[5313]765}
[5355]766   
[6006]767void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
[5355]768       
[6006]769    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
770    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
[5355]771   
772    // Output is written and committed to the output buffer one swizzle at a time.
773    //
[6006]774    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
775    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
[5355]776   
[6006]777    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
778    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
779    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
[5355]780
781    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
[6006]782    Value * pendingOffset = kb->getScalarField("pendingOffset");
[5355]783    // There is a separate vector of pending data for each swizzle group.
784    std::vector<Value *> pendingData;
785    std::vector<Value *> outputStreamPtr;
786    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
[6006]787        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
788        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
[5355]789    }
790   
791    // Generate code for each of the mSwizzleFactor fields making up a block.
792    // We load the count for the field and process all swizzle groups accordingly.
793    for (unsigned i = 0; i < mSwizzleFactor; i++) {
[6006]794        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
795        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
796        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
[5355]797       
[6184]798        Value * const fieldWidths = kb->simd_fill(mFieldWidth, pendingOffset);
799
[5355]800        // Data from the ith swizzle pack of each group is processed
801        // according to the same newItemCount, pendingSpace, ...
802        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
[6006]803            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
[5355]804            // Combine as many of the new items as possible into the pending group.
[6184]805            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, fieldWidths));
[5985]806            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
[6006]807            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
[5355]808            // Any items in excess of the space available in the current pending group overflow for the next group.
[6006]809            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
[5355]810            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
[6006]811            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
[5355]812        }
[6006]813        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
814        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
[5355]815    }
[6006]816    kb->setScalarField("pendingOffset", pendingOffset);
[6184]817//    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
818//    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
[5355]819    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
[6006]820        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
[5355]821    }
[6184]822//    kb->setProducedItemCount("outputSwizzle0", produced);
[5355]823}
824
[6006]825void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
826    CreateDoBlockMethodCall(kb);
827    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
828    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
[5376]829   
[6006]830    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
831    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
832    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
[6184]833//    Value * pendingOffset = kb->getScalarField("pendingOffset");
[5376]834
835    // Write the pending data.
836    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
[6006]837        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
838        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
839        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
[5376]840    }
[6184]841//    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
[5355]842}
[6037]843
844}
Note: See TracBrowser for help on using the repository browser.