source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6037

Last change on this file since 6037 was 6037, checked in by cameron, 11 months ago

StreamCompressionCompiler? initial check-in

File size: 44.8 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
9#include <kernels/kernel_builder.h>
10#include <llvm/Support/raw_ostream.h>
11#include <IR_Gen/idisa_target.h>
12
13using namespace llvm;
14
15namespace kernel {
16
17inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
18    Value * m = kb->simd_not(del_mask);
19    Value * mk = kb->simd_slli(fw, del_mask, 1);
20    std::vector<Value *> move_masks;
21    for (unsigned shift = 1; shift < fw; shift *= 2) {
22        Value * mp = mk;
23        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
24            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
25        }
26        Value * mv = kb->simd_and(mp, m);
27        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
28        mk = kb->simd_and(mk, kb->simd_not(mp));
29        move_masks.push_back(mv);
30    }
31    return move_masks;
32}
33
34inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
35    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
36    for (unsigned i = 0; i < mv.size(); i++) {
37        unsigned shift = 1 << i;
38        Value * t = kb->simd_and(s, mv[i]);
39        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
40    }
41    return s;
42}
43
44// Apply deletion to a set of stream_count input streams to produce a set of output streams.
45// Kernel inputs: stream_count data streams plus one del_mask stream
46// Outputs: the deleted streams, plus a partial sum popcount
47
48void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
49    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
50    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
51    for (unsigned j = 0; j < mStreamCount; ++j) {
52        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
53        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
54        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
55    }
56    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
57    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
58}
59
60void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
61    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
62    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
63    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
64    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
65    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
66    for (unsigned j = 0; j < mStreamCount; ++j) {
67        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
68        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
69        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
70    }
71    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
72    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
73}
74
75DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
76: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
77                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
78                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
79                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
80                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
81                      {}, {}, {})
82, mDeletionFieldWidth(fieldWidth)
83, mStreamCount(streamCount) {
84}
85
86void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
87    BasicBlock * entry = kb->GetInsertBlock();
88    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
89    BasicBlock * done = kb->CreateBasicBlock("done");
90    Constant * const ZERO = kb->getSize(0);
91    kb->CreateBr(processBlock);
92    kb->SetInsertPoint(processBlock);
93    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
94    blockOffsetPhi->addIncoming(ZERO, entry);
95    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
96    Value * delMask = kb->simd_not(extractionMask);
97    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
98    for (unsigned j = 0; j < mStreamCount; ++j) {
99        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
100        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
101        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
102    }
103    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
104    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
105    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
106    blockOffsetPhi->addIncoming(nextBlk, processBlock);
107    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
108    kb->CreateCondBr(moreToDo, processBlock, done);
109    kb->SetInsertPoint(done);
110}
111
112FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
113: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
114                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
115                          Binding{kb->getStreamSetTy(), "extractionMask"}},
116                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
117                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
118                      {}, {}, {})
119, mCompressFieldWidth(fieldWidth)
120, mStreamCount(streamCount) {
121}
122
123void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
124    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
125    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
126    Constant * fieldTyZero = kb->getIntN(mPEXTWidth, 0);
127    Constant * PEXT_func = nullptr;
128    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
129    if (mPEXTWidth == 64) {
130        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
131    } else if (mPEXTWidth == 32) {
132        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
133    }
134    BasicBlock * entry = kb->GetInsertBlock();
135    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
136    BasicBlock * done = kb->CreateBasicBlock("done");
137    Constant * const ZERO = kb->getSize(0);
138    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
139    kb->CreateBr(processBlock);
140    kb->SetInsertPoint(processBlock);
141    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
142    blockOffsetPhi->addIncoming(ZERO, entry);
143    std::vector<Value *> mask(fieldsPerBlock);
144    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
145    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
146    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
147    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
148
149    Value* remainingMask = kb->getAvailableItemCount("extractionMask");
150    Value* PEXTWidth = kb->getSize(mPEXTWidth);
151    for (unsigned i = 0; i < fieldsPerBlock; i++) {
152        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
153        Value* hasFullMask = kb->CreateICmpUGE(remainingMask, PEXTWidth);
154        Value* shiftAmount = kb->CreateSub(PEXTWidth, remainingMask);
155        Value * targetMask = kb->CreateSelect(
156                hasFullMask,
157                mask[i],
158                kb->CreateShl(mask[i], shiftAmount)
159        );
160        targetMask = kb->CreateSelect(kb->CreateICmpEQ(remainingMask, kb->getSize(0)), fieldTyZero, targetMask);
161        Value * popc = kb->CreateCall(popc_func, targetMask);
162        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
163        remainingMask = kb->CreateSelect(
164                hasFullMask,
165                kb->CreateSub(remainingMask, kb->getSize(mPEXTWidth)),
166                kb->getSize(0)
167        );
168    }
169    for (unsigned j = 0; j < mStreamCount; ++j) {
170        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
171        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
172        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
173        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
174        for (unsigned i = 0; i < fieldsPerBlock; i++) {
175            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
176            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
177            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
178        }
179    }
180    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
181    blockOffsetPhi->addIncoming(nextBlk, processBlock);
182    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
183    kb->CreateCondBr(moreToDo, processBlock, done);
184    kb->SetInsertPoint(done);
185}
186
187PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
188: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
189                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
190                       Binding{kb->getStreamSetTy(), "extractionMask"}},
191                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
192                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
193                   {}, {}, {})
194, mPEXTWidth(fieldWidth)
195, mStreamCount(streamCount) {
196    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
197}
198   
199StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
200: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
201                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
202                       Binding{kb->getStreamSetTy(), "unitCounts"}},
203                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
204                   {}, {}, {})
205, mCompressedFieldWidth(fieldWidth)
206, mStreamCount(streamCount) {
207    addScalar(kb->getSizeTy(), "pendingItemCount");
208    for (unsigned i = 0; i < streamCount; i++) {
209        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
210    }
211
212}
213   
214void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
215    const unsigned fw = mCompressedFieldWidth;
216    Type * fwTy = b->getIntNTy(fw);
217    Type * sizeTy = b->getSizeTy();
218    const unsigned numFields = b->getBitBlockWidth()/fw;
219    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
220    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
221    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
222    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
223    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
224    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
225    BasicBlock * entry = b->GetInsertBlock();
226    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
227    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
228    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
229    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
230    Constant * const ZERO = b->getSize(0);
231   
232    Value * pendingItemCount = b->getScalarField("pendingItemCount");
233    std::vector<Value *> pendingData(mStreamCount);
234    for (unsigned i = 0; i < mStreamCount; i++) {
235        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
236    }
237   
238    b->CreateBr(segmentLoop);
239    // Main Loop
240    b->SetInsertPoint(segmentLoop);
241    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
242    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
243    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
244    PHINode * pendingDataPhi[mStreamCount];
245    blockOffsetPhi->addIncoming(ZERO, entry);
246    outputBlockPhi->addIncoming(ZERO, entry);
247    pendingItemsPhi->addIncoming(pendingItemCount, entry);
248    for (unsigned i = 0; i < mStreamCount; i++) {
249        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
250        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
251    }
252    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
253    // For each field determine the (partial) sum popcount of all fields up to and
254    // including the current field.
255    Value * partialSum = fieldPopCounts;
256    for (unsigned i = 1; i < numFields; i *= 2) {
257        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
258    }
259    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
260    //
261    // Now determine for each source field the output offset of the first bit.
262    // Note that this depends on the number of pending bits.
263    //
264    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
265    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
266    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
267    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
268    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
269   //
270    // Determine the relative field number for each output field.   Note that the total
271    // number of fields involved is numFields + 1.   However, the first field always
272    // be immediately combined into the current pending data field, so we calculate
273    // field numbers for all subsequent fields, (the fields that receive overflow bits).
274    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
275  //
276    // Now process the input data block of each stream in the input stream set.
277    //
278    // First load all the stream set blocks and the pending data.
279    std::vector<Value *> sourceBlock(mStreamCount);
280    for (unsigned i = 0; i < mStreamCount; i++) {
281        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
282    }
283    // Now separate the bits of each field into ones that go into the current field
284    // and ones that go into the overflow field.   Extract the first field separately,
285    // and then shift and combine subsequent fields.
286    std::vector<Value *> pendingOutput(mStreamCount);
287    std::vector<Value *> outputFields(mStreamCount);
288    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
289    for (unsigned i = 0; i < mStreamCount; i++) {
290        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
291        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
292        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
293        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
294        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
295        // shift back currentFieldBits to combine with nextFieldBits.
296        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
297    }
298    // Now combine forward all fields with the same field number.  This may require
299    // up to log2 numFields steps.
300    for (unsigned j = 1; j < numFields; j*=2) {
301        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
302        for (unsigned i = 0; i < mStreamCount; i++) {
303            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
304            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
305       }
306    }
307    // Now compress the data fields, eliminating all but the last field from
308    // each run of consecutive field having the same field number as a subsequent field.
309    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
310    // So we add 1 to field numbers first.
311    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
312    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
313    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
314    for (unsigned i = 0; i < mStreamCount; i++) {
315        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
316    }
317    //
318    // Finally combine the pendingOutput and outputField data.
319    // (a) shift forward outputField data to fill the pendingOutput values.
320    // (b) shift back outputField data to clear data added to pendingOutput.
321    //
322    // However, we may need to increment pendingFieldIndex if we previously
323    // filled the field with the extracted firstField value.  The first
324    // value of the fieldNo vector will be 0 or 1.
325    // It is possible that pendingFieldIndex will reach the total number
326    // of fields held in register.  mvmd_sll may not handle this if it
327    // translates to an LLVM shl.
328    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
329    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
330    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
331    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
332    for (unsigned i = 0; i < mStreamCount; i++) {
333        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
334        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
335        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
336        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
337    }
338    //
339    // Write the pendingOutput data to outputStream.
340    // Note: this data may be overwritten later, but we avoid branching.
341    for (unsigned i = 0; i < mStreamCount; i++) {
342        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
343    }
344    // Now determine the total amount of pending items and whether
345    // the pending data all fits within the pendingOutput.
346    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
347    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
348    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
349    //
350    // Prepare Phi nodes for the next iteration.
351    //
352    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
353    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
354    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
355    // But don't advance the output if all the data does fit into pendingOutput.
356    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
357    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
358    pendingItemsPhi->addIncoming(newPending, segmentLoop);
359
360    for (unsigned i = 0; i < mStreamCount; i++) {
361        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
362        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
363    }
364    //
365    // Now continue the loop if there are more blocks to process.
366    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
367    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
368   
369    b->SetInsertPoint(segmentDone);
370    // Save kernel state.
371    b->setScalarField("pendingItemCount", newPending);
372    for (unsigned i = 0; i < mStreamCount; i++) {
373        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
374    }
375    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
376    b->SetInsertPoint(finalWrite);
377    for (unsigned i = 0; i < mStreamCount; i++) {
378        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
379        Value * pending = b->bitCast(pendingOutput[i]);
380        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
381    }
382    b->CreateBr(updateProducedCount);
383    b->SetInsertPoint(updateProducedCount);
384     Value * produced = b->getProducedItemCount("compressedOutput");
385    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
386    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
387    b->setProducedItemCount("compressedOutput", produced);
388}
389
390SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
391: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
392                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
393                  {}, {}, {}, {})
394, mStreamCount(streamCount)
395, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
396// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
397// to prevent rounding errors.
398, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
399, mPEXTWidth(PEXT_width)
400{
401    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
402        && "mDelCountFieldWidth must be a power of 2");
403    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
404    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
405
406    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
407    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
408
409    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
410    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
411    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
412        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
413        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
414    }
415    addScalar(b->getSizeTy(), "pendingOffset");
416}
417
418void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
419    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
420    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
421    generateProcessingLoop(b, delMask, false);
422}
423
424void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
425    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
426    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
427    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
428    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
429    generateProcessingLoop(b, delMask, true);
430}
431
432/*
433What this function does in pseudo code:
434for (mSwizzleFactor)
435    create a swizzle set containing mSwizzleFactor blocks
436    apply PEXT to each block in the swizzle set
437    store the swizzleSet in PEXTedSwizzleSets vector
438
439for (each swizzle row i)
440    for (each swizzle set j)
441        processes row i in swizzle set j
442        store output in pendingData[j]
443*/
444
445void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
446
447    // selectors marks the positions we want to keep
448    Value * const selectors = b->CreateNot(delMask);
449
450    const auto swizzleSets = makeSwizzleSets(b, selectors);
451
452    // Compress the PEXTedSwizzleSets
453    // Output is written and committed to the output buffer one swizzle at a time.
454    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
455    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
456    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
457    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
458    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
459
460    // All output groups have the same count.
461    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
462    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
463    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
464    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
465
466    // There is a separate vector of pending data for each swizzle group.
467    std::vector<Value *> pendingData;
468    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
469        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
470    }
471
472    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
473
474    // For each row i
475    for (unsigned i = 0; i < mSwizzleFactor; i++) {
476
477        // Generate code for each of the mSwizzleFactor fields making up a block.
478        // We load the count for the field and process all swizzle groups accordingly.
479        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
480        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
481        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
482        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
483
484        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
485        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
486
487        // Data from the ith swizzle pack of each group is processed
488        // according to the same newItemCount, pendingSpace, ...
489        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
490            Value * const newItems = swizzleSets[j][i];
491            // Combine as many of the new items as possible into the pending group.
492            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
493            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
494            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
495            // To avoid an unpredictable branch, always store the combined group, whether full or not.
496            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
497            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
498            // Any items in excess of the space available in the current pending group overflow for the next group.
499            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
500            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
501            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
502        }
503
504        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
505        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
506
507        outputProduced = b->CreateAdd(outputProduced, newItemCount);
508    }
509
510    if (flush) { // incase we selected the overflow group on the final iteration
511        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
512        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
513        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
514            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
515            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
516        }
517    } else {
518        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
519            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
520        }
521        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
522        b->setScalarField("pendingOffset", pendingOffset);
523        // unless this is our final stride, don't report partially written fields.
524        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
525    }
526
527    b->setProducedItemCount("outputSwizzle0", outputProduced);
528}
529
530/*
531Apply PEXT deletion to the blocks in strms and swizzle the result.
532
533Q: Why is it advantageous to swizzle the PEXTed streams?
534
535A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
536
537Example:
538Input:     11101101
539PEXT mask: 11110000
540Output:    00001110
541
542PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
543cleared. This has implications if we're working with multiple streams.
544
545For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
546(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
547Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
548Stream 2:   qrstuv00 wxy00000 z1000000 23456000
549Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
550Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
551
552If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
553should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
554time:
555
556The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
557
558Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
559Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
560Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
561Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
562
563Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
564compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
565
566Args:
567    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
568    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
569        sequentially to PEXT a full block.)
570
571Returns:
572    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
573*/
574
575std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
576
577    Constant * pext = nullptr;
578    if (mPEXTWidth == 64) {
579        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
580    } else if (mPEXTWidth == 32) {
581        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
582    }
583
584    Value * const m = b->fwCast(mPEXTWidth, selectors);
585
586    std::vector<Value *> masks(mSwizzleFactor);
587    for (unsigned i = 0; i < mSwizzleFactor; i++) {
588        masks[i] = b->CreateExtractElement(m, i);
589
590    }
591
592    std::vector<std::vector<Value *>> swizzleSets;
593    swizzleSets.reserve(mSwizzleSetCount);
594
595    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
596
597    UndefValue * const outputInitializer = UndefValue::get(vecTy);
598
599    std::vector<Value *> input(mSwizzleFactor);
600    // For each of the k swizzle sets required to apply PEXT to all input streams
601    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
602
603        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
604            const unsigned k = (i * mSwizzleFactor) + j;
605            if (k < mStreamCount) {
606                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
607            } else {
608                input[j] = Constant::getNullValue(vecTy);
609            }
610        }
611
612        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
613        // then splat the pext mask and apply it to each output row
614
615        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
616        // For each of the input streams
617        for (unsigned j = 0; j < mSwizzleFactor; j++) {
618            for (unsigned k = 0; k < mSwizzleFactor; k++) {
619                // Load block j,k
620                Value * const field = b->CreateExtractElement(input[j], k);
621                // Apply PEXT deletion
622                Value * const selected = b->CreateCall(pext, {field, masks[k]});
623                // Then store it as our k,j-th output
624                output[k] = b->CreateInsertElement(output[k], selected, j);
625            }
626        }
627
628        swizzleSets.emplace_back(output);
629    }
630
631    return swizzleSets;
632}
633
634// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
635// Kernel inputs: stream_count data streams plus one del_mask stream
636// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
637
638void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
639    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
640    generateProcessingLoop(kb, delMask);
641}
642
643void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
644    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
645    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
646    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
647    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
648    generateProcessingLoop(kb, delMask);
649}
650
651void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
652    Constant * PEXT_func = nullptr;
653    if (mPEXTWidth == 64) {
654        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
655    } else if (mPEXTWidth == 32) {
656        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
657    }
658    std::vector<Value *> masks(mSwizzleFactor);
659    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
660    for (unsigned i = 0; i < mSwizzleFactor; i++) {
661        masks[i] = kb->CreateExtractElement(m, i);
662    }
663
664    for (unsigned i = 0; i < mStreamCount; ++i) {
665        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
666        Value * value = kb->fwCast(mPEXTWidth, input);
667        Value * output = UndefValue::get(value->getType());
668        for (unsigned j = 0; j < mSwizzleFactor; j++) {
669            Value * field = kb->CreateExtractElement(value, j);
670            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
671            output = kb->CreateInsertElement(output, compressed, j);
672        }
673        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
674    }
675    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
676    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
677}
678
679DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
680: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
681              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
682                  Binding{b->getStreamSetTy(), "delMaskSet"}},
683              {}, {}, {}, {})
684, mDelCountFieldWidth(fw)
685, mStreamCount(streamCount)
686, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
687, mPEXTWidth(PEXT_width) {
688    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
689    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
690}
691
692   
693//
694// This kernel performs final stream compression for a set of N bitstreams, given
695// (a) a set of bitstreams partially compressed within K-bit fields and stored
696//     in K-bit swizzled form, and
697// (b) a stream of deletion/extraction counts per K-bit stride.
698//
699// Restrictions:  At present, only K=64 is supported.
700//                At present, N must be an exact multiple of BLOCK_SIZE/K.
701//
702// The kernel always consumes full blocks of input and emits data into the output
703// buffer in swizzles of K items at a time.   Upon completion of a segment,
704// up to K-1 pending output items per stream may be stored in the kernel state.
705//
706// Note: that both input streams and output streams are stored in swizzled form.
707//
708
709SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
710: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
711                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
712, mBitStreamCount(bitStreamCount)
713    , mFieldWidth(fieldWidth)
714    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
715    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
716        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
717        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
718        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
719        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
720        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
721        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
722            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
723            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
724            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
725        }
726        addScalar(kb->getSizeTy(), "pendingOffset");
727}
728   
729void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
730       
731    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
732    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
733   
734    // Output is written and committed to the output buffer one swizzle at a time.
735    //
736    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
737    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
738   
739    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
740    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
741    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
742
743    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
744    Value * pendingOffset = kb->getScalarField("pendingOffset");
745    // There is a separate vector of pending data for each swizzle group.
746    std::vector<Value *> pendingData;
747    std::vector<Value *> outputStreamPtr;
748    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
749        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
750        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
751    }
752   
753    // Generate code for each of the mSwizzleFactor fields making up a block.
754    // We load the count for the field and process all swizzle groups accordingly.
755    for (unsigned i = 0; i < mSwizzleFactor; i++) {
756        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
757        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
758        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
759       
760        // Data from the ith swizzle pack of each group is processed
761        // according to the same newItemCount, pendingSpace, ...
762        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
763            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
764            // Combine as many of the new items as possible into the pending group.
765            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
766            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
767            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
768            // Any items in excess of the space available in the current pending group overflow for the next group.
769            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
770            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
771            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
772        }
773        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
774        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
775    }
776    kb->setScalarField("pendingOffset", pendingOffset);
777    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
778    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
779    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
780        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
781    }
782    kb->setProducedItemCount("outputSwizzle0", produced);
783}
784
785void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
786    CreateDoBlockMethodCall(kb);
787    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
788    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
789   
790    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
791    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
792    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
793    Value * pendingOffset = kb->getScalarField("pendingOffset");
794
795    // Write the pending data.
796    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
797        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
798        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
799        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
800    }
801    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
802}
803   
804void StreamFilterCompiler::makeCall(parabix::StreamSetBuffer * mask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
805    if (mBufferBlocks == 0) {
806        llvm::report_fatal_error("StreamFilterCompiler needs a non-zero bufferBlocks parameter (for now).");
807    }
808    auto & iBuilder = mDriver.getBuilder();
809    unsigned N = IDISA::getNumOfStreams(ssType);
810    if (IDISA::getStreamFieldWidth(ssType) != 1) {
811        llvm::report_fatal_error("StreamFilterCompiler only compresses bit streams (for now)");
812    }
813    Kernel * compressK = nullptr;
814    if (AVX2_available()) {
815        compressK = mDriver.addKernelInstance<PEXTFieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
816    } else {
817        compressK = mDriver.addKernelInstance<FieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
818    }
819    parabix::StreamSetBuffer * compressedFields = mDriver.addBuffer<parabix::CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
820    parabix::StreamSetBuffer * unitCounts = mDriver.addBuffer<parabix::CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(), mBufferBlocks);
821   
822    mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields, unitCounts});
823   
824    Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
825    mDriver.makeKernelCall(streamK, {compressedFields, unitCounts}, {outputs});
826}
827
828
829}
Note: See TracBrowser for help on using the repository browser.