source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6135

Last change on this file since 6135 was 6094, checked in by cameron, 11 months ago

mvmd_compress 32 for SSE2

File size: 45.2 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <toolchain/driver.h>
8#include <toolchain/cpudriver.h>
9#include <kernels/kernel_builder.h>
10#include <llvm/Support/raw_ostream.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Intrinsics.h>
13
14using namespace llvm;
15
16namespace kernel {
17
18inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
19    Value * m = kb->simd_not(del_mask);
20    Value * mk = kb->simd_slli(fw, del_mask, 1);
21    std::vector<Value *> move_masks;
22    for (unsigned shift = 1; shift < fw; shift *= 2) {
23        Value * mp = mk;
24        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
25            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
26        }
27        Value * mv = kb->simd_and(mp, m);
28        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
29        mk = kb->simd_and(mk, kb->simd_not(mp));
30        move_masks.push_back(mv);
31    }
32    return move_masks;
33}
34
35inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
36    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
37    for (unsigned i = 0; i < mv.size(); i++) {
38        unsigned shift = 1 << i;
39        Value * t = kb->simd_and(s, mv[i]);
40        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
41    }
42    return s;
43}
44
45// Apply deletion to a set of stream_count input streams to produce a set of output streams.
46// Kernel inputs: stream_count data streams plus one del_mask stream
47// Outputs: the deleted streams, plus a partial sum popcount
48
49void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
50    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
51    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
52    for (unsigned j = 0; j < mStreamCount; ++j) {
53        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
54        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
55        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
56    }
57    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
58    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
59}
60
61void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
62    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
63    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
64    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
65    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
66    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
67    for (unsigned j = 0; j < mStreamCount; ++j) {
68        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
69        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
70        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
71    }
72    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
73    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
74}
75
76DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
77: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
78                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
79                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
80                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
81                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
82                      {}, {}, {})
83, mDeletionFieldWidth(fieldWidth)
84, mStreamCount(streamCount) {
85}
86
87void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
88    BasicBlock * entry = kb->GetInsertBlock();
89    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
90    BasicBlock * done = kb->CreateBasicBlock("done");
91    Constant * const ZERO = kb->getSize(0);
92    kb->CreateBr(processBlock);
93    kb->SetInsertPoint(processBlock);
94    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
95    blockOffsetPhi->addIncoming(ZERO, entry);
96    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
97    Value * delMask = kb->simd_not(extractionMask);
98    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
99    for (unsigned j = 0; j < mStreamCount; ++j) {
100        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
101        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
102        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
103    }
104#ifndef STREAM_COMPRESS_USING_EXTRACTION_MASK
105    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
106    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
107#endif
108    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
109    blockOffsetPhi->addIncoming(nextBlk, processBlock);
110    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
111    kb->CreateCondBr(moreToDo, processBlock, done);
112    kb->SetInsertPoint(done);
113}
114
115FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
116: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
117                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
118                          Binding{kb->getStreamSetTy(), "extractionMask"}},
119#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
120                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
121#else
122                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
123                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
124#endif
125                      {}, {}, {})
126, mCompressFieldWidth(fieldWidth)
127, mStreamCount(streamCount) {
128}
129
130void PEXTFieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
131    Type * fieldTy = kb->getIntNTy(mPEXTWidth);
132    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
133    Constant * PEXT_func = nullptr;
134    Constant * popc_func = Intrinsic::getDeclaration(getModule(), Intrinsic::ctpop, fieldTy);
135    if (mPEXTWidth == 64) {
136        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
137    } else if (mPEXTWidth == 32) {
138        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
139    }
140    BasicBlock * entry = kb->GetInsertBlock();
141    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
142    BasicBlock * done = kb->CreateBasicBlock("done");
143    Constant * const ZERO = kb->getSize(0);
144    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPEXTWidth;
145    kb->CreateBr(processBlock);
146    kb->SetInsertPoint(processBlock);
147    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
148    blockOffsetPhi->addIncoming(ZERO, entry);
149    std::vector<Value *> mask(fieldsPerBlock);
150    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("extractionMask", ZERO, blockOffsetPhi);
151    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
152#ifndef STREAM_COMPRESS_USING_EXTRACTION_MASK
153    Value * unitCountPtr = kb->getOutputStreamBlockPtr("unitCounts", ZERO, blockOffsetPhi);
154    unitCountPtr = kb->CreatePointerCast(unitCountPtr, fieldPtrTy);
155    for (unsigned i = 0; i < fieldsPerBlock; i++) {
156        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
157        Value * popc = kb->CreateCall(popc_func, mask[i]);
158        kb->CreateStore(popc, kb->CreateGEP(unitCountPtr, kb->getInt32(i)));
159    }
160#else
161    for (unsigned i = 0; i < fieldsPerBlock; i++) {
162        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
163    }
164#endif
165    for (unsigned j = 0; j < mStreamCount; ++j) {
166        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
167        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
168        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
169        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
170        for (unsigned i = 0; i < fieldsPerBlock; i++) {
171            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
172            Value * compressed = kb->CreateCall(PEXT_func, {field, mask[i]});
173            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
174        }
175    }
176    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
177    blockOffsetPhi->addIncoming(nextBlk, processBlock);
178    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
179    kb->CreateCondBr(moreToDo, processBlock, done);
180    kb->SetInsertPoint(done);
181}
182
183PEXTFieldCompressKernel::PEXTFieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
184: MultiBlockKernel("PEXTfieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
185                   {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
186                       Binding{kb->getStreamSetTy(), "extractionMask"}},
187#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
188                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
189#else
190                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
191                       Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
192#endif
193                   {}, {}, {})
194, mPEXTWidth(fieldWidth)
195, mStreamCount(streamCount) {
196    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
197}
198   
199StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
200: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
201                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
202#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
203                       Binding{kb->getStreamSetTy(), "extractionMask"}},
204#else
205                   Binding{kb->getStreamSetTy(), "unitCounts"}},
206#endif
207                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
208                   {}, {}, {})
209, mCompressedFieldWidth(fieldWidth)
210, mStreamCount(streamCount) {
211    addScalar(kb->getSizeTy(), "pendingItemCount");
212    for (unsigned i = 0; i < streamCount; i++) {
213        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
214    }
215
216}
217   
218void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
219    const unsigned fw = mCompressedFieldWidth;
220    Type * fwTy = b->getIntNTy(fw);
221    Type * sizeTy = b->getSizeTy();
222    const unsigned numFields = b->getBitBlockWidth()/fw;
223    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
224    Constant * oneSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, 1));
225    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
226    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
227    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
228    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
229    BasicBlock * entry = b->GetInsertBlock();
230    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
231    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
232    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
233    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
234    Constant * const ZERO = b->getSize(0);
235   
236    Value * pendingItemCount = b->getScalarField("pendingItemCount");
237    std::vector<Value *> pendingData(mStreamCount);
238    for (unsigned i = 0; i < mStreamCount; i++) {
239        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
240    }
241   
242    b->CreateBr(segmentLoop);
243    // Main Loop
244    b->SetInsertPoint(segmentLoop);
245    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
246    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
247    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
248    PHINode * pendingDataPhi[mStreamCount];
249    blockOffsetPhi->addIncoming(ZERO, entry);
250    outputBlockPhi->addIncoming(ZERO, entry);
251    pendingItemsPhi->addIncoming(pendingItemCount, entry);
252    for (unsigned i = 0; i < mStreamCount; i++) {
253        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
254        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
255    }
256#ifdef STREAM_COMPRESS_USING_EXTRACTION_MASK
257    Value * fieldPopCounts = b->simd_popcount(fw, b->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi));
258#else
259    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
260#endif
261    // For each field determine the (partial) sum popcount of all fields up to and
262    // including the current field.
263    Value * partialSum = fieldPopCounts;
264    for (unsigned i = 1; i < numFields; i *= 2) {
265        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
266    }
267    Value * blockPopCount = b->CreateZExtOrTrunc(b->mvmd_extract(fw, partialSum, numFields-1), sizeTy);
268    //
269    // Now determine for each source field the output offset of the first bit.
270    // Note that this depends on the number of pending bits.
271    //
272    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
273    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
274    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
275    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
276    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
277   //
278    // Determine the relative field number for each output field.   Note that the total
279    // number of fields involved is numFields + 1.   However, the first field always
280    // be immediately combined into the current pending data field, so we calculate
281    // field numbers for all subsequent fields, (the fields that receive overflow bits).
282    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
283  //
284    // Now process the input data block of each stream in the input stream set.
285    //
286    // First load all the stream set blocks and the pending data.
287    std::vector<Value *> sourceBlock(mStreamCount);
288    for (unsigned i = 0; i < mStreamCount; i++) {
289        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
290    }
291    // Now separate the bits of each field into ones that go into the current field
292    // and ones that go into the overflow field.   Extract the first field separately,
293    // and then shift and combine subsequent fields.
294    std::vector<Value *> pendingOutput(mStreamCount);
295    std::vector<Value *> outputFields(mStreamCount);
296    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
297    for (unsigned i = 0; i < mStreamCount; i++) {
298        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
299        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
300        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
301        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
302        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
303        // shift back currentFieldBits to combine with nextFieldBits.
304        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
305    }
306    // Now combine forward all fields with the same field number.  This may require
307    // up to log2 numFields steps.
308    for (unsigned j = 1; j < numFields; j*=2) {
309        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
310        for (unsigned i = 0; i < mStreamCount; i++) {
311            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
312            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
313       }
314    }
315    // Now compress the data fields, eliminating all but the last field from
316    // each run of consecutive field having the same field number as a subsequent field.
317    // But it may be that last field number is 0 which will compare equal to a 0 shifted in.
318    // So we add 1 to field numbers first.
319    Value * nonZeroFieldNo = b->simd_add(fw, fieldNo, oneSplat);
320    Value * eqNext = b->simd_eq(fw, nonZeroFieldNo, b->mvmd_srli(fw, nonZeroFieldNo, 1));
321    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
322    for (unsigned i = 0; i < mStreamCount; i++) {
323        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
324    }
325    //
326    // Finally combine the pendingOutput and outputField data.
327    // (a) shift forward outputField data to fill the pendingOutput values.
328    // (b) shift back outputField data to clear data added to pendingOutput.
329    //
330    // However, we may need to increment pendingFieldIndex if we previously
331    // filled the field with the extracted firstField value.  The first
332    // value of the fieldNo vector will be 0 or 1.
333    // It is possible that pendingFieldIndex will reach the total number
334    // of fields held in register.  mvmd_sll may not handle this if it
335    // translates to an LLVM shl.
336    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
337    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
338    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
339    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
340    for (unsigned i = 0; i < mStreamCount; i++) {
341        Value * outputFwd = b->fwCast(fw, b->mvmd_sll(fw, outputFields[i], pendingFieldIdx));
342        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
343        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
344        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
345    }
346    //
347    // Write the pendingOutput data to outputStream.
348    // Note: this data may be overwritten later, but we avoid branching.
349    for (unsigned i = 0; i < mStreamCount; i++) {
350        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
351    }
352    // Now determine the total amount of pending items and whether
353    // the pending data all fits within the pendingOutput.
354    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
355    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
356    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
357    //
358    // Prepare Phi nodes for the next iteration.
359    //
360    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
361    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
362    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
363    // But don't advance the output if all the data does fit into pendingOutput.
364    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
365    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
366    pendingItemsPhi->addIncoming(newPending, segmentLoop);
367
368    for (unsigned i = 0; i < mStreamCount; i++) {
369        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
370        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
371    }
372    //
373    // Now continue the loop if there are more blocks to process.
374    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
375    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
376   
377    b->SetInsertPoint(segmentDone);
378    // Save kernel state.
379    b->setScalarField("pendingItemCount", newPending);
380    for (unsigned i = 0; i < mStreamCount; i++) {
381        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
382    }
383    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
384    b->SetInsertPoint(finalWrite);
385    for (unsigned i = 0; i < mStreamCount; i++) {
386        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
387        Value * pending = b->bitCast(pendingOutput[i]);
388        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
389    }
390    b->CreateBr(updateProducedCount);
391    b->SetInsertPoint(updateProducedCount);
392    Value * produced = b->getProducedItemCount("compressedOutput");
393    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
394    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
395    b->setProducedItemCount("compressedOutput", produced);
396}
397
398SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
399: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
400                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
401                  {}, {}, {}, {})
402, mStreamCount(streamCount)
403, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
404// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
405// to prevent rounding errors.
406, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
407, mPEXTWidth(PEXT_width)
408{
409    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
410        && "mDelCountFieldWidth must be a power of 2");
411    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
412    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
413
414    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
415    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
416
417    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
418    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
419    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
420        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
421        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
422    }
423    addScalar(b->getSizeTy(), "pendingOffset");
424}
425
426void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
427    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
428    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
429    generateProcessingLoop(b, delMask, false);
430}
431
432void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
433    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
434    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
435    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
436    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
437    generateProcessingLoop(b, delMask, true);
438}
439
440/*
441What this function does in pseudo code:
442for (mSwizzleFactor)
443    create a swizzle set containing mSwizzleFactor blocks
444    apply PEXT to each block in the swizzle set
445    store the swizzleSet in PEXTedSwizzleSets vector
446
447for (each swizzle row i)
448    for (each swizzle set j)
449        processes row i in swizzle set j
450        store output in pendingData[j]
451*/
452
453void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
454
455    // selectors marks the positions we want to keep
456    Value * const selectors = b->CreateNot(delMask);
457
458    const auto swizzleSets = makeSwizzleSets(b, selectors);
459
460    // Compress the PEXTedSwizzleSets
461    // Output is written and committed to the output buffer one swizzle at a time.
462    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
463    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
464    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
465    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
466    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
467
468    // All output groups have the same count.
469    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
470    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
471    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
472    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
473
474    // There is a separate vector of pending data for each swizzle group.
475    std::vector<Value *> pendingData;
476    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
477        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
478    }
479
480    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
481
482    // For each row i
483    for (unsigned i = 0; i < mSwizzleFactor; i++) {
484
485        // Generate code for each of the mSwizzleFactor fields making up a block.
486        // We load the count for the field and process all swizzle groups accordingly.
487        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
488        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
489        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
490        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
491
492        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
493        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
494
495        // Data from the ith swizzle pack of each group is processed
496        // according to the same newItemCount, pendingSpace, ...
497        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
498            Value * const newItems = swizzleSets[j][i];
499            // Combine as many of the new items as possible into the pending group.
500            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
501            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
502            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
503            // To avoid an unpredictable branch, always store the combined group, whether full or not.
504            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
505            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
506            // Any items in excess of the space available in the current pending group overflow for the next group.
507            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
508            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
509            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
510        }
511
512        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
513        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
514
515        outputProduced = b->CreateAdd(outputProduced, newItemCount);
516    }
517
518    if (flush) { // incase we selected the overflow group on the final iteration
519        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
520        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
521        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
522            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
523            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
524        }
525    } else {
526        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
527            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
528        }
529        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
530        b->setScalarField("pendingOffset", pendingOffset);
531        // unless this is our final stride, don't report partially written fields.
532        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
533    }
534
535    b->setProducedItemCount("outputSwizzle0", outputProduced);
536}
537
538/*
539Apply PEXT deletion to the blocks in strms and swizzle the result.
540
541Q: Why is it advantageous to swizzle the PEXTed streams?
542
543A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
544
545Example:
546Input:     11101101
547PEXT mask: 11110000
548Output:    00001110
549
550PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
551cleared. This has implications if we're working with multiple streams.
552
553For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
554(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
555Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
556Stream 2:   qrstuv00 wxy00000 z1000000 23456000
557Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
558Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
559
560If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
561should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
562time:
563
564The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
565
566Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
567Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
568Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
569Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
570
571Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
572compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
573
574Args:
575    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
576    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
577        sequentially to PEXT a full block.)
578
579Returns:
580    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
581*/
582
583std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
584
585    Constant * pext = nullptr;
586    if (mPEXTWidth == 64) {
587        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
588    } else if (mPEXTWidth == 32) {
589        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
590    }
591
592    Value * const m = b->fwCast(mPEXTWidth, selectors);
593
594    std::vector<Value *> masks(mSwizzleFactor);
595    for (unsigned i = 0; i < mSwizzleFactor; i++) {
596        masks[i] = b->CreateExtractElement(m, i);
597
598    }
599
600    std::vector<std::vector<Value *>> swizzleSets;
601    swizzleSets.reserve(mSwizzleSetCount);
602
603    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
604
605    UndefValue * const outputInitializer = UndefValue::get(vecTy);
606
607    std::vector<Value *> input(mSwizzleFactor);
608    // For each of the k swizzle sets required to apply PEXT to all input streams
609    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
610
611        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
612            const unsigned k = (i * mSwizzleFactor) + j;
613            if (k < mStreamCount) {
614                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
615            } else {
616                input[j] = Constant::getNullValue(vecTy);
617            }
618        }
619
620        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
621        // then splat the pext mask and apply it to each output row
622
623        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
624        // For each of the input streams
625        for (unsigned j = 0; j < mSwizzleFactor; j++) {
626            for (unsigned k = 0; k < mSwizzleFactor; k++) {
627                // Load block j,k
628                Value * const field = b->CreateExtractElement(input[j], k);
629                // Apply PEXT deletion
630                Value * const selected = b->CreateCall(pext, {field, masks[k]});
631                // Then store it as our k,j-th output
632                output[k] = b->CreateInsertElement(output[k], selected, j);
633            }
634        }
635
636        swizzleSets.emplace_back(output);
637    }
638
639    return swizzleSets;
640}
641
642// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
643// Kernel inputs: stream_count data streams plus one del_mask stream
644// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
645
646void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
647    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
648    generateProcessingLoop(kb, delMask);
649}
650
651void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
652    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
653    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
654    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
655    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
656    generateProcessingLoop(kb, delMask);
657}
658
659void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
660    Constant * PEXT_func = nullptr;
661    if (mPEXTWidth == 64) {
662        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
663    } else if (mPEXTWidth == 32) {
664        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
665    }
666    std::vector<Value *> masks(mSwizzleFactor);
667    Value * const m = kb->fwCast(mPEXTWidth, kb->simd_not(delMask));
668    for (unsigned i = 0; i < mSwizzleFactor; i++) {
669        masks[i] = kb->CreateExtractElement(m, i);
670    }
671
672    for (unsigned i = 0; i < mStreamCount; ++i) {
673        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
674        Value * value = kb->fwCast(mPEXTWidth, input);
675        Value * output = UndefValue::get(value->getType());
676        for (unsigned j = 0; j < mSwizzleFactor; j++) {
677            Value * field = kb->CreateExtractElement(value, j);
678            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
679            output = kb->CreateInsertElement(output, compressed, j);
680        }
681        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
682    }
683    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
684    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
685}
686
687DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
688: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
689              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
690                  Binding{b->getStreamSetTy(), "delMaskSet"}},
691              {}, {}, {}, {})
692, mDelCountFieldWidth(fw)
693, mStreamCount(streamCount)
694, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
695, mPEXTWidth(PEXT_width) {
696    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
697    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
698}
699
700   
701//
702// This kernel performs final stream compression for a set of N bitstreams, given
703// (a) a set of bitstreams partially compressed within K-bit fields and stored
704//     in K-bit swizzled form, and
705// (b) a stream of deletion/extraction counts per K-bit stride.
706//
707// Restrictions:  At present, only K=64 is supported.
708//                At present, N must be an exact multiple of BLOCK_SIZE/K.
709//
710// The kernel always consumes full blocks of input and emits data into the output
711// buffer in swizzles of K items at a time.   Upon completion of a segment,
712// up to K-1 pending output items per stream may be stored in the kernel state.
713//
714// Note: that both input streams and output streams are stored in swizzled form.
715//
716
717SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
718: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
719                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
720, mBitStreamCount(bitStreamCount)
721    , mFieldWidth(fieldWidth)
722    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
723    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
724        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
725        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
726        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
727        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
728        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
729        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
730            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
731            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
732            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
733        }
734        addScalar(kb->getSizeTy(), "pendingOffset");
735}
736   
737void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
738       
739    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
740    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
741   
742    // Output is written and committed to the output buffer one swizzle at a time.
743    //
744    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
745    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
746   
747    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
748    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
749    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
750
751    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
752    Value * pendingOffset = kb->getScalarField("pendingOffset");
753    // There is a separate vector of pending data for each swizzle group.
754    std::vector<Value *> pendingData;
755    std::vector<Value *> outputStreamPtr;
756    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
757        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
758        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
759    }
760   
761    // Generate code for each of the mSwizzleFactor fields making up a block.
762    // We load the count for the field and process all swizzle groups accordingly.
763    for (unsigned i = 0; i < mSwizzleFactor; i++) {
764        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
765        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
766        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
767       
768        // Data from the ith swizzle pack of each group is processed
769        // according to the same newItemCount, pendingSpace, ...
770        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
771            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
772            // Combine as many of the new items as possible into the pending group.
773            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
774            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
775            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
776            // Any items in excess of the space available in the current pending group overflow for the next group.
777            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
778            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
779            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
780        }
781        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
782        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
783    }
784    kb->setScalarField("pendingOffset", pendingOffset);
785    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
786    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
787    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
788        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
789    }
790    kb->setProducedItemCount("outputSwizzle0", produced);
791}
792
793void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
794    CreateDoBlockMethodCall(kb);
795    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
796    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
797   
798    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
799    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
800    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
801    Value * pendingOffset = kb->getScalarField("pendingOffset");
802
803    // Write the pending data.
804    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
805        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
806        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
807        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
808    }
809    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
810}
811   
812void StreamFilterCompiler::makeCall(parabix::StreamSetBuffer * mask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
813    if (mBufferBlocks == 0) {
814        llvm::report_fatal_error("StreamFilterCompiler needs a non-zero bufferBlocks parameter (for now).");
815    }
816    auto & iBuilder = mDriver.getBuilder();
817    unsigned N = IDISA::getNumOfStreams(ssType);
818    if (IDISA::getStreamFieldWidth(ssType) != 1) {
819        llvm::report_fatal_error("StreamFilterCompiler only compresses bit streams (for now)");
820    }
821    Kernel * compressK = nullptr;
822    if (AVX2_available()) {
823        compressK = mDriver.addKernelInstance<PEXTFieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
824    } else {
825        compressK = mDriver.addKernelInstance<FieldCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
826    }
827    parabix::StreamSetBuffer * compressedFields = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
828#ifndef STREAM_COMPRESS_USING_EXTRACTION_MASK
829    parabix::StreamSetBuffer * unitCounts = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(), mBufferBlocks);
830   
831    mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields, unitCounts});
832   
833    Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
834    mDriver.makeKernelCall(streamK, {compressedFields, unitCounts}, {outputs});
835#else
836    mDriver.makeKernelCall(compressK, {inputs, mask}, {compressedFields});
837   
838    Kernel * streamK = mDriver.addKernelInstance<StreamCompressKernel>(iBuilder, mIntraFieldCompressionWidth, N);
839    mDriver.makeKernelCall(streamK, {compressedFields, mask}, {outputs});
840#endif
841}
842
843
844}
Note: See TracBrowser for help on using the repository browser.