source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6014

Last change on this file since 6014 was 6014, checked in by cameron, 12 months ago

Stream deletion bug fixes

File size: 38.7 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
15    Value * m = kb->simd_not(del_mask);
16    Value * mk = kb->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = kb->simd_and(mp, m);
24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
25        mk = kb->simd_and(mk, kb->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = kb->simd_and(s, mv[i]);
36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
52    }
53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
67    }
68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
84    BasicBlock * entry = kb->GetInsertBlock();
85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
86    BasicBlock * done = kb->CreateBasicBlock("done");
87    Constant * const ZERO = kb->getSize(0);
88    kb->CreateBr(processBlock);
89    kb->SetInsertPoint(processBlock);
90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
91    blockOffsetPhi->addIncoming(ZERO, entry);
92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
93    Value * delMask = kb->simd_not(extractionMask);
94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
95    for (unsigned j = 0; j < mStreamCount; ++j) {
96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
99    }
100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
105    kb->CreateCondBr(moreToDo, processBlock, done);
106    kb->SetInsertPoint(done);
107}
108
109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
115                      {}, {}, {})
116, mCompressFieldWidth(fieldWidth)
117, mStreamCount(streamCount) {
118}
119
120StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
121: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
122                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
123                       Binding{kb->getStreamSetTy(), "unitCounts"}},
124                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
125                   {}, {}, {})
126, mCompressedFieldWidth(fieldWidth)
127, mStreamCount(streamCount) {
128    addScalar(kb->getSizeTy(), "pendingItemCount");
129    for (unsigned i = 0; i < streamCount; i++) {
130        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
131    }
132
133}
134   
135void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
136    const unsigned fw = mCompressedFieldWidth;
137    Type * fwTy = b->getIntNTy(fw);
138    Type * sizeTy = b->getSizeTy();
139    const unsigned numFields = b->getBitBlockWidth()/fw;
140    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
141    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
142    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
143    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
144    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
145    BasicBlock * entry = b->GetInsertBlock();
146    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
147    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
148    BasicBlock * finalWrite = b->CreateBasicBlock("finalWrite");
149    BasicBlock * updateProducedCount = b->CreateBasicBlock("updateProducedCount");
150    Constant * const ZERO = b->getSize(0);
151   
152    Value * pendingItemCount = b->getScalarField("pendingItemCount");
153    std::vector<Value *> pendingData(mStreamCount);
154    for (unsigned i = 0; i < mStreamCount; i++) {
155        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
156    }
157   
158    b->CreateBr(segmentLoop);
159    // Main Loop
160    b->SetInsertPoint(segmentLoop);
161    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
162    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
163    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
164    PHINode * pendingDataPhi[mStreamCount];
165    blockOffsetPhi->addIncoming(ZERO, entry);
166    outputBlockPhi->addIncoming(ZERO, entry);
167    pendingItemsPhi->addIncoming(pendingItemCount, entry);
168    for (unsigned i = 0; i < mStreamCount; i++) {
169        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
170        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
171    }
172    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
173    // For each field determine the (partial) sum popcount of all fields up to and
174    // including the current field.
175    Value * partialSum = fieldPopCounts;
176    for (unsigned i = 1; i < numFields; i *= 2) {
177        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
178    }
179    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
180    //
181    // Now determine for each source field the output offset of the first bit.
182    // Note that this depends on the number of pending bits.
183    //
184    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
185    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
186    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
187    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
188    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
189   //
190    // Determine the relative field number for each output field.   Note that the total
191    // number of fields involved is numFields + 1.   However, the first field always
192    // be immediately combined into the current pending data field, so we calculate
193    // field numbers for all subsequent fields, (the fields that receive overflow bits).
194    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
195  //
196    // Now process the input data block of each stream in the input stream set.
197    //
198    // First load all the stream set blocks and the pending data.
199    std::vector<Value *> sourceBlock(mStreamCount);
200    for (unsigned i = 0; i < mStreamCount; i++) {
201        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
202    }
203    // Now separate the bits of each field into ones that go into the current field
204    // and ones that go into the overflow field.   Extract the first field separately,
205    // and then shift and combine subsequent fields.
206    std::vector<Value *> pendingOutput(mStreamCount);
207    std::vector<Value *> outputFields(mStreamCount);
208    Value * backShift = b->simd_sub(fw, fwSplat, offsets);
209    for (unsigned i = 0; i < mStreamCount; i++) {
210        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
211        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], backShift);
212        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
213        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
214        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
215        // shift back currentFieldBits to combine with nextFieldBits.
216        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
217    }
218    // Now combine forward all fields with the same field number.  This may require
219    // up to log2 numFields steps.
220    for (unsigned j = 1; j < numFields; j*=2) {
221        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
222        for (unsigned i = 0; i < mStreamCount; i++) {
223            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
224            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
225        }
226    }
227    // Now compress the data fields, eliminating all but the last field from
228    // each run of consecutive field having the same field number.
229    // same field number as a subsequent field.
230    Value * eqNext = b->simd_eq(fw, fieldNo, b->mvmd_srli(fw, fieldNo, 1));
231    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
232    for (unsigned i = 0; i < mStreamCount; i++) {
233        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
234   }
235    //
236    // Finally combine the pendingOutput and outputField data.
237    // (a) shift forward outputField data to fill the pendingOutput values.
238    // (b) shift back outputField data to clear data added to pendingOutput.
239    //
240    // However, we may need to increment pendingFieldIndex if we previously
241    // filled the field with the extracted firstField value.  The first
242    // value of the fieldNo vector will be 0 or 1.
243    // It is possible that pendingFieldIndex will reach the total number
244    // of fields held in register.  mvmd_sll may not handle this if it
245    // translates to an LLVM shl.
246    Value * increment = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldNo, 0), sizeTy);
247    pendingFieldIdx = b->CreateAdd(pendingFieldIdx, increment);
248    Value * const pendingSpaceFilled = b->CreateICmpEQ(pendingFieldIdx, numFieldConst);
249    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
250    for (unsigned i = 0; i < mStreamCount; i++) {
251        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
252        outputFwd = b->CreateSelect(pendingSpaceFilled, zeroSplat, outputFwd);
253        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
254        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
255    }
256    //
257    // Write the pendingOutput data to outputStream.
258    // Note: this data may be overwritten later, but we avoid branching.
259    for (unsigned i = 0; i < mStreamCount; i++) {
260        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
261    }
262    // Now determine the total amount of pending items and whether
263    // the pending data all fits within the pendingOutput.
264    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
265    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
266    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
267    //
268    // Prepare Phi nodes for the next iteration.
269    //
270    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
271    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
272    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
273    // But don't advance the output if all the data does fit into pendingOutput.
274    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
275    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
276    pendingItemsPhi->addIncoming(newPending, segmentLoop);
277
278    for (unsigned i = 0; i < mStreamCount; i++) {
279        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
280        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
281    }
282    //
283    // Now continue the loop if there are more blocks to process.
284    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
285    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
286   
287    b->SetInsertPoint(segmentDone);
288    // Save kernel state.
289    b->setScalarField("pendingItemCount", newPending);
290    for (unsigned i = 0; i < mStreamCount; i++) {
291        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
292    }
293    b->CreateCondBr(mIsFinal, finalWrite, updateProducedCount);
294    b->SetInsertPoint(finalWrite);
295    for (unsigned i = 0; i < mStreamCount; i++) {
296        //Value * pending = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
297        Value * pending = b->bitCast(pendingOutput[i]);
298        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), nextOutputBlk, pending);
299    }
300    b->CreateBr(updateProducedCount);
301    b->SetInsertPoint(updateProducedCount);
302     Value * produced = b->getProducedItemCount("compressedOutput");
303    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
304    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
305    b->setProducedItemCount("compressedOutput", produced);
306}
307
308SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
309: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
310                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
311                  {}, {}, {}, {})
312, mStreamCount(streamCount)
313, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
314// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
315// to prevent rounding errors.
316, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
317, mPEXTWidth(PEXT_width)
318{
319    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
320        && "mDelCountFieldWidth must be a power of 2");
321    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
322    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
323
324    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
325    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
326
327    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
328    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
329    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
330        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
331        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
332    }
333    addScalar(b->getSizeTy(), "pendingOffset");
334}
335
336void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
337    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
338    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
339    generateProcessingLoop(b, delMask, false);
340}
341
342void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
343    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
344    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
345    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
346    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
347    generateProcessingLoop(b, delMask, true);
348}
349
350/*
351What this function does in pseudo code:
352for (mSwizzleFactor)
353    create a swizzle set containing mSwizzleFactor blocks
354    apply PEXT to each block in the swizzle set
355    store the swizzleSet in PEXTedSwizzleSets vector
356
357for (each swizzle row i)
358    for (each swizzle set j)
359        processes row i in swizzle set j
360        store output in pendingData[j]
361*/
362
363void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
364
365    // selectors marks the positions we want to keep
366    Value * const selectors = b->CreateNot(delMask);
367
368    const auto swizzleSets = makeSwizzleSets(b, selectors);
369
370    // Compress the PEXTedSwizzleSets
371    // Output is written and committed to the output buffer one swizzle at a time.
372    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
373    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
374    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
375    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
376    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
377
378    // All output groups have the same count.
379    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
380    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
381    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
382    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
383
384    // There is a separate vector of pending data for each swizzle group.
385    std::vector<Value *> pendingData;
386    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
387        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
388    }
389
390    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
391
392    // For each row i
393    for (unsigned i = 0; i < mSwizzleFactor; i++) {
394
395        // Generate code for each of the mSwizzleFactor fields making up a block.
396        // We load the count for the field and process all swizzle groups accordingly.
397        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
398        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
399        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
400        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
401
402        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
403        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
404
405        // Data from the ith swizzle pack of each group is processed
406        // according to the same newItemCount, pendingSpace, ...
407        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
408            Value * const newItems = swizzleSets[j][i];
409            // Combine as many of the new items as possible into the pending group.
410            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
411            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
412            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
413            // To avoid an unpredictable branch, always store the combined group, whether full or not.
414            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
415            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
416            // Any items in excess of the space available in the current pending group overflow for the next group.
417            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
418            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
419            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
420        }
421
422        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
423        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
424
425        outputProduced = b->CreateAdd(outputProduced, newItemCount);
426    }
427
428    if (flush) { // incase we selected the overflow group on the final iteration
429        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
430        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
431        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
432            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
433            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
434        }
435    } else {
436        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
437            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
438        }
439        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
440        b->setScalarField("pendingOffset", pendingOffset);
441        // unless this is our final stride, don't report partially written fields.
442        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
443    }
444
445    b->setProducedItemCount("outputSwizzle0", outputProduced);
446}
447
448/*
449Apply PEXT deletion to the blocks in strms and swizzle the result.
450
451Q: Why is it advantageous to swizzle the PEXTed streams?
452
453A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
454
455Example:
456Input:     11101101
457PEXT mask: 11110000
458Output:    00001110
459
460PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
461cleared. This has implications if we're working with multiple streams.
462
463For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
464(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
465Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
466Stream 2:   qrstuv00 wxy00000 z1000000 23456000
467Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
468Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
469
470If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
471should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
472time:
473
474The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
475
476Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
477Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
478Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
479Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
480
481Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
482compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
483
484Args:
485    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
486    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
487        sequentially to PEXT a full block.)
488
489Returns:
490    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
491*/
492
493std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
494
495    Constant * pext = nullptr;
496    if (mPEXTWidth == 64) {
497        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
498    } else if (mPEXTWidth == 32) {
499        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
500    }
501
502    Value * const m = b->fwCast(mPEXTWidth, selectors);
503
504    std::vector<Value *> masks(mSwizzleFactor);
505    for (unsigned i = 0; i < mSwizzleFactor; i++) {
506        masks[i] = b->CreateExtractElement(m, i);
507
508    }
509
510    std::vector<std::vector<Value *>> swizzleSets;
511    swizzleSets.reserve(mSwizzleSetCount);
512
513    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
514
515    UndefValue * const outputInitializer = UndefValue::get(vecTy);
516
517    std::vector<Value *> input(mSwizzleFactor);
518    // For each of the k swizzle sets required to apply PEXT to all input streams
519    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
520
521        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
522            const unsigned k = (i * mSwizzleFactor) + j;
523            if (k < mStreamCount) {
524                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
525            } else {
526                input[j] = Constant::getNullValue(vecTy);
527            }
528        }
529
530        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
531        // then splat the pext mask and apply it to each output row
532
533        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
534        // For each of the input streams
535        for (unsigned j = 0; j < mSwizzleFactor; j++) {
536            for (unsigned k = 0; k < mSwizzleFactor; k++) {
537                // Load block j,k
538                Value * const field = b->CreateExtractElement(input[j], k);
539                // Apply PEXT deletion
540                Value * const selected = b->CreateCall(pext, {field, masks[k]});
541                // Then store it as our k,j-th output
542                output[k] = b->CreateInsertElement(output[k], selected, j);
543            }
544        }
545
546        swizzleSets.emplace_back(output);
547    }
548
549    return swizzleSets;
550}
551
552// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
553// Kernel inputs: stream_count data streams plus one del_mask stream
554// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
555
556void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
557    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
558    generateProcessingLoop(kb, delMask);
559}
560
561void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
562    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
563    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
564    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
565    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
566    generateProcessingLoop(kb, delMask);
567}
568
569void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
570    Constant * PEXT_func = nullptr;
571    if (mPEXTWidth == 64) {
572        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
573    } else if (mPEXTWidth == 32) {
574        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
575    }
576    std::vector<Value *> masks(mSwizzleFactor);
577    Value * const m = kb->fwCast(mSwizzleFactor, kb->simd_not(delMask));
578    for (unsigned i = 0; i < mSwizzleFactor; i++) {
579        masks.push_back(kb->CreateExtractElement(m, i));
580    }
581
582    for (unsigned i = 0; i < mStreamCount; ++i) {
583        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
584        Value * value = kb->fwCast(mPEXTWidth, input);
585        Value * output = UndefValue::get(value->getType());
586        for (unsigned j = 0; j < mSwizzleFactor; j++) {
587            Value * field = kb->CreateExtractElement(value, j);
588            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
589            output = kb->CreateInsertElement(output, compressed, j);
590        }
591        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
592    }
593    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
594    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
595}
596
597DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
598: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
599              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
600                  Binding{b->getStreamSetTy(), "delMaskSet"}},
601              {}, {}, {}, {})
602, mDelCountFieldWidth(fw)
603, mStreamCount(streamCount)
604, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
605, mPEXTWidth(PEXT_width) {
606    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
607    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
608}
609
610   
611//
612// This kernel performs final stream compression for a set of N bitstreams, given
613// (a) a set of bitstreams partially compressed within K-bit fields and stored
614//     in K-bit swizzled form, and
615// (b) a stream of deletion/extraction counts per K-bit stride.
616//
617// Restrictions:  At present, only K=64 is supported.
618//                At present, N must be an exact multiple of BLOCK_SIZE/K.
619//
620// The kernel always consumes full blocks of input and emits data into the output
621// buffer in swizzles of K items at a time.   Upon completion of a segment,
622// up to K-1 pending output items per stream may be stored in the kernel state.
623//
624// Note: that both input streams and output streams are stored in swizzled form.
625//
626
627SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
628: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
629                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
630, mBitStreamCount(bitStreamCount)
631    , mFieldWidth(fieldWidth)
632    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
633    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
634        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
635        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
636        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
637        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
638        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
639        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
640            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
641            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
642            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
643        }
644        addScalar(kb->getSizeTy(), "pendingOffset");
645}
646   
647void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
648       
649    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
650    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
651   
652    // Output is written and committed to the output buffer one swizzle at a time.
653    //
654    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
655    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
656   
657    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
658    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
659    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
660
661    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
662    Value * pendingOffset = kb->getScalarField("pendingOffset");
663    // There is a separate vector of pending data for each swizzle group.
664    std::vector<Value *> pendingData;
665    std::vector<Value *> outputStreamPtr;
666    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
667        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
668        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
669    }
670   
671    // Generate code for each of the mSwizzleFactor fields making up a block.
672    // We load the count for the field and process all swizzle groups accordingly.
673    for (unsigned i = 0; i < mSwizzleFactor; i++) {
674        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
675        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
676        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
677       
678        // Data from the ith swizzle pack of each group is processed
679        // according to the same newItemCount, pendingSpace, ...
680        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
681            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
682            // Combine as many of the new items as possible into the pending group.
683            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
684            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
685            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
686            // Any items in excess of the space available in the current pending group overflow for the next group.
687            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
688            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
689            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
690        }
691        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
692        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
693    }
694    kb->setScalarField("pendingOffset", pendingOffset);
695    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
696    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
697    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
698        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
699    }
700    kb->setProducedItemCount("outputSwizzle0", produced);
701}
702
703void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
704    CreateDoBlockMethodCall(kb);
705    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
706    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
707   
708    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
709    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
710    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
711    Value * pendingOffset = kb->getScalarField("pendingOffset");
712
713    // Write the pending data.
714    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
715        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
716        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
717        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
718    }
719    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
720}
721}
Note: See TracBrowser for help on using the repository browser.