source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6013

Last change on this file since 6013 was 6013, checked in by cameron, 18 months ago

StreamCompressKernel? initial check-in

File size: 37.7 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
15    Value * m = kb->simd_not(del_mask);
16    Value * mk = kb->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = kb->simd_and(mp, m);
24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
25        mk = kb->simd_and(mk, kb->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = kb->simd_and(s, mv[i]);
36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
52    }
53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
67    }
68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
84    BasicBlock * entry = kb->GetInsertBlock();
85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
86    BasicBlock * done = kb->CreateBasicBlock("done");
87    Constant * const ZERO = kb->getSize(0);
88    kb->CreateBr(processBlock);
89    kb->SetInsertPoint(processBlock);
90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
91    blockOffsetPhi->addIncoming(ZERO, entry);
92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
93    Value * delMask = kb->simd_not(extractionMask);
94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
95    for (unsigned j = 0; j < mStreamCount; ++j) {
96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
99    }
100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
105    kb->CreateCondBr(moreToDo, processBlock, done);
106    kb->SetInsertPoint(done);
107}
108
109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
115                      {}, {}, {})
116, mCompressFieldWidth(fieldWidth)
117, mStreamCount(streamCount) {
118}
119
120StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
121: MultiBlockKernel("streamCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
122                   {Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet"},
123                       Binding{kb->getStreamSetTy(), "unitCounts"}},
124                   {Binding{kb->getStreamSetTy(streamCount), "compressedOutput", BoundedRate(0, 1)}},
125                   {}, {}, {})
126, mCompressedFieldWidth(fieldWidth)
127, mStreamCount(streamCount) {
128    addScalar(kb->getSizeTy(), "pendingItemCount");
129    for (unsigned i = 0; i < streamCount; i++) {
130        addScalar(kb->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
131    }
132
133}
134   
135void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
136    const unsigned fw = mCompressedFieldWidth;
137    Type * fwTy = b->getIntNTy(fw);
138    Type * sizeTy = b->getSizeTy();
139    const unsigned numFields = b->getBitBlockWidth()/fw;
140    Constant * zeroSplat = Constant::getNullValue(b->fwVectorType(fw));
141    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
142    Constant * numFieldConst = ConstantInt::get(sizeTy, numFields);
143    Constant * fwMaskSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
144    Constant * bitBlockWidthConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
145    BasicBlock * entry = b->GetInsertBlock();
146    BasicBlock * segmentLoop = b->CreateBasicBlock("segmentLoop");
147    BasicBlock * segmentDone = b->CreateBasicBlock("segmentDone");
148    Constant * const ZERO = b->getSize(0);
149   
150    Value * pendingItemCount = b->getScalarField("pendingItemCount");
151    std::vector<Value *> pendingData(mStreamCount);
152    for (unsigned i = 0; i < mStreamCount; i++) {
153        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
154    }
155   
156    b->CreateBr(segmentLoop);
157    // Main Loop
158    b->SetInsertPoint(segmentLoop);
159    PHINode * blockOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
160    PHINode * outputBlockPhi = b->CreatePHI(b->getSizeTy(), 2);
161    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
162    PHINode * pendingDataPhi[mStreamCount];
163    blockOffsetPhi->addIncoming(ZERO, entry);
164    outputBlockPhi->addIncoming(ZERO, entry);
165    pendingItemsPhi->addIncoming(pendingItemCount, entry);
166    for (unsigned i = 0; i < mStreamCount; i++) {
167        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
168        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
169    }
170    Value * fieldPopCounts = b->loadInputStreamBlock("unitCounts", ZERO, blockOffsetPhi);
171    // For each field determine the (partial) sum popcount of all fields up to and
172    // including the current field.
173    Value * partialSum = fieldPopCounts;
174    for (unsigned i = 1; i < numFields; i *= 2) {
175        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
176    }
177    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
178    //
179    // Now determine for each source field the output offset of the first bit.
180    // Note that this depends on the number of pending bits.
181    //
182    Value * pendingOffset = b->CreateURem(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
183    Value * splatPending = b->simd_fill(fw, b->CreateZExtOrTrunc(pendingOffset, fwTy));
184    Value * pendingFieldIdx = b->CreateUDiv(pendingItemsPhi, ConstantInt::get(sizeTy, fw));
185    Value * offsets = b->simd_add(fw, b->mvmd_slli(fw, partialSum, 1), splatPending);
186    offsets = b->simd_and(offsets, fwMaskSplat); // parallel URem fw
187   //
188    // Determine the relative field number for each output field.   Note that the total
189    // number of fields involved is numFields + 1.   However, the first field always
190    // be immediately combined into the current pending data field, so we calculate
191    // field numbers for all subsequent fields, (the fields that receive overflow bits).
192    Value * fieldNo = b->simd_srli(fw, b->simd_add(fw, partialSum, splatPending), std::log2(fw));
193  //
194    // Now process the input data block of each stream in the input stream set.
195    //
196    // First load all the stream set blocks and the pending data.
197    std::vector<Value *> sourceBlock(mStreamCount);
198    for (unsigned i = 0; i < mStreamCount; i++) {
199        sourceBlock[i] = b->loadInputStreamBlock("sourceStreamSet", b->getInt32(i), blockOffsetPhi);
200
201    }
202    // Now separate the bits of each field into ones that go into the current field
203    // and ones that go into the overflow field.   Extract the first field separately,
204    // and then shift and combine subsequent fields.
205    std::vector<Value *> pendingOutput(mStreamCount);
206    std::vector<Value *> outputFields(mStreamCount);
207    for (unsigned i = 0; i < mStreamCount; i++) {
208        Value * currentFieldBits = b->simd_sllv(fw, sourceBlock[i], offsets);
209        Value * nextFieldBits = b->simd_srlv(fw, sourceBlock[i], b->simd_sub(fw, fwSplat, offsets));
210        Value * firstField = b->mvmd_extract(fw, currentFieldBits, 0);
211        Value * vec1 = b->CreateInsertElement(zeroSplat, firstField, pendingFieldIdx);
212        pendingOutput[i] = b->simd_or(pendingDataPhi[i], vec1);
213        // shift back currentFieldBits to combine with nextFieldBits.
214        outputFields[i] = b->simd_or(b->mvmd_srli(fw, currentFieldBits, 1), nextFieldBits);
215    }
216    // We may have filled the current field of the pendingOutput update the
217    // pendingFieldIndex.
218    Value * newPendingTotal = b->CreateZExtOrTrunc(b->mvmd_extract(fw, fieldPopCounts, 0), sizeTy);
219    pendingFieldIdx = b->CreateUDiv(newPendingTotal, ConstantInt::get(sizeTy, fw));
220    // Now combine forward all fields with the same field number.  This may require
221    // up to log2 numFields steps.
222    for (unsigned j = 1; j < numFields; j*=2) {
223        Value * select = b->simd_eq(fw, fieldNo, b->mvmd_slli(fw, fieldNo, j));
224        for (unsigned i = 0; i < mStreamCount; i++) {
225            Value * fields_fwd = b->mvmd_slli(fw, outputFields[i], j);
226            outputFields[i] = b->simd_or(outputFields[i], b->simd_and(select, fields_fwd));
227        }
228    }
229    // Now compress the data fields, eliminating all but the last field from
230    // each run of consecutive field having the same field number.
231    // same field number as a subsequent field.
232    Value * eqNext = b->simd_eq(fw, fieldNo, b->mvmd_srli(fw, fieldNo, 1));
233    Value * compressMask = b->hsimd_signmask(fw, b->simd_not(eqNext));
234    for (unsigned i = 0; i < mStreamCount; i++) {
235        outputFields[i] = b->mvmd_compress(fw, outputFields[i], compressMask);
236   }
237    //
238    // Finally combine the pendingOutput and outputField data.
239    // (a) shift forward outputField data to fill the pendingOutput values.
240    // (b) shift back outputField data to clear data added to pendingOutput.
241    Value * shftBack = b->CreateSub(numFieldConst, pendingFieldIdx);
242    for (unsigned i = 0; i < mStreamCount; i++) {
243        Value * outputFwd = b->mvmd_sll(fw, outputFields[i], pendingFieldIdx);
244        pendingOutput[i] = b->simd_or(pendingOutput[i], outputFwd);
245        outputFields[i] = b->mvmd_srl(fw, outputFields[i], shftBack);
246  }
247    //
248    // Write the pendingOutput data to outputStream.
249    // Note: this data may be overwritten later, but we avoid branching.
250    for (unsigned i = 0; i < mStreamCount; i++) {
251        b->storeOutputStreamBlock("compressedOutput", b->getInt32(i), outputBlockPhi, pendingOutput[i]);
252    }
253    // Now determine the total amount of pending items and whether
254    // the pending data all fits within the pendingOutput.
255    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
256    Value * doesFit = b->CreateICmpULT(newPending, bitBlockWidthConst);
257    newPending = b->CreateSelect(doesFit, newPending, b->CreateSub(newPending, bitBlockWidthConst));
258    //
259    // Prepare Phi nodes for the next iteration.
260    //
261    Value * nextBlk = b->CreateAdd(blockOffsetPhi, b->getSize(1));
262    blockOffsetPhi->addIncoming(nextBlk, segmentLoop);
263    Value * nextOutputBlk = b->CreateAdd(outputBlockPhi, b->getSize(1));
264    // But don't advance the output if all the data does fit into pendingOutput.
265    nextOutputBlk = b->CreateSelect(doesFit, outputBlockPhi, nextOutputBlk);
266    outputBlockPhi->addIncoming(nextOutputBlk, segmentLoop);
267    pendingItemsPhi->addIncoming(newPending, segmentLoop);
268
269    for (unsigned i = 0; i < mStreamCount; i++) {
270        pendingOutput[i] = b->CreateSelect(doesFit, b->fwCast(fw, pendingOutput[i]), b->fwCast(fw, outputFields[i]));
271        pendingDataPhi[i]->addIncoming(b->bitCast(pendingOutput[i]), segmentLoop);
272    }
273    //
274    // Now continue the loop if there are more blocks to process.
275    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
276    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
277   
278    b->SetInsertPoint(segmentDone);
279    // Save kernel state.
280    b->setScalarField("pendingItemCount", newPending);
281    for (unsigned i = 0; i < mStreamCount; i++) {
282        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
283    }
284    Value * produced = b->getProducedItemCount("compressedOutput");
285    produced = b->CreateAdd(produced, b->CreateMul(nextOutputBlk, bitBlockWidthConst));
286    produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, blockPopCount), produced);
287    b->setProducedItemCount("compressedOutput", produced);
288}
289   
290   
291   
292
293SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
294: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
295                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
296                  {}, {}, {}, {})
297, mStreamCount(streamCount)
298, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
299// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
300// to prevent rounding errors.
301, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
302, mPEXTWidth(PEXT_width)
303{
304    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
305        && "mDelCountFieldWidth must be a power of 2");
306    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
307    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
308
309    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
310    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
311
312    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
313    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
314    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
315        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
316        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
317    }
318    addScalar(b->getSizeTy(), "pendingOffset");
319}
320
321void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
322    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
323    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
324    generateProcessingLoop(b, delMask, false);
325}
326
327void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
328    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
329    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
330    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
331    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
332    generateProcessingLoop(b, delMask, true);
333}
334
335/*
336What this function does in pseudo code:
337for (mSwizzleFactor)
338    create a swizzle set containing mSwizzleFactor blocks
339    apply PEXT to each block in the swizzle set
340    store the swizzleSet in PEXTedSwizzleSets vector
341
342for (each swizzle row i)
343    for (each swizzle set j)
344        processes row i in swizzle set j
345        store output in pendingData[j]
346*/
347
348void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
349
350    // selectors marks the positions we want to keep
351    Value * const selectors = b->CreateNot(delMask);
352
353    const auto swizzleSets = makeSwizzleSets(b, selectors);
354
355    // Compress the PEXTedSwizzleSets
356    // Output is written and committed to the output buffer one swizzle at a time.
357    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
358    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
359    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
360    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
361    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
362
363    // All output groups have the same count.
364    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
365    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
366    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
367    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
368
369    // There is a separate vector of pending data for each swizzle group.
370    std::vector<Value *> pendingData;
371    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
372        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
373    }
374
375    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
376
377    // For each row i
378    for (unsigned i = 0; i < mSwizzleFactor; i++) {
379
380        // Generate code for each of the mSwizzleFactor fields making up a block.
381        // We load the count for the field and process all swizzle groups accordingly.
382        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
383        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
384        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
385        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
386
387        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
388        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
389
390        // Data from the ith swizzle pack of each group is processed
391        // according to the same newItemCount, pendingSpace, ...
392        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
393            Value * const newItems = swizzleSets[j][i];
394            // Combine as many of the new items as possible into the pending group.
395            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
396            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
397            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
398            // To avoid an unpredictable branch, always store the combined group, whether full or not.
399            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
400            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
401            // Any items in excess of the space available in the current pending group overflow for the next group.
402            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
403            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
404            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
405        }
406
407        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
408        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
409
410        outputProduced = b->CreateAdd(outputProduced, newItemCount);
411    }
412
413    if (flush) { // incase we selected the overflow group on the final iteration
414        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
415        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
416        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
417            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
418            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
419        }
420    } else {
421        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
422            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
423        }
424        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
425        b->setScalarField("pendingOffset", pendingOffset);
426        // unless this is our final stride, don't report partially written fields.
427        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
428    }
429
430    b->setProducedItemCount("outputSwizzle0", outputProduced);
431}
432
433/*
434Apply PEXT deletion to the blocks in strms and swizzle the result.
435
436Q: Why is it advantageous to swizzle the PEXTed streams?
437
438A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
439
440Example:
441Input:     11101101
442PEXT mask: 11110000
443Output:    00001110
444
445PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
446cleared. This has implications if we're working with multiple streams.
447
448For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
449(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
450Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
451Stream 2:   qrstuv00 wxy00000 z1000000 23456000
452Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
453Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
454
455If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
456should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
457time:
458
459The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
460
461Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
462Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
463Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
464Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
465
466Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
467compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
468
469Args:
470    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
471    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
472        sequentially to PEXT a full block.)
473
474Returns:
475    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
476*/
477
478std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
479
480    Constant * pext = nullptr;
481    if (mPEXTWidth == 64) {
482        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
483    } else if (mPEXTWidth == 32) {
484        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
485    }
486
487    Value * const m = b->fwCast(mPEXTWidth, selectors);
488
489    std::vector<Value *> masks(mSwizzleFactor);
490    for (unsigned i = 0; i < mSwizzleFactor; i++) {
491        masks[i] = b->CreateExtractElement(m, i);
492
493    }
494
495    std::vector<std::vector<Value *>> swizzleSets;
496    swizzleSets.reserve(mSwizzleSetCount);
497
498    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
499
500    UndefValue * const outputInitializer = UndefValue::get(vecTy);
501
502    std::vector<Value *> input(mSwizzleFactor);
503    // For each of the k swizzle sets required to apply PEXT to all input streams
504    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
505
506        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
507            const unsigned k = (i * mSwizzleFactor) + j;
508            if (k < mStreamCount) {
509                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
510            } else {
511                input[j] = Constant::getNullValue(vecTy);
512            }
513        }
514
515        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
516        // then splat the pext mask and apply it to each output row
517
518        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
519        // For each of the input streams
520        for (unsigned j = 0; j < mSwizzleFactor; j++) {
521            for (unsigned k = 0; k < mSwizzleFactor; k++) {
522                // Load block j,k
523                Value * const field = b->CreateExtractElement(input[j], k);
524                // Apply PEXT deletion
525                Value * const selected = b->CreateCall(pext, {field, masks[k]});
526                // Then store it as our k,j-th output
527                output[k] = b->CreateInsertElement(output[k], selected, j);
528            }
529        }
530
531        swizzleSets.emplace_back(output);
532    }
533
534    return swizzleSets;
535}
536
537// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
538// Kernel inputs: stream_count data streams plus one del_mask stream
539// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
540
541void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
542    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
543    generateProcessingLoop(kb, delMask);
544}
545
546void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
547    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
548    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
549    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
550    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
551    generateProcessingLoop(kb, delMask);
552}
553
554void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
555    Constant * PEXT_func = nullptr;
556    if (mPEXTWidth == 64) {
557        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
558    } else if (mPEXTWidth == 32) {
559        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
560    }
561    std::vector<Value *> masks(mSwizzleFactor);
562    Value * const m = kb->fwCast(mSwizzleFactor, kb->simd_not(delMask));
563    for (unsigned i = 0; i < mSwizzleFactor; i++) {
564        masks.push_back(kb->CreateExtractElement(m, i));
565    }
566
567    for (unsigned i = 0; i < mStreamCount; ++i) {
568        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
569        Value * value = kb->fwCast(mPEXTWidth, input);
570        Value * output = UndefValue::get(value->getType());
571        for (unsigned j = 0; j < mSwizzleFactor; j++) {
572            Value * field = kb->CreateExtractElement(value, j);
573            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
574            output = kb->CreateInsertElement(output, compressed, j);
575        }
576        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
577    }
578    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
579    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
580}
581
582DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
583: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
584              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
585                  Binding{b->getStreamSetTy(), "delMaskSet"}},
586              {}, {}, {}, {})
587, mDelCountFieldWidth(fw)
588, mStreamCount(streamCount)
589, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
590, mPEXTWidth(PEXT_width) {
591    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
592    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
593}
594
595   
596//
597// This kernel performs final stream compression for a set of N bitstreams, given
598// (a) a set of bitstreams partially compressed within K-bit fields and stored
599//     in K-bit swizzled form, and
600// (b) a stream of deletion/extraction counts per K-bit stride.
601//
602// Restrictions:  At present, only K=64 is supported.
603//                At present, N must be an exact multiple of BLOCK_SIZE/K.
604//
605// The kernel always consumes full blocks of input and emits data into the output
606// buffer in swizzles of K items at a time.   Upon completion of a segment,
607// up to K-1 pending output items per stream may be stored in the kernel state.
608//
609// Note: that both input streams and output streams are stored in swizzled form.
610//
611
612SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
613: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
614                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
615, mBitStreamCount(bitStreamCount)
616    , mFieldWidth(fieldWidth)
617    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
618    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
619        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
620        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
621        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
622        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
623        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
624        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
625            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
626            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
627            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
628        }
629        addScalar(kb->getSizeTy(), "pendingOffset");
630}
631   
632void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
633       
634    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
635    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
636   
637    // Output is written and committed to the output buffer one swizzle at a time.
638    //
639    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
640    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
641   
642    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
643    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
644    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
645
646    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
647    Value * pendingOffset = kb->getScalarField("pendingOffset");
648    // There is a separate vector of pending data for each swizzle group.
649    std::vector<Value *> pendingData;
650    std::vector<Value *> outputStreamPtr;
651    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
652        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
653        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
654    }
655   
656    // Generate code for each of the mSwizzleFactor fields making up a block.
657    // We load the count for the field and process all swizzle groups accordingly.
658    for (unsigned i = 0; i < mSwizzleFactor; i++) {
659        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
660        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
661        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
662       
663        // Data from the ith swizzle pack of each group is processed
664        // according to the same newItemCount, pendingSpace, ...
665        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
666            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
667            // Combine as many of the new items as possible into the pending group.
668            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
669            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
670            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
671            // Any items in excess of the space available in the current pending group overflow for the next group.
672            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
673            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
674            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
675        }
676        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
677        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
678    }
679    kb->setScalarField("pendingOffset", pendingOffset);
680    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
681    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
682    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
683        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
684    }
685    kb->setProducedItemCount("outputSwizzle0", produced);
686}
687
688void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
689    CreateDoBlockMethodCall(kb);
690    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
691    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
692   
693    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
694    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
695    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
696    Value * pendingOffset = kb->getScalarField("pendingOffset");
697
698    // Write the pending data.
699    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
700        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
701        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
702        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
703    }
704    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
705}
706}
Note: See TracBrowser for help on using the repository browser.