source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6006

Last change on this file since 6006 was 6006, checked in by cameron, 13 months ago

Multiblock field compress kernel, used in u8u16

File size: 28.4 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask) {
15    Value * m = kb->simd_not(del_mask);
16    Value * mk = kb->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = kb->simd_xor(mp, kb->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = kb->simd_and(mp, m);
24        m = kb->simd_or(kb->simd_xor(m, mv), kb->simd_srli(fw, mv, shift));
25        mk = kb->simd_and(mk, kb->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & kb, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = kb->simd_and(strm, kb->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = kb->simd_and(s, mv[i]);
36        s = kb->simd_or(kb->simd_xor(s, t), kb->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
46    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
51        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
52    }
53    Value * unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
54    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * remainingBytes) {
58    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
59    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(kb, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(kb, mDeletionFieldWidth, delMask, move_masks, input);
66        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), output);
67    }
68    Value * const unitCount = kb->simd_popcount(mDeletionFieldWidth, kb->simd_not(delMask));
69    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), kb->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{kb->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83void FieldCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
84    BasicBlock * entry = kb->GetInsertBlock();
85    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
86    BasicBlock * done = kb->CreateBasicBlock("done");
87    Constant * const ZERO = kb->getSize(0);
88    kb->CreateBr(processBlock);
89    kb->SetInsertPoint(processBlock);
90    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
91    blockOffsetPhi->addIncoming(ZERO, entry);
92    Value * extractionMask = kb->loadInputStreamBlock("extractionMask", ZERO, blockOffsetPhi);
93    Value * delMask = kb->simd_not(extractionMask);
94    const auto move_masks = parallel_prefix_deletion_masks(kb, mCompressFieldWidth, delMask);
95    for (unsigned j = 0; j < mStreamCount; ++j) {
96        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
97        Value * output = apply_parallel_prefix_deletion(kb, mCompressFieldWidth, delMask, move_masks, input);
98        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
99    }
100    Value * unitCount = kb->simd_popcount(mCompressFieldWidth, extractionMask);
101    kb->storeOutputStreamBlock("unitCounts", kb->getInt32(0), blockOffsetPhi, kb->bitCast(unitCount));
102    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
103    blockOffsetPhi->addIncoming(nextBlk, processBlock);
104    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
105    kb->CreateCondBr(moreToDo, processBlock, done);
106    kb->SetInsertPoint(done);
107}
108
109FieldCompressKernel::FieldCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
110: MultiBlockKernel("fieldCompress" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
111                      {Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"},
112                          Binding{kb->getStreamSetTy(), "extractionMask"}},
113                      {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"},
114                          Binding{kb->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(kb->getBitBlockWidth())}},
115                      {}, {}, {})
116, mCompressFieldWidth(fieldWidth)
117, mStreamCount(streamCount) {
118}
119
120SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
121: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
122                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
123                  {}, {}, {}, {})
124, mStreamCount(streamCount)
125, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
126// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
127// to prevent rounding errors.
128, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
129, mPEXTWidth(PEXT_width)
130{
131    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
132        && "mDelCountFieldWidth must be a power of 2");
133    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
134    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
135
136    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
137    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
138
139    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
140    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
141    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
142        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
143        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
144    }
145    addScalar(b->getSizeTy(), "pendingOffset");
146}
147
148void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
149    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
150    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
151    generateProcessingLoop(b, delMask, false);
152}
153
154void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
155    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
156    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
157    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
158    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
159    generateProcessingLoop(b, delMask, true);
160}
161
162/*
163What this function does in pseudo code:
164for (mSwizzleFactor)
165    create a swizzle set containing mSwizzleFactor blocks
166    apply PEXT to each block in the swizzle set
167    store the swizzleSet in PEXTedSwizzleSets vector
168
169for (each swizzle row i)
170    for (each swizzle set j)
171        processes row i in swizzle set j
172        store output in pendingData[j]
173*/
174
175void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
176
177    // selectors marks the positions we want to keep
178    Value * const selectors = b->CreateNot(delMask);
179
180    const auto swizzleSets = makeSwizzleSets(b, selectors);
181
182    // Compress the PEXTedSwizzleSets
183    // Output is written and committed to the output buffer one swizzle at a time.
184    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
185    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
186    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
187    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
188    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
189
190    // All output groups have the same count.
191    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
192    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
193    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
194    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
195
196    // There is a separate vector of pending data for each swizzle group.
197    std::vector<Value *> pendingData;
198    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
199        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
200    }
201
202    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
203
204    // For each row i
205    for (unsigned i = 0; i < mSwizzleFactor; i++) {
206
207        // Generate code for each of the mSwizzleFactor fields making up a block.
208        // We load the count for the field and process all swizzle groups accordingly.
209        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
210        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
211        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
212        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
213
214        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
215        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
216
217        // Data from the ith swizzle pack of each group is processed
218        // according to the same newItemCount, pendingSpace, ...
219        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
220            Value * const newItems = swizzleSets[j][i];
221            // Combine as many of the new items as possible into the pending group.
222            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
223            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
224            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
225            // To avoid an unpredictable branch, always store the combined group, whether full or not.
226            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
227            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
228            // Any items in excess of the space available in the current pending group overflow for the next group.
229            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
230            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
231            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
232        }
233
234        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
235        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
236
237        outputProduced = b->CreateAdd(outputProduced, newItemCount);
238    }
239
240    if (flush) { // incase we selected the overflow group on the final iteration
241        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
242        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
243        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
244            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
245            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
246        }
247    } else {
248        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
249            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
250        }
251        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
252        b->setScalarField("pendingOffset", pendingOffset);
253        // unless this is our final stride, don't report partially written fields.
254        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
255    }
256
257    b->setProducedItemCount("outputSwizzle0", outputProduced);
258}
259
260/*
261Apply PEXT deletion to the blocks in strms and swizzle the result.
262
263Q: Why is it advantageous to swizzle the PEXTed streams?
264
265A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
266
267Example:
268Input:     11101101
269PEXT mask: 11110000
270Output:    00001110
271
272PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
273cleared. This has implications if we're working with multiple streams.
274
275For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
276(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
277Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
278Stream 2:   qrstuv00 wxy00000 z1000000 23456000
279Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
280Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
281
282If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
283should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
284time:
285
286The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
287
288Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
289Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
290Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
291Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
292
293Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
294compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
295
296Args:
297    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
298    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
299        sequentially to PEXT a full block.)
300
301Returns:
302    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
303*/
304
305std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
306
307    Constant * pext = nullptr;
308    if (mPEXTWidth == 64) {
309        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
310    } else if (mPEXTWidth == 32) {
311        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
312    }
313
314    Value * const m = b->fwCast(mPEXTWidth, selectors);
315
316    std::vector<Value *> masks(mSwizzleFactor);
317    for (unsigned i = 0; i < mSwizzleFactor; i++) {
318        masks[i] = b->CreateExtractElement(m, i);
319
320    }
321
322    std::vector<std::vector<Value *>> swizzleSets;
323    swizzleSets.reserve(mSwizzleSetCount);
324
325    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
326
327    UndefValue * const outputInitializer = UndefValue::get(vecTy);
328
329    std::vector<Value *> input(mSwizzleFactor);
330    // For each of the k swizzle sets required to apply PEXT to all input streams
331    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
332
333        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
334            const unsigned k = (i * mSwizzleFactor) + j;
335            if (k < mStreamCount) {
336                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
337            } else {
338                input[j] = Constant::getNullValue(vecTy);
339            }
340        }
341
342        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
343        // then splat the pext mask and apply it to each output row
344
345        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
346        // For each of the input streams
347        for (unsigned j = 0; j < mSwizzleFactor; j++) {
348            for (unsigned k = 0; k < mSwizzleFactor; k++) {
349                // Load block j,k
350                Value * const field = b->CreateExtractElement(input[j], k);
351                // Apply PEXT deletion
352                Value * const selected = b->CreateCall(pext, {field, masks[k]});
353                // Then store it as our k,j-th output
354                output[k] = b->CreateInsertElement(output[k], selected, j);
355            }
356        }
357
358        swizzleSets.emplace_back(output);
359    }
360
361    return swizzleSets;
362}
363
364// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
365// Kernel inputs: stream_count data streams plus one del_mask stream
366// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
367
368void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
369    Value * delMask = kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0));
370    generateProcessingLoop(kb, delMask);
371}
372
373void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &kb, Value * remainingBytes) {
374    IntegerType * vecTy = kb->getIntNTy(kb->getBitBlockWidth());
375    Value * remaining = kb->CreateZExt(remainingBytes, vecTy);
376    Value * EOF_del = kb->bitCast(kb->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
377    Value * delMask = kb->CreateOr(EOF_del, kb->loadInputStreamBlock("delMaskSet", kb->getInt32(0)));
378    generateProcessingLoop(kb, delMask);
379}
380
381void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & kb, Value * delMask) {
382    Constant * PEXT_func = nullptr;
383    if (mPEXTWidth == 64) {
384        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_64);
385    } else if (mPEXTWidth == 32) {
386        PEXT_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pext_32);
387    }
388    std::vector<Value *> masks(mSwizzleFactor);
389    Value * const m = kb->fwCast(mSwizzleFactor, kb->simd_not(delMask));
390    for (unsigned i = 0; i < mSwizzleFactor; i++) {
391        masks.push_back(kb->CreateExtractElement(m, i));
392    }
393
394    for (unsigned i = 0; i < mStreamCount; ++i) {
395        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(i));
396        Value * value = kb->fwCast(mPEXTWidth, input);
397        Value * output = UndefValue::get(value->getType());
398        for (unsigned j = 0; j < mSwizzleFactor; j++) {
399            Value * field = kb->CreateExtractElement(value, j);
400            Value * compressed = kb->CreateCall(PEXT_func, {field, masks[j]});
401            output = kb->CreateInsertElement(output, compressed, j);
402        }
403        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(i), output);
404    }
405    Value * delCount = kb->simd_popcount(mDelCountFieldWidth, kb->simd_not(delMask));
406    kb->storeOutputStreamBlock("deletionCounts", kb->getInt32(0), kb->bitCast(delCount));
407}
408
409DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
410: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
411              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
412                  Binding{b->getStreamSetTy(), "delMaskSet"}},
413              {}, {}, {}, {})
414, mDelCountFieldWidth(fw)
415, mStreamCount(streamCount)
416, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
417, mPEXTWidth(PEXT_width) {
418    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
419    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
420}
421
422   
423//
424// This kernel performs final stream compression for a set of N bitstreams, given
425// (a) a set of bitstreams partially compressed within K-bit fields and stored
426//     in K-bit swizzled form, and
427// (b) a stream of deletion/extraction counts per K-bit stride.
428//
429// Restrictions:  At present, only K=64 is supported.
430//                At present, N must be an exact multiple of BLOCK_SIZE/K.
431//
432// The kernel always consumes full blocks of input and emits data into the output
433// buffer in swizzles of K items at a time.   Upon completion of a segment,
434// up to K-1 pending output items per stream may be stored in the kernel state.
435//
436// Note: that both input streams and output streams are stored in swizzled form.
437//
438
439SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned bitStreamCount, unsigned fieldWidth)
440: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
441                     {Binding{kb->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
442, mBitStreamCount(bitStreamCount)
443    , mFieldWidth(fieldWidth)
444    , mSwizzleFactor(kb->getBitBlockWidth() / fieldWidth)
445    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
446        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
447        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
448        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
449        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
450        addScalar(kb->getBitBlockType(), "pendingSwizzleData0");
451        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
452            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
453            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
454            addScalar(kb->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
455        }
456        addScalar(kb->getSizeTy(), "pendingOffset");
457}
458   
459void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
460       
461    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
462    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
463   
464    // Output is written and committed to the output buffer one swizzle at a time.
465    //
466    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
467    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
468   
469    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
470    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
471    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
472
473    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
474    Value * pendingOffset = kb->getScalarField("pendingOffset");
475    // There is a separate vector of pending data for each swizzle group.
476    std::vector<Value *> pendingData;
477    std::vector<Value *> outputStreamPtr;
478    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
479        pendingData.push_back(kb->getScalarField("pendingSwizzleData" + std::to_string(i)));
480        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
481    }
482   
483    // Generate code for each of the mSwizzleFactor fields making up a block.
484    // We load the count for the field and process all swizzle groups accordingly.
485    for (unsigned i = 0; i < mSwizzleFactor; i++) {
486        Value * newItemCount = kb->CreateLoad(kb->CreateGEP(countStreamPtr, kb->getInt32(i)));
487        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
488        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
489       
490        // Data from the ith swizzle pack of each group is processed
491        // according to the same newItemCount, pendingSpace, ...
492        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
493            Value * newItems = kb->loadInputStreamBlock("inputSwizzle" + std::to_string(j), kb->getInt32(i));
494            // Combine as many of the new items as possible into the pending group.
495            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, kb->simd_fill(mFieldWidth, pendingOffset)));
496            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
497            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
498            // Any items in excess of the space available in the current pending group overflow for the next group.
499            Value * overFlowGroup = kb->CreateLShr(newItems, kb->simd_fill(mFieldWidth, pendingSpace));
500            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
501            pendingData[j] = kb->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
502        }
503        outputIndex = kb->CreateSelect(pendingSpaceFilled, kb->CreateAdd(outputIndex, kb->getSize(1)), outputIndex);
504        pendingOffset = kb->CreateAnd(kb->CreateAdd(newItemCount, pendingOffset), kb->getSize(mFieldWidth-1));
505    }
506    kb->setScalarField("pendingOffset", pendingOffset);
507    Value * newlyProduced = kb->CreateSub(kb->CreateShl(outputIndex, outputIndexShift), producedOffset);
508    Value * produced = kb->CreateAdd(outputProduced, newlyProduced);
509    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
510        kb->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
511    }
512    kb->setProducedItemCount("outputSwizzle0", produced);
513}
514
515void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & kb, Value * /* remainingBytes */) {
516    CreateDoBlockMethodCall(kb);
517    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
518    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
519   
520    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
521    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
522    Value * outputIndex = kb->CreateLShr(producedOffset, outputIndexShift);
523    Value * pendingOffset = kb->getScalarField("pendingOffset");
524
525    // Write the pending data.
526    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
527        Value * pendingData = kb->getScalarField("pendingSwizzleData" + std::to_string(i));
528        Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0));
529        kb->CreateBlockAlignedStore(pendingData, kb->CreateGEP(outputStreamPtr, outputIndex));
530    }
531    kb->setProducedItemCount("outputSwizzle0", kb->CreateAdd(pendingOffset, outputProduced));
532}
533}
Note: See TracBrowser for help on using the repository browser.