source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5985

Last change on this file since 5985 was 5985, checked in by nmedfort, 13 months ago

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File size: 27.4 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
51: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
52                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
53                  {}, {}, {}, {})
54, mStreamCount(streamCount)
55, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
56// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
57// to prevent rounding errors.
58, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
59, mPEXTWidth(PEXT_width)
60{
61    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
62        && "mDelCountFieldWidth must be a power of 2");
63    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
64    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
65
66    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
67    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
68
69    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
70    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
71    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
72        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
73        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
74    }
75    addScalar(b->getSizeTy(), "pendingOffset");
76}
77
78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
79    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
80    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
81    generateProcessingLoop(b, delMask, false);
82}
83
84void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
85    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
86    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
87    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
88    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
89    generateProcessingLoop(b, delMask, true);
90}
91
92/*
93What this function does in pseudo code:
94for (mSwizzleFactor)
95    create a swizzle set containing mSwizzleFactor blocks
96    apply PEXT to each block in the swizzle set
97    store the swizzleSet in PEXTedSwizzleSets vector
98
99for (each swizzle row i)
100    for (each swizzle set j)
101        processes row i in swizzle set j
102        store output in pendingData[j]
103*/
104
105void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
106
107    // selectors marks the positions we want to keep
108    Value * const selectors = b->CreateNot(delMask);
109
110    const auto swizzleSets = makeSwizzleSets(b, selectors);
111
112    // Compress the PEXTedSwizzleSets
113    // Output is written and committed to the output buffer one swizzle at a time.
114    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
115    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
116    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
117    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
118    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
119
120    // All output groups have the same count.
121    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
122    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
123    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
124    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
125
126    // There is a separate vector of pending data for each swizzle group.
127    std::vector<Value *> pendingData;
128    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
129        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
130    }
131
132    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
133
134    // For each row i
135    for (unsigned i = 0; i < mSwizzleFactor; i++) {
136
137        // Generate code for each of the mSwizzleFactor fields making up a block.
138        // We load the count for the field and process all swizzle groups accordingly.
139        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
140        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
141        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
142        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
143
144        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
145        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
146
147        // Data from the ith swizzle pack of each group is processed
148        // according to the same newItemCount, pendingSpace, ...
149        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
150            Value * const newItems = swizzleSets[j][i];
151            // Combine as many of the new items as possible into the pending group.
152            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
153            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
154            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
155            // To avoid an unpredictable branch, always store the combined group, whether full or not.
156            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
157            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
158            // Any items in excess of the space available in the current pending group overflow for the next group.
159            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
160            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
161            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
162        }
163
164        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
165        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
166
167        outputProduced = b->CreateAdd(outputProduced, newItemCount);
168    }
169
170    if (flush) { // incase we selected the overflow group on the final iteration
171        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
172        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
173        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
174            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
175            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
176        }
177    } else {
178        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
179            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
180        }
181        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
182        b->setScalarField("pendingOffset", pendingOffset);
183        // unless this is our final stride, don't report partially written fields.
184        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
185    }
186
187    b->setProducedItemCount("outputSwizzle0", outputProduced);
188}
189
190/*
191Apply PEXT deletion to the blocks in strms and swizzle the result.
192
193Q: Why is it advantageous to swizzle the PEXTed streams?
194
195A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
196
197Example:
198Input:     11101101
199PEXT mask: 11110000
200Output:    00001110
201
202PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
203cleared. This has implications if we're working with multiple streams.
204
205For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
206(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
207Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
208Stream 2:   qrstuv00 wxy00000 z1000000 23456000
209Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
210Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
211
212If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
213should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
214time:
215
216The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
217
218Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
219Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
220Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
221Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
222
223Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
224compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
225
226Args:
227    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
228    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
229        sequentially to PEXT a full block.)
230
231Returns:
232    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
233*/
234
235std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
236
237    Constant * pext = nullptr;
238    if (mPEXTWidth == 64) {
239        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
240    } else if (mPEXTWidth == 32) {
241        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
242    }
243
244    Value * const m = b->fwCast(mPEXTWidth, selectors);
245
246    std::vector<Value *> masks(mSwizzleFactor);
247    for (unsigned i = 0; i < mSwizzleFactor; i++) {
248        masks[i] = b->CreateExtractElement(m, i);
249
250    }
251
252    std::vector<std::vector<Value *>> swizzleSets;
253    swizzleSets.reserve(mSwizzleSetCount);
254
255    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
256
257    UndefValue * const outputInitializer = UndefValue::get(vecTy);
258
259    std::vector<Value *> input(mSwizzleFactor);
260    // For each of the k swizzle sets required to apply PEXT to all input streams
261    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
262
263        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
264            const unsigned k = (i * mSwizzleFactor) + j;
265            if (k < mStreamCount) {
266                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
267            } else {
268                input[j] = Constant::getNullValue(vecTy);
269            }
270        }
271
272        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
273        // then splat the pext mask and apply it to each output row
274
275        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
276        // For each of the input streams
277        for (unsigned j = 0; j < mSwizzleFactor; j++) {
278            for (unsigned k = 0; k < mSwizzleFactor; k++) {
279                // Load block j,k
280                Value * const field = b->CreateExtractElement(input[j], k);
281                // Apply PEXT deletion
282                Value * const selected = b->CreateCall(pext, {field, masks[k]});
283                // Then store it as our k,j-th output
284                output[k] = b->CreateInsertElement(output[k], selected, j);
285            }
286        }
287
288        swizzleSets.emplace_back(output);
289    }
290
291    return swizzleSets;
292}
293
294// Apply deletion to a set of stream_count input streams to produce a set of output streams.
295// Kernel inputs: stream_count data streams plus one del_mask stream
296// Outputs: the deleted streams, plus a partial sum popcount
297
298void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
299    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
300    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
301    for (unsigned j = 0; j < mStreamCount; ++j) {
302        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
303        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
304        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
305    }
306    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
307    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
308}
309
310void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
311    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
312    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
313    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
314    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
315    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
316    for (unsigned j = 0; j < mStreamCount; ++j) {
317        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
318        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
319        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
320    }
321    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
322    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
323}
324
325DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
326: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
327              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
328               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
329              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
330               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
331              {}, {}, {})
332, mDeletionFieldWidth(fieldWidth)
333, mStreamCount(streamCount) {
334}
335
336// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
337// Kernel inputs: stream_count data streams plus one del_mask stream
338// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
339
340void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
341    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
342    generateProcessingLoop(iBuilder, delMask);
343}
344
345void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
346    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
347    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
348    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
349    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
350    generateProcessingLoop(iBuilder, delMask);
351}
352
353void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, Value * delMask) {
354    Constant * PEXT_func = nullptr;
355    if (mPEXTWidth == 64) {
356        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
357    } else if (mPEXTWidth == 32) {
358        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
359    }
360    std::vector<Value *> masks(mSwizzleFactor);
361    Value * const m = iBuilder->fwCast(mSwizzleFactor, iBuilder->simd_not(delMask));
362    for (unsigned i = 0; i < mSwizzleFactor; i++) {
363        masks.push_back(iBuilder->CreateExtractElement(m, i));
364    }
365
366    for (unsigned i = 0; i < mStreamCount; ++i) {
367        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i));
368        Value * value = iBuilder->fwCast(mPEXTWidth, input);
369        Value * output = UndefValue::get(value->getType());
370        for (unsigned j = 0; j < mSwizzleFactor; j++) {
371            Value * field = iBuilder->CreateExtractElement(value, j);
372            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[j]});
373            output = iBuilder->CreateInsertElement(output, compressed, j);
374        }
375        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(i), output);
376    }
377    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
378    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
379}
380
381DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
382: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
383              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
384                  Binding{b->getStreamSetTy(), "delMaskSet"}},
385              {}, {}, {}, {})
386, mDelCountFieldWidth(fw)
387, mStreamCount(streamCount)
388, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
389, mPEXTWidth(PEXT_width) {
390    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
391    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
392}
393
394   
395//
396// This kernel performs final stream compression for a set of N bitstreams, given
397// (a) a set of bitstreams partially compressed within K-bit fields and stored
398//     in K-bit swizzled form, and
399// (b) a stream of deletion/extraction counts per K-bit stride.
400//
401// Restrictions:  At present, only K=64 is supported.
402//                At present, N must be an exact multiple of BLOCK_SIZE/K.
403//
404// The kernel always consumes full blocks of input and emits data into the output
405// buffer in swizzles of K items at a time.   Upon completion of a segment,
406// up to K-1 pending output items per stream may be stored in the kernel state.
407//
408// Note: that both input streams and output streams are stored in swizzled form.
409//
410
411SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
412: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
413                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
414, mBitStreamCount(bitStreamCount)
415    , mFieldWidth(fieldWidth)
416    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
417    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
418        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
419        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
420        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
421        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
422        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
423        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
424            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
425            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
426            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
427        }
428        addScalar(iBuilder->getSizeTy(), "pendingOffset");
429}
430   
431void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
432       
433    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
434    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
435   
436    // Output is written and committed to the output buffer one swizzle at a time.
437    //
438    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
439    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
440   
441    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
442    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
443    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
444
445    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
446    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
447    // There is a separate vector of pending data for each swizzle group.
448    std::vector<Value *> pendingData;
449    std::vector<Value *> outputStreamPtr;
450    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
451        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
452        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
453    }
454   
455    // Generate code for each of the mSwizzleFactor fields making up a block.
456    // We load the count for the field and process all swizzle groups accordingly.
457    for (unsigned i = 0; i < mSwizzleFactor; i++) {
458        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
459        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
460        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
461       
462        // Data from the ith swizzle pack of each group is processed
463        // according to the same newItemCount, pendingSpace, ...
464        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
465            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
466            // Combine as many of the new items as possible into the pending group.
467            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
468            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
469            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
470            // Any items in excess of the space available in the current pending group overflow for the next group.
471            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
472            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
473            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
474        }
475        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
476        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
477    }
478    iBuilder->setScalarField("pendingOffset", pendingOffset);   
479    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
480    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
481    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
482        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
483    }
484    iBuilder->setProducedItemCount("outputSwizzle0", produced);
485}
486
487void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
488    CreateDoBlockMethodCall(iBuilder);
489    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
490    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
491   
492    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
493    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
494    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
495    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
496
497    // Write the pending data.
498    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
499        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
500        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
501        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
502    }
503    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
504}
505}
Note: See TracBrowser for help on using the repository browser.