source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 6004

Last change on this file since 6004 was 6004, checked in by cameron, 13 months ago

Restructuring step for DeletionKernel?: move partial sum popcount in p2sWithCompress

File size: 27.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41// Apply deletion to a set of stream_count input streams to produce a set of output streams.
42// Kernel inputs: stream_count data streams plus one del_mask stream
43// Outputs: the deleted streams, plus a partial sum popcount
44
45void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
46    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
47    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
48    for (unsigned j = 0; j < mStreamCount; ++j) {
49        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
50        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
51        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
52    }
53    Value * unitCount = iBuilder->simd_popcount(mDeletionFieldWidth, iBuilder->simd_not(delMask));
54    iBuilder->storeOutputStreamBlock("unitCounts", iBuilder->getInt32(0), iBuilder->bitCast(unitCount));
55}
56
57void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
58    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
59    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
60    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
61    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
62    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
63    for (unsigned j = 0; j < mStreamCount; ++j) {
64        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
65        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
66        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
67    }
68    Value * const unitCount = iBuilder->simd_popcount(mDeletionFieldWidth, iBuilder->simd_not(delMask));
69    iBuilder->storeOutputStreamBlock("unitCounts", iBuilder->getInt32(0), iBuilder->bitCast(unitCount));
70}
71
72DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
73: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
74                      {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
75                          Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
76                      {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
77                          Binding{iBuilder->getStreamSetTy(), "unitCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
78                      {}, {}, {})
79, mDeletionFieldWidth(fieldWidth)
80, mStreamCount(streamCount) {
81}
82
83   
84   
85SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
86: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
87                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
88                  {}, {}, {}, {})
89, mStreamCount(streamCount)
90, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
91// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
92// to prevent rounding errors.
93, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
94, mPEXTWidth(PEXT_width)
95{
96    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
97        && "mDelCountFieldWidth must be a power of 2");
98    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
99    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
100
101    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
102    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
103
104    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
105    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
106    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
107        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
108        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
109    }
110    addScalar(b->getSizeTy(), "pendingOffset");
111}
112
113void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
114    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
115    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
116    generateProcessingLoop(b, delMask, false);
117}
118
119void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
120    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
121    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
122    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
123    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
124    generateProcessingLoop(b, delMask, true);
125}
126
127/*
128What this function does in pseudo code:
129for (mSwizzleFactor)
130    create a swizzle set containing mSwizzleFactor blocks
131    apply PEXT to each block in the swizzle set
132    store the swizzleSet in PEXTedSwizzleSets vector
133
134for (each swizzle row i)
135    for (each swizzle set j)
136        processes row i in swizzle set j
137        store output in pendingData[j]
138*/
139
140void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
141
142    // selectors marks the positions we want to keep
143    Value * const selectors = b->CreateNot(delMask);
144
145    const auto swizzleSets = makeSwizzleSets(b, selectors);
146
147    // Compress the PEXTedSwizzleSets
148    // Output is written and committed to the output buffer one swizzle at a time.
149    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
150    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
151    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
152    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
153    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
154
155    // All output groups have the same count.
156    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
157    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
158    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
159    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
160
161    // There is a separate vector of pending data for each swizzle group.
162    std::vector<Value *> pendingData;
163    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
164        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
165    }
166
167    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
168
169    // For each row i
170    for (unsigned i = 0; i < mSwizzleFactor; i++) {
171
172        // Generate code for each of the mSwizzleFactor fields making up a block.
173        // We load the count for the field and process all swizzle groups accordingly.
174        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
175        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
176        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
177        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
178
179        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
180        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
181
182        // Data from the ith swizzle pack of each group is processed
183        // according to the same newItemCount, pendingSpace, ...
184        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
185            Value * const newItems = swizzleSets[j][i];
186            // Combine as many of the new items as possible into the pending group.
187            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
188            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
189            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
190            // To avoid an unpredictable branch, always store the combined group, whether full or not.
191            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
192            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
193            // Any items in excess of the space available in the current pending group overflow for the next group.
194            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
195            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
196            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
197        }
198
199        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
200        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
201
202        outputProduced = b->CreateAdd(outputProduced, newItemCount);
203    }
204
205    if (flush) { // incase we selected the overflow group on the final iteration
206        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
207        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
208        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
209            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
210            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
211        }
212    } else {
213        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
214            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
215        }
216        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
217        b->setScalarField("pendingOffset", pendingOffset);
218        // unless this is our final stride, don't report partially written fields.
219        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
220    }
221
222    b->setProducedItemCount("outputSwizzle0", outputProduced);
223}
224
225/*
226Apply PEXT deletion to the blocks in strms and swizzle the result.
227
228Q: Why is it advantageous to swizzle the PEXTed streams?
229
230A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
231
232Example:
233Input:     11101101
234PEXT mask: 11110000
235Output:    00001110
236
237PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
238cleared. This has implications if we're working with multiple streams.
239
240For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
241(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
242Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
243Stream 2:   qrstuv00 wxy00000 z1000000 23456000
244Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
245Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
246
247If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
248should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
249time:
250
251The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
252
253Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
254Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
255Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
256Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
257
258Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
259compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
260
261Args:
262    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
263    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
264        sequentially to PEXT a full block.)
265
266Returns:
267    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
268*/
269
270std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
271
272    Constant * pext = nullptr;
273    if (mPEXTWidth == 64) {
274        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
275    } else if (mPEXTWidth == 32) {
276        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
277    }
278
279    Value * const m = b->fwCast(mPEXTWidth, selectors);
280
281    std::vector<Value *> masks(mSwizzleFactor);
282    for (unsigned i = 0; i < mSwizzleFactor; i++) {
283        masks[i] = b->CreateExtractElement(m, i);
284
285    }
286
287    std::vector<std::vector<Value *>> swizzleSets;
288    swizzleSets.reserve(mSwizzleSetCount);
289
290    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
291
292    UndefValue * const outputInitializer = UndefValue::get(vecTy);
293
294    std::vector<Value *> input(mSwizzleFactor);
295    // For each of the k swizzle sets required to apply PEXT to all input streams
296    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
297
298        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
299            const unsigned k = (i * mSwizzleFactor) + j;
300            if (k < mStreamCount) {
301                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
302            } else {
303                input[j] = Constant::getNullValue(vecTy);
304            }
305        }
306
307        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
308        // then splat the pext mask and apply it to each output row
309
310        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
311        // For each of the input streams
312        for (unsigned j = 0; j < mSwizzleFactor; j++) {
313            for (unsigned k = 0; k < mSwizzleFactor; k++) {
314                // Load block j,k
315                Value * const field = b->CreateExtractElement(input[j], k);
316                // Apply PEXT deletion
317                Value * const selected = b->CreateCall(pext, {field, masks[k]});
318                // Then store it as our k,j-th output
319                output[k] = b->CreateInsertElement(output[k], selected, j);
320            }
321        }
322
323        swizzleSets.emplace_back(output);
324    }
325
326    return swizzleSets;
327}
328
329// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
330// Kernel inputs: stream_count data streams plus one del_mask stream
331// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
332
333void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
334    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
335    generateProcessingLoop(iBuilder, delMask);
336}
337
338void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
339    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
340    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
341    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
342    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
343    generateProcessingLoop(iBuilder, delMask);
344}
345
346void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, Value * delMask) {
347    Constant * PEXT_func = nullptr;
348    if (mPEXTWidth == 64) {
349        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
350    } else if (mPEXTWidth == 32) {
351        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
352    }
353    std::vector<Value *> masks(mSwizzleFactor);
354    Value * const m = iBuilder->fwCast(mSwizzleFactor, iBuilder->simd_not(delMask));
355    for (unsigned i = 0; i < mSwizzleFactor; i++) {
356        masks.push_back(iBuilder->CreateExtractElement(m, i));
357    }
358
359    for (unsigned i = 0; i < mStreamCount; ++i) {
360        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i));
361        Value * value = iBuilder->fwCast(mPEXTWidth, input);
362        Value * output = UndefValue::get(value->getType());
363        for (unsigned j = 0; j < mSwizzleFactor; j++) {
364            Value * field = iBuilder->CreateExtractElement(value, j);
365            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[j]});
366            output = iBuilder->CreateInsertElement(output, compressed, j);
367        }
368        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(i), output);
369    }
370    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
371    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
372}
373
374DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
375: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
376              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
377                  Binding{b->getStreamSetTy(), "delMaskSet"}},
378              {}, {}, {}, {})
379, mDelCountFieldWidth(fw)
380, mStreamCount(streamCount)
381, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
382, mPEXTWidth(PEXT_width) {
383    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
384    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
385}
386
387   
388//
389// This kernel performs final stream compression for a set of N bitstreams, given
390// (a) a set of bitstreams partially compressed within K-bit fields and stored
391//     in K-bit swizzled form, and
392// (b) a stream of deletion/extraction counts per K-bit stride.
393//
394// Restrictions:  At present, only K=64 is supported.
395//                At present, N must be an exact multiple of BLOCK_SIZE/K.
396//
397// The kernel always consumes full blocks of input and emits data into the output
398// buffer in swizzles of K items at a time.   Upon completion of a segment,
399// up to K-1 pending output items per stream may be stored in the kernel state.
400//
401// Note: that both input streams and output streams are stored in swizzled form.
402//
403
404SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
405: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
406                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
407, mBitStreamCount(bitStreamCount)
408    , mFieldWidth(fieldWidth)
409    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
410    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
411        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
412        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
413        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
414        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
415        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
416        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
417            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
418            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
419            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
420        }
421        addScalar(iBuilder->getSizeTy(), "pendingOffset");
422}
423   
424void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
425       
426    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
427    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
428   
429    // Output is written and committed to the output buffer one swizzle at a time.
430    //
431    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
432    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
433   
434    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
435    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
436    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
437
438    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
439    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
440    // There is a separate vector of pending data for each swizzle group.
441    std::vector<Value *> pendingData;
442    std::vector<Value *> outputStreamPtr;
443    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
444        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
445        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
446    }
447   
448    // Generate code for each of the mSwizzleFactor fields making up a block.
449    // We load the count for the field and process all swizzle groups accordingly.
450    for (unsigned i = 0; i < mSwizzleFactor; i++) {
451        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
452        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
453        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
454       
455        // Data from the ith swizzle pack of each group is processed
456        // according to the same newItemCount, pendingSpace, ...
457        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
458            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
459            // Combine as many of the new items as possible into the pending group.
460            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
461            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
462            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
463            // Any items in excess of the space available in the current pending group overflow for the next group.
464            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
465            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
466            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
467        }
468        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
469        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
470    }
471    iBuilder->setScalarField("pendingOffset", pendingOffset);   
472    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
473    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
474    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
475        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
476    }
477    iBuilder->setProducedItemCount("outputSwizzle0", produced);
478}
479
480void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
481    CreateDoBlockMethodCall(iBuilder);
482    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
483    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
484   
485    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
486    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
487    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
488    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
489
490    // Write the pending data.
491    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
492        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
493        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
494        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
495    }
496    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
497}
498}
Note: See TracBrowser for help on using the repository browser.