source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5848

Last change on this file since 5848 was 5848, checked in by xwa163, 16 months ago
  1. Fix bug of SwizzledDeleteByPEXTKernel
  2. Add character_deletion pipeline and character_deletion_test to make sure SwizzledDeleteByPEXTKernel related logic work correctly
File size: 36.6 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
51: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
52                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}, Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
53                  {}, {}, {}, {})
54, mDelCountFieldWidth(fw)
55, mStreamCount(streamCount)
56, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
57// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
58// to prevent rounding errors.
59, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
60, mPEXTWidth(PEXT_width)
61{
62    assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
63        && "mDelCountFieldWidth must be a power of 2");
64    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
65    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
66
67    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
68    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
69    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
70    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
71        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
72            "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
73        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
74    }
75    addScalar(iBuilder->getSizeTy(), "pendingOffset");
76}
77
78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
79    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
80    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
81    const auto masks = get_PEXT_masks(iBuilder, delMask);
82    generateProcessingLoop(iBuilder, masks, delMask);
83}
84
85void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
86    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
87    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
88    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
89    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
90    const auto masks = get_PEXT_masks(iBuilder, delMask);
91    generateProcessingLoop(iBuilder, masks, delMask);
92    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
93    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
94   
95    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
96    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
97    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
98    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
99
100    // Write the pending data.
101    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
102        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
103        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
104                // TODO it seems that we do not need to store pending data here
105        // iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
106    }
107    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
108}
109
110std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
111    // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
112    // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
113    Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask)); 
114    std::vector<Value *> masks;
115    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
116        masks.push_back(iBuilder->CreateExtractElement(m, i));
117    }
118    return masks;
119}
120
121void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
122                                                Value * delMask) {
123    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
124    std::vector<Value *> counts;
125    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
126        // Store the deletion counts for each PEXT field
127        counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
128    }
129
130    generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
131}
132
133/*
134What this function does in pseudo code:
135for (mSwizzleFactor)
136        create a swizzle set containing mSwizzleFactor blocks
137        apply PEXT to each block in the swizzle set
138        store the swizzleSet in PEXTedSwizzleSets vector
139       
140for (each swizzle row i)
141        for (each swizzle set j)
142                processes row i in swizzle set j
143                store output in pendingData[j]
144*/
145void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
146                                                    std::vector<Value *> counts) {
147    // For each of the k swizzle sets required to apply PEXT to all input streams
148    std::vector<std::vector<Value *>> PEXTedSwizzleSets;
149    for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
150    // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
151    // Each block belongs to a different input stream.
152        std::vector<Value *> input;
153        unsigned streamSelectionIndex = j * mSwizzleFactor;
154        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
155                // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
156            if ( i >= mStreamCount) {
157                                input.push_back(iBuilder->allZeroes());
158            } else {
159                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
160            }
161        }
162        // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
163        PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
164    }
165        // Compress the PEXTedSwizzleSets
166    // Output is written and committed to the output buffer one swizzle at a time.
167    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
168    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
169   
170    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
171    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
172    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
173
174    // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
175    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
176    // There is a separate vector of pending data for each swizzle group.
177    std::vector<Value *> pendingData;
178    std::vector<Value *> outputStreamPtr;
179
180    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
181        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
182        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
183    }
184
185    // For each row i
186    for (unsigned i = 0; i < mSwizzleFactor; i++) {
187        // Generate code for each of the mSwizzleFactor fields making up a block.
188        // We load the count for the field and process all swizzle groups accordingly.
189        Value * newItemCount = counts[i];
190        //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
191        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
192        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
193       
194        // Data from the ith swizzle pack of each group is processed
195        // according to the same newItemCount, pendingSpace, ...
196        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
197            Value * newItems = PEXTedSwizzleSets[j][i];
198            //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
199            // Combine as many of the new items as possible into the pending group.
200            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
201                pendingOffset)));
202            //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
203            // To avoid an unpredictable branch, always store the combined group, whether full or not.             
204            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
205           
206            // Any items in excess of the space available in the current pending group overflow for the next group.
207            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
208            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
209            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
210        }
211        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
212        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
213    }
214   
215    iBuilder->setScalarField("pendingOffset", pendingOffset);
216    //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
217   
218    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
219    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
220    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
221        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
222        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
223    }
224    iBuilder->setProducedItemCount("outputSwizzle0", produced);
225}
226
227/*
228Apply PEXT deletion to the blocks in strms and swizzle the result.
229
230Q: Why is it advantageous to swizzle the PEXTed streams?
231
232A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
233
234Example:
235Input:     11101101
236PEXT mask: 11110000
237Output:    00001110
238
239PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
240cleared. This has implications if we're working with multiple streams.
241
242For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
243(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
244Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
245Stream 2:   qrstuv00 wxy00000 z1000000 23456000
246Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
247Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
248
249If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
250should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
251time:
252
253The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
254
255Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
256Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
257Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
258Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
259
260Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
261compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
262
263Args:
264    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
265    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
266        sequentially to PEXT a full block.)
267
268Returns:
269    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
270*/
271std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, 
272                                                             const std::vector<Value *> & masks, std::vector<Value *> strms) {
273    Value * PEXT_func = nullptr;
274    if (mPEXTWidth == 64) {
275        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
276    } else if (mPEXTWidth == 32) {
277        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
278    }
279   
280    std::vector<Value *> output;     
281    for (unsigned i = 0; i < strms.size(); i++) {
282        Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
283        output.push_back(Constant::getNullValue(v->getType())); 
284    }
285
286    // For each of the input streams
287    for (unsigned j = 0; j < strms.size(); j++) {
288        Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
289        // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
290        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
291            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
292            Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
293            /*
294             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
295             E.g.:
296
297               *i*
298            *j* a b c d strms[0]
299                e f g h
300                i j k l
301                m n o p
302
303             Apply pext deletion at each position, then swizzle results:
304               *j*
305            *i* a` e` i` m` output[0]
306                b` f` j` n`
307                c` g` k` o` 
308                d` i` l` p`         
309            */   
310            output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j); 
311            /*
312            numCompressedBits = 0
313
314            for (each swizzleField position j)
315                for (each input swizzle i)
316                    get PEXTed_field
317                    Shift PEXTed_field left by "numCompressedBits" (in output[i])
318                    OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
319                numCompressedBits += popcount(mask[i])
320            */
321        }
322    }
323   
324    return output;
325}
326
327Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
328    Value * PEXT_func = nullptr;
329    if (mPEXTWidth == 64) {
330        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
331    } else if (mPEXTWidth == 32) {
332        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
333    }
334       
335    Value * v = iBuilder->fwCast(mPEXTWidth, strm);
336    Value * output = Constant::getNullValue(v->getType());
337    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
338        Value * field = iBuilder->CreateExtractElement(v, i);
339        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
340        output = iBuilder->CreateInsertElement(output, compressed, i);
341    }
342    return output;
343}
344
345// Apply deletion to a set of stream_count input streams to produce a set of output streams.
346// Kernel inputs: stream_count data streams plus one del_mask stream
347// Outputs: the deleted streams, plus a partial sum popcount
348
349void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
350    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
351    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
352    for (unsigned j = 0; j < mStreamCount; ++j) {
353        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
354        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
355        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
356    }
357    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
358    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
359}
360
361void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
362    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
363    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
364    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
365    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
366    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
367    for (unsigned j = 0; j < mStreamCount; ++j) {
368        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
369        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
370        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
371    }
372    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
373    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
374}
375
376DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
377: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
378              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
379               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
380              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
381               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
382              {}, {}, {})
383, mDeletionFieldWidth(fieldWidth)
384, mStreamCount(streamCount) {
385}
386
387const unsigned PEXT_width = 64;
388
389inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
390    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
391    std::vector<Value *> masks;
392    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
393        masks.push_back(iBuilder->CreateExtractElement(m, i));
394    }
395    return masks;
396}
397
398// Apply PEXT deletion to a collection of blocks and swizzle the result.
399// strms contains the blocks to process
400inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
401    Value * PEXT_func = nullptr;
402    if (PEXT_width == 64) {
403        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
404    } else if (PEXT_width == 32) {
405        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
406    }
407   
408    std::vector<Value *> output;     
409    for (unsigned i = 0; i < strms.size(); i++) {
410        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
411        output.push_back(Constant::getNullValue(v->getType())); 
412    }
413
414    // For each of the input streams
415    for (unsigned j = 0; j < strms.size(); j++) {
416        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
417        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
418        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
419            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
420            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
421            /*
422             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
423
424             a b c d
425             e f g h
426             i j k l
427             m n o p
428
429             Apply pext deletion at each position, then swizzle results:
430
431             a` e` i` m`
432             b` f` j` n`
433             c` g` k` o` 
434             d` i` l` p`         
435            */     
436            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
437        }
438    }
439   
440    return output;
441}
442
443inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
444    Value * PEXT_func = nullptr;
445    if (PEXT_width == 64) {
446        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
447    } else if (PEXT_width == 32) {
448        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
449    }
450       
451    Value * v = iBuilder->fwCast(PEXT_width, strm);
452    Value * output = Constant::getNullValue(v->getType());
453    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
454        Value * field = iBuilder->CreateExtractElement(v, i);
455        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
456        output = iBuilder->CreateInsertElement(output, compressed, i);
457    }
458    return output;
459}
460
461// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
462// Kernel inputs: stream_count data streams plus one del_mask stream
463// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
464
465void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
466    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
467    const auto masks = get_PEXT_masks(iBuilder, delMask);
468    generateProcessingLoop(iBuilder, masks, delMask);
469}
470
471void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
472    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
473    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
474    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
475    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
476    const auto masks = get_PEXT_masks(iBuilder, delMask);
477    generateProcessingLoop(iBuilder, masks, delMask);
478}
479
480void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
481    if (mShouldSwizzle) {
482        generatePEXTAndSwizzleLoop(iBuilder, masks);
483    } else {
484        generatePEXTLoop(iBuilder, masks);
485    }
486    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
487    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
488    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
489}
490
491void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
492    for (unsigned j = 0; j < mStreamCount; ++j) {
493        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
494        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
495        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
496    }
497}
498
499void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
500    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
501    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
502    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
503        std::vector<Value *> input;
504        unsigned streamSelectionIndex = j * mSwizzleFactor;
505        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
506                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
507            if ( i >= mStreamCount) {
508                                input.push_back(iBuilder->allZeroes());
509            } else {
510                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
511            }
512        }
513        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
514        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
515             iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
516        }
517    }
518}
519
520DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
521: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
522                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
523                      Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
524                  {}, {}, {}, {})
525, mDelCountFieldWidth(fw)
526, mStreamCount(streamCount)
527, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
528, mShouldSwizzle(shouldSwizzle)
529{
530    if(mShouldSwizzle) {       
531        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
532            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
533        }
534    } else {
535        // No swizzling. Output results as single stream set
536        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
537    }
538    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
539}
540
541   
542//
543// This kernel performs final stream compression for a set of N bitstreams, given
544// (a) a set of bitstreams partially compressed within K-bit fields and stored
545//     in K-bit swizzled form, and
546// (b) a stream of deletion/extraction counts per K-bit stride.
547//
548// Restrictions:  At present, only K=64 is supported.
549//                At present, N must be an exact multiple of BLOCK_SIZE/K.
550//
551// The kernel always consumes full blocks of input and emits data into the output
552// buffer in swizzles of K items at a time.   Upon completion of a segment,
553// up to K-1 pending output items per stream may be stored in the kernel state.
554//
555// Note: that both input streams and output streams are stored in swizzled form.
556//
557
558SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
559: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
560                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
561, mBitStreamCount(bitStreamCount)
562    , mFieldWidth(fieldWidth)
563    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
564    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
565        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
566        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
567        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
568        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
569        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
570        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
571            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
572            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
573            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
574        }
575        addScalar(iBuilder->getSizeTy(), "pendingOffset");
576}
577   
578void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
579       
580    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
581    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
582   
583    // Output is written and committed to the output buffer one swizzle at a time.
584    //
585    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
586    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
587   
588    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
589    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
590    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
591
592    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
593    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
594    // There is a separate vector of pending data for each swizzle group.
595    std::vector<Value *> pendingData;
596    std::vector<Value *> outputStreamPtr;
597    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
598        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
599        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
600    }
601   
602    // Generate code for each of the mSwizzleFactor fields making up a block.
603    // We load the count for the field and process all swizzle groups accordingly.
604    for (unsigned i = 0; i < mSwizzleFactor; i++) {
605        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
606    //iBuilder->CallPrintInt("newItemCount", newItemCount);
607        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
608        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
609       
610        // Data from the ith swizzle pack of each group is processed
611        // according to the same newItemCount, pendingSpace, ...
612        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
613            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
614        //iBuilder->CallPrintRegister("newItems", newItems);
615            // Combine as many of the new items as possible into the pending group.
616            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
617            //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
618            // To avoid an unpredictable branch, always store the combined group, whether full or not.
619               
620            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
621            // Any items in excess of the space available in the current pending group overflow for the next group.
622            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
623            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
624            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
625        }
626        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
627        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
628    }
629    iBuilder->setScalarField("pendingOffset", pendingOffset);   
630    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
631    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
632    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
633        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
634        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
635
636    }
637    iBuilder->setProducedItemCount("outputSwizzle0", produced);
638}
639
640void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
641    CreateDoBlockMethodCall(iBuilder);
642    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
643    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
644   
645    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
646    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
647    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
648    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
649
650    // Write the pending data.
651    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
652        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
653        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
654        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
655    }
656    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
657}
658}
Note: See TracBrowser for help on using the repository browser.