source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5822

Last change on this file since 5822 was 5755, checked in by nmedfort, 23 months ago

Bug fixes and simplified MultiBlockKernel? logic

File size: 36.6 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
51: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
52                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}, Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
53                  {}, {}, {}, {})
54, mDelCountFieldWidth(fw)
55, mStreamCount(streamCount)
56, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
57// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
58// to prevent rounding errors.
59, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
60, mPEXTWidth(PEXT_width)
61{
62    assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
63        && "mDelCountFieldWidth must be a power of 2");
64    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
65    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
66
67    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
68    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
69    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
70    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
71        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
72            "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
73        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
74    }
75    addScalar(iBuilder->getSizeTy(), "pendingOffset");
76}
77
78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
79    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
80    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
81    const auto masks = get_PEXT_masks(iBuilder, delMask);
82    generateProcessingLoop(iBuilder, masks, delMask);
83}
84
85void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
86    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
87    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
88    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
89    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
90    const auto masks = get_PEXT_masks(iBuilder, delMask);
91    generateProcessingLoop(iBuilder, masks, delMask);
92    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
93    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
94   
95    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
96    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
97    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
98    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
99
100    // Write the pending data.
101    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
102        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
103        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
104        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
105    }
106    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
107}
108
109std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
110    // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
111    // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
112    Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask)); 
113    std::vector<Value *> masks;
114    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
115        masks.push_back(iBuilder->CreateExtractElement(m, i));
116    }
117    return masks;
118}
119
120void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
121                                                Value * delMask) {
122    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
123    std::vector<Value *> counts;
124    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
125        // Store the deletion counts for each PEXT field
126        counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
127    }
128
129    generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
130}
131
132/*
133What this function does in pseudo code:
134for (mSwizzleFactor)
135        create a swizzle set containing mSwizzleFactor blocks
136        apply PEXT to each block in the swizzle set
137        store the swizzleSet in PEXTedSwizzleSets vector
138       
139for (each swizzle row i)
140        for (each swizzle set j)
141                processes row i in swizzle set j
142                store output in pendingData[j]
143*/
144void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
145                                                    std::vector<Value *> counts) {
146    // For each of the k swizzle sets required to apply PEXT to all input streams
147    std::vector<std::vector<Value *>> PEXTedSwizzleSets;
148    for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
149    // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
150    // Each block belongs to a different input stream.
151        std::vector<Value *> input;
152        unsigned streamSelectionIndex = j * mSwizzleFactor;
153        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
154                // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
155            if ( i >= mStreamCount) {
156                                input.push_back(iBuilder->allZeroes());
157            } else {
158                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
159            }
160        }
161        // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
162        PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
163    }
164        // Compress the PEXTedSwizzleSets
165    // Output is written and committed to the output buffer one swizzle at a time.
166    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
167    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
168   
169    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
170    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
171    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
172
173    // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
174    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
175    // There is a separate vector of pending data for each swizzle group.
176    std::vector<Value *> pendingData;
177    std::vector<Value *> outputStreamPtr;
178
179    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
180        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
181        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
182    }
183
184    // For each row i
185    for (unsigned i = 0; i < mSwizzleFactor; i++) {
186        // Generate code for each of the mSwizzleFactor fields making up a block.
187        // We load the count for the field and process all swizzle groups accordingly.
188        Value * newItemCount = counts[i];
189        //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
190        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
191        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
192       
193        // Data from the ith swizzle pack of each group is processed
194        // according to the same newItemCount, pendingSpace, ...
195        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
196            Value * newItems = PEXTedSwizzleSets[j][i];
197            //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
198            // Combine as many of the new items as possible into the pending group.
199            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
200                pendingOffset)));
201            //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
202            // To avoid an unpredictable branch, always store the combined group, whether full or not.             
203            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
204           
205            // Any items in excess of the space available in the current pending group overflow for the next group.
206            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
207            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
208            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
209        }
210        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
211        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
212    }
213   
214    iBuilder->setScalarField("pendingOffset", pendingOffset);
215    //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
216   
217    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
218    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
219    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
220        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
221        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
222    }
223    iBuilder->setProducedItemCount("outputSwizzle0", produced);
224}
225
226/*
227Apply PEXT deletion to the blocks in strms and swizzle the result.
228
229Q: Why is it advantageous to swizzle the PEXTed streams?
230
231A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
232
233Example:
234Input:     11101101
235PEXT mask: 11110000
236Output:    00001110
237
238PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
239cleared. This has implications if we're working with multiple streams.
240
241For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
242(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
243Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
244Stream 2:   qrstuv00 wxy00000 z1000000 23456000
245Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
246Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
247
248If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
249should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
250time:
251
252The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
253
254Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
255Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
256Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
257Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
258
259Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
260compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
261
262Args:
263    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
264    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
265        sequentially to PEXT a full block.)
266
267Returns:
268    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
269*/
270std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, 
271                                                             const std::vector<Value *> & masks, std::vector<Value *> strms) {
272    Value * PEXT_func = nullptr;
273    if (mPEXTWidth == 64) {
274        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
275    } else if (mPEXTWidth == 32) {
276        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
277    }
278   
279    std::vector<Value *> output;     
280    for (unsigned i = 0; i < strms.size(); i++) {
281        Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
282        output.push_back(Constant::getNullValue(v->getType())); 
283    }
284
285    // For each of the input streams
286    for (unsigned j = 0; j < strms.size(); j++) {
287        Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
288        // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
289        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
290            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
291            Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
292            /*
293             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
294             E.g.:
295
296               *i*
297            *j* a b c d strms[0]
298                e f g h
299                i j k l
300                m n o p
301
302             Apply pext deletion at each position, then swizzle results:
303               *j*
304            *i* a` e` i` m` output[0]
305                b` f` j` n`
306                c` g` k` o` 
307                d` i` l` p`         
308            */   
309            output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j); 
310            /*
311            numCompressedBits = 0
312
313            for (each swizzleField position j)
314                for (each input swizzle i)
315                    get PEXTed_field
316                    Shift PEXTed_field left by "numCompressedBits" (in output[i])
317                    OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
318                numCompressedBits += popcount(mask[i])
319            */
320        }
321    }
322   
323    return output;
324}
325
326Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
327    Value * PEXT_func = nullptr;
328    if (mPEXTWidth == 64) {
329        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
330    } else if (mPEXTWidth == 32) {
331        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
332    }
333       
334    Value * v = iBuilder->fwCast(mPEXTWidth, strm);
335    Value * output = Constant::getNullValue(v->getType());
336    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
337        Value * field = iBuilder->CreateExtractElement(v, i);
338        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
339        output = iBuilder->CreateInsertElement(output, compressed, i);
340    }
341    return output;
342}
343
344// Apply deletion to a set of stream_count input streams to produce a set of output streams.
345// Kernel inputs: stream_count data streams plus one del_mask stream
346// Outputs: the deleted streams, plus a partial sum popcount
347
348void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
349    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
350    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
351    for (unsigned j = 0; j < mStreamCount; ++j) {
352        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
353        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
354        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
355    }
356    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
357    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
358}
359
360void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
361    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
362    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
363    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
364    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
365    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
366    for (unsigned j = 0; j < mStreamCount; ++j) {
367        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
368        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
369        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
370    }
371    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
372    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
373}
374
375DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
376: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
377              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
378               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
379              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
380               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
381              {}, {}, {})
382, mDeletionFieldWidth(fieldWidth)
383, mStreamCount(streamCount) {
384}
385
386const unsigned PEXT_width = 64;
387
388inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
389    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
390    std::vector<Value *> masks;
391    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
392        masks.push_back(iBuilder->CreateExtractElement(m, i));
393    }
394    return masks;
395}
396
397// Apply PEXT deletion to a collection of blocks and swizzle the result.
398// strms contains the blocks to process
399inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
400    Value * PEXT_func = nullptr;
401    if (PEXT_width == 64) {
402        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
403    } else if (PEXT_width == 32) {
404        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
405    }
406   
407    std::vector<Value *> output;     
408    for (unsigned i = 0; i < strms.size(); i++) {
409        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
410        output.push_back(Constant::getNullValue(v->getType())); 
411    }
412
413    // For each of the input streams
414    for (unsigned j = 0; j < strms.size(); j++) {
415        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
416        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
417        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
418            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
419            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
420            /*
421             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
422
423             a b c d
424             e f g h
425             i j k l
426             m n o p
427
428             Apply pext deletion at each position, then swizzle results:
429
430             a` e` i` m`
431             b` f` j` n`
432             c` g` k` o` 
433             d` i` l` p`         
434            */     
435            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
436        }
437    }
438   
439    return output;
440}
441
442inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
443    Value * PEXT_func = nullptr;
444    if (PEXT_width == 64) {
445        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
446    } else if (PEXT_width == 32) {
447        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
448    }
449       
450    Value * v = iBuilder->fwCast(PEXT_width, strm);
451    Value * output = Constant::getNullValue(v->getType());
452    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
453        Value * field = iBuilder->CreateExtractElement(v, i);
454        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
455        output = iBuilder->CreateInsertElement(output, compressed, i);
456    }
457    return output;
458}
459
460// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
461// Kernel inputs: stream_count data streams plus one del_mask stream
462// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
463
464void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
465    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
466    const auto masks = get_PEXT_masks(iBuilder, delMask);
467    generateProcessingLoop(iBuilder, masks, delMask);
468}
469
470void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
471    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
472    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
473    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
474    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
475    const auto masks = get_PEXT_masks(iBuilder, delMask);
476    generateProcessingLoop(iBuilder, masks, delMask);
477}
478
479void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
480    if (mShouldSwizzle) {
481        generatePEXTAndSwizzleLoop(iBuilder, masks);
482    } else {
483        generatePEXTLoop(iBuilder, masks);
484    }
485    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
486    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
487    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
488}
489
490void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
491    for (unsigned j = 0; j < mStreamCount; ++j) {
492        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
493        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
494        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
495    }
496}
497
498void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
499    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
500    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
501    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
502        std::vector<Value *> input;
503        unsigned streamSelectionIndex = j * mSwizzleFactor;
504        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
505                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
506            if ( i >= mStreamCount) {
507                                input.push_back(iBuilder->allZeroes());
508            } else {
509                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
510            }
511        }
512        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
513        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
514             iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
515        }
516    }
517}
518
519DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
520: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
521                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
522                      Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
523                  {}, {}, {}, {})
524, mDelCountFieldWidth(fw)
525, mStreamCount(streamCount)
526, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
527, mShouldSwizzle(shouldSwizzle)
528{
529    if(mShouldSwizzle) {       
530        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
531            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
532        }
533    } else {
534        // No swizzling. Output results as single stream set
535        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
536    }
537    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
538}
539
540   
541//
542// This kernel performs final stream compression for a set of N bitstreams, given
543// (a) a set of bitstreams partially compressed within K-bit fields and stored
544//     in K-bit swizzled form, and
545// (b) a stream of deletion/extraction counts per K-bit stride.
546//
547// Restrictions:  At present, only K=64 is supported.
548//                At present, N must be an exact multiple of BLOCK_SIZE/K.
549//
550// The kernel always consumes full blocks of input and emits data into the output
551// buffer in swizzles of K items at a time.   Upon completion of a segment,
552// up to K-1 pending output items per stream may be stored in the kernel state.
553//
554// Note: that both input streams and output streams are stored in swizzled form.
555//
556
557SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
558: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
559                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
560, mBitStreamCount(bitStreamCount)
561    , mFieldWidth(fieldWidth)
562    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
563    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
564        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
565        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
566        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
567        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
568        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
569        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
570            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
571            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
572            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
573        }
574        addScalar(iBuilder->getSizeTy(), "pendingOffset");
575}
576   
577void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
578       
579    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
580    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
581   
582    // Output is written and committed to the output buffer one swizzle at a time.
583    //
584    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
585    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
586   
587    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
588    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
589    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
590
591    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
592    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
593    // There is a separate vector of pending data for each swizzle group.
594    std::vector<Value *> pendingData;
595    std::vector<Value *> outputStreamPtr;
596    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
597        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
598        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
599    }
600   
601    // Generate code for each of the mSwizzleFactor fields making up a block.
602    // We load the count for the field and process all swizzle groups accordingly.
603    for (unsigned i = 0; i < mSwizzleFactor; i++) {
604        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
605    //iBuilder->CallPrintInt("newItemCount", newItemCount);
606        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
607        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
608       
609        // Data from the ith swizzle pack of each group is processed
610        // according to the same newItemCount, pendingSpace, ...
611        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
612            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
613        //iBuilder->CallPrintRegister("newItems", newItems);
614            // Combine as many of the new items as possible into the pending group.
615            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
616            //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
617            // To avoid an unpredictable branch, always store the combined group, whether full or not.
618               
619            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
620            // Any items in excess of the space available in the current pending group overflow for the next group.
621            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
622            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
623            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
624        }
625        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
626        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
627    }
628    iBuilder->setScalarField("pendingOffset", pendingOffset);   
629    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
630    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
631    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
632        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
633        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
634
635    }
636    iBuilder->setProducedItemCount("outputSwizzle0", produced);
637}
638
639void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
640    CreateDoBlockMethodCall(iBuilder);
641    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
642    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
643   
644    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
645    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
646    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
647    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
648
649    // Write the pending data.
650    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
651        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
652        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
653        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
654    }
655    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
656}
657}
Note: See TracBrowser for help on using the repository browser.