source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5902

Last change on this file since 5902 was 5853, checked in by xwa163, 18 months ago

Use delMaskSet as principal input stream in SwizzledDeleteByPEXTkernel

File size: 37.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
51: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
52                  {Binding{iBuilder->getStreamSetTy(), "delMaskSet"}, Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}},
53                  {}, {}, {}, {})
54, mDelCountFieldWidth(fw)
55, mStreamCount(streamCount)
56, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
57// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
58// to prevent rounding errors.
59, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
60, mPEXTWidth(PEXT_width)
61{
62    assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
63        && "mDelCountFieldWidth must be a power of 2");
64    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
65    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
66
67    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
68    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
69    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
70    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
71        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
72            "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
73        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
74    }
75    addScalar(iBuilder->getSizeTy(), "pendingOffset");
76}
77
78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
79    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
80    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
81    const auto masks = get_PEXT_masks(iBuilder, delMask);
82    generateProcessingLoop(iBuilder, masks, delMask);
83}
84
85void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
86    const auto originalProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
87    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
88    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
89    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
90    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
91    const auto masks = get_PEXT_masks(iBuilder, delMask);
92    generateProcessingLoop(iBuilder, masks, delMask);
93
94    const auto newProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
95    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
96    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
97   
98    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
99    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
100    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
101
102    const auto deltaOutputIndex = iBuilder->CreateSub(
103            iBuilder->CreateUDiv(newProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth())),
104            iBuilder->CreateUDiv(originalProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth()))
105    );
106    outputIndex = iBuilder->CreateAdd(outputIndex, iBuilder->CreateMul(deltaOutputIndex, iBuilder->getSize(iBuilder->getBitBlockWidth() / mDelCountFieldWidth)));
107
108    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
109
110    // Write the pending data.
111    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
112        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
113        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
114        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
115    }
116    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
117}
118
119std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
120    // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
121    // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
122    Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask)); 
123    std::vector<Value *> masks;
124    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
125        masks.push_back(iBuilder->CreateExtractElement(m, i));
126    }
127    return masks;
128}
129
130void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
131                                                Value * delMask) {
132    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
133    std::vector<Value *> counts;
134    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
135        // Store the deletion counts for each PEXT field
136        counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
137    }
138
139    generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
140}
141
142/*
143What this function does in pseudo code:
144for (mSwizzleFactor)
145        create a swizzle set containing mSwizzleFactor blocks
146        apply PEXT to each block in the swizzle set
147        store the swizzleSet in PEXTedSwizzleSets vector
148       
149for (each swizzle row i)
150        for (each swizzle set j)
151                processes row i in swizzle set j
152                store output in pendingData[j]
153*/
154void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
155                                                    std::vector<Value *> counts) {
156    // For each of the k swizzle sets required to apply PEXT to all input streams
157    std::vector<std::vector<Value *>> PEXTedSwizzleSets;
158    for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
159    // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
160    // Each block belongs to a different input stream.
161        std::vector<Value *> input;
162        unsigned streamSelectionIndex = j * mSwizzleFactor;
163        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
164                // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
165            if ( i >= mStreamCount) {
166                                input.push_back(iBuilder->allZeroes());
167            } else {
168                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
169            }
170        }
171        // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
172        PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
173    }
174        // Compress the PEXTedSwizzleSets
175    // Output is written and committed to the output buffer one swizzle at a time.
176    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
177    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
178   
179    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
180    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
181    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
182
183    // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
184    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
185    // There is a separate vector of pending data for each swizzle group.
186    std::vector<Value *> pendingData;
187    std::vector<Value *> outputStreamPtr;
188
189    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
190        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
191        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
192    }
193
194    // For each row i
195    for (unsigned i = 0; i < mSwizzleFactor; i++) {
196        // Generate code for each of the mSwizzleFactor fields making up a block.
197        // We load the count for the field and process all swizzle groups accordingly.
198        Value * newItemCount = counts[i];
199        //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
200        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
201        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
202       
203        // Data from the ith swizzle pack of each group is processed
204        // according to the same newItemCount, pendingSpace, ...
205        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
206            Value * newItems = PEXTedSwizzleSets[j][i];
207            //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
208            // Combine as many of the new items as possible into the pending group.
209            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
210                pendingOffset)));
211            //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
212            // To avoid an unpredictable branch, always store the combined group, whether full or not.             
213            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
214           
215            // Any items in excess of the space available in the current pending group overflow for the next group.
216            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
217            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
218            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
219        }
220        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
221        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
222    }
223   
224    iBuilder->setScalarField("pendingOffset", pendingOffset);
225    //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
226   
227    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
228    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
229    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
230        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
231        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
232    }
233    iBuilder->setProducedItemCount("outputSwizzle0", produced);
234}
235
236/*
237Apply PEXT deletion to the blocks in strms and swizzle the result.
238
239Q: Why is it advantageous to swizzle the PEXTed streams?
240
241A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
242
243Example:
244Input:     11101101
245PEXT mask: 11110000
246Output:    00001110
247
248PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
249cleared. This has implications if we're working with multiple streams.
250
251For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
252(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
253Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
254Stream 2:   qrstuv00 wxy00000 z1000000 23456000
255Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
256Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
257
258If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
259should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
260time:
261
262The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
263
264Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
265Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
266Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
267Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
268
269Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
270compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
271
272Args:
273    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
274    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
275        sequentially to PEXT a full block.)
276
277Returns:
278    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
279*/
280std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, 
281                                                             const std::vector<Value *> & masks, std::vector<Value *> strms) {
282    Value * PEXT_func = nullptr;
283    if (mPEXTWidth == 64) {
284        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
285    } else if (mPEXTWidth == 32) {
286        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
287    }
288   
289    std::vector<Value *> output;     
290    for (unsigned i = 0; i < strms.size(); i++) {
291        Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
292        output.push_back(Constant::getNullValue(v->getType())); 
293    }
294
295    // For each of the input streams
296    for (unsigned j = 0; j < strms.size(); j++) {
297        Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
298        // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
299        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
300            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
301            Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
302            /*
303             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
304             E.g.:
305
306               *i*
307            *j* a b c d strms[0]
308                e f g h
309                i j k l
310                m n o p
311
312             Apply pext deletion at each position, then swizzle results:
313               *j*
314            *i* a` e` i` m` output[0]
315                b` f` j` n`
316                c` g` k` o` 
317                d` i` l` p`         
318            */   
319            output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j); 
320            /*
321            numCompressedBits = 0
322
323            for (each swizzleField position j)
324                for (each input swizzle i)
325                    get PEXTed_field
326                    Shift PEXTed_field left by "numCompressedBits" (in output[i])
327                    OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
328                numCompressedBits += popcount(mask[i])
329            */
330        }
331    }
332   
333    return output;
334}
335
336Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
337    Value * PEXT_func = nullptr;
338    if (mPEXTWidth == 64) {
339        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
340    } else if (mPEXTWidth == 32) {
341        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
342    }
343       
344    Value * v = iBuilder->fwCast(mPEXTWidth, strm);
345    Value * output = Constant::getNullValue(v->getType());
346    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
347        Value * field = iBuilder->CreateExtractElement(v, i);
348        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
349        output = iBuilder->CreateInsertElement(output, compressed, i);
350    }
351    return output;
352}
353
354// Apply deletion to a set of stream_count input streams to produce a set of output streams.
355// Kernel inputs: stream_count data streams plus one del_mask stream
356// Outputs: the deleted streams, plus a partial sum popcount
357
358void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
359    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
360    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
361    for (unsigned j = 0; j < mStreamCount; ++j) {
362        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
363        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
364        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
365    }
366    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
367    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
368}
369
370void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
371    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
372    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
373    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
374    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
375    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
376    for (unsigned j = 0; j < mStreamCount; ++j) {
377        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
378        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
379        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
380    }
381    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
382    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
383}
384
385DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
386: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
387              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
388               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
389              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
390               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
391              {}, {}, {})
392, mDeletionFieldWidth(fieldWidth)
393, mStreamCount(streamCount) {
394}
395
396const unsigned PEXT_width = 64;
397
398inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
399    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
400    std::vector<Value *> masks;
401    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
402        masks.push_back(iBuilder->CreateExtractElement(m, i));
403    }
404    return masks;
405}
406
407// Apply PEXT deletion to a collection of blocks and swizzle the result.
408// strms contains the blocks to process
409inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
410    Value * PEXT_func = nullptr;
411    if (PEXT_width == 64) {
412        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
413    } else if (PEXT_width == 32) {
414        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
415    }
416   
417    std::vector<Value *> output;     
418    for (unsigned i = 0; i < strms.size(); i++) {
419        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
420        output.push_back(Constant::getNullValue(v->getType())); 
421    }
422
423    // For each of the input streams
424    for (unsigned j = 0; j < strms.size(); j++) {
425        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
426        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
427        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
428            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
429            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
430            /*
431             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
432
433             a b c d
434             e f g h
435             i j k l
436             m n o p
437
438             Apply pext deletion at each position, then swizzle results:
439
440             a` e` i` m`
441             b` f` j` n`
442             c` g` k` o` 
443             d` i` l` p`         
444            */     
445            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
446        }
447    }
448   
449    return output;
450}
451
452inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
453    Value * PEXT_func = nullptr;
454    if (PEXT_width == 64) {
455        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
456    } else if (PEXT_width == 32) {
457        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
458    }
459       
460    Value * v = iBuilder->fwCast(PEXT_width, strm);
461    Value * output = Constant::getNullValue(v->getType());
462    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
463        Value * field = iBuilder->CreateExtractElement(v, i);
464        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
465        output = iBuilder->CreateInsertElement(output, compressed, i);
466    }
467    return output;
468}
469
470// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
471// Kernel inputs: stream_count data streams plus one del_mask stream
472// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
473
474void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
475    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
476    const auto masks = get_PEXT_masks(iBuilder, delMask);
477    generateProcessingLoop(iBuilder, masks, delMask);
478}
479
480void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
481    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
482    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
483    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
484    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
485    const auto masks = get_PEXT_masks(iBuilder, delMask);
486    generateProcessingLoop(iBuilder, masks, delMask);
487}
488
489void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
490    if (mShouldSwizzle) {
491        generatePEXTAndSwizzleLoop(iBuilder, masks);
492    } else {
493        generatePEXTLoop(iBuilder, masks);
494    }
495    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
496    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
497    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
498}
499
500void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
501    for (unsigned j = 0; j < mStreamCount; ++j) {
502        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
503        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
504        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
505    }
506}
507
508void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
509    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
510    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
511    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
512        std::vector<Value *> input;
513        unsigned streamSelectionIndex = j * mSwizzleFactor;
514        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
515                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
516            if ( i >= mStreamCount) {
517                                input.push_back(iBuilder->allZeroes());
518            } else {
519                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
520            }
521        }
522        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
523        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
524             iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
525        }
526    }
527}
528
529DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
530: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
531                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
532                      Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
533                  {}, {}, {}, {})
534, mDelCountFieldWidth(fw)
535, mStreamCount(streamCount)
536, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
537, mShouldSwizzle(shouldSwizzle)
538{
539    if(mShouldSwizzle) {       
540        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
541            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
542        }
543    } else {
544        // No swizzling. Output results as single stream set
545        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
546    }
547    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
548}
549
550   
551//
552// This kernel performs final stream compression for a set of N bitstreams, given
553// (a) a set of bitstreams partially compressed within K-bit fields and stored
554//     in K-bit swizzled form, and
555// (b) a stream of deletion/extraction counts per K-bit stride.
556//
557// Restrictions:  At present, only K=64 is supported.
558//                At present, N must be an exact multiple of BLOCK_SIZE/K.
559//
560// The kernel always consumes full blocks of input and emits data into the output
561// buffer in swizzles of K items at a time.   Upon completion of a segment,
562// up to K-1 pending output items per stream may be stored in the kernel state.
563//
564// Note: that both input streams and output streams are stored in swizzled form.
565//
566
567SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
568: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
569                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
570, mBitStreamCount(bitStreamCount)
571    , mFieldWidth(fieldWidth)
572    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
573    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
574        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
575        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
576        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
577        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
578        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
579        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
580            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
581            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
582            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
583        }
584        addScalar(iBuilder->getSizeTy(), "pendingOffset");
585}
586   
587void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
588       
589    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
590    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
591   
592    // Output is written and committed to the output buffer one swizzle at a time.
593    //
594    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
595    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
596   
597    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
598    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
599    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
600
601    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
602    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
603    // There is a separate vector of pending data for each swizzle group.
604    std::vector<Value *> pendingData;
605    std::vector<Value *> outputStreamPtr;
606    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
607        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
608        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
609    }
610   
611    // Generate code for each of the mSwizzleFactor fields making up a block.
612    // We load the count for the field and process all swizzle groups accordingly.
613    for (unsigned i = 0; i < mSwizzleFactor; i++) {
614        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
615    //iBuilder->CallPrintInt("newItemCount", newItemCount);
616        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
617        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
618       
619        // Data from the ith swizzle pack of each group is processed
620        // according to the same newItemCount, pendingSpace, ...
621        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
622            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
623        //iBuilder->CallPrintRegister("newItems", newItems);
624            // Combine as many of the new items as possible into the pending group.
625            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
626            //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
627            // To avoid an unpredictable branch, always store the combined group, whether full or not.
628               
629            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
630            // Any items in excess of the space available in the current pending group overflow for the next group.
631            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
632            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
633            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
634        }
635        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
636        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
637    }
638    iBuilder->setScalarField("pendingOffset", pendingOffset);   
639    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
640    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
641    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
642        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
643        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
644
645    }
646    iBuilder->setProducedItemCount("outputSwizzle0", produced);
647}
648
649void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
650    CreateDoBlockMethodCall(iBuilder);
651    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
652    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
653   
654    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
655    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
656    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
657    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
658
659    // Write the pending data.
660    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
661        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
662        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
663        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
664    }
665    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
666}
667}
Note: See TracBrowser for help on using the repository browser.