source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5650

Last change on this file since 5650 was 5540, checked in by cameron, 2 years ago

Integrated AVX deletion kernel

File size: 36.4 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
51: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
52                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}, Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
53                  {}, {}, {}, {})
54, mDelCountFieldWidth(fw)
55, mStreamCount(streamCount)
56, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
57// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
58// to prevent rounding errors.
59, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
60, mPEXTWidth(PEXT_width)
61{
62    assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
63        && "mDelCountFieldWidth must be a power of 2");
64    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
65    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
66   
67    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
68    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
69    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
70        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
71            "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
72        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
73    }
74    addScalar(iBuilder->getSizeTy(), "pendingOffset");
75}
76
77void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
78    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
79    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
80    const auto masks = get_PEXT_masks(iBuilder, delMask);
81    generateProcessingLoop(iBuilder, masks, delMask);
82}
83
84void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
85    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
86    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
87    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
88    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
89    const auto masks = get_PEXT_masks(iBuilder, delMask);
90    generateProcessingLoop(iBuilder, masks, delMask);
91    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
92    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
93   
94    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
95    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
96    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
97    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
98
99    // Write the pending data.
100    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
101        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
102        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
103        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
104    }
105    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
106}
107
108std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
109    // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
110    // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
111    Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask)); 
112    std::vector<Value *> masks;
113    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
114        masks.push_back(iBuilder->CreateExtractElement(m, i));
115    }
116    return masks;
117}
118
119void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
120                                                Value * delMask) {
121    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
122    std::vector<Value *> counts;
123    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
124        // Store the deletion counts for each PEXT field
125        counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
126    }
127
128    generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
129}
130
131/*
132What this function does in pseudo code:
133for (mSwizzleFactor)
134        create a swizzle set containing mSwizzleFactor blocks
135        apply PEXT to each block in the swizzle set
136        store the swizzleSet in PEXTedSwizzleSets vector
137       
138for (each swizzle row i)
139        for (each swizzle set j)
140                processes row i in swizzle set j
141                store output in pendingData[j]
142*/
143void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
144                                                    std::vector<Value *> counts) {
145    // For each of the k swizzle sets required to apply PEXT to all input streams
146    std::vector<std::vector<Value *>> PEXTedSwizzleSets;
147    for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
148    // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
149    // Each block belongs to a different input stream.
150        std::vector<Value *> input;
151        unsigned streamSelectionIndex = j * mSwizzleFactor;
152        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
153                // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
154            if ( i >= mStreamCount) {
155                                input.push_back(iBuilder->allZeroes());
156            } else {
157                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
158            }
159        }
160        // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
161        PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
162    }
163        // Compress the PEXTedSwizzleSets
164    // Output is written and committed to the output buffer one swizzle at a time.
165    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
166    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
167   
168    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
169    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
170    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
171
172    // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
173    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
174    // There is a separate vector of pending data for each swizzle group.
175    std::vector<Value *> pendingData;
176    std::vector<Value *> outputStreamPtr;
177
178    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
179        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
180        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
181    }
182
183    // For each row i
184    for (unsigned i = 0; i < mSwizzleFactor; i++) {
185        // Generate code for each of the mSwizzleFactor fields making up a block.
186        // We load the count for the field and process all swizzle groups accordingly.
187        Value * newItemCount = counts[i];
188        //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
189        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
190        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
191       
192        // Data from the ith swizzle pack of each group is processed
193        // according to the same newItemCount, pendingSpace, ...
194        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
195            Value * newItems = PEXTedSwizzleSets[j][i];
196            //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
197            // Combine as many of the new items as possible into the pending group.
198            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
199                pendingOffset)));
200            //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
201            // To avoid an unpredictable branch, always store the combined group, whether full or not.             
202            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
203           
204            // Any items in excess of the space available in the current pending group overflow for the next group.
205            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
206            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
207            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
208        }
209        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
210        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
211    }
212   
213    iBuilder->setScalarField("pendingOffset", pendingOffset);
214    //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
215   
216    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
217    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
218    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
219        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
220        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
221    }
222    iBuilder->setProducedItemCount("outputSwizzle0", produced);
223}
224
225/*
226Apply PEXT deletion to the blocks in strms and swizzle the result.
227
228Q: Why is it advantageous to swizzle the PEXTed streams?
229
230A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
231
232Example:
233Input:     11101101
234PEXT mask: 11110000
235Output:    00001110
236
237PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
238cleared. This has implications if we're working with multiple streams.
239
240For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
241(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
242Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
243Stream 2:   qrstuv00 wxy00000 z1000000 23456000
244Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
245Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
246
247If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
248should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
249time:
250
251The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
252
253Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
254Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
255Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
256Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
257
258Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
259compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
260
261Args:
262    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
263    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
264        sequentially to PEXT a full block.)
265
266Returns:
267    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
268*/
269std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, 
270                                                             const std::vector<Value *> & masks, std::vector<Value *> strms) {
271    Value * PEXT_func = nullptr;
272    if (mPEXTWidth == 64) {
273        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
274    } else if (mPEXTWidth == 32) {
275        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
276    }
277   
278    std::vector<Value *> output;     
279    for (unsigned i = 0; i < strms.size(); i++) {
280        Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
281        output.push_back(Constant::getNullValue(v->getType())); 
282    }
283
284    // For each of the input streams
285    for (unsigned j = 0; j < strms.size(); j++) {
286        Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
287        // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
288        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
289            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
290            Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
291            /*
292             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
293             E.g.:
294
295               *i*
296            *j* a b c d strms[0]
297                e f g h
298                i j k l
299                m n o p
300
301             Apply pext deletion at each position, then swizzle results:
302               *j*
303            *i* a` e` i` m` output[0]
304                b` f` j` n`
305                c` g` k` o` 
306                d` i` l` p`         
307            */   
308            output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j); 
309            /*
310            numCompressedBits = 0
311
312            for (each swizzleField position j)
313                for (each input swizzle i)
314                    get PEXTed_field
315                    Shift PEXTed_field left by "numCompressedBits" (in output[i])
316                    OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
317                numCompressedBits += popcount(mask[i])
318            */
319        }
320    }
321   
322    return output;
323}
324
325Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
326    Value * PEXT_func = nullptr;
327    if (mPEXTWidth == 64) {
328        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
329    } else if (mPEXTWidth == 32) {
330        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
331    }
332       
333    Value * v = iBuilder->fwCast(mPEXTWidth, strm);
334    Value * output = Constant::getNullValue(v->getType());
335    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
336        Value * field = iBuilder->CreateExtractElement(v, i);
337        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
338        output = iBuilder->CreateInsertElement(output, compressed, i);
339    }
340    return output;
341}
342
343// Apply deletion to a set of stream_count input streams to produce a set of output streams.
344// Kernel inputs: stream_count data streams plus one del_mask stream
345// Outputs: the deleted streams, plus a partial sum popcount
346
347void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
348    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
349    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
350    for (unsigned j = 0; j < mStreamCount; ++j) {
351        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
352        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
353        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
354    }
355    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
356    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
357}
358
359void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
360    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
361    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
362    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
363    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
364    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
365    for (unsigned j = 0; j < mStreamCount; ++j) {
366        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
367        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
368        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
369    }
370    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
371    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
372}
373
374DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount)
375: BlockOrientedKernel("del" + std::to_string(fw) + "_" + std::to_string(streamCount),
376              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
377               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
378              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
379               Binding{iBuilder->getStreamSetTy(), "deletionCounts"}},
380              {}, {}, {})
381, mDeletionFieldWidth(fw)
382, mStreamCount(streamCount) {
383}
384
385const unsigned PEXT_width = 64;
386
387inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
388    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
389    std::vector<Value *> masks;
390    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
391        masks.push_back(iBuilder->CreateExtractElement(m, i));
392    }
393    return masks;
394}
395
396// Apply PEXT deletion to a collection of blocks and swizzle the result.
397// strms contains the blocks to process
398inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
399    Value * PEXT_func = nullptr;
400    if (PEXT_width == 64) {
401        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
402    } else if (PEXT_width == 32) {
403        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
404    }
405   
406    std::vector<Value *> output;     
407    for (unsigned i = 0; i < strms.size(); i++) {
408        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
409        output.push_back(Constant::getNullValue(v->getType())); 
410    }
411
412    // For each of the input streams
413    for (unsigned j = 0; j < strms.size(); j++) {
414        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
415        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
416        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
417            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
418            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
419            /*
420             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
421
422             a b c d
423             e f g h
424             i j k l
425             m n o p
426
427             Apply pext deletion at each position, then swizzle results:
428
429             a` e` i` m`
430             b` f` j` n`
431             c` g` k` o` 
432             d` i` l` p`         
433            */     
434            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
435        }
436    }
437   
438    return output;
439}
440
441inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
442    Value * PEXT_func = nullptr;
443    if (PEXT_width == 64) {
444        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
445    } else if (PEXT_width == 32) {
446        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
447    }
448       
449    Value * v = iBuilder->fwCast(PEXT_width, strm);
450    Value * output = Constant::getNullValue(v->getType());
451    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
452        Value * field = iBuilder->CreateExtractElement(v, i);
453        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
454        output = iBuilder->CreateInsertElement(output, compressed, i);
455    }
456    return output;
457}
458
459// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
460// Kernel inputs: stream_count data streams plus one del_mask stream
461// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
462
463void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
464    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
465    const auto masks = get_PEXT_masks(iBuilder, delMask);
466    generateProcessingLoop(iBuilder, masks, delMask);
467}
468
469void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
470    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
471    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
472    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
473    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
474    const auto masks = get_PEXT_masks(iBuilder, delMask);
475    generateProcessingLoop(iBuilder, masks, delMask);
476}
477
478void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
479    if (mShouldSwizzle) {
480        generatePEXTAndSwizzleLoop(iBuilder, masks);
481    } else {
482        generatePEXTLoop(iBuilder, masks);
483    }
484    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
485    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
486    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
487}
488
489void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
490    for (unsigned j = 0; j < mStreamCount; ++j) {
491        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
492        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
493        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
494    }
495}
496
497void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
498    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
499    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
500    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
501        std::vector<Value *> input;
502        unsigned streamSelectionIndex = j * mSwizzleFactor;
503        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
504                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
505            if ( i >= mStreamCount) {
506                                input.push_back(iBuilder->allZeroes());
507            } else {
508                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
509            }
510        }
511        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
512        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
513             iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
514        }
515    }
516}
517
518DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
519: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
520                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
521                      Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
522                  {}, {}, {}, {})
523, mDelCountFieldWidth(fw)
524, mStreamCount(streamCount)
525, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
526, mShouldSwizzle(shouldSwizzle)
527{
528    if(mShouldSwizzle) {       
529        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
530            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
531        }
532    } else {
533        // No swizzling. Output results as single stream set
534        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
535    }
536    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
537}
538
539   
540//
541// This kernel performs final stream compression for a set of N bitstreams, given
542// (a) a set of bitstreams partially compressed within K-bit fields and stored
543//     in K-bit swizzled form, and
544// (b) a stream of deletion/extraction counts per K-bit stride.
545//
546// Restrictions:  At present, only K=64 is supported.
547//                At present, N must be an exact multiple of BLOCK_SIZE/K.
548//
549// The kernel always consumes full blocks of input and emits data into the output
550// buffer in swizzles of K items at a time.   Upon completion of a segment,
551// up to K-1 pending output items per stream may be stored in the kernel state.
552//
553// Note: that both input streams and output streams are stored in swizzled form.
554//
555
556SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
557: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
558                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
559, mBitStreamCount(bitStreamCount)
560    , mFieldWidth(fieldWidth)
561    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
562    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
563        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
564        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
565        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
566        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
567        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
568        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
569            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
570            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
571            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
572        }
573        addScalar(iBuilder->getSizeTy(), "pendingOffset");
574}
575   
576void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
577       
578    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
579    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
580   
581    // Output is written and committed to the output buffer one swizzle at a time.
582    //
583    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
584    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
585   
586    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
587    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
588    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
589
590    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
591    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
592    // There is a separate vector of pending data for each swizzle group.
593    std::vector<Value *> pendingData;
594    std::vector<Value *> outputStreamPtr;
595    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
596        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
597        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
598    }
599   
600    // Generate code for each of the mSwizzleFactor fields making up a block.
601    // We load the count for the field and process all swizzle groups accordingly.
602    for (unsigned i = 0; i < mSwizzleFactor; i++) {
603        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
604        iBuilder->CallPrintInt("newItemCount", newItemCount);
605        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
606        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
607       
608        // Data from the ith swizzle pack of each group is processed
609        // according to the same newItemCount, pendingSpace, ...
610        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
611            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
612            iBuilder->CallPrintRegister("newItems", newItems);
613            // Combine as many of the new items as possible into the pending group.
614            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
615            //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
616            // To avoid an unpredictable branch, always store the combined group, whether full or not.
617               
618            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
619            // Any items in excess of the space available in the current pending group overflow for the next group.
620            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
621            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
622            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
623        }
624        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
625        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
626    }
627    iBuilder->setScalarField("pendingOffset", pendingOffset);
628    iBuilder->CallPrintInt("pendingOffset", pendingOffset);
629
630   
631    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
632    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
633    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
634        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
635        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
636
637    }
638    iBuilder->setProducedItemCount("outputSwizzle0", produced);
639}
640
641void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
642    CreateDoBlockMethodCall(iBuilder);
643    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
644    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
645   
646    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
647    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
648    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
649    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
650
651    // Write the pending data.
652    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
653        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
654        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
655        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
656    }
657    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
658}
659}
Note: See TracBrowser for help on using the repository browser.