source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5941

Last change on this file since 5941 was 5926, checked in by xwa163, 14 months ago

Fix lz4 related GEP instructions and TODO

File size: 37.0 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
51: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
52                  {Binding{iBuilder->getStreamSetTy(), "delMaskSet"}, Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}},
53                  {}, {}, {}, {})
54, mDelCountFieldWidth(fw)
55, mStreamCount(streamCount)
56, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
57// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
58// to prevent rounding errors.
59, mSwizzleSetCount((mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor)
60, mPEXTWidth(PEXT_width)
61{
62    assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
63        && "mDelCountFieldWidth must be a power of 2");
64    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
65    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
66
67    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
68    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
69    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
70    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
71        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
72            "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
73        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
74    }
75    addScalar(iBuilder->getSizeTy(), "pendingOffset");
76}
77
78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
79    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
80    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
81    const auto masks = get_PEXT_masks(iBuilder, delMask);
82    generateProcessingLoop(iBuilder, masks, delMask);
83}
84
85void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
86    const auto originalProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
87    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
88    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
89    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
90    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
91    const auto masks = get_PEXT_masks(iBuilder, delMask);
92    generateProcessingLoop(iBuilder, masks, delMask);
93
94    const auto newProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
95    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
96    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
97   
98    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
99    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
100    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
101
102    const auto deltaOutputIndex = iBuilder->CreateSub(
103            iBuilder->CreateUDiv(newProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth())),
104            iBuilder->CreateUDiv(originalProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth()))
105    );
106    outputIndex = iBuilder->CreateAdd(outputIndex, iBuilder->CreateMul(deltaOutputIndex, iBuilder->getSize(iBuilder->getBitBlockWidth() / mDelCountFieldWidth)));
107
108    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
109
110    // Write the pending data.
111    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
112        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
113        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0), outputIndex);
114        iBuilder->CreateBlockAlignedStore(pendingData, outputStreamPtr);
115    }
116    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
117}
118
119std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
120    // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
121    // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
122    Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask)); 
123    std::vector<Value *> masks;
124    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
125        masks.push_back(iBuilder->CreateExtractElement(m, i));
126    }
127    return masks;
128}
129
130void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
131                                                Value * delMask) {
132    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
133    std::vector<Value *> counts;
134    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
135        // Store the deletion counts for each PEXT field
136        counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
137    }
138
139    generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
140}
141
142/*
143What this function does in pseudo code:
144for (mSwizzleFactor)
145        create a swizzle set containing mSwizzleFactor blocks
146        apply PEXT to each block in the swizzle set
147        store the swizzleSet in PEXTedSwizzleSets vector
148       
149for (each swizzle row i)
150        for (each swizzle set j)
151                processes row i in swizzle set j
152                store output in pendingData[j]
153*/
154void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
155                                                    std::vector<Value *> counts) {
156    // For each of the k swizzle sets required to apply PEXT to all input streams
157    std::vector<std::vector<Value *>> PEXTedSwizzleSets;
158    for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
159    // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
160    // Each block belongs to a different input stream.
161        std::vector<Value *> input;
162        unsigned streamSelectionIndex = j * mSwizzleFactor;
163        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
164                // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
165            if ( i >= mStreamCount) {
166                                input.push_back(iBuilder->allZeroes());
167            } else {
168                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
169            }
170        }
171        // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
172        PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
173    }
174        // Compress the PEXTedSwizzleSets
175    // Output is written and committed to the output buffer one swizzle at a time.
176    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
177    Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
178   
179    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
180    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
181    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
182
183    // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
184    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
185    // There is a separate vector of pending data for each swizzle group.
186    std::vector<Value *> pendingData;
187
188    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
189        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
190    }
191
192    // For each row i
193    for (unsigned i = 0; i < mSwizzleFactor; i++) {
194        // Generate code for each of the mSwizzleFactor fields making up a block.
195        // We load the count for the field and process all swizzle groups accordingly.
196        Value * newItemCount = counts[i];
197        //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
198        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
199        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
200       
201        // Data from the ith swizzle pack of each group is processed
202        // according to the same newItemCount, pendingSpace, ...
203        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
204            Value * newItems = PEXTedSwizzleSets[j][i];
205            //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
206            // Combine as many of the new items as possible into the pending group.
207            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
208                pendingOffset)));
209            //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
210            // To avoid an unpredictable branch, always store the combined group, whether full or not.
211            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), outputIndex));
212           
213            // Any items in excess of the space available in the current pending group overflow for the next group.
214            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
215            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
216            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
217        }
218        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
219        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
220    }
221   
222    iBuilder->setScalarField("pendingOffset", pendingOffset);
223    //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
224   
225    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
226    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
227    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
228        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
229        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
230    }
231    iBuilder->setProducedItemCount("outputSwizzle0", produced);
232}
233
234/*
235Apply PEXT deletion to the blocks in strms and swizzle the result.
236
237Q: Why is it advantageous to swizzle the PEXTed streams?
238
239A: PEXT doesn't compress streams, if the input to a PEXT operation is 64 bits wide, the output is also 64 bits wide.
240
241Example:
242Input:     11101101
243PEXT mask: 11110000
244Output:    00001110
245
246PEXT selects the bits we tell it to and stores them at contiguous lower-order bits. Higher-order bits are
247cleared. This has implications if we're working with multiple streams.
248
249For example, say we've applied PEXT on the following 4 streams using this deletion mask (inverse of PEXT mask): 00000011 00011111 00111111 00000111
250(I think this diagram is backwards, PEXTed bits should be stored in lower-order bits, not higher.)
251Stream 1:   abcdef00 ghi00000 jk000000 lmnop000
252Stream 2:   qrstuv00 wxy00000 z1000000 23456000
253Stream 3:   ABCDEF00 GHI00000 JK000000 LMNOP000
254Stream 4:   QRSTUV00 WZY00000 Z1000000 23456000
255
256If we wanted to compress each stream to remove the sequences of 0s, it's tricky. The first 32 bits of each stream
257should be compress by 2 bits, the second 32 bits by 5, etc. If we swizzle the streams with a swizzle factor of 4 we have a much easier
258time:
259
260The swizzled output using a field width of 8 produces the following swizzles (swizzle factor = block width / pext field width = 4).
261
262Swizzle 1:  abcdef00 qrstuv00 ABCDEF00 QRSTUV00
263Swizzle 2:  ghi00000 wxy00000 GHI00000 WZY00000
264Swizzle 3:  jk000000 z1000000 JK000000 Z1000000
265Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
266
267Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
268compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
269
270Args:
271    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
272    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
273        sequentially to PEXT a full block.)
274
275Returns:
276    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
277*/
278std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, 
279                                                             const std::vector<Value *> & masks, std::vector<Value *> strms) {
280    Value * PEXT_func = nullptr;
281    if (mPEXTWidth == 64) {
282        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
283    } else if (mPEXTWidth == 32) {
284        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
285    }
286   
287    std::vector<Value *> output;     
288    for (unsigned i = 0; i < strms.size(); i++) {
289        Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
290        output.push_back(Constant::getNullValue(v->getType())); 
291    }
292
293    // For each of the input streams
294    for (unsigned j = 0; j < strms.size(); j++) {
295        Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
296        // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
297        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
298            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
299            Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
300            /*
301             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
302             E.g.:
303
304               *i*
305            *j* a b c d strms[0]
306                e f g h
307                i j k l
308                m n o p
309
310             Apply pext deletion at each position, then swizzle results:
311               *j*
312            *i* a` e` i` m` output[0]
313                b` f` j` n`
314                c` g` k` o` 
315                d` i` l` p`         
316            */   
317            output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j); 
318            /*
319            numCompressedBits = 0
320
321            for (each swizzleField position j)
322                for (each input swizzle i)
323                    get PEXTed_field
324                    Shift PEXTed_field left by "numCompressedBits" (in output[i])
325                    OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
326                numCompressedBits += popcount(mask[i])
327            */
328        }
329    }
330   
331    return output;
332}
333
334Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
335    Value * PEXT_func = nullptr;
336    if (mPEXTWidth == 64) {
337        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
338    } else if (mPEXTWidth == 32) {
339        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
340    }
341       
342    Value * v = iBuilder->fwCast(mPEXTWidth, strm);
343    Value * output = Constant::getNullValue(v->getType());
344    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
345        Value * field = iBuilder->CreateExtractElement(v, i);
346        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
347        output = iBuilder->CreateInsertElement(output, compressed, i);
348    }
349    return output;
350}
351
352// Apply deletion to a set of stream_count input streams to produce a set of output streams.
353// Kernel inputs: stream_count data streams plus one del_mask stream
354// Outputs: the deleted streams, plus a partial sum popcount
355
356void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
357    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
358    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
359    for (unsigned j = 0; j < mStreamCount; ++j) {
360        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
361        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
362        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
363    }
364    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
365    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
366}
367
368void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
369    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
370    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
371    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
372    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
373    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
374    for (unsigned j = 0; j < mStreamCount; ++j) {
375        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
376        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
377        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
378    }
379    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
380    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
381}
382
383DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
384: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
385              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
386               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
387              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
388               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
389              {}, {}, {})
390, mDeletionFieldWidth(fieldWidth)
391, mStreamCount(streamCount) {
392}
393
394const unsigned PEXT_width = 64;
395
396inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
397    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
398    std::vector<Value *> masks;
399    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
400        masks.push_back(iBuilder->CreateExtractElement(m, i));
401    }
402    return masks;
403}
404
405// Apply PEXT deletion to a collection of blocks and swizzle the result.
406// strms contains the blocks to process
407inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
408    Value * PEXT_func = nullptr;
409    if (PEXT_width == 64) {
410        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
411    } else if (PEXT_width == 32) {
412        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
413    }
414   
415    std::vector<Value *> output;     
416    for (unsigned i = 0; i < strms.size(); i++) {
417        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
418        output.push_back(Constant::getNullValue(v->getType())); 
419    }
420
421    // For each of the input streams
422    for (unsigned j = 0; j < strms.size(); j++) {
423        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
424        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
425        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
426            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
427            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
428            /*
429             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
430
431             a b c d
432             e f g h
433             i j k l
434             m n o p
435
436             Apply pext deletion at each position, then swizzle results:
437
438             a` e` i` m`
439             b` f` j` n`
440             c` g` k` o` 
441             d` i` l` p`         
442            */     
443            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
444        }
445    }
446   
447    return output;
448}
449
450inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
451    Value * PEXT_func = nullptr;
452    if (PEXT_width == 64) {
453        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
454    } else if (PEXT_width == 32) {
455        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
456    }
457       
458    Value * v = iBuilder->fwCast(PEXT_width, strm);
459    Value * output = Constant::getNullValue(v->getType());
460    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
461        Value * field = iBuilder->CreateExtractElement(v, i);
462        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
463        output = iBuilder->CreateInsertElement(output, compressed, i);
464    }
465    return output;
466}
467
468// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
469// Kernel inputs: stream_count data streams plus one del_mask stream
470// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
471
472void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
473    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
474    const auto masks = get_PEXT_masks(iBuilder, delMask);
475    generateProcessingLoop(iBuilder, masks, delMask);
476}
477
478void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
479    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
480    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
481    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
482    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
483    const auto masks = get_PEXT_masks(iBuilder, delMask);
484    generateProcessingLoop(iBuilder, masks, delMask);
485}
486
487void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
488    if (mShouldSwizzle) {
489        generatePEXTAndSwizzleLoop(iBuilder, masks);
490    } else {
491        generatePEXTLoop(iBuilder, masks);
492    }
493    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
494    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
495    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
496}
497
498void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
499    for (unsigned j = 0; j < mStreamCount; ++j) {
500        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
501        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
502        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
503    }
504}
505
506void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
507    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
508    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
509    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
510        std::vector<Value *> input;
511        unsigned streamSelectionIndex = j * mSwizzleFactor;
512        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
513                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
514            if ( i >= mStreamCount) {
515                                input.push_back(iBuilder->allZeroes());
516            } else {
517                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
518            }
519        }
520        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
521        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
522             iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
523        }
524    }
525}
526
527DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
528: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
529                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
530                      Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
531                  {}, {}, {}, {})
532, mDelCountFieldWidth(fw)
533, mStreamCount(streamCount)
534, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
535, mShouldSwizzle(shouldSwizzle)
536{
537    if(mShouldSwizzle) {       
538        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
539            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
540        }
541    } else {
542        // No swizzling. Output results as single stream set
543        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
544    }
545    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
546}
547
548   
549//
550// This kernel performs final stream compression for a set of N bitstreams, given
551// (a) a set of bitstreams partially compressed within K-bit fields and stored
552//     in K-bit swizzled form, and
553// (b) a stream of deletion/extraction counts per K-bit stride.
554//
555// Restrictions:  At present, only K=64 is supported.
556//                At present, N must be an exact multiple of BLOCK_SIZE/K.
557//
558// The kernel always consumes full blocks of input and emits data into the output
559// buffer in swizzles of K items at a time.   Upon completion of a segment,
560// up to K-1 pending output items per stream may be stored in the kernel state.
561//
562// Note: that both input streams and output streams are stored in swizzled form.
563//
564
565SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
566: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
567                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
568, mBitStreamCount(bitStreamCount)
569    , mFieldWidth(fieldWidth)
570    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
571    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
572        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
573        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
574        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
575        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
576        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
577        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
578            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
579            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
580            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
581        }
582        addScalar(iBuilder->getSizeTy(), "pendingOffset");
583}
584   
585void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
586       
587    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
588    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
589   
590    // Output is written and committed to the output buffer one swizzle at a time.
591    //
592    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
593    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
594   
595    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
596    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
597    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
598
599    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
600    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
601    // There is a separate vector of pending data for each swizzle group.
602    std::vector<Value *> pendingData;
603    std::vector<Value *> outputStreamPtr;
604    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
605        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
606        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
607    }
608   
609    // Generate code for each of the mSwizzleFactor fields making up a block.
610    // We load the count for the field and process all swizzle groups accordingly.
611    for (unsigned i = 0; i < mSwizzleFactor; i++) {
612        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
613    //iBuilder->CallPrintInt("newItemCount", newItemCount);
614        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
615        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
616       
617        // Data from the ith swizzle pack of each group is processed
618        // according to the same newItemCount, pendingSpace, ...
619        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
620            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
621        //iBuilder->CallPrintRegister("newItems", newItems);
622            // Combine as many of the new items as possible into the pending group.
623            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
624            //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
625            // To avoid an unpredictable branch, always store the combined group, whether full or not.
626               
627            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
628            // Any items in excess of the space available in the current pending group overflow for the next group.
629            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
630            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
631            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
632        }
633        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
634        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
635    }
636    iBuilder->setScalarField("pendingOffset", pendingOffset);   
637    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
638    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
639    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
640        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
641        //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
642
643    }
644    iBuilder->setProducedItemCount("outputSwizzle0", produced);
645}
646
647void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
648    CreateDoBlockMethodCall(iBuilder);
649    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
650    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
651   
652    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
653    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
654    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
655    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
656
657    // Write the pending data.
658    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
659        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
660        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
661        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
662    }
663    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
664}
665}
Note: See TracBrowser for help on using the repository browser.