source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5385

Last change on this file since 5385 was 5376, checked in by cameron, 2 years ago

Bug fix for SwizzledBitstreamCompressByCount? final block

File size: 18.9 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <IR_Gen/idisa_builder.h>
8#include <llvm/IR/Value.h>
9#include <llvm/IR/Module.h>
10#include <llvm/Support/raw_ostream.h>
11
12using namespace llvm;
13
14namespace kernel {
15
16inline std::vector<Value *> parallel_prefix_deletion_masks(IDISA::IDISA_Builder * iBuilder, const unsigned fw, Value * del_mask) {
17    Value * m = iBuilder->simd_not(del_mask);
18    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
19    std::vector<Value *> move_masks;
20    for (unsigned shift = 1; shift < fw; shift *= 2) {
21        Value * mp = mk;
22        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
23            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
24        }
25        Value * mv = iBuilder->simd_and(mp, m);
26        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
27        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
28        move_masks.push_back(mv);
29    }
30    return move_masks;
31}
32
33inline Value * apply_parallel_prefix_deletion(IDISA::IDISA_Builder * iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
34    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
35    for (unsigned i = 0; i < mv.size(); i++) {
36        unsigned shift = 1 << i;
37        Value * t = iBuilder->simd_and(s, mv[i]);
38        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
39    }
40    return s;
41}
42
43inline Value * partial_sum_popcount(IDISA::IDISA_Builder * iBuilder, const unsigned fw, Value * mask) {
44    Value * field = iBuilder->simd_popcount(fw, mask);
45    const auto count = iBuilder->getBitBlockWidth() / fw;
46    for (unsigned move = 1; move < count; move *= 2) {
47        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
48    }
49    return field;
50}
51
52// Apply deletion to a set of stream_count input streams to produce a set of output streams.
53// Kernel inputs: stream_count data streams plus one del_mask stream
54// Outputs: the deleted streams, plus a partial sum popcount
55
56void DeletionKernel::generateDoBlockMethod() {
57    Value * delMask = loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
58    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
59    for (unsigned j = 0; j < mStreamCount; ++j) {
60        Value * input = loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
61        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
62        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
63    }
64    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
65    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
66}
67
68void DeletionKernel::generateFinalBlockMethod(Value * remainingBytes) {
69    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
70    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
71    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
72    Value * delMask = iBuilder->CreateOr(EOF_del, loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
73    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
74    for (unsigned j = 0; j < mStreamCount; ++j) {
75        Value * input = loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
76        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
77        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
78    }
79    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
80    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
81}
82
83DeletionKernel::DeletionKernel(IDISA::IDISA_Builder * iBuilder, unsigned fw, unsigned streamCount)
84: BlockOrientedKernel(iBuilder, "del",
85              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
86               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
87              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
88               Binding{iBuilder->getStreamSetTy(), "deletionCounts"}},
89              {}, {}, {})
90, mDeletionFieldWidth(fw)
91, mStreamCount(streamCount) {
92}
93
94const unsigned PEXT_width = 64;
95
96inline std::vector<Value *> get_PEXT_masks(IDISA::IDISA_Builder * iBuilder, Value * del_mask) {
97    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
98    std::vector<Value *> masks;
99    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
100        masks.push_back(iBuilder->CreateExtractElement(m, i));
101    }
102    return masks;
103}
104
105// Apply PEXT deletion to a collection of blocks and swizzle the result.
106// strms contains the blocks to process
107inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(IDISA::IDISA_Builder * iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {   
108    Value * PEXT_func = nullptr;
109    if (PEXT_width == 64) {
110        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
111    } else if (PEXT_width == 32) {
112        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
113    }
114   
115    std::vector<Value *> output;     
116    for (unsigned i = 0; i < strms.size(); i++) {
117        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
118        output.push_back(Constant::getNullValue(v->getType())); 
119    }
120
121    // For each of the input streams
122    for (unsigned j = 0; j < strms.size(); j++) {
123        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
124        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
125        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
126            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
127            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
128            /*
129             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
130
131             a b c d
132             e f g h
133             i j k l
134             m n o p
135
136             Apply pext deletion at each position, then swizzle results:
137
138             a` e` i` m`
139             b` f` j` n`
140             c` g` k` o` 
141             d` i` l` p`         
142            */     
143            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
144        }
145    }
146   
147    return output;
148}
149
150inline Value * apply_PEXT_deletion(IDISA::IDISA_Builder * iBuilder, const std::vector<Value *> & masks, Value * strm) { 
151    Value * PEXT_func = nullptr;
152    if (PEXT_width == 64) {
153        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
154    } else if (PEXT_width == 32) {
155        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
156    }
157       
158    Value * v = iBuilder->fwCast(PEXT_width, strm);
159    Value * output = Constant::getNullValue(v->getType());
160    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
161        Value * field = iBuilder->CreateExtractElement(v, i);
162        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
163        output = iBuilder->CreateInsertElement(output, compressed, i);
164    }
165    return output;
166}
167
168// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
169// Kernel inputs: stream_count data streams plus one del_mask stream
170// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
171
172void DeleteByPEXTkernel::generateDoBlockMethod() {
173    Value * delMask = loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
174    const auto masks = get_PEXT_masks(iBuilder, delMask);
175    generateProcessingLoop(masks, delMask);
176}
177
178void DeleteByPEXTkernel::generateFinalBlockMethod(Value * remainingBytes) {
179    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
180    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
181    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
182    Value * delMask = iBuilder->CreateOr(EOF_del, loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
183    const auto masks = get_PEXT_masks(iBuilder, delMask);
184    generateProcessingLoop(masks, delMask);
185}
186
187void DeleteByPEXTkernel::generateProcessingLoop(const std::vector<Value *> & masks, Value * delMask) {
188    if (mShouldSwizzle)   
189        generatePEXTAndSwizzleLoop(masks);
190    else
191        generatePEXTLoop(masks);   
192   
193    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
194    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
195    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
196}
197
198void DeleteByPEXTkernel::generatePEXTLoop(const std::vector<Value *> & masks) {
199    for (unsigned j = 0; j < mStreamCount; ++j) {
200        Value * input = loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
201        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
202        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
203    }
204}
205
206void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::vector<Value *> & masks) {
207    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
208    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
209    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
210        std::vector<Value *> input;
211        unsigned streamSelectionIndex = j * mSwizzleFactor;
212        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
213                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
214            if ( i >= mStreamCount)
215                                input.push_back(iBuilder->allZeroes());
216                        else
217                        input.push_back(loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
218        }
219        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
220        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
221             storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
222        }
223    }
224}
225
226DeleteByPEXTkernel::DeleteByPEXTkernel(IDISA::IDISA_Builder * iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
227: BlockOrientedKernel(iBuilder, "PEXTdel",
228                      {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
229                          Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
230                      {}, {}, {}, {})
231, mDelCountFieldWidth(fw)
232, mStreamCount(streamCount)
233, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
234, mShouldSwizzle(shouldSwizzle)
235{
236    if(mShouldSwizzle) {       
237        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
238            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
239        }
240    } else {
241        // No swizzling. Output results as single stream set
242        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
243    }
244    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
245}
246
247   
248//
249// This kernel performs final stream compression for a set of N bitstreams, given
250// (a) a set of bitstreams partially compressed within K-bit fields and stored
251//     in K-bit swizzled form, and
252// (b) a stream of deletion/extraction counts per K-bit stride.
253//
254// Restrictions:  At present, only K=64 is supported.
255//                At present, N must be an exact multiple of BLOCK_SIZE/K.
256//
257// The kernel always consumes full blocks of input and emits data into the output
258// buffer in swizzles of K items at a time.   Upon completion of a segment,
259// up to K-1 pending output items per stream may be stored in the kernel state.
260//
261// Note: that both input streams and output streams are stored in swizzled form.
262//
263
264   
265
266SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(IDISA::IDISA_Builder * iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
267    : BlockOrientedKernel(iBuilder, "swizzled_compress", {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
268, mBitStreamCount(bitStreamCount)
269    , mFieldWidth(fieldWidth)
270    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
271    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
272        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
273        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
274        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
275        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
276        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
277        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
278            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
279            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
280            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
281        }
282        addScalar(iBuilder->getSizeTy(), "pendingOffset");
283}
284
285   
286void SwizzledBitstreamCompressByCount::generateDoBlockMethod() {
287       
288    Value * countStreamPtr = iBuilder->CreateBitCast(getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0)), iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
289   
290    // Output is written and committed to the output buffer one swizzle at a time.
291    //
292    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
293    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
294   
295    Value * outputProduced = getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
296    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
297    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
298
299    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
300    Value * pendingOffset = getScalarField("pendingOffset");
301    // There is a separate vector of pending data for each swizzle group.
302    std::vector<Value *> pendingData;
303    std::vector<Value *> outputStreamPtr;
304    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
305        pendingData.push_back(getScalarField("pendingSwizzleData" + std::to_string(i)));
306        outputStreamPtr.push_back(getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
307    }
308   
309    // Generate code for each of the mSwizzleFactor fields making up a block.
310    // We load the count for the field and process all swizzle groups accordingly.
311    for (unsigned i = 0; i < mSwizzleFactor; i++) {
312        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
313        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
314        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
315       
316        // Data from the ith swizzle pack of each group is processed
317        // according to the same newItemCount, pendingSpace, ...
318        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
319            Value * newItems = loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
320            // Combine as many of the new items as possible into the pending group.
321            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
322            // To avoid an unpredictable branch, always store the combined group, whether full or not.
323               
324            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
325            // Any items in excess of the space available in the current pending group overflow for the next group.
326            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
327            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
328            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
329        }
330        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
331        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
332    }
333    setScalarField("pendingOffset", pendingOffset);
334   
335    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
336    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
337    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
338        setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
339    }
340    setProducedItemCount("outputSwizzle0", produced);
341}
342
343void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(Value * remainingBytes) {
344    CreateDoBlockMethodCall();
345    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
346    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
347   
348    Value * outputProduced = getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
349    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
350    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
351    Value * pendingOffset = getScalarField("pendingOffset");
352
353    // Write the pending data.
354    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
355        Value * pendingData = getScalarField("pendingSwizzleData" + std::to_string(i));
356        Value * outputStreamPtr = getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
357        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
358    }
359    setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
360}
361}
Note: See TracBrowser for help on using the repository browser.