source: icGREP/icgrep-devel/icgrep/kernels/deletion.cpp @ 5440

Last change on this file since 5440 was 5440, checked in by nmedfort, 2 years ago

Large refactoring step. Removed IR generation code from Kernel (formally KernelBuilder?) and moved it into the new KernelBuilder? class.

File size: 20.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "deletion.h"
7#include <kernels/kernel_builder.h>
8#include <llvm/Support/raw_ostream.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14inline std::vector<Value *> parallel_prefix_deletion_masks(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask) {
15    Value * m = iBuilder->simd_not(del_mask);
16    Value * mk = iBuilder->simd_slli(fw, del_mask, 1);
17    std::vector<Value *> move_masks;
18    for (unsigned shift = 1; shift < fw; shift *= 2) {
19        Value * mp = mk;
20        for (unsigned lookright = 1; lookright < fw; lookright *= 2) {
21            mp = iBuilder->simd_xor(mp, iBuilder->simd_slli(fw, mp, lookright));
22        }
23        Value * mv = iBuilder->simd_and(mp, m);
24        m = iBuilder->simd_or(iBuilder->simd_xor(m, mv), iBuilder->simd_srli(fw, mv, shift));
25        mk = iBuilder->simd_and(mk, iBuilder->simd_not(mp));
26        move_masks.push_back(mv);
27    }
28    return move_masks;
29}
30
31inline Value * apply_parallel_prefix_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * del_mask, const std::vector<Value *> & mv, Value * strm) {
32    Value * s = iBuilder->simd_and(strm, iBuilder->simd_not(del_mask));
33    for (unsigned i = 0; i < mv.size(); i++) {
34        unsigned shift = 1 << i;
35        Value * t = iBuilder->simd_and(s, mv[i]);
36        s = iBuilder->simd_or(iBuilder->simd_xor(s, t), iBuilder->simd_srli(fw, t, shift));
37    }
38    return s;
39}
40
41inline Value * partial_sum_popcount(const std::unique_ptr<KernelBuilder> & iBuilder, const unsigned fw, Value * mask) {
42    Value * field = iBuilder->simd_popcount(fw, mask);
43    const auto count = iBuilder->getBitBlockWidth() / fw;
44    for (unsigned move = 1; move < count; move *= 2) {
45        field = iBuilder->simd_add(fw, field, iBuilder->mvmd_slli(fw, field, move));
46    }
47    return field;
48}
49
50// Apply deletion to a set of stream_count input streams to produce a set of output streams.
51// Kernel inputs: stream_count data streams plus one del_mask stream
52// Outputs: the deleted streams, plus a partial sum popcount
53
54void DeletionKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
55    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
56    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
57    for (unsigned j = 0; j < mStreamCount; ++j) {
58        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
59        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
60        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
61    }
62    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
63    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
64}
65
66void DeletionKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * remainingBytes) {
67    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
68    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
69    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
70    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
71    const auto move_masks = parallel_prefix_deletion_masks(iBuilder, mDeletionFieldWidth, delMask);
72    for (unsigned j = 0; j < mStreamCount; ++j) {
73        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
74        Value * output = apply_parallel_prefix_deletion(iBuilder, mDeletionFieldWidth, delMask, move_masks, input);
75        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
76    }
77    Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
78    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
79}
80
81DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount)
82: BlockOrientedKernel("del" + std::to_string(fw) + "_" + std::to_string(streamCount),
83              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
84               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
85              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
86               Binding{iBuilder->getStreamSetTy(), "deletionCounts"}},
87              {}, {}, {})
88, mDeletionFieldWidth(fw)
89, mStreamCount(streamCount) {
90}
91
92const unsigned PEXT_width = 64;
93
94inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
95    Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
96    std::vector<Value *> masks;
97    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
98        masks.push_back(iBuilder->CreateExtractElement(m, i));
99    }
100    return masks;
101}
102
103// Apply PEXT deletion to a collection of blocks and swizzle the result.
104// strms contains the blocks to process
105inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
106    Value * PEXT_func = nullptr;
107    if (PEXT_width == 64) {
108        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
109    } else if (PEXT_width == 32) {
110        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
111    }
112   
113    std::vector<Value *> output;     
114    for (unsigned i = 0; i < strms.size(); i++) {
115        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
116        output.push_back(Constant::getNullValue(v->getType())); 
117    }
118
119    // For each of the input streams
120    for (unsigned j = 0; j < strms.size(); j++) {
121        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
122        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
123        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
124            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
125            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
126            /*
127             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
128
129             a b c d
130             e f g h
131             i j k l
132             m n o p
133
134             Apply pext deletion at each position, then swizzle results:
135
136             a` e` i` m`
137             b` f` j` n`
138             c` g` k` o` 
139             d` i` l` p`         
140            */     
141            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j); 
142        }
143    }
144   
145    return output;
146}
147
148inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
149    Value * PEXT_func = nullptr;
150    if (PEXT_width == 64) {
151        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
152    } else if (PEXT_width == 32) {
153        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
154    }
155       
156    Value * v = iBuilder->fwCast(PEXT_width, strm);
157    Value * output = Constant::getNullValue(v->getType());
158    for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
159        Value * field = iBuilder->CreateExtractElement(v, i);
160        Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
161        output = iBuilder->CreateInsertElement(output, compressed, i);
162    }
163    return output;
164}
165
166// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
167// Kernel inputs: stream_count data streams plus one del_mask stream
168// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
169
170void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
171    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
172    const auto masks = get_PEXT_masks(iBuilder, delMask);
173    generateProcessingLoop(iBuilder, masks, delMask);
174}
175
176void DeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
177    IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
178    Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
179    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
180    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
181    const auto masks = get_PEXT_masks(iBuilder, delMask);
182    generateProcessingLoop(iBuilder, masks, delMask);
183}
184
185void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
186    if (mShouldSwizzle) {
187        generatePEXTAndSwizzleLoop(iBuilder, masks);
188    } else {
189        generatePEXTLoop(iBuilder, masks);
190    }
191    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
192    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
193    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
194}
195
196void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
197    for (unsigned j = 0; j < mStreamCount; ++j) {
198        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
199        Value * output = apply_PEXT_deletion(iBuilder, masks, input);
200        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
201    }
202}
203
204void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
205    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
206    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
207    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
208        std::vector<Value *> input;
209        unsigned streamSelectionIndex = j * mSwizzleFactor;
210        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
211                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
212            if ( i >= mStreamCount) {
213                                input.push_back(iBuilder->allZeroes());
214            } else {
215                input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
216            }
217        }
218        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
219        for (unsigned i = 0; i < mSwizzleFactor; i++) { 
220             iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
221        }
222    }
223}
224
225DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
226: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
227                  {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
228                      Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
229                  {}, {}, {}, {})
230, mDelCountFieldWidth(fw)
231, mStreamCount(streamCount)
232, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
233, mShouldSwizzle(shouldSwizzle)
234{
235    if(mShouldSwizzle) {       
236        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) { 
237            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
238        }
239    } else {
240        // No swizzling. Output results as single stream set
241        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
242    }
243    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
244}
245
246   
247//
248// This kernel performs final stream compression for a set of N bitstreams, given
249// (a) a set of bitstreams partially compressed within K-bit fields and stored
250//     in K-bit swizzled form, and
251// (b) a stream of deletion/extraction counts per K-bit stride.
252//
253// Restrictions:  At present, only K=64 is supported.
254//                At present, N must be an exact multiple of BLOCK_SIZE/K.
255//
256// The kernel always consumes full blocks of input and emits data into the output
257// buffer in swizzles of K items at a time.   Upon completion of a segment,
258// up to K-1 pending output items per stream may be stored in the kernel state.
259//
260// Note: that both input streams and output streams are stored in swizzled form.
261//
262
263SwizzledBitstreamCompressByCount::SwizzledBitstreamCompressByCount(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned bitStreamCount, unsigned fieldWidth)
264: BlockOrientedKernel("swizzled_compress" + std::to_string(fieldWidth) + "_" + std::to_string(bitStreamCount),
265                     {Binding{iBuilder->getStreamSetTy(), "countsPerStride"}}, {}, {}, {}, {})
266, mBitStreamCount(bitStreamCount)
267    , mFieldWidth(fieldWidth)
268    , mSwizzleFactor(iBuilder->getBitBlockWidth() / fieldWidth)
269    , mSwizzleSetCount((mBitStreamCount + mSwizzleFactor - 1)/mSwizzleFactor) {
270        assert((fieldWidth > 0) && ((fieldWidth & (fieldWidth - 1)) == 0) && "fieldWidth must be a power of 2");
271        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
272        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
273        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
274        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
275        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
276            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
277            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
278            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
279        }
280        addScalar(iBuilder->getSizeTy(), "pendingOffset");
281}
282   
283void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
284       
285    Value * countsPerStridePtr = iBuilder->getInputStreamBlockPtr("countsPerStride", iBuilder->getInt32(0));
286    Value * countStreamPtr = iBuilder->CreatePointerCast(countsPerStridePtr, iBuilder->getIntNTy(mFieldWidth)->getPointerTo());
287   
288    // Output is written and committed to the output buffer one swizzle at a time.
289    //
290    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
291    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
292   
293    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
294    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
295    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
296
297    // There may be pending data in the kernel state, for up to mFieldWidth-1 bits per stream.
298    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
299    // There is a separate vector of pending data for each swizzle group.
300    std::vector<Value *> pendingData;
301    std::vector<Value *> outputStreamPtr;
302    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
303        pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
304        outputStreamPtr.push_back(iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0)));
305    }
306   
307    // Generate code for each of the mSwizzleFactor fields making up a block.
308    // We load the count for the field and process all swizzle groups accordingly.
309    for (unsigned i = 0; i < mSwizzleFactor; i++) {
310        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
311        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
312        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
313       
314        // Data from the ith swizzle pack of each group is processed
315        // according to the same newItemCount, pendingSpace, ...
316        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
317            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
318            // Combine as many of the new items as possible into the pending group.
319            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
320            // To avoid an unpredictable branch, always store the combined group, whether full or not.
321               
322            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
323            // Any items in excess of the space available in the current pending group overflow for the next group.
324            Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mFieldWidth, pendingSpace));
325            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
326            pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
327        }
328        outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
329        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
330    }
331    iBuilder->setScalarField("pendingOffset", pendingOffset);
332   
333    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
334    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
335    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
336        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
337    }
338    iBuilder->setProducedItemCount("outputSwizzle0", produced);
339}
340
341void SwizzledBitstreamCompressByCount::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, Value * /* remainingBytes */) {
342    CreateDoBlockMethodCall(iBuilder);
343    Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
344    Constant * outputIndexShift = iBuilder->getSize(std::log2(mFieldWidth));
345   
346    Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
347    Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
348    Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
349    Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
350
351    // Write the pending data.
352    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
353        Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
354        Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0));
355        iBuilder->CreateBlockAlignedStore(pendingData, iBuilder->CreateGEP(outputStreamPtr, outputIndex));
356    }
357    iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
358}
359}
Note: See TracBrowser for help on using the repository browser.