source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 6079

Last change on this file since 6079 was 6079, checked in by cameron, 12 months ago

StreamExpandKernel?: eliminate kernel state: rely on PopcountOf? attribute

File size: 21.0 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <toolchain/toolchain.h>
9#include <toolchain/driver.h>
10#include <toolchain/cpudriver.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Intrinsics.h>
13#include <llvm/IR/Module.h>
14
15
16using namespace llvm;
17
18namespace kernel {
19
20PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
21: MultiBlockKernel(std::move(name),
22// input stream sets
23{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
24Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
25// output stream set
26{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
27{}, {}, {})
28, mSwizzleFactor(swizzleFactor) {
29
30}
31
32void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
33    BasicBlock * const entry = b->GetInsertBlock();
34    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
35    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
36    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
37    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
38    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
39
40    Constant * const ZERO = b->getSize(0);
41    Value * const sourceItemCount = b->getProcessedItemCount("source");
42
43    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
44    b->CreateBr(processBlock);
45
46    b->SetInsertPoint(processBlock);
47    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
48    strideIndex->addIncoming(ZERO, entry);
49    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
50    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
51    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
52    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
53    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
54    bufferSizePhi->addIncoming(ZERO, entry);
55
56    // Extract the values we will use in the main processing loop
57    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
58    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
59    Value * const selectors = b->fwCast(pdepWidth, markerValue);
60    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
61
62    // For each element of the marker block
63    Value * bufferSize = bufferSizePhi;
64    Value * sourceOffset = sourceOffsetPhi;
65    Value * buffer = bufferPhi;
66    for (unsigned i = 0; i < mSwizzleFactor; i++) {
67
68        // How many bits will we deposit?
69        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
70
71        // Aggressively enqueue any additional bits
72        BasicBlock * const entry = b->GetInsertBlock();
73        BasicBlock * const enqueueBits = b->CreateBasicBlock();
74        b->CreateBr(enqueueBits);
75
76        b->SetInsertPoint(enqueueBits);
77        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
78        updatedBufferSize->addIncoming(bufferSize, entry);
79        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
80        updatedSourceOffset->addIncoming(sourceOffset, entry);
81        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
82        updatedBuffer->addIncoming(buffer, entry);
83
84        // Calculate the block and swizzle index of the current swizzle row
85        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
86        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
87        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
88        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
89
90        // Shift the swizzle to the right to clear off any used bits ...
91        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
92        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
93
94        // ... then to the left to align the bits with the buffer and combine them.
95        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
96        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
97
98        buffer = b->CreateOr(updatedBuffer, pendingBits);
99        updatedBuffer->addIncoming(buffer, enqueueBits);
100
101        // Update the buffer size with the number of bits we have actually enqueued
102        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
103        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
104        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
105
106        // ... and increment the source offset by the number we actually inserted
107        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
108        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
109        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
110
111        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
112        // branch misprediction is better or worse than always filling from two swizzles to
113        // ensure that we have enough bits to deposit.
114        BasicBlock * const depositBits = b->CreateBasicBlock();
115        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
116
117        b->SetInsertPoint(depositBits);
118
119        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
120        Value * const mask = b->CreateExtractElement(selectors, i);
121        Value* result = b->simd_pdep(pdepWidth, buffer, b->simd_fill(pdepWidth, mask));
122
123        // Store the result
124        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
125        b->CreateBlockAlignedStore(result, outputStreamPtr);
126
127        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
128        Value * const usedShift = b->simd_fill(pdepWidth, required);
129        buffer = b->CreateLShr(buffer, usedShift);
130        bufferSize = b->CreateSub(bufferSize, required);
131    }
132
133    BasicBlock * const finishedBlock = b->GetInsertBlock();
134    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
135    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
136    bufferPhi->addIncoming(buffer, finishedBlock);
137    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
138    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
139    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
140
141    b->SetInsertPoint(finishedStrides);
142}
143   
144StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, unsigned sourceStreamCount, unsigned selectedStreamBase, unsigned selectedStreamCount)
145: MultiBlockKernel("streamExpand" + std::to_string(fieldWidth) + "_" + std::to_string(sourceStreamCount) + "_" + std::to_string(selectedStreamBase) + "_" + std::to_string(selectedStreamCount),
146                   {Binding{kb->getStreamSetTy(), "marker", FixedRate(), Principal()},
147                       Binding{kb->getStreamSetTy(sourceStreamCount), "source", PopcountOf("marker")}},
148                   {Binding{kb->getStreamSetTy(selectedStreamCount), "output", FixedRate()}},
149                   {}, {}, {})
150, mFieldWidth(fieldWidth)
151, mSelectedStreamBase(selectedStreamBase)
152, mSelectedStreamCount(selectedStreamCount) {
153}
154
155void StreamExpandKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
156    const unsigned fw = mFieldWidth;
157    Type * fwTy = b->getIntNTy(fw);
158    Type * sizeTy = b->getSizeTy();
159    const unsigned numFields = b->getBitBlockWidth()/fw;
160   
161    Constant * const ZERO = b->getSize(0);
162    Constant * bwConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
163    Constant * bw_sub1Const = ConstantInt::get(sizeTy, b->getBitBlockWidth() -1);
164    Constant * fwConst = ConstantInt::get(sizeTy, fw);
165    Constant * fw_sub1Const = ConstantInt::get(sizeTy, fw-1);
166    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
167    Constant * fw_sub1Splat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
168   
169    BasicBlock * entry = b->GetInsertBlock();
170    BasicBlock * expandLoop = b->CreateBasicBlock("expandLoop");
171    BasicBlock * expansionDone = b->CreateBasicBlock("expansionDone");
172   
173    Value * processedSourceItems = b->getProcessedItemCount("source");
174    Value * sourceOffset = b->CreateURem(processedSourceItems, bwConst);
175    std::vector<Value *> pendingData(mSelectedStreamCount);
176    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
177        pendingData[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), ZERO);
178    }
179   
180    b->CreateBr(expandLoop);
181    // Main Loop
182    b->SetInsertPoint(expandLoop);
183    PHINode * blockNoPhi = b->CreatePHI(b->getSizeTy(), 2);
184    PHINode * pendingOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
185    PHINode * pendingDataPhi[mSelectedStreamCount];
186    blockNoPhi->addIncoming(ZERO, entry);
187    pendingOffsetPhi->addIncoming(sourceOffset, entry);
188    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
189        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
190        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
191    }
192    Value * deposit_mask = b->loadInputStreamBlock("marker", ZERO, blockNoPhi);
193    // The source stream may not be positioned at a block boundary.  Partial data
194    // has been saved in the kernel state, determine the next full block number
195    // for loading source streams.
196    Value * pendingBlockEnd = b->CreateAdd(pendingOffsetPhi, bw_sub1Const);
197    Value * srcBlockNo = b->CreateUDiv(pendingBlockEnd, bwConst);
198   
199    // Calculate the field values and offsets we need for assembling a
200    // a full block of source bits.  Assembly will use the following operations.
201    // A = b->simd_srlv(fw, b->mvmd_dsll(fw, source, pending, field_offset_lo), bit_offset);
202    // B = b->simd_sllv(fw, b->mvmd_dsll(fw, source, pending, field_offset_hi), shift_fwd);
203    // all_source_bits = simd_or(A, B);
204    Value * pendingOffset = b->CreateURem(pendingOffsetPhi, bwConst);
205    Value * pendingItems = b->CreateURem(b->CreateSub(bwConst, pendingOffset), bwConst);
206    Value * field_offset_lo = b->CreateUDiv(b->CreateAdd(pendingItems, fw_sub1Const), fwConst);
207    Value * bit_offset = b->simd_fill(fw, b->CreateURem(pendingOffset, fwConst));
208    // Carefully avoid a shift by the full fieldwith (which gives a poison value).
209    // field_offset_lo + 1 unless the bit_offset is 0, in which case it is just field_offset_lo.
210    Value * field_offset_hi =  b->CreateUDiv(pendingItems, fwConst);
211    // fw - bit_offset, unless bit_offset is 0, in which case, the shift_fwd is 0.
212    Value * shift_fwd = b->CreateURem(b->CreateSub(fwSplat, bit_offset), fwSplat);
213
214    // Once all source bits are assembled, they need to be distributed to the
215    // output fields in accord with the popcounts of the deposit mask fields.
216    // The bits for each output field will typically come from (at most) two
217    // source fields, with offsets.  Calculate the field numbers and offsets.
218   
219    Value * fieldPopCounts = b->simd_popcount(fw, deposit_mask);
220    // For each field determine the (partial) sum popcount of all fields prior to
221    // the current field.
222    Value * partialSum = fieldPopCounts;
223    for (unsigned i = 1; i < numFields; i *= 2) {
224        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
225    }
226    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
227    partialSum = b->mvmd_slli(fw, partialSum, 1);
228   
229    Value * source_field_lo = b->CreateUDiv(partialSum, fwSplat);
230    Value * source_field_hi = b->CreateUDiv(b->CreateAdd(partialSum, fw_sub1Splat), fwSplat);
231    Value * source_shift_lo = b->CreateAnd(partialSum, fw_sub1Splat);  // parallel URem
232    Value * source_shift_hi = b->CreateAnd(b->CreateSub(fwSplat, source_shift_lo), fw_sub1Splat);
233
234    // Now load and process source streams.
235    Value * source[mSelectedStreamCount];
236    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
237        source[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), srcBlockNo);
238        Value * A = b->simd_srlv(fw, b->mvmd_dsll(fw, source[i], pendingDataPhi[i], field_offset_lo), bit_offset);
239        Value * B = b->simd_sllv(fw, b->mvmd_dsll(fw, source[i], pendingDataPhi[i], field_offset_hi), shift_fwd);
240        Value * full_source_block = b->simd_or(A, B);
241        Value * C = b->simd_srlv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_lo), source_shift_lo);
242        Value * D = b->simd_sllv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_hi), source_shift_hi);
243        Value * output = b->bitCast(b->simd_or(C, D));
244        b->storeOutputStreamBlock("output", b->getInt32(i), blockNoPhi, output);
245        pendingDataPhi[i]->addIncoming(source[i], expandLoop);
246    }
247    //
248    // Update loop control Phis for the next iteration.
249    //
250    Value * nextBlk = b->CreateAdd(blockNoPhi, b->getSize(1));
251    blockNoPhi->addIncoming(nextBlk, expandLoop);
252    Value * newPending = b->CreateAdd(pendingOffsetPhi, blockPopCount);
253    pendingOffsetPhi->addIncoming(newPending, expandLoop);
254    //
255    // Now continue the loop if there are more blocks to process.
256    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
257    b->CreateCondBr(moreToDo, expandLoop, expansionDone);
258   
259    b->SetInsertPoint(expansionDone);
260}
261
262FieldDepositKernel::FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
263: MultiBlockKernel("FieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
264                   {Binding{kb->getStreamSetTy(1), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
265                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
266                   {}, {}, {})
267, mFieldWidth(fieldWidth)
268, mStreamCount(streamCount) {
269}
270   
271void FieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
272    BasicBlock * entry = kb->GetInsertBlock();
273    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
274    BasicBlock * done = kb->CreateBasicBlock("done");
275    Constant * const ZERO = kb->getSize(0);
276    kb->CreateBr(processBlock);
277    kb->SetInsertPoint(processBlock);
278    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
279    blockOffsetPhi->addIncoming(ZERO, entry);
280    Value * depositMask = kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi);
281    for (unsigned j = 0; j < mStreamCount; ++j) {
282        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
283        Value * output = kb->simd_pdep(mFieldWidth, input, depositMask);
284        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
285    }
286    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
287    blockOffsetPhi->addIncoming(nextBlk, processBlock);
288    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
289    kb->CreateCondBr(moreToDo, processBlock, done);
290    kb->SetInsertPoint(done);
291}
292
293PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount, std::string suffix)
294: MultiBlockKernel("PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount) + suffix,
295                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
296                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
297                   {}, {}, {})
298, mPDEPWidth(fieldWidth)
299, mStreamCount(streamCount) {
300    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldDepositKernel");
301}
302
303void PDEPFieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
304    Type * fieldTy = kb->getIntNTy(mPDEPWidth);
305    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
306    Constant * PDEP_func = nullptr;
307    if (mPDEPWidth == 64) {
308        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
309    } else if (mPDEPWidth == 32) {
310        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
311    }
312    BasicBlock * entry = kb->GetInsertBlock();
313    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
314    BasicBlock * done = kb->CreateBasicBlock("done");
315    Constant * const ZERO = kb->getSize(0);
316    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPDEPWidth;
317    kb->CreateBr(processBlock);
318    kb->SetInsertPoint(processBlock);
319    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
320    blockOffsetPhi->addIncoming(ZERO, entry);
321    std::vector<Value *> mask(fieldsPerBlock);
322//  When operating on fields individually, we can use vector load/store with
323//  extract/insert element operations, or we can use individual field load
324//  and stores.   Individual field operations require fewer total operations,
325//  but more memory instructions.   It may be that vector load/extract is better,
326//  while field store is better.   Vector insert then store creates long dependence
327//  chains.
328//
329#define PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
330#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
331    Value * depositMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
332    depositMaskPtr = kb->CreatePointerCast(depositMaskPtr, fieldPtrTy);
333    for (unsigned i = 0; i < fieldsPerBlock; i++) {
334        mask[i] = kb->CreateLoad(kb->CreateGEP(depositMaskPtr, kb->getInt32(i)));
335    }
336#else
337    Value * depositMask = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi));
338    for (unsigned i = 0; i < fieldsPerBlock; i++) {
339        mask[i] = kb->CreateExtractElement(depositMask, kb->getInt32(i));
340    }
341#endif
342    for (unsigned j = 0; j < mStreamCount; ++j) {
343#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
344        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
345        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
346#else
347        Value * inputStrm = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi));
348#endif
349#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
350        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
351        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
352#else
353        Value * outputStrm = kb->fwCast(mPDEPWidth, kb->allZeroes());
354#endif
355        for (unsigned i = 0; i < fieldsPerBlock; i++) {
356#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
357            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
358#else
359            Value * field = kb->CreateExtractElement(inputStrm, kb->getInt32(i));
360#endif
361            Value * compressed = kb->CreateCall(PDEP_func, {field, mask[i]});
362#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
363            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
364        }
365#else
366            outputStrm = kb->CreateInsertElement(outputStrm, compressed, kb->getInt32(i));
367        }
368        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, outputStrm);
369#endif
370    }
371    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
372    blockOffsetPhi->addIncoming(nextBlk, processBlock);
373    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
374    kb->CreateCondBr(moreToDo, processBlock, done);
375    kb->SetInsertPoint(done);
376}
377
378void StreamDepositCompiler::makeCall(parabix::StreamSetBuffer * depositMask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
379    if (mBufferBlocks == 0) {
380        llvm::report_fatal_error("StreamDepositCompiler needs a non-zero bufferBlocks parameter (for now).");
381    }
382    auto & b = mDriver.getBuilder();
383    unsigned N = mSelectedStreamCount;
384    parabix::StreamSetBuffer * expandedStreams = mDriver.addBuffer<parabix::StaticBuffer>(b, b->getStreamSetTy(N), mBufferBlocks);
385    Kernel * streamK = mDriver.addKernelInstance<StreamExpandKernel>(b, mFieldWidth, mSourceStreamCount, mSelectedStreamBase, N);
386    mDriver.makeKernelCall(streamK, {depositMask, inputs}, {expandedStreams});
387
388    Kernel * depositK = nullptr;
389    if (AVX2_available()) {
390        depositK = mDriver.addKernelInstance<PDEPFieldDepositKernel>(b, mFieldWidth, N, std::to_string(mSelectedStreamBase));
391    } else {
392        depositK = mDriver.addKernelInstance<FieldDepositKernel>(b, mFieldWidth, N);
393    }
394    mDriver.makeKernelCall(depositK, {depositMask, expandedStreams}, {outputs});
395}
396
397}
398
Note: See TracBrowser for help on using the repository browser.