source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 6086

Last change on this file since 6086 was 6086, checked in by cameron, 10 months ago

Bug fix for StreamCompress?

File size: 21.2 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <toolchain/toolchain.h>
9#include <toolchain/driver.h>
10#include <toolchain/cpudriver.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Intrinsics.h>
13#include <llvm/IR/Module.h>
14
15
16using namespace llvm;
17
18namespace kernel {
19
20PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
21: MultiBlockKernel(std::move(name),
22// input stream sets
23{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
24Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
25// output stream set
26{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
27{}, {}, {})
28, mSwizzleFactor(swizzleFactor) {
29
30}
31
32void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
33    BasicBlock * const entry = b->GetInsertBlock();
34    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
35    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
36    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
37    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
38    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
39
40    Constant * const ZERO = b->getSize(0);
41    Value * const sourceItemCount = b->getProcessedItemCount("source");
42
43    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
44    b->CreateBr(processBlock);
45
46    b->SetInsertPoint(processBlock);
47    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
48    strideIndex->addIncoming(ZERO, entry);
49    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
50    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
51    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
52    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
53    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
54    bufferSizePhi->addIncoming(ZERO, entry);
55
56    // Extract the values we will use in the main processing loop
57    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
58    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
59    Value * const selectors = b->fwCast(pdepWidth, markerValue);
60    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
61
62    // For each element of the marker block
63    Value * bufferSize = bufferSizePhi;
64    Value * sourceOffset = sourceOffsetPhi;
65    Value * buffer = bufferPhi;
66    for (unsigned i = 0; i < mSwizzleFactor; i++) {
67
68        // How many bits will we deposit?
69        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
70
71        // Aggressively enqueue any additional bits
72        BasicBlock * const entry = b->GetInsertBlock();
73        BasicBlock * const enqueueBits = b->CreateBasicBlock();
74        b->CreateBr(enqueueBits);
75
76        b->SetInsertPoint(enqueueBits);
77        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
78        updatedBufferSize->addIncoming(bufferSize, entry);
79        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
80        updatedSourceOffset->addIncoming(sourceOffset, entry);
81        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
82        updatedBuffer->addIncoming(buffer, entry);
83
84        // Calculate the block and swizzle index of the current swizzle row
85        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
86        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
87        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
88        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
89
90        // Shift the swizzle to the right to clear off any used bits ...
91        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
92        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
93
94        // ... then to the left to align the bits with the buffer and combine them.
95        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
96        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
97
98        buffer = b->CreateOr(updatedBuffer, pendingBits);
99        updatedBuffer->addIncoming(buffer, enqueueBits);
100
101        // Update the buffer size with the number of bits we have actually enqueued
102        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
103        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
104        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
105
106        // ... and increment the source offset by the number we actually inserted
107        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
108        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
109        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
110
111        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
112        // branch misprediction is better or worse than always filling from two swizzles to
113        // ensure that we have enough bits to deposit.
114        BasicBlock * const depositBits = b->CreateBasicBlock();
115        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
116
117        b->SetInsertPoint(depositBits);
118
119        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
120        Value * const mask = b->CreateExtractElement(selectors, i);
121        Value* result = b->simd_pdep(pdepWidth, buffer, b->simd_fill(pdepWidth, mask));
122
123        // Store the result
124        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
125        b->CreateBlockAlignedStore(result, outputStreamPtr);
126
127        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
128        Value * const usedShift = b->simd_fill(pdepWidth, required);
129        buffer = b->CreateLShr(buffer, usedShift);
130        bufferSize = b->CreateSub(bufferSize, required);
131    }
132
133    BasicBlock * const finishedBlock = b->GetInsertBlock();
134    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
135    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
136    bufferPhi->addIncoming(buffer, finishedBlock);
137    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
138    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
139    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
140
141    b->SetInsertPoint(finishedStrides);
142}
143   
144StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, unsigned sourceStreamCount, unsigned selectedStreamBase, unsigned selectedStreamCount)
145: MultiBlockKernel("streamExpand" + std::to_string(fieldWidth) + "_" + std::to_string(sourceStreamCount) + "_" + std::to_string(selectedStreamBase) + "_" + std::to_string(selectedStreamCount),
146                   {Binding{kb->getStreamSetTy(), "marker", FixedRate(), Principal()},
147                       Binding{kb->getStreamSetTy(sourceStreamCount), "source", PopcountOf("marker")}},
148                   {Binding{kb->getStreamSetTy(selectedStreamCount), "output", FixedRate()}},
149                   {}, {}, {})
150, mFieldWidth(fieldWidth)
151, mSelectedStreamBase(selectedStreamBase)
152, mSelectedStreamCount(selectedStreamCount) {
153}
154
155void StreamExpandKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
156    const unsigned fw = mFieldWidth;
157    Type * fwTy = b->getIntNTy(fw);
158    Type * sizeTy = b->getSizeTy();
159    const unsigned numFields = b->getBitBlockWidth()/fw;
160   
161    Constant * const ZERO = b->getSize(0);
162    Constant * bwConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
163    Constant * bw_sub1Const = ConstantInt::get(sizeTy, b->getBitBlockWidth() -1);
164    Constant * fwConst = ConstantInt::get(sizeTy, fw);
165    Constant * fw_sub1Const = ConstantInt::get(sizeTy, fw-1);
166    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
167    Constant * fw_sub1Splat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
168   
169    BasicBlock * entry = b->GetInsertBlock();
170    BasicBlock * expandLoop = b->CreateBasicBlock("expandLoop");
171    BasicBlock * expansionDone = b->CreateBasicBlock("expansionDone");
172   
173    Value * processedSourceItems = b->getProcessedItemCount("source");
174    Value * sourceOffset = b->CreateURem(processedSourceItems, bwConst);
175    std::vector<Value *> pendingData(mSelectedStreamCount);
176    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
177        pendingData[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), ZERO);
178    }
179   
180    b->CreateBr(expandLoop);
181    // Main Loop
182    b->SetInsertPoint(expandLoop);
183    PHINode * blockNoPhi = b->CreatePHI(b->getSizeTy(), 2);
184    PHINode * pendingOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
185    PHINode * pendingDataPhi[mSelectedStreamCount];
186    blockNoPhi->addIncoming(ZERO, entry);
187    pendingOffsetPhi->addIncoming(sourceOffset, entry);
188    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
189        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
190        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
191    }
192    Value * deposit_mask = b->loadInputStreamBlock("marker", ZERO, blockNoPhi);
193    // The source stream may not be positioned at a block boundary.  Partial data
194    // has been saved in the kernel state, determine the next full block number
195    // for loading source streams.
196    Value * pendingBlockEnd = b->CreateAdd(pendingOffsetPhi, bw_sub1Const);
197    Value * srcBlockNo = b->CreateUDiv(pendingBlockEnd, bwConst);
198    // Calculate the field values and offsets we need for assembling a
199    // a full block of source bits.  Assembly will use the following operations.
200    // A = b->simd_srlv(fw, b->mvmd_dsll(fw, source, pending, field_offset_lo), bit_offset);
201    // B = b->simd_sllv(fw, b->mvmd_dsll(fw, source, pending, field_offset_hi), shift_fwd);
202    // all_source_bits = simd_or(A, B);
203    Value * pendingOffset = b->CreateURem(pendingOffsetPhi, bwConst);
204    Value * pendingItems = b->CreateURem(b->CreateSub(bwConst, pendingOffset), bwConst);
205    Value * field_offset_lo = b->CreateUDiv(b->CreateAdd(pendingItems, fw_sub1Const), fwConst);
206    Value * bit_offset = b->simd_fill(fw, b->CreateURem(pendingOffset, fwConst));
207    // Carefully avoid a shift by the full fieldwith (which gives a poison value).
208    // field_offset_lo + 1 unless the bit_offset is 0, in which case it is just field_offset_lo.
209    Value * field_offset_hi =  b->CreateUDiv(pendingItems, fwConst);
210    // fw - bit_offset, unless bit_offset is 0, in which case, the shift_fwd is 0.
211    Value * shift_fwd = b->CreateURem(b->CreateSub(fwSplat, bit_offset), fwSplat);
212
213    // Once all source bits are assembled, they need to be distributed to the
214    // output fields in accord with the popcounts of the deposit mask fields.
215    // The bits for each output field will typically come from (at most) two
216    // source fields, with offsets.  Calculate the field numbers and offsets.
217   
218    Value * fieldPopCounts = b->simd_popcount(fw, deposit_mask);
219    // For each field determine the (partial) sum popcount of all fields prior to
220    // the current field.
221    Value * partialSum = fieldPopCounts;
222    for (unsigned i = 1; i < numFields; i *= 2) {
223        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
224    }
225    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
226    partialSum = b->mvmd_slli(fw, partialSum, 1);
227   
228    Value * source_field_lo = b->CreateUDiv(partialSum, fwSplat);
229    Value * source_field_hi = b->CreateUDiv(b->CreateAdd(partialSum, fw_sub1Splat), fwSplat);
230    Value * source_shift_lo = b->CreateAnd(partialSum, fw_sub1Splat);  // parallel URem
231    Value * source_shift_hi = b->CreateAnd(b->CreateSub(fwSplat, source_shift_lo), fw_sub1Splat);
232
233    // Now load and process source streams.
234    Value * source[mSelectedStreamCount];
235    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
236        source[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), srcBlockNo);
237        Value * A = b->simd_srlv(fw, b->mvmd_dsll(fw, source[i], pendingDataPhi[i], field_offset_lo), bit_offset);
238        Value * B = b->simd_sllv(fw, b->mvmd_dsll(fw, source[i], pendingDataPhi[i], field_offset_hi), shift_fwd);
239        Value * full_source_block = b->simd_or(A, B);
240        Value * C = b->simd_srlv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_lo), source_shift_lo);
241        Value * D = b->simd_sllv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_hi), source_shift_hi);
242        Value * output = b->bitCast(b->simd_or(C, D));
243        b->storeOutputStreamBlock("output", b->getInt32(i), blockNoPhi, output);
244    }
245    //
246    // Update loop control Phis for the next iteration.
247    //
248    Value * nextBlk = b->CreateAdd(blockNoPhi, b->getSize(1));
249    blockNoPhi->addIncoming(nextBlk, expandLoop);
250    Value * newPending = b->CreateAdd(pendingOffsetPhi, blockPopCount);
251    Value * isNewBlock = b->CreateICmpNE(srcBlockNo, b->CreateUDiv(b->CreateAdd(newPending, bw_sub1Const), bwConst));
252
253    pendingOffsetPhi->addIncoming(newPending, expandLoop);
254    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
255        pendingDataPhi[i]->addIncoming(b->CreateSelect(isNewBlock, source[i], pendingDataPhi[i]), expandLoop);
256    }
257    //
258    // Now continue the loop if there are more blocks to process.
259    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
260    b->CreateCondBr(moreToDo, expandLoop, expansionDone);
261   
262    b->SetInsertPoint(expansionDone);
263}
264
265FieldDepositKernel::FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
266: MultiBlockKernel("FieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
267                   {Binding{kb->getStreamSetTy(1), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
268                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
269                   {}, {}, {})
270, mFieldWidth(fieldWidth)
271, mStreamCount(streamCount) {
272}
273   
274void FieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
275    BasicBlock * entry = kb->GetInsertBlock();
276    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
277    BasicBlock * done = kb->CreateBasicBlock("done");
278    Constant * const ZERO = kb->getSize(0);
279    kb->CreateBr(processBlock);
280    kb->SetInsertPoint(processBlock);
281    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
282    blockOffsetPhi->addIncoming(ZERO, entry);
283    Value * depositMask = kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi);
284    for (unsigned j = 0; j < mStreamCount; ++j) {
285        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
286        Value * output = kb->simd_pdep(mFieldWidth, input, depositMask);
287        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
288    }
289    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
290    blockOffsetPhi->addIncoming(nextBlk, processBlock);
291    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
292    kb->CreateCondBr(moreToDo, processBlock, done);
293    kb->SetInsertPoint(done);
294}
295
296PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount, std::string suffix)
297: MultiBlockKernel("PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount) + suffix,
298                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
299                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
300                   {}, {}, {})
301, mPDEPWidth(fieldWidth)
302, mStreamCount(streamCount) {
303    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldDepositKernel");
304}
305
306void PDEPFieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
307    Type * fieldTy = kb->getIntNTy(mPDEPWidth);
308    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
309    Constant * PDEP_func = nullptr;
310    if (mPDEPWidth == 64) {
311        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
312    } else if (mPDEPWidth == 32) {
313        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
314    }
315    BasicBlock * entry = kb->GetInsertBlock();
316    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
317    BasicBlock * done = kb->CreateBasicBlock("done");
318    Constant * const ZERO = kb->getSize(0);
319    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPDEPWidth;
320    kb->CreateBr(processBlock);
321    kb->SetInsertPoint(processBlock);
322    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
323    blockOffsetPhi->addIncoming(ZERO, entry);
324    std::vector<Value *> mask(fieldsPerBlock);
325//  When operating on fields individually, we can use vector load/store with
326//  extract/insert element operations, or we can use individual field load
327//  and stores.   Individual field operations require fewer total operations,
328//  but more memory instructions.   It may be that vector load/extract is better,
329//  while field store is better.   Vector insert then store creates long dependence
330//  chains.
331//
332#define PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
333#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
334    Value * depositMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
335    depositMaskPtr = kb->CreatePointerCast(depositMaskPtr, fieldPtrTy);
336    for (unsigned i = 0; i < fieldsPerBlock; i++) {
337        mask[i] = kb->CreateLoad(kb->CreateGEP(depositMaskPtr, kb->getInt32(i)));
338    }
339#else
340    Value * depositMask = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi));
341    for (unsigned i = 0; i < fieldsPerBlock; i++) {
342        mask[i] = kb->CreateExtractElement(depositMask, kb->getInt32(i));
343    }
344#endif
345    for (unsigned j = 0; j < mStreamCount; ++j) {
346#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
347        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
348        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
349#else
350        Value * inputStrm = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi));
351#endif
352#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
353        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
354        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
355#else
356        Value * outputStrm = kb->fwCast(mPDEPWidth, kb->allZeroes());
357#endif
358        for (unsigned i = 0; i < fieldsPerBlock; i++) {
359#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
360            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
361#else
362            Value * field = kb->CreateExtractElement(inputStrm, kb->getInt32(i));
363#endif
364            Value * compressed = kb->CreateCall(PDEP_func, {field, mask[i]});
365#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
366            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
367        }
368#else
369            outputStrm = kb->CreateInsertElement(outputStrm, compressed, kb->getInt32(i));
370        }
371        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, outputStrm);
372#endif
373    }
374    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
375    blockOffsetPhi->addIncoming(nextBlk, processBlock);
376    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
377    kb->CreateCondBr(moreToDo, processBlock, done);
378    kb->SetInsertPoint(done);
379}
380
381void StreamDepositCompiler::makeCall(parabix::StreamSetBuffer * depositMask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
382    if (mBufferBlocks == 0) {
383        llvm::report_fatal_error("StreamDepositCompiler needs a non-zero bufferBlocks parameter (for now).");
384    }
385    auto & b = mDriver.getBuilder();
386    unsigned N = mSelectedStreamCount;
387    parabix::StreamSetBuffer * expandedStreams = mDriver.addBuffer<parabix::StaticBuffer>(b, b->getStreamSetTy(N), mBufferBlocks);
388    Kernel * streamK = mDriver.addKernelInstance<StreamExpandKernel>(b, mFieldWidth, mSourceStreamCount, mSelectedStreamBase, N);
389    mDriver.makeKernelCall(streamK, {depositMask, inputs}, {expandedStreams});
390
391    Kernel * depositK = nullptr;
392    if (AVX2_available()) {
393        depositK = mDriver.addKernelInstance<PDEPFieldDepositKernel>(b, mFieldWidth, N, std::to_string(mSelectedStreamBase));
394    } else {
395        depositK = mDriver.addKernelInstance<FieldDepositKernel>(b, mFieldWidth, N);
396    }
397    mDriver.makeKernelCall(depositK, {depositMask, expandedStreams}, {outputs});
398}
399
400}
401
Note: See TracBrowser for help on using the repository browser.