source: icGREP/icgrep-devel/icgrep/kernels/kernel.cpp @ 5442

Last change on this file since 5442 was 5442, checked in by cameron, 2 years ago

Bug fixes for MultiBlockKernel?, StdOutKernel?

File size: 52.3 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "kernel.h"
7#include <toolchain/toolchain.h>
8#include <kernels/streamset.h>
9#include <llvm/IR/Constants.h>
10#include <llvm/IR/Function.h>
11#include <llvm/IR/Instructions.h>
12#include <llvm/IR/MDBuilder.h>
13#include <llvm/IR/Module.h>
14#include <llvm/Support/raw_ostream.h>
15#include <llvm/Bitcode/ReaderWriter.h>
16#include <llvm/Transforms/Utils/Local.h>
17#include <kernels/streamset.h>
18#include <sstream>
19#include <kernels/kernel_builder.h>
20
21using namespace llvm;
22using namespace parabix;
23
24namespace kernel {
25
26const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
27const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
28const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
29const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
30const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
31const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
32const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
33const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal";
34const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr";
35const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks";
36
37unsigned Kernel::addScalar(Type * const type, const std::string & name) {
38    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
39        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
40    }
41    if (LLVM_UNLIKELY(mKernelMap.count(name))) {
42        report_fatal_error(getName() + " already contains scalar field " + name);
43    }
44    const auto index = mKernelFields.size();
45    mKernelMap.emplace(name, index);
46    mKernelFields.push_back(type);
47    return index;
48}
49
50unsigned Kernel::addUnnamedScalar(Type * const type) {
51    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
52        report_fatal_error("Cannot add unnamed field  to " + getName() + " after kernel state finalized");
53    }
54    const auto index = mKernelFields.size();
55    mKernelFields.push_back(type);
56    return index;
57}
58
59void Kernel::prepareStreamSetNameMap() {
60    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
61        mStreamMap.emplace(mStreamSetInputs[i].name, std::make_pair(Port::Input, i));
62    }
63    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
64        mStreamMap.emplace(mStreamSetOutputs[i].name, std::make_pair(Port::Output, i));
65    }
66}
67   
68void Kernel::createKernelStub(const std::unique_ptr<KernelBuilder> & idb, const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
69    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
70    assert ("IDISA Builder does not have a valid Module" && idb->getModule());
71    std::stringstream cacheName;   
72    cacheName << getName() << '_' << idb->getBuilderUniqueName();
73    for (const StreamSetBuffer * b: inputs) {
74        cacheName <<  ':' <<  b->getUniqueID();
75    }
76    for (const StreamSetBuffer * b: outputs) {
77        cacheName <<  ':' <<  b->getUniqueID();
78    }
79    Module * const kernelModule = new Module(cacheName.str(), idb->getContext());
80    createKernelStub(idb, inputs, outputs, kernelModule);
81}
82
83void Kernel::createKernelStub(const std::unique_ptr<KernelBuilder> & idb, const StreamSetBuffers & inputs, const StreamSetBuffers & outputs, Module * const kernelModule) {
84    assert (mModule == nullptr);
85    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
86    assert (mStreamSetInputBuffers.empty());
87    assert (mStreamSetOutputBuffers.empty());
88
89    if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) {
90        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetInputs.size()) +
91                           " input stream sets but was given "
92                           + std::to_string(inputs.size()));
93    }
94
95    for (unsigned i = 0; i < inputs.size(); ++i) {
96        StreamSetBuffer * const buf = inputs[i];
97        if (LLVM_UNLIKELY(buf == nullptr)) {
98            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
99                               + " cannot be null");
100        }
101        buf->addConsumer(this);
102    }
103
104    if (LLVM_UNLIKELY(mStreamSetOutputs.size() != outputs.size())) {
105        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetOutputs.size())
106                           + " output stream sets but was given "
107                           + std::to_string(outputs.size()));
108    }
109
110    for (unsigned i = 0; i < outputs.size(); ++i) {
111        StreamSetBuffer * const buf = outputs[i];
112        if (LLVM_UNLIKELY(buf == nullptr)) {
113            report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null");
114        }
115        if (LLVM_LIKELY(buf->getProducer() == nullptr)) {
116            buf->setProducer(this);
117        } else {
118            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
119                               + " is already produced by kernel " + buf->getProducer()->getName());
120        }
121    }
122
123    mModule = kernelModule;
124    mStreamSetInputBuffers.assign(inputs.begin(), inputs.end());
125    mStreamSetOutputBuffers.assign(outputs.begin(), outputs.end());
126    prepareKernel(idb);
127}
128
129void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
130    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
131    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
132        report_fatal_error("Cannot prepare kernel after kernel state finalized");
133    }
134    if (mStreamSetInputs.size() != mStreamSetInputBuffers.size()) {
135        std::string tmp;
136        raw_string_ostream out(tmp);
137        out << "kernel contains " << mStreamSetInputBuffers.size() << " input buffers for "
138            << mStreamSetInputs.size() << " input stream sets.";
139        report_fatal_error(out.str());
140    }
141    if (mStreamSetOutputs.size() != mStreamSetOutputBuffers.size()) {
142        std::string tmp;
143        raw_string_ostream out(tmp);
144        out << "kernel contains " << mStreamSetOutputBuffers.size() << " output buffers for "
145            << mStreamSetOutputs.size() << " output stream sets.";
146        report_fatal_error(out.str());
147    }
148    const auto blockSize = idb->getBitBlockWidth();
149    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
150        if ((mStreamSetInputBuffers[i]->getBufferBlocks() > 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < codegen::SegmentSize + (blockSize + mLookAheadPositions - 1)/blockSize)) {
151            report_fatal_error("Kernel preparation: Buffer size too small " + mStreamSetInputs[i].name);
152        }
153        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getPointerType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
154        if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
155            addScalar(idb->getSizeTy(), mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
156        }
157    }
158
159    IntegerType * const sizeTy = idb->getSizeTy();
160    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
161        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getPointerType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
162        if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
163            addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
164        }
165    }
166    for (const auto binding : mScalarInputs) {
167        addScalar(binding.type, binding.name);
168    }
169    for (const auto binding : mScalarOutputs) {
170        addScalar(binding.type, binding.name);
171    }
172    if (mStreamMap.empty()) {
173        prepareStreamSetNameMap();
174    }
175    for (auto binding : mInternalScalars) {
176        addScalar(binding.type, binding.name);
177    }
178
179    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
180    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
181        addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
182    }
183
184    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
185    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
186
187    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
188        addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
189    }
190
191    mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
192}
193
194// Default kernel signature: generate the IR and emit as byte code.
195std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) {
196    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
197    if (LLVM_LIKELY(moduleIDisSignature())) {
198        return getModule()->getModuleIdentifier();
199    } else {
200        generateKernel(idb);
201        std::string signature;
202        raw_string_ostream OS(signature);
203        WriteBitcodeToFile(getModule(), OS);
204        return signature;
205    }
206}
207
208void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
209    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
210    // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
211    // generated the unoptimized IR.
212    if (!mIsGenerated) {
213        const auto m = idb->getModule();
214        const auto ip = idb->saveIP();
215        const auto saveInstance = getInstance();
216        idb->setModule(mModule);
217        addKernelDeclarations(idb);
218        callGenerateInitializeMethod(idb);
219        callGenerateDoSegmentMethod(idb);
220        callGenerateFinalizeMethod(idb);
221        setInstance(saveInstance);
222        idb->setModule(m);
223        idb->restoreIP(ip);
224        mIsGenerated = true;
225    }
226}
227
228inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
229    mCurrentMethod = getInitFunction(idb->getModule());
230    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
231    Function::arg_iterator args = mCurrentMethod->arg_begin();
232    setInstance(&*(args++));
233    idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
234    for (const auto & binding : mScalarInputs) {
235        idb->setScalarField(binding.name, &*(args++));
236    }
237    for (const auto & binding : mStreamSetOutputs) {
238        idb->setConsumerLock(binding.name, &*(args++));
239    }
240    generateInitializeMethod(idb);
241    idb->CreateRetVoid();
242}
243
244inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
245    mCurrentMethod = getDoSegmentFunction(idb->getModule());
246    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
247    auto args = mCurrentMethod->arg_begin();
248    setInstance(&*(args++));
249    mIsFinal = &*(args++);
250    const auto n = mStreamSetInputs.size();
251    mAvailableItemCount.resize(n, nullptr);
252    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
253        mAvailableItemCount[i] = &*(args++);
254    }
255    generateDoSegmentMethod(idb); // must be overridden by the KernelBuilder subtype
256    mIsFinal = nullptr;
257    mAvailableItemCount.clear();
258    idb->CreateRetVoid();
259    //CurrentMethod->dump();
260}
261
262inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
263    mCurrentMethod = getTerminateFunction(idb->getModule());
264    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
265    auto args = mCurrentMethod->arg_begin();
266    setInstance(&*(args++));
267    generateFinalizeMethod(idb); // may be overridden by the KernelBuilder subtype
268    const auto n = mScalarOutputs.size();
269    if (n == 0) {
270        idb->CreateRetVoid();
271    } else {
272        Value * outputs[n];
273        for (unsigned i = 0; i < n; ++i) {
274            outputs[i] = idb->getScalarField(mScalarOutputs[i].name);
275        }
276        if (n == 1) {
277            idb->CreateRet(outputs[0]);
278        } else {
279            idb->CreateAggregateRet(outputs, n);
280        }
281    }
282}
283
284unsigned Kernel::getScalarIndex(const std::string & name) const {
285    const auto f = mKernelMap.find(name);
286    if (LLVM_UNLIKELY(f == mKernelMap.end())) {
287        assert (false);
288        report_fatal_error(getName() + " does not contain scalar: " + name);
289    }
290    return f->second;
291}
292
293Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) {
294    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
295    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
296        report_fatal_error("Cannot instantiate " + getName() + " before calling prepareKernel()");
297    }
298    setInstance(idb->CreateCacheAlignedAlloca(mKernelStateType));
299    return getInstance();
300}
301
302void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
303    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
304    if (LLVM_UNLIKELY(getInstance() == nullptr)) {
305        report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()");
306    }
307    std::vector<Value *> args;
308    args.reserve(1 + mInitialArguments.size() + mStreamSetInputBuffers.size() + (mStreamSetOutputBuffers.size() * 2));
309    args.push_back(getInstance());
310    for (unsigned i = 0; i < mInitialArguments.size(); ++i) {
311        Value * arg = mInitialArguments[i];
312        if (LLVM_UNLIKELY(arg == nullptr)) {
313            report_fatal_error(getName() + ": initial argument " + std::to_string(i)
314                               + " cannot be null when calling createInstance()");
315        }
316        args.push_back(arg);
317    }
318    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
319        assert (mStreamSetInputBuffers[i]);
320        Value * arg = mStreamSetInputBuffers[i]->getStreamSetBasePtr();
321        if (LLVM_UNLIKELY(arg == nullptr)) {
322            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
323                               + " was not allocated prior to calling createInstance()");
324        }
325        args.push_back(arg);
326    }
327    assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
328    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
329        assert (mStreamSetOutputBuffers[i]);
330        Value * arg = mStreamSetOutputBuffers[i]->getStreamSetBasePtr();
331        if (LLVM_UNLIKELY(arg == nullptr)) {
332            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
333                               + " was not allocated prior to calling createInstance()");
334        }
335        args.push_back(arg);
336    }
337    assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
338    IntegerType * const sizeTy = idb->getSizeTy();
339    PointerType * const sizePtrTy = sizeTy->getPointerTo();
340    PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo();
341    StructType * const consumerTy = StructType::get(sizeTy, sizePtrPtrTy, nullptr);
342    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
343        const auto output = mStreamSetOutputBuffers[i];
344        const auto & consumers = output->getConsumers();
345        const auto n = consumers.size();
346        AllocaInst * const outputConsumers = idb->CreateAlloca(consumerTy);
347        Value * const consumerSegNoArray = idb->CreateAlloca(ArrayType::get(sizePtrTy, n));
348        for (unsigned i = 0; i < n; ++i) {
349            Kernel * const consumer = consumers[i];
350            assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance());
351            idb->setKernel(consumer);
352            Value * const segmentNoPtr = idb->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
353            idb->CreateStore(segmentNoPtr, idb->CreateGEP(consumerSegNoArray, { idb->getInt32(0), idb->getInt32(i) }));
354        }
355        idb->setKernel(this);
356        Value * const consumerCountPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(0)});
357        idb->CreateStore(idb->getSize(n), consumerCountPtr);
358        Value * const consumerSegNoArrayPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(1)});
359        idb->CreateStore(idb->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
360        args.push_back(outputConsumers);
361    }
362    idb->CreateCall(getInitFunction(idb->getModule()), args);
363}
364
365//  The default doSegment method dispatches to the doBlock routine for
366//  each block of the given number of blocksToDo, and then updates counts.
367
368void BlockOrientedKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb) {
369    BasicBlock * const entryBlock = idb->GetInsertBlock();
370    BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
371    mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody");
372    BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone");
373    BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock");
374    BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone");
375
376    Value * baseTarget = nullptr;
377    if (idb->supportsIndirectBr()) {
378        baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
379    }
380
381    ConstantInt * stride = idb->getSize(idb->getStride());
382    Value * availablePos = mAvailableItemCount[0];
383    Value * processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
384    Value * itemsAvail = idb->CreateSub(availablePos, processed);
385    Value * stridesToDo = idb->CreateUDiv(itemsAvail, stride);
386
387    idb->CreateBr(strideLoopCond);
388
389    idb->SetInsertPoint(strideLoopCond);
390
391    PHINode * branchTarget = nullptr;
392    if (idb->supportsIndirectBr()) {
393        branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
394        branchTarget->addIncoming(baseTarget, entryBlock);
395    }
396
397    PHINode * const stridesRemaining = idb->CreatePHI(idb->getSizeTy(), 2, "stridesRemaining");
398    stridesRemaining->addIncoming(stridesToDo, entryBlock);
399    // NOTE: stridesRemaining may go to a negative number in the final block if the generateFinalBlockMethod(...)
400    // calls CreateDoBlockMethodCall(). Do *not* replace the comparator with an unsigned one!
401    Value * notDone = idb->CreateICmpSGT(stridesRemaining, idb->getSize(0));
402    idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
403
404    idb->SetInsertPoint(mStrideLoopBody);
405
406    if (idb->supportsIndirectBr()) {
407        mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget");
408        mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
409    }
410
411    /// GENERATE DO BLOCK METHOD
412
413    writeDoBlockMethod(idb);
414
415    /// UPDATE PROCESSED COUNTS
416
417    processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
418    Value * itemsDone = idb->CreateAdd(processed, stride);
419    idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
420
421    stridesRemaining->addIncoming(idb->CreateSub(stridesRemaining, idb->getSize(1)), idb->GetInsertBlock());
422
423    BasicBlock * bodyEnd = idb->GetInsertBlock();
424    if (idb->supportsIndirectBr()) {
425        branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
426    }
427    idb->CreateBr(strideLoopCond);
428
429    stridesDone->moveAfter(bodyEnd);
430
431    idb->SetInsertPoint(stridesDone);
432
433    // Now conditionally perform the final block processing depending on the doFinal parameter.
434    if (idb->supportsIndirectBr()) {
435        mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
436        mStrideLoopBranch->addDestination(doFinalBlock);
437        mStrideLoopBranch->addDestination(segmentDone);
438    } else {
439        idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
440    }
441
442    doFinalBlock->moveAfter(stridesDone);
443
444    idb->SetInsertPoint(doFinalBlock);
445
446    Value * remainingItems = idb->CreateSub(mAvailableItemCount[0], idb->getProcessedItemCount(mStreamSetInputs[0].name));
447
448    writeFinalBlockMethod(idb, remainingItems);
449
450    itemsDone = mAvailableItemCount[0];
451    idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
452    idb->setTerminationSignal();
453    idb->CreateBr(segmentDone);
454
455    segmentDone->moveAfter(idb->GetInsertBlock());
456
457    idb->SetInsertPoint(segmentDone);
458
459    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
460    if (idb->supportsIndirectBr()) {
461        MDBuilder mdb(idb->getContext());
462        const auto destinations = mStrideLoopBranch->getNumDestinations();
463        uint32_t weights[destinations];
464        for (unsigned i = 0; i < destinations; ++i) {
465            weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1;
466        }
467        ArrayRef<uint32_t> bw(weights, destinations);
468        mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw));
469    }
470
471}
472
473inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) {
474
475    Value * const self = getInstance();
476    Function * const cp = mCurrentMethod;
477    auto ip = idb->saveIP();
478
479    /// Check if the do block method is called and create the function if necessary   
480    if (!idb->supportsIndirectBr()) {
481        FunctionType * const type = FunctionType::get(idb->getVoidTy(), {self->getType()}, false);
482        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule());
483        mCurrentMethod->setCallingConv(CallingConv::C);
484        mCurrentMethod->setDoesNotThrow();
485        mCurrentMethod->setDoesNotCapture(1);
486        auto args = mCurrentMethod->arg_begin();
487        args->setName("self");
488        setInstance(&*args);
489        idb->SetInsertPoint(idb->CreateBasicBlock("entry"));
490    }
491
492    std::vector<Value *> priorProduced;
493    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
494        if (isa<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]) || isa<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
495            priorProduced.push_back(idb->getProducedItemCount(mStreamSetOutputs[i].name));
496        }
497    }
498
499    generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
500
501    unsigned priorIdx = 0;
502    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
503        Value * log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
504        if (SwizzledCopybackBuffer * const cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
505            BasicBlock * copyBack = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
506            BasicBlock * done = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
507            Value * newlyProduced = idb->CreateSub(idb->getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
508            Value * priorBlock = idb->CreateLShr(priorProduced[priorIdx], log2BlockSize);
509            Value * priorOffset = idb->CreateAnd(priorProduced[priorIdx], idb->getSize(idb->getBitBlockWidth() - 1));
510            Value * instance = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
511            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(idb.get(), priorBlock);
512            Value * accessible = idb->CreateSub(idb->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
513            Value * wraparound = idb->CreateICmpULT(accessible, newlyProduced);
514            idb->CreateCondBr(wraparound, copyBack, done);
515            idb->SetInsertPoint(copyBack);
516            Value * copyItems = idb->CreateSub(newlyProduced, accessible);
517            cb->createCopyBack(idb.get(), instance, copyItems);
518            idb->CreateBr(done);
519            idb->SetInsertPoint(done);
520            priorIdx++;
521        }
522        if (CircularCopybackBuffer * const cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
523            BasicBlock * copyBack = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
524            BasicBlock * done = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
525            Value * instance = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
526            Value * newlyProduced = idb->CreateSub(idb->getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
527            Value * accessible = cb->getLinearlyAccessibleItems(idb.get(), priorProduced[priorIdx]);
528            Value * wraparound = idb->CreateICmpULT(accessible, newlyProduced);
529            idb->CreateCondBr(wraparound, copyBack, done);
530            idb->SetInsertPoint(copyBack);
531            Value * copyItems = idb->CreateSub(newlyProduced, accessible);
532            cb->createCopyBack(idb.get(), instance, copyItems);
533            idb->CreateBr(done);
534            idb->SetInsertPoint(done);
535            priorIdx++;
536        }
537    }
538
539
540    /// Call the do block method if necessary then restore the current function state to the do segement method
541    if (!idb->supportsIndirectBr()) {
542        idb->CreateRetVoid();
543        mDoBlockMethod = mCurrentMethod;
544        idb->restoreIP(ip);
545        idb->CreateCall(mCurrentMethod, self);
546        setInstance(self);
547        mCurrentMethod = cp;
548    }
549
550}
551
552inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) {
553
554    Value * const self = getInstance();
555    Function * const cp = mCurrentMethod;
556    Value * const remainingItemCount = remainingItems;
557    auto ip = idb->saveIP();
558
559    if (!idb->supportsIndirectBr()) {
560        FunctionType * const type = FunctionType::get(idb->getVoidTy(), {self->getType(), idb->getSizeTy()}, false);
561        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule());
562        mCurrentMethod->setCallingConv(CallingConv::C);
563        mCurrentMethod->setDoesNotThrow();
564        mCurrentMethod->setDoesNotCapture(1);
565        auto args = mCurrentMethod->arg_begin();
566        args->setName("self");
567        setInstance(&*args);
568        remainingItems = &*(++args);
569        remainingItems->setName("remainingItems");
570        idb->SetInsertPoint(idb->CreateBasicBlock("entry"));
571    }
572
573    generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
574
575    RecursivelyDeleteTriviallyDeadInstructions(remainingItems); // if remainingItems was not used, this will eliminate it.
576
577    if (!idb->supportsIndirectBr()) {
578        idb->CreateRetVoid();
579        idb->restoreIP(ip);
580        idb->CreateCall(mCurrentMethod, {self, remainingItemCount});
581        mCurrentMethod = cp;
582        setInstance(self);
583    }
584
585}
586
587//  The default finalBlock method simply dispatches to the doBlock routine.
588void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) {
589    CreateDoBlockMethodCall(idb);
590}
591
592void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) {
593    if (idb->supportsIndirectBr()) {
594        BasicBlock * bb = idb->CreateBasicBlock("resume");
595        mStrideLoopBranch->addDestination(bb);
596        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock());
597        idb->CreateBr(mStrideLoopBody);
598        bb->moveAfter(idb->GetInsertBlock());
599        idb->SetInsertPoint(bb);
600    } else {
601        idb->CreateCall(mDoBlockMethod, getInstance());
602    }
603}
604
605void MultiBlockKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
606
607    KernelBuilder * const iBuilder = kb.get();
608    auto ip = iBuilder->saveIP();
609    Function * const cp = mCurrentMethod;
610   
611    // First prepare the multi-block method that will be used.
612
613    std::vector<Type *> multiBlockParmTypes;
614    multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
615    multiBlockParmTypes.push_back(iBuilder->getSizeTy());
616    for (auto buffer : mStreamSetInputBuffers) {
617        multiBlockParmTypes.push_back(buffer->getPointerType());
618    }
619    for (auto buffer : mStreamSetOutputBuffers) {
620        multiBlockParmTypes.push_back(buffer->getPointerType());
621    }
622   
623    FunctionType * const type = FunctionType::get(iBuilder->getVoidTy(), multiBlockParmTypes, false);
624    Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, iBuilder->getModule());
625    multiBlockFunction->setCallingConv(CallingConv::C);
626    multiBlockFunction->setDoesNotThrow();
627    auto args = multiBlockFunction->arg_begin();
628    args->setName("self");
629    (++args)->setName("itemsToDo");
630    for (auto binding : mStreamSetInputs) {
631        (++args)->setName(binding.name + "BufPtr");
632    }
633    for (auto binding : mStreamSetOutputs) {
634        (++args)->setName(binding.name + "BufPtr");
635    }
636
637    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
638    // provide the required multi-block kernel logic.
639    mCurrentMethod = multiBlockFunction;
640    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "multiBlockEntry", multiBlockFunction, 0));
641    generateMultiBlockLogic(kb);
642
643    iBuilder->CreateRetVoid();
644   
645    iBuilder->restoreIP(ip);
646    mCurrentMethod = cp;
647   
648    // Now proceed with creation of the doSegment method.
649
650    BasicBlock * const entry = iBuilder->GetInsertBlock();
651    BasicBlock * const doSegmentOuterLoop = iBuilder->CreateBasicBlock(getName() + "_doSegmentOuterLoop");
652    BasicBlock * const doMultiBlockCall = iBuilder->CreateBasicBlock(getName() + "_doMultiBlockCall");
653    BasicBlock * const tempBlockCheck = iBuilder->CreateBasicBlock(getName() + "_tempBlockCheck");
654    BasicBlock * const doTempBufferBlock = iBuilder->CreateBasicBlock(getName() + "_doTempBufferBlock");
655    BasicBlock * const segmentDone = iBuilder->CreateBasicBlock(getName() + "_segmentDone");
656
657    Value * blockBaseMask = iBuilder->CreateNot(iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
658
659    //
660    //  A. Temporary Buffer Area Determination
661    //
662    // For final block processing and for processing near the end of physical buffer
663    // boundaries, we need to allocate temporary space for processing a full block of input.
664    // Compute the size requirements to store stream set data at the declared processing
665    // rates in reference to one block of the principal input stream.
666    //
667
668    unsigned bitBlockWidth = iBuilder->getBitBlockWidth();
669    std::vector<Type *> tempBuffers;
670    std::vector<unsigned> itemsPerPrincipalBlock;
671    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
672        auto & rate = mStreamSetInputs[i].rate;
673        std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
674        if (refSet.empty()) {
675            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
676        }
677        else {
678            Port port; unsigned ssIdx;
679            std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
680            assert (port == Port::Input && ssIdx < i);
681            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
682        }
683        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth +2;
684        if (blocks > 1) {
685            tempBuffers.push_back(ArrayType::get(mStreamSetInputBuffers[i]->getType(), blocks));
686        }
687        else {
688            tempBuffers.push_back(mStreamSetInputBuffers[i]->getType());
689        }
690    }
691    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
692        auto & rate = mStreamSetOutputs[i].rate;
693        std::string refSet = mStreamSetOutputs[i].rate.referenceStreamSet();
694        if (refSet.empty()) {
695            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
696        }
697        else {
698            Port port; unsigned ssIdx;
699            std::tie(port, ssIdx) = getStreamPort(mStreamSetOutputs[i].name);
700            if (port == Port::Output) ssIdx += mStreamSetInputs.size();
701            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
702        }
703        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth +2;
704        if (blocks > 1) {
705            tempBuffers.push_back(ArrayType::get(mStreamSetOutputBuffers[i]->getType(), blocks));
706        }
707        else {
708            tempBuffers.push_back(mStreamSetOutputBuffers[i]->getType());
709        }
710    }
711    Type * tempParameterStructType = StructType::create(iBuilder->getContext(), tempBuffers);
712    Value * tempParameterArea = iBuilder->CreateCacheAlignedAlloca(tempParameterStructType);
713
714    ConstantInt * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
715
716    Value * availablePos = mAvailableItemCount[0];
717    Value * itemsAvail = availablePos;
718    //  Make sure that corresponding data is available depending on processing rate
719    //  for all input stream sets.
720    for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
721        Value * a = mAvailableItemCount[i];
722        auto & rate = mStreamSetInputs[i].rate;
723        assert (((rate.referenceStreamSet().empty()) || (rate.referenceStreamSet() == mStreamSetInputs[0].name)) && "Multiblock kernel input rate not with respect to principal stream.");
724        Value * maxItems = rate.CreateMaxReferenceItemsCalculation(iBuilder, a);
725        itemsAvail = iBuilder->CreateSelect(iBuilder->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
726    }
727
728    Value * processed = iBuilder->getProcessedItemCount(mStreamSetInputs[0].name);
729    Value * itemsToDo = iBuilder->CreateSub(itemsAvail, processed);
730    Value * fullBlocksToDo = iBuilder->CreateUDiv(itemsToDo, blockSize);
731    Value * excessItems = iBuilder->CreateURem(itemsToDo, blockSize);
732
733    //  Now we iteratively process these blocks using the doMultiBlock method.
734    //  In each iteration, we process the maximum number of linearly accessible
735    //  blocks on the principal input, reduced to ensure that the corresponding
736    //  data is linearly available at the specified processing rates for the other inputs,
737    //  and that each of the output buffers has sufficient linearly available space
738    //  (using overflow areas, if necessary) for the maximum output that can be
739    //  produced.
740
741    //iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(fullBlocksToDo, iBuilder->getSize(0)), doSegmentOuterLoop, finalBlockCheck);
742   
743    iBuilder->CreateBr(doSegmentOuterLoop);
744    iBuilder->SetInsertPoint(doSegmentOuterLoop);
745    PHINode * const blocksRemaining = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "blocksRemaining");
746    blocksRemaining->addIncoming(fullBlocksToDo, entry);
747    // For each input buffer, determine the processedItemCount, the block pointer for the
748    // buffer block containing the next item, and the number of linearly available items.
749    //
750    std::vector<Value *> processedItemCount;
751    std::vector<Value *> inputBlockPtr;
752    std::vector<Value *> producedItemCount;
753    std::vector<Value *> outputBlockPtr;
754
755    //  Now determine the linearly available blocks, based on blocks remaining reduced
756    //  by limitations of linearly available input buffer space.
757    Value * linearlyAvailBlocks = blocksRemaining;
758    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
759        Value * p = iBuilder->getProcessedItemCount(mStreamSetInputs[i].name);
760        Value * blkNo = iBuilder->CreateUDiv(p, blockSize);
761        Value * b = iBuilder->getInputStreamBlockPtr(mStreamSetInputs[i].name, iBuilder->getInt32(0));
762        processedItemCount.push_back(p);
763        inputBlockPtr.push_back(b);
764        auto & rate = mStreamSetInputs[i].rate;
765        Value * blocks = nullptr;
766        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator()) && (rate.referenceStreamSet() == "")) {
767            blocks = mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(iBuilder, blkNo);
768        } else {
769            Value * linearlyAvailItems = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(iBuilder, p);
770            Value * items = rate.CreateMaxReferenceItemsCalculation(iBuilder, linearlyAvailItems);
771            blocks = iBuilder->CreateUDiv(items, blockSize);
772        }
773        linearlyAvailBlocks = iBuilder->CreateSelect(iBuilder->CreateICmpULT(blocks, linearlyAvailBlocks), blocks, linearlyAvailBlocks);
774    }
775    //  Now determine the linearly writeable blocks, based on available blocks reduced
776    //  by limitations of output buffer space.
777    Value * linearlyWritableBlocks = linearlyAvailBlocks;
778
779    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
780        Value * p = iBuilder->getProducedItemCount(mStreamSetOutputs[i].name);
781        Value * blkNo = iBuilder->CreateUDiv(p, blockSize);
782        Value * b = iBuilder->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, iBuilder->getInt32(0));
783        producedItemCount.push_back(p);
784        outputBlockPtr.push_back(b);
785        auto & rate = mStreamSetOutputs[i].rate;
786        Value * blocks = nullptr;
787        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator())) {
788            blocks = mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(iBuilder, blkNo);
789        } else {
790            Value * writableItems = mStreamSetOutputBuffers[0]->getLinearlyWritableItems(iBuilder, p);
791            blocks = iBuilder->CreateUDiv(writableItems, blockSize);
792        }
793        linearlyWritableBlocks = iBuilder->CreateSelect(iBuilder->CreateICmpULT(blocks, linearlyWritableBlocks), blocks, linearlyWritableBlocks);
794    }
795    Value * haveBlocks = iBuilder->CreateICmpUGT(linearlyWritableBlocks, iBuilder->getSize(0));
796    iBuilder->CreateCondBr(haveBlocks, doMultiBlockCall, tempBlockCheck);
797
798    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
799    //  Now prepare the doMultiBlock call.
800    iBuilder->SetInsertPoint(doMultiBlockCall);
801
802    Value * linearlyAvailItems = iBuilder->CreateMul(linearlyWritableBlocks, blockSize);
803
804    std::vector<Value *> doMultiBlockArgs;
805    doMultiBlockArgs.push_back(getInstance());
806    doMultiBlockArgs.push_back(linearlyAvailItems);
807    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
808        Value * bufPtr = iBuilder->getRawInputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), processedItemCount[i]);
809        bufPtr = iBuilder->CreatePointerCast(bufPtr, mStreamSetInputBuffers[i]->getPointerType());
810        doMultiBlockArgs.push_back(bufPtr);
811    }
812    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
813        Value * bufPtr = iBuilder->getRawOutputPointer(mStreamSetOutputs[i].name, iBuilder->getInt32(0), producedItemCount[i]);
814        bufPtr = iBuilder->CreatePointerCast(bufPtr, mStreamSetOutputBuffers[i]->getPointerType());
815        doMultiBlockArgs.push_back(bufPtr);
816    }
817
818    iBuilder->CreateCall(multiBlockFunction, doMultiBlockArgs);
819    // Do copybacks if necessary.
820    unsigned priorIdx = 0;
821    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
822        Value * log2BlockSize = iBuilder->getSize(std::log2(iBuilder->getBitBlockWidth()));
823        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
824            BasicBlock * copyBack = iBuilder->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
825            BasicBlock * done = iBuilder->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
826            Value * newlyProduced = iBuilder->CreateSub(iBuilder->getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
827            Value * priorBlock = iBuilder->CreateLShr(producedItemCount[i], log2BlockSize);
828            Value * priorOffset = iBuilder->CreateAnd(producedItemCount[i], iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
829            Value * instance = iBuilder->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
830            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(iBuilder, priorBlock);
831            Value * accessible = iBuilder->CreateSub(iBuilder->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
832            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
833            iBuilder->CreateCondBr(wraparound, copyBack, done);
834            iBuilder->SetInsertPoint(copyBack);
835            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
836            cb->createCopyBack(iBuilder, instance, copyItems);
837            iBuilder->CreateBr(done);
838            iBuilder->SetInsertPoint(done);
839            priorIdx++;
840        }
841        if (auto cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
842            BasicBlock * copyBack = iBuilder->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
843            BasicBlock * done = iBuilder->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
844            Value * instance = iBuilder->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
845            Value * newlyProduced = iBuilder->CreateSub(iBuilder->getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
846            Value * accessible = cb->getLinearlyAccessibleItems(iBuilder, producedItemCount[i]);
847            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
848            iBuilder->CreateCondBr(wraparound, copyBack, done);
849            iBuilder->SetInsertPoint(copyBack);
850            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
851            cb->createCopyBack(iBuilder, instance, copyItems);
852            iBuilder->CreateBr(done);
853            iBuilder->SetInsertPoint(done);
854            priorIdx++;
855        }
856    }
857    iBuilder->setProcessedItemCount(mStreamSetInputs[0].name, iBuilder->CreateAdd(processed, linearlyAvailItems));
858    Value * reducedBlocksToDo = iBuilder->CreateSub(blocksRemaining, linearlyWritableBlocks);
859    Value * fullBlocksRemain = iBuilder->CreateICmpUGT(reducedBlocksToDo, iBuilder->getSize(0));
860    BasicBlock * multiBlockFinal = iBuilder->GetInsertBlock();
861    blocksRemaining->addIncoming(reducedBlocksToDo, multiBlockFinal);
862    iBuilder->CreateCondBr(fullBlocksRemain, doSegmentOuterLoop, tempBlockCheck);
863    //iBuilder->CreateBr(doSegmentOuterLoop);
864    //
865    // We use temporary buffers in 3 different cases that preclude full block processing.
866    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
867    // (b) One or more output buffers does not have sufficient linearly available buffer space.
868    // (c) We have processed all the full blocks of input and only the excessItems remain.
869    // In each case we set up temporary buffers for input and output and then
870    // call the Multiblock routine.
871    //
872
873    iBuilder->SetInsertPoint(tempBlockCheck);
874    PHINode * const tempBlocksRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "tempBlocksRemain");
875    tempBlocksRemain->addIncoming(blocksRemaining, doSegmentOuterLoop);
876    tempBlocksRemain->addIncoming(reducedBlocksToDo, multiBlockFinal);
877   
878    haveBlocks = iBuilder->CreateICmpUGT(tempBlocksRemain, iBuilder->getSize(0));
879    iBuilder->CreateCondBr(iBuilder->CreateOr(mIsFinal, haveBlocks), doTempBufferBlock, segmentDone);
880
881    //
882    // We use temporary buffers in 3 different cases that preclude full block processing.
883    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
884    // (b) One or more output buffers does not have sufficient linearly available buffer space.
885    // (c) We have processed all the full blocks of input and only the excessItems remain.
886    // In each case we set up temporary buffers for input and output and then
887    // call the Multiblock routine.
888    //
889    iBuilder->SetInsertPoint(doTempBufferBlock);
890    Value * tempBlockItems = iBuilder->CreateSelect(haveBlocks, blockSize, excessItems);
891
892    // Begin constructing the doMultiBlock args.
893    std::vector<Value *> tempArgs;
894    tempArgs.push_back(getInstance());
895    tempArgs.push_back(tempBlockItems);
896
897    // Prepare the temporary buffer area.
898    //
899    // First zero it out.
900    Constant * const tempAreaSize = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(tempParameterStructType), iBuilder->getSizeTy(), false);
901    iBuilder->CreateMemZero(tempParameterArea, tempAreaSize);
902   
903    // For each input and output buffer, copy over necessary data starting from the last
904    // block boundary.
905    std::vector<Value *> finalItemPos;
906    finalItemPos.push_back(iBuilder->CreateAdd(processedItemCount[0], tempBlockItems));
907
908    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); i++) {
909        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(i));
910        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetInputBuffers[i]->getPointerType());
911
912        Value * blockItemPos = iBuilder->CreateAnd(processedItemCount[i], blockBaseMask);
913
914        // The number of items to copy is determined by the processing rate requirements.
915        if (i > 1) {
916            auto & rate = mStreamSetInputs[i].rate;
917            std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
918            if (refSet.empty()) {
919                finalItemPos.push_back(rate.CreateRatioCalculation(iBuilder, finalItemPos[0], iBuilder->CreateNot(haveBlocks)));
920            }
921            else {
922                Port port; unsigned ssIdx;
923                std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
924                assert (port == Port::Input && ssIdx < i);
925                finalItemPos.push_back(rate.CreateRatioCalculation(iBuilder, finalItemPos[ssIdx], iBuilder->CreateNot(haveBlocks)));
926            }
927        }
928        Value * neededItems = iBuilder->CreateSub(finalItemPos[i], blockItemPos);
929        Value * availFromBase = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(iBuilder, blockItemPos);
930        Value * copyItems1 = iBuilder->CreateSelect(iBuilder->CreateICmpULT(neededItems, availFromBase), neededItems, availFromBase);
931        Value * copyItems2 = iBuilder->CreateSub(neededItems, copyItems1);
932        Value * inputPtr = iBuilder->getInputStreamBlockPtr(mStreamSetInputs[i].name, iBuilder->getInt32(0));
933        mStreamSetInputBuffers[i]->createBlockAlignedCopy(iBuilder, tempBufPtr, inputPtr, copyItems1);
934        Value * nextBufPtr = iBuilder->CreateGEP(tempBufPtr, iBuilder->CreateUDiv(availFromBase, blockSize));
935        mStreamSetInputBuffers[i]->createBlockAlignedCopy(iBuilder, nextBufPtr, iBuilder->getStreamSetBufferPtr(mStreamSetInputs[i].name), copyItems2);
936        Value * itemAddress = iBuilder->CreatePtrToInt(iBuilder->getRawInputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), processedItemCount[i]), iBuilder->getSizeTy());
937        Value * baseAddress = iBuilder->CreatePtrToInt(inputBlockPtr[i], iBuilder->getSizeTy());
938        Value * tempAddress = iBuilder->CreateAdd(iBuilder->CreatePtrToInt(tempBufPtr, iBuilder->getSizeTy()), iBuilder->CreateSub(itemAddress, baseAddress));
939        tempArgs.push_back(iBuilder->CreateIntToPtr(tempAddress, mStreamSetInputBuffers[i]->getPointerType()));
940    }
941
942    std::vector<Value *> blockItemPos;
943    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
944        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(mStreamSetInputs.size() + i));
945        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
946        blockItemPos.push_back(iBuilder->CreateAnd(producedItemCount[i], blockBaseMask));
947        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, tempBufPtr, outputBlockPtr[i], iBuilder->CreateSub(producedItemCount[i], blockItemPos[i]));
948        Value * itemAddress = iBuilder->CreatePtrToInt(iBuilder->getRawOutputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), producedItemCount[i]), iBuilder->getSizeTy());
949        Value * outputPtr = iBuilder->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, iBuilder->getInt32(0));
950        Value * baseAddress = iBuilder->CreatePtrToInt(outputPtr, iBuilder->getSizeTy());
951        Value * tempAddress = iBuilder->CreateAdd(iBuilder->CreatePtrToInt(tempBufPtr, iBuilder->getSizeTy()), iBuilder->CreateSub(itemAddress, baseAddress));
952        tempArgs.push_back(iBuilder->CreateIntToPtr(tempAddress, mStreamSetOutputBuffers[i]->getPointerType()));
953    }
954
955   
956    iBuilder->CreateCall(multiBlockFunction, tempArgs);
957
958    // Copy back data to the actual output buffers.
959
960    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
961        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(mStreamSetInputs.size() + i));
962        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
963        Value * final_items = iBuilder->getProducedItemCount(mStreamSetOutputs[i].name);
964        Value * copyItems = iBuilder->CreateSub(final_items, blockItemPos[i]);
965        Value * copyItems1 = mStreamSetOutputBuffers[i]->getLinearlyWritableItems(iBuilder, blockItemPos[i]); // must be a whole number of blocks.
966        Value * outputPtr = iBuilder->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, iBuilder->getInt32(0));
967        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, outputPtr, tempBufPtr, copyItems1);
968        Value * copyItems2 = iBuilder->CreateSelect(iBuilder->CreateICmpULT(copyItems, copyItems), iBuilder->getSize(0), iBuilder->CreateSub(copyItems, copyItems1));
969        tempBufPtr = iBuilder->CreateGEP(tempBufPtr, iBuilder->CreateUDiv(copyItems1, blockSize));
970        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, iBuilder->getStreamSetBufferPtr(mStreamSetOutputs[i].name), tempBufPtr, copyItems2);
971    }
972
973    iBuilder->setProcessedItemCount(mStreamSetInputs[0].name, finalItemPos[0]);
974
975    //  We've dealt with the partial block processing and copied information back into the
976    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
977    //
978    blocksRemaining->addIncoming(iBuilder->CreateSub(tempBlocksRemain, iBuilder->CreateZExt(haveBlocks, iBuilder->getSizeTy())), iBuilder->GetInsertBlock());
979    iBuilder->CreateCondBr(haveBlocks, doSegmentOuterLoop, segmentDone);
980    iBuilder->SetInsertPoint(segmentDone);
981}
982
983void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
984    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
985    mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
986}
987
988Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
989    const auto f = mStreamMap.find(name);
990    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
991        report_fatal_error(getName() + " does not contain stream set " + name);
992    }
993    return f->second;
994}
995
996// CONSTRUCTOR
997Kernel::Kernel(std::string && kernelName,
998                             std::vector<Binding> && stream_inputs,
999                             std::vector<Binding> && stream_outputs,
1000                             std::vector<Binding> && scalar_parameters,
1001                             std::vector<Binding> && scalar_outputs,
1002                             std::vector<Binding> && internal_scalars)
1003: KernelInterface(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1004, mCurrentMethod(nullptr)
1005, mNoTerminateAttribute(false)
1006, mIsGenerated(false)
1007, mIsFinal(nullptr)
1008, mOutputScalarResult(nullptr) {
1009
1010}
1011
1012Kernel::~Kernel() {
1013
1014}
1015
1016// CONSTRUCTOR
1017BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
1018                                         std::vector<Binding> && stream_inputs,
1019                                         std::vector<Binding> && stream_outputs,
1020                                         std::vector<Binding> && scalar_parameters,
1021                                         std::vector<Binding> && scalar_outputs,
1022                                         std::vector<Binding> && internal_scalars)
1023: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1024, mDoBlockMethod(nullptr)
1025, mStrideLoopBody(nullptr)
1026, mStrideLoopBranch(nullptr)
1027, mStrideLoopTarget(nullptr) {
1028
1029}
1030
1031// CONSTRUCTOR
1032MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
1033                                     std::vector<Binding> && stream_inputs,
1034                                     std::vector<Binding> && stream_outputs,
1035                                     std::vector<Binding> && scalar_parameters,
1036                                     std::vector<Binding> && scalar_outputs,
1037                                             std::vector<Binding> && internal_scalars)
1038: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1039   
1040}
1041
1042// CONSTRUCTOR
1043SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
1044                                             std::vector<Binding> && stream_inputs,
1045                                             std::vector<Binding> && stream_outputs,
1046                                             std::vector<Binding> && scalar_parameters,
1047                                             std::vector<Binding> && scalar_outputs,
1048                                             std::vector<Binding> && internal_scalars)
1049: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1050   
1051}
1052   
1053}
Note: See TracBrowser for help on using the repository browser.