source: icGREP/icgrep-devel/icgrep/kernels/kernel.cpp @ 5761

Last change on this file since 5761 was 5761, checked in by nmedfort, 15 months ago

Cache signature is now written into .kernel bitcode file. Minor bug fix and revision of GrepEngine::DoGrepThreadMethod?

File size: 71.5 KB
Line 
1/*
2 *  Copyright (c) 2016-7 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "kernel.h"
7#include <toolchain/toolchain.h>
8#include <kernels/streamset.h>
9#include <llvm/IR/Constants.h>
10#include <llvm/IR/Function.h>
11#include <llvm/IR/Instructions.h>
12#include <llvm/IR/MDBuilder.h>
13#include <llvm/IR/Module.h>
14#include <llvm/Support/raw_ostream.h>
15#if LLVM_VERSION_INTEGER < LLVM_4_0_0
16#include <llvm/Bitcode/ReaderWriter.h>
17#else
18#include <llvm/Bitcode/BitcodeWriter.h>
19#endif
20#include <llvm/Transforms/Utils/Local.h>
21#include <kernels/streamset.h>
22#include <sstream>
23#include <kernels/kernel_builder.h>
24#include <boost/math/common_factor.hpp>
25#include <llvm/Support/Debug.h>
26
27using namespace llvm;
28using namespace parabix;
29using namespace boost::math;
30
31namespace kernel {
32
33const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
34const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
35const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
36const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
37const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
38const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
39const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
40const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal";
41const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr";
42const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks";
43const std::string Kernel::CYCLECOUNT_SCALAR = "CPUcycles";
44
45/** ------------------------------------------------------------------------------------------------------------- *
46 * @brief addScalar
47 ** ------------------------------------------------------------------------------------------------------------- */
48unsigned Kernel::addScalar(Type * const type, const std::string & name) {
49    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
50        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
51    }
52    if (LLVM_UNLIKELY(mKernelFieldMap.count(name))) {
53        report_fatal_error(getName() + " already contains scalar field " + name);
54    }
55    const auto index = mKernelFields.size();
56    mKernelFieldMap.emplace(name, index);
57    mKernelFields.push_back(type);
58    return index;
59}
60
61
62/** ------------------------------------------------------------------------------------------------------------- *
63 * @brief addUnnamedScalar
64 ** ------------------------------------------------------------------------------------------------------------- */
65unsigned Kernel::addUnnamedScalar(Type * const type) {
66    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
67        report_fatal_error("Cannot add unnamed field  to " + getName() + " after kernel state finalized");
68    }
69    const auto index = mKernelFields.size();
70    mKernelFields.push_back(type);
71    return index;
72}
73
74
75/** ------------------------------------------------------------------------------------------------------------- *
76 * @brief prepareStreamSetNameMap
77 ** ------------------------------------------------------------------------------------------------------------- */
78void Kernel::prepareStreamSetNameMap() {
79    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
80        mStreamMap.emplace(mStreamSetInputs[i].getName(), std::make_pair(Port::Input, i));
81    }
82    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
83        mStreamMap.emplace(mStreamSetOutputs[i].getName(), std::make_pair(Port::Output, i));
84    }
85}
86
87
88/** ------------------------------------------------------------------------------------------------------------- *
89 * @brief bindPorts
90 ** ------------------------------------------------------------------------------------------------------------- */
91void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
92    assert (mModule == nullptr);
93    assert (mStreamSetInputBuffers.empty());
94    assert (mStreamSetOutputBuffers.empty());
95
96    if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) {
97        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetInputs.size()) +
98                           " input stream sets but was given "
99                           + std::to_string(inputs.size()));
100    }
101
102    for (unsigned i = 0; i < inputs.size(); ++i) {
103        StreamSetBuffer * const buf = inputs[i];
104        if (LLVM_UNLIKELY(buf == nullptr)) {
105            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
106                               + " cannot be null");
107        }
108        buf->addConsumer(this);
109    }
110
111    if (LLVM_UNLIKELY(mStreamSetOutputs.size() != outputs.size())) {
112        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetOutputs.size())
113                           + " output stream sets but was given "
114                           + std::to_string(outputs.size()));
115    }
116
117    for (unsigned i = 0; i < outputs.size(); ++i) {
118        StreamSetBuffer * const buf = outputs[i];
119        if (LLVM_UNLIKELY(buf == nullptr)) {
120            report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null");
121        }
122        if (LLVM_LIKELY(buf->getProducer() == nullptr)) {
123            buf->setProducer(this);
124        } else {
125            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
126                               + " is already produced by kernel " + buf->getProducer()->getName());
127        }
128    }
129
130    mStreamSetInputBuffers.assign(inputs.begin(), inputs.end());
131    mStreamSetOutputBuffers.assign(outputs.begin(), outputs.end());
132}
133
134
135/** ------------------------------------------------------------------------------------------------------------- *
136 * @brief getCacheName
137 ** ------------------------------------------------------------------------------------------------------------- */
138std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & idb) const {
139    std::stringstream cacheName;
140    cacheName << getName() << '_' << idb->getBuilderUniqueName();
141    for (const StreamSetBuffer * b: mStreamSetInputBuffers) {
142        cacheName <<  ':' <<  b->getUniqueID();
143    }
144    for (const StreamSetBuffer * b: mStreamSetOutputBuffers) {
145        cacheName <<  ':' <<  b->getUniqueID();
146    }
147    return cacheName.str();
148}
149
150
151/** ------------------------------------------------------------------------------------------------------------- *
152 * @brief setModule
153 ** ------------------------------------------------------------------------------------------------------------- */
154Module * Kernel::setModule(Module * const module) {
155    assert (mModule == nullptr || mModule == module);
156    assert (module != nullptr);
157    mModule = module;
158    return mModule;
159}
160
161
162/** ------------------------------------------------------------------------------------------------------------- *
163 * @brief makeModule
164 ** ------------------------------------------------------------------------------------------------------------- */
165Module * Kernel::makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb) {
166    Module * m = new Module(getCacheName(idb), idb->getContext());
167    m->setTargetTriple(idb->getModule()->getTargetTriple());
168    m->setDataLayout(idb->getModule()->getDataLayout());
169    return setModule(m);
170}
171
172
173/** ------------------------------------------------------------------------------------------------------------- *
174 * @brief prepareKernel
175 ** ------------------------------------------------------------------------------------------------------------- */
176void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
177    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
178    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
179        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
180    }
181    addBaseKernelProperties(idb);
182    addInternalKernelProperties(idb);
183    // NOTE: StructType::create always creates a new type even if an identical one exists.
184    if (LLVM_UNLIKELY(mModule == nullptr)) {
185        makeModule(idb);
186    }
187    mKernelStateType = mModule->getTypeByName(getName());
188    if (LLVM_LIKELY(mKernelStateType == nullptr)) {
189        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
190        assert (mKernelStateType);
191    }
192}
193
194
195/** ------------------------------------------------------------------------------------------------------------- *
196 * @brief prepareCachedKernel
197 ** ------------------------------------------------------------------------------------------------------------- */
198void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
199    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
200    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
201        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
202    }
203    assert (getModule());
204    addBaseKernelProperties(idb);
205    mKernelStateType = getModule()->getTypeByName(getName());
206    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
207        report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object");
208    }
209}
210
211/** ------------------------------------------------------------------------------------------------------------- *
212 * @brief addBaseKernelProperties
213 ** ------------------------------------------------------------------------------------------------------------- */
214void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
215
216    if (mStreamMap.empty()) {
217        prepareStreamSetNameMap();
218    }
219
220    normalizeStreamProcessingRates();
221
222    const unsigned inputSetCount = mStreamSetInputs.size();
223    const unsigned outputSetCount = mStreamSetOutputs.size();
224
225    assert (inputSetCount == mStreamSetInputBuffers.size());
226    assert (outputSetCount == mStreamSetOutputBuffers.size());
227
228    if (mStride == 0) {
229        // Set the default kernel stride.
230        mStride = idb->getBitBlockWidth();
231    }
232
233    IntegerType * const sizeTy = idb->getSizeTy();
234
235    for (unsigned i = 0; i < inputSetCount; i++) {
236        const Binding & b = mStreamSetInputs[i];
237        //const ProcessingRate & rate = b.getRate();
238        //if (rate.isBounded() || rate.isUnknown()) {
239            addScalar(sizeTy, b.getName() + PROCESSED_ITEM_COUNT_SUFFIX);
240        //}
241    }
242
243    for (unsigned i = 0; i < outputSetCount; i++) {
244        const Binding & b = mStreamSetOutputs[i];
245        //const ProcessingRate & rate = b.getRate();
246        //if (rate.isBounded() || rate.isUnknown()) {
247            addScalar(sizeTy, b.getName() + PRODUCED_ITEM_COUNT_SUFFIX);
248        //}
249    }
250
251    for (unsigned i = 0; i < inputSetCount; i++) {
252        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].getName() + BUFFER_PTR_SUFFIX);
253    }
254    for (unsigned i = 0; i < outputSetCount; i++) {
255        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].getName() + BUFFER_PTR_SUFFIX);
256    }
257    for (const auto & binding : mScalarInputs) {
258        addScalar(binding.getType(), binding.getName());
259    }
260    for (const auto & binding : mScalarOutputs) {
261        addScalar(binding.getType(), binding.getName());
262    }
263    for (const auto & binding : mInternalScalars) {
264        addScalar(binding.getType(), binding.getName());
265    }
266    Type * const consumerSetTy = StructType::get(idb->getContext(), {sizeTy, sizeTy->getPointerTo()->getPointerTo()})->getPointerTo();
267    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
268        addScalar(consumerSetTy, mStreamSetOutputs[i].getName() + CONSUMER_SUFFIX);
269    }
270    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
271    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
272    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
273        addScalar(sizeTy, mStreamSetOutputs[i].getName() + CONSUMED_ITEM_COUNT_SUFFIX);
274    }
275    // We compile in a 64-bit CPU cycle counter into every kernel.   It will remain unused
276    // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
277    // will be able to add instrumentation to cached modules without recompilation.
278    addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
279
280}
281
282
283/** ------------------------------------------------------------------------------------------------------------- *
284 * @brief makeSignature
285 *
286 * Default kernel signature: generate the IR and emit as byte code.
287 ** ------------------------------------------------------------------------------------------------------------- */
288std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) {
289    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
290    if (LLVM_UNLIKELY(hasSignature())) {
291        generateKernel(idb);
292        std::string signature;
293        raw_string_ostream OS(signature);
294        WriteBitcodeToFile(getModule(), OS);
295        return signature;
296    } else {
297        return getModule()->getModuleIdentifier();
298    }
299}
300
301
302/** ------------------------------------------------------------------------------------------------------------- *
303 * @brief generateKernel
304 ** ------------------------------------------------------------------------------------------------------------- */
305void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
306    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
307    // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
308    // generated the unoptimized IR.
309    if (!mIsGenerated) {
310        const auto m = idb->getModule();
311        const auto ip = idb->saveIP();
312        // const auto saveInstance = getInstance();
313        idb->setModule(mModule);
314        addKernelDeclarations(idb);
315        callGenerateInitializeMethod(idb);
316        callGenerateDoSegmentMethod(idb);
317        callGenerateFinalizeMethod(idb);
318        // setInstance(saveInstance);
319        idb->setModule(m);
320        idb->restoreIP(ip);
321        mIsGenerated = true;
322    }
323}
324
325
326/** ------------------------------------------------------------------------------------------------------------- *
327 * @brief callGenerateInitializeMethod
328 ** ------------------------------------------------------------------------------------------------------------- */
329inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
330    mCurrentMethod = getInitFunction(idb->getModule());
331    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
332    Function::arg_iterator args = mCurrentMethod->arg_begin();
333    setInstance(&*(args++));
334    idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
335    for (const auto & binding : mScalarInputs) {
336        idb->setScalarField(binding.getName(), &*(args++));
337    }
338    for (const auto & binding : mStreamSetOutputs) {
339        idb->setConsumerLock(binding.getName(), &*(args++));
340    }
341    generateInitializeMethod(idb);
342    idb->CreateRetVoid();
343}
344
345/** ------------------------------------------------------------------------------------------------------------- *
346 * @brief callGenerateDoSegmentMethod
347 ** ------------------------------------------------------------------------------------------------------------- */
348inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
349    mCurrentMethod = getDoSegmentFunction(idb->getModule());
350    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
351    auto args = mCurrentMethod->arg_begin();
352    setInstance(&*(args++));
353    mIsFinal = &*(args++);
354    mAvailablePrincipalItemCount = nullptr;
355    const auto n = mStreamSetInputs.size();
356    mAvailableItemCount.resize(n, nullptr);
357    for (unsigned i = 0; i < n; i++) {
358        assert (args != mCurrentMethod->arg_end());
359        mAvailableItemCount[i] = &*(args++);
360    }
361    assert (args == mCurrentMethod->arg_end());
362    generateKernelMethod(idb); // must be overridden by the Kernel subtype
363    mIsFinal = nullptr;
364    mAvailableItemCount.clear();
365    idb->CreateRetVoid();
366}
367
368
369/** ------------------------------------------------------------------------------------------------------------- *
370 * @brief callGenerateFinalizeMethod
371 ** ------------------------------------------------------------------------------------------------------------- */
372inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
373    mCurrentMethod = getTerminateFunction(idb->getModule());
374    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
375    auto args = mCurrentMethod->arg_begin();
376    setInstance(&*(args++));
377    generateFinalizeMethod(idb); // may be overridden by the Kernel subtype
378    const auto n = mScalarOutputs.size();
379    if (n == 0) {
380        idb->CreateRetVoid();
381    } else {
382        Value * outputs[n];
383        for (unsigned i = 0; i < n; ++i) {
384            outputs[i] = idb->getScalarField(mScalarOutputs[i].getName());
385        }
386        if (n == 1) {
387            idb->CreateRet(outputs[0]);
388        } else {
389            idb->CreateAggregateRet(outputs, n);
390        }
391    }
392}
393
394
395/** ------------------------------------------------------------------------------------------------------------- *
396 * @brief getScalarIndex
397 ** ------------------------------------------------------------------------------------------------------------- */
398unsigned Kernel::getScalarIndex(const std::string & name) const {
399    const auto f = mKernelFieldMap.find(name);
400    if (LLVM_UNLIKELY(f == mKernelFieldMap.end())) {
401        assert (false);
402        report_fatal_error(getName() + " does not contain scalar: " + name);
403    }
404    return f->second;
405}
406
407
408/** ------------------------------------------------------------------------------------------------------------- *
409 * @brief createInstance
410 ** ------------------------------------------------------------------------------------------------------------- */
411Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) {
412    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
413    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
414        report_fatal_error("Cannot instantiate " + getName() + " before calling prepareKernel()");
415    }
416    setInstance(idb->CreateCacheAlignedAlloca(mKernelStateType));
417    return getInstance();
418}
419
420
421/** ------------------------------------------------------------------------------------------------------------- *
422 * @brief initializeInstance
423 ** ------------------------------------------------------------------------------------------------------------- */
424void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
425    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
426    if (LLVM_UNLIKELY(getInstance() == nullptr)) {
427        report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()");
428    }
429    std::vector<Value *> args;
430    args.reserve(1 + mInitialArguments.size() + mStreamSetInputBuffers.size() + (mStreamSetOutputBuffers.size() * 2));
431    args.push_back(getInstance());
432    for (unsigned i = 0; i < mInitialArguments.size(); ++i) {
433        Value * arg = mInitialArguments[i];
434        if (LLVM_UNLIKELY(arg == nullptr)) {
435            report_fatal_error(getName() + ": initial argument " + std::to_string(i)
436                               + " cannot be null when calling createInstance()");
437        }
438        args.push_back(arg);
439    }
440    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
441        assert (mStreamSetInputBuffers[i]);
442        Value * arg = mStreamSetInputBuffers[i]->getStreamSetHandle();
443        if (LLVM_UNLIKELY(arg == nullptr)) {
444            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
445                               + " was not allocated prior to calling createInstance()");
446        }
447        args.push_back(arg);
448    }
449    assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
450    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
451        assert (mStreamSetOutputBuffers[i]);
452        Value * arg = mStreamSetOutputBuffers[i]->getStreamSetHandle();
453        if (LLVM_UNLIKELY(arg == nullptr)) {
454            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
455                               + " was not allocated prior to calling createInstance()");
456        }
457        args.push_back(arg);
458    }
459    assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
460    IntegerType * const sizeTy = idb->getSizeTy();
461    PointerType * const sizePtrTy = sizeTy->getPointerTo();
462    PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo();
463    StructType * const consumerTy = StructType::get(idb->getContext(), {sizeTy, sizePtrPtrTy});
464    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
465        const auto output = mStreamSetOutputBuffers[i];
466        const auto & consumers = output->getConsumers();
467        const auto n = consumers.size();
468        AllocaInst * const outputConsumers = idb->CreateAlloca(consumerTy);
469        Value * const consumerSegNoArray = idb->CreateAlloca(ArrayType::get(sizePtrTy, n));
470        for (unsigned i = 0; i < n; ++i) {
471            Kernel * const consumer = consumers[i];
472            assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance());
473            idb->setKernel(consumer);
474            Value * const segmentNoPtr = idb->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
475            idb->CreateStore(segmentNoPtr, idb->CreateGEP(consumerSegNoArray, { idb->getInt32(0), idb->getInt32(i) }));
476        }
477        idb->setKernel(this);
478        Value * const consumerCountPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(0)});
479        idb->CreateStore(idb->getSize(n), consumerCountPtr);
480        Value * const consumerSegNoArrayPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(1)});
481        idb->CreateStore(idb->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
482        args.push_back(outputConsumers);
483    }
484    idb->CreateCall(getInitFunction(idb->getModule()), args);
485}
486
487/** ------------------------------------------------------------------------------------------------------------- *
488 * @brief finalizeInstance
489 ** ------------------------------------------------------------------------------------------------------------- */
490void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
491    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
492    mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
493}
494
495/** ------------------------------------------------------------------------------------------------------------- *
496 * @brief getStreamPort
497 ** ------------------------------------------------------------------------------------------------------------- */
498Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
499    const auto f = mStreamMap.find(name);
500    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
501        report_fatal_error(getName() + " does not contain stream set " + name);
502    }
503    return f->second;
504}
505
506/** ------------------------------------------------------------------------------------------------------------- *
507 * @brief getStreamPort
508 ** ------------------------------------------------------------------------------------------------------------- */
509const Binding & Kernel::getBinding(const std::string & name) const {
510    Port port; unsigned index;
511    std::tie(port, index) = getStreamPort(name);
512    return (port == Port::Input) ? getStreamInput(index) : getStreamOutput(index);
513}
514
515/** ------------------------------------------------------------------------------------------------------------- *
516 * @brief getLowerBound
517 ** ------------------------------------------------------------------------------------------------------------- */
518ProcessingRate::RateValue Kernel::getLowerBound(const ProcessingRate & rate) const {
519    if (rate.isFixed() || rate.isBounded()) {
520        return rate.getLowerBound();
521    } else if (rate.isRelative()) {
522        return rate.getRate() * getLowerBound(getBinding(rate.getReference()).getRate());
523    } else { // if (rate.isUnknown())
524        return 0;
525    }
526}
527
528/** ------------------------------------------------------------------------------------------------------------- *
529 * @brief getUpperBound
530 ** ------------------------------------------------------------------------------------------------------------- */
531ProcessingRate::RateValue Kernel::getUpperBound(const ProcessingRate &rate) const {
532    if (rate.isFixed() || rate.isBounded()) {
533        return rate.getUpperBound();
534    } else if (rate.isRelative()) {
535        return rate.getRate() * getUpperBound(getBinding(rate.getReference()).getRate());
536    } else { // if (rate.isUnknown())
537        return 0;
538    }
539}
540
541/** ------------------------------------------------------------------------------------------------------------- *
542 * @brief normalizeRelativeToFixedProcessingRate
543 ** ------------------------------------------------------------------------------------------------------------- */
544bool Kernel::normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate) {
545    if (base.isFixed()) {
546        return true;
547    } else if (LLVM_UNLIKELY(base.isRelative())) {
548        const auto & ref = getBinding(base.getReference()).getRate();
549        if (normalizeRelativeToFixedProcessingRate(ref, toUpdate)) {
550            toUpdate.getRate() *= ref.getRate();
551            return true;
552        }
553    }
554    return false;
555}
556
557/** ------------------------------------------------------------------------------------------------------------- *
558 * @brief normalizeStreamProcessingRates
559 *
560 * If we allow a stream to be transitively relative to a fixed rate stream, it complicates detection of fixed
561 * rate streams later. Find any such occurance and transform them. This implies, however, that a fixed rate
562 * stream could have a rational processing rate (which should not occur normally.)
563 ** ------------------------------------------------------------------------------------------------------------- */
564inline void Kernel::normalizeStreamProcessingRates() {
565    for (Binding & input : mStreamSetInputs) {
566        normalizeRelativeToFixedProcessingRate(input.getRate(), input.getRate());
567    }
568    for (Binding & output : mStreamSetOutputs) {
569        normalizeRelativeToFixedProcessingRate(output.getRate(), output.getRate());
570    }
571    // TODO: we want to consume whole units. Once the pipeline is able to schedule kernels based on their stride
572    // and input/output rates, modify them here.
573}
574
575/** ------------------------------------------------------------------------------------------------------------- *
576 * @brief generateKernelMethod
577 ** ------------------------------------------------------------------------------------------------------------- */
578void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
579    const auto inputSetCount = mStreamSetInputs.size();
580    mStreamSetInputBaseAddress.resize(inputSetCount);
581    for (unsigned i = 0; i < inputSetCount; ++i) {
582        mStreamSetInputBaseAddress[i] = nullptr;
583    }
584    const auto outputSetCount = mStreamSetOutputs.size();
585    mStreamSetOutputBaseAddress.resize(outputSetCount);
586    for (unsigned i = 0; i < outputSetCount; ++i) {
587        mStreamSetOutputBaseAddress[i] = nullptr;
588    }
589    generateDoSegmentMethod(b);
590}
591
592/** ------------------------------------------------------------------------------------------------------------- *
593 * @brief requiresBufferedFinalStride
594 ** ------------------------------------------------------------------------------------------------------------- */
595inline bool requiresBufferedFinalStride(const Binding & binding) {
596    if (LLVM_LIKELY(isa<ArrayType>(binding.getType()))) {
597        return binding.getType()->getArrayNumElements() == 1;
598    }
599    return true;
600}
601
602/** ------------------------------------------------------------------------------------------------------------- *
603 * @brief getItemWidth
604 ** ------------------------------------------------------------------------------------------------------------- */
605inline unsigned getItemWidth(const Binding & b) {
606    Type * ty = b.getType();
607    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
608        ty = ty->getArrayElementType();
609    }
610    return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
611}
612
613/** ------------------------------------------------------------------------------------------------------------- *
614 * @brief getUpperBound
615 ** ------------------------------------------------------------------------------------------------------------- */
616bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
617    if (rate.isUnknown()) {
618        return true;
619    } else if (rate.isDerived()) {
620        return isTransitivelyUnknownRate(getBinding(rate.getReference()).getRate());
621    }
622    return false;
623}
624
625/** ------------------------------------------------------------------------------------------------------------- *
626 * @brief roundUp
627 ** ------------------------------------------------------------------------------------------------------------- */
628unsigned roundUp(const ProcessingRate::RateValue & r) {
629    if (LLVM_LIKELY(r.denominator() == 1)) {
630        return r.numerator();
631    } else {
632        return (r.numerator() + r.denominator() - 1) / r.denominator();
633    }
634}
635
636/** ------------------------------------------------------------------------------------------------------------- *
637 * @brief getItemAlignment
638 ** ------------------------------------------------------------------------------------------------------------- */
639inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
640    const auto & rate = binding.getRate();
641    if (rate.isFixed()) {
642        const auto & r = rate.getRate();
643        const auto n = (r.numerator() * mStride);
644        if (LLVM_LIKELY(r.denominator() == 1)) {
645            return n;
646        } else if (LLVM_LIKELY((n % r.denominator()) == 0)) {
647            return n / r.denominator();
648        }
649    }
650    return 1; // ∀x GCD(x, x + 1) = 1
651}
652
653/** ------------------------------------------------------------------------------------------------------------- *
654 * @brief getStrideSize
655 ** ------------------------------------------------------------------------------------------------------------- */
656llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
657    // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
658    const auto r = getUpperBound(rate);
659    if (r.numerator() == 0) {
660        return nullptr;
661    } else {
662        assert ((r.numerator() * mStride) % r.denominator() == 0);
663        return b->getSize((r.numerator() * mStride) / r.denominator());
664    }
665}
666
667/** ------------------------------------------------------------------------------------------------------------- *
668 * @brief generateKernelMethod
669 ** ------------------------------------------------------------------------------------------------------------- */
670void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
671
672    if (LLVM_UNLIKELY((mStride % b->getBitBlockWidth()) != 0)) {
673        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of MultiBlockKernel "
674                           "must be a multiple of the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
675    }
676
677    const auto inputSetCount = mStreamSetInputs.size();
678    const auto outputSetCount = mStreamSetOutputs.size();
679
680    // Define and allocate the temporary buffer area in the prolog.
681    const auto blockAlignment = b->getBitBlockWidth() / 8;
682    Value * temporaryInputBuffer[inputSetCount];
683    for (unsigned i = 0; i < inputSetCount; ++i) {
684
685        // TODO: if this is a fixed rate input stream and the pipeline guarantees it will not call the kernel unless
686        // there is sufficient input and all buffers will be sized sufficiently for the input, we ought to be able to
687        // avoid the temporary buffer checks.
688
689        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
690        Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
691        const auto ub = getUpperBound(rate);
692        if (ub.numerator() == 0) {
693            report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
694        } else {           
695            temporaryInputBuffer[i] = b->CreateAlignedAlloca(ty, blockAlignment, b->getSize(roundUp(ub)));
696            Type * const sty = temporaryInputBuffer[i]->getType()->getPointerElementType();
697            b->CreateStore(Constant::getNullValue(sty), temporaryInputBuffer[i]);
698        }       
699    }
700
701    Value * temporaryOutputBuffer[outputSetCount];
702    for (unsigned i = 0; i < outputSetCount; i++) {
703        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
704        Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
705        if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
706            temporaryOutputBuffer[i] = nullptr;
707        } else {           
708            auto ub = getUpperBound(rate);
709            if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
710                ub += mStreamSetOutputBuffers[i]->overflowSize();
711            }
712            temporaryOutputBuffer[i] = b->CreateAlignedAlloca(ty, blockAlignment, b->getSize(roundUp(ub)));
713            Type * const sty = temporaryOutputBuffer[i]->getType()->getPointerElementType();
714            b->CreateStore(Constant::getNullValue(sty), temporaryOutputBuffer[i]);
715        }
716    }
717
718    // Now we iteratively process these blocks using the doMultiBlock method.
719    // In each iteration, we check how many linearly accessible / writable
720    // items can be processed with our current input / output buffers. If we
721    // cannot support an full stride, we check whether (a) there is enough
722    // input data to process but it is not linearly accessible, in which case
723    // we move the data into temporary buffers or (b) there is not enough data
724    // to process, in which case we abort unless IsFinal was set.
725
726    Constant * const ZERO = b->getSize(0);
727    Constant * const ONE = b->getSize(1);
728    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
729    Constant * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
730
731    // Now proceed with creation of the doSegment method.
732    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
733
734    b->CreateBr(segmentLoop);
735
736    /// DO SEGMENT LOOP
737
738    b->SetInsertPoint(segmentLoop);
739
740    // For each input buffer, get the initial processed item count, base input pointer, and the number of
741    // linearly available strides.
742    Value * numOfStrides = nullptr;
743    mInitialAvailableItemCount.resize(inputSetCount);
744    mInitialProcessedItemCount.resize(inputSetCount);
745    mStreamSetInputBaseAddress.resize(inputSetCount);
746    Value * inputStrideSize[inputSetCount];
747    for (unsigned i = 0; i < inputSetCount; i++) {
748        const auto & input = mStreamSetInputs[i];
749        const auto & name = input.getName();
750        const ProcessingRate & rate = input.getRate();
751        Value * const ic = b->getProcessedItemCount(name);
752        mInitialProcessedItemCount[i] = ic;
753        b->CreateAssert(b->CreateICmpUGE(mAvailableItemCount[i], ic), "processed item count cannot exceed the available item count");
754        assert (ic->getType() == mAvailableItemCount[i]->getType());
755        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], ic);
756
757        mStreamSetInputBaseAddress[i]  = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
758        mInitialAvailableItemCount[i] = mAvailableItemCount[i];
759        mAvailableItemCount[i] = b->getLinearlyAccessibleItems(name, ic, unprocessed);
760
761        // Are our linearly accessible items sufficient for a stride?
762        inputStrideSize[i] = getStrideSize(b, rate);
763
764        Value * accessibleStrides = b->CreateUDiv(mAvailableItemCount[i], inputStrideSize[i]);
765        if (!rate.isFixed() || (requiresBufferedFinalStride(input) && input.nonDeferred())) {
766
767            // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
768            // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer.
769
770            BasicBlock * const entry = b->GetInsertBlock();
771            BasicBlock * const copyFromBack = b->CreateBasicBlock(name + "CopyFromBack");
772            BasicBlock * const copyFromFront = b->CreateBasicBlock(name + "CopyFromFront");
773            BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
774
775            b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume);
776
777            b->SetInsertPoint(copyFromBack);
778            Value * const temporaryAvailable = b->CreateUMin(unprocessed, inputStrideSize[i]);
779
780            b->CreateAssert(b->CreateICmpULE(mAvailableItemCount[i], temporaryAvailable), "linearly available cannot be greater than temporarily available");
781            Value * const tempBufferPtr = temporaryInputBuffer[i];
782            Value * const offset = b->CreateAnd(ic, BLOCK_WIDTH_MASK);
783            const auto copyAlignment = getItemAlignment(mStreamSetInputs[i]);
784            b->CreateMemZero(tempBufferPtr, ConstantExpr::getSizeOf(tempBufferPtr->getType()), blockAlignment);
785            b->CreateStreamCpy(name, tempBufferPtr, ZERO, mStreamSetInputBaseAddress[i] , offset, mAvailableItemCount[i], copyAlignment);
786            Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
787            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
788            b->CreateCondBr(b->CreateICmpNE(mAvailableItemCount[i], temporaryAvailable), copyFromFront, resume);
789
790            b->SetInsertPoint(copyFromFront);
791            Value * const remaining = b->CreateSub(temporaryAvailable, mAvailableItemCount[i]);
792            Value * const baseAddress = b->getBaseAddress(name);
793            b->CreateStreamCpy(name, tempBufferPtr, mAvailableItemCount[i], baseAddress, ZERO, remaining, copyAlignment);
794            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
795            b->CreateBr(resume);
796
797            b->SetInsertPoint(resume);
798            PHINode * const bufferPtr = b->CreatePHI(mStreamSetInputBaseAddress[i] ->getType(), 3);
799            bufferPtr->addIncoming(mStreamSetInputBaseAddress[i] , entry);
800            bufferPtr->addIncoming(tempBufferPtr, copyToBackEnd);
801            bufferPtr->addIncoming(tempBufferPtr, copyToFrontEnd);
802            mStreamSetInputBaseAddress[i] = bufferPtr;
803
804            PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 3);
805            phiAvailItemCount->addIncoming(mAvailableItemCount[i], entry);
806            phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
807            phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd);
808            mAvailableItemCount[i] = phiAvailItemCount;
809
810            PHINode * const phiNumOfStrides = b->CreatePHI(b->getSizeTy(), 2);
811            phiNumOfStrides->addIncoming(accessibleStrides, entry);
812            phiNumOfStrides->addIncoming(temporaryStrides, copyToBackEnd);
813            phiNumOfStrides->addIncoming(temporaryStrides, copyToFrontEnd);
814            accessibleStrides = phiNumOfStrides;
815        }
816        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
817    }
818
819    // Now determine the linearly writeable strides
820    Value * linearlyWritable[outputSetCount];
821    Value * baseOutputBuffer[outputSetCount];
822    Value * outputStrideSize[outputSetCount];
823    mInitialProducedItemCount.resize(outputSetCount);
824    mStreamSetOutputBaseAddress.resize(outputSetCount);
825    for (unsigned i = 0; i < outputSetCount; i++) {
826        const auto & output = mStreamSetOutputs[i];
827        const auto & name = output.getName();
828        const ProcessingRate & rate = output.getRate();
829        Value * const ic = b->getProducedItemCount(name);
830        baseOutputBuffer[i] = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
831        assert (baseOutputBuffer[i]->getType()->isPointerTy());
832        linearlyWritable[i] = b->getLinearlyWritableItems(name, ic);
833        mInitialProducedItemCount[i] = ic;
834        outputStrideSize[i] = nullptr;
835        if (temporaryOutputBuffer[i]) {
836            outputStrideSize[i] = getStrideSize(b, rate);
837            // Is the number of linearly writable items sufficient for a stride?
838            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
839            if (!rate.isFixed() || requiresBufferedFinalStride(output)) {
840                Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
841                assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
842                baseOutputBuffer[i] = b->CreateSelect(requiresCopy, temporaryOutputBuffer[i], baseOutputBuffer[i]);
843                writableStrides = b->CreateSelect(requiresCopy, ONE, writableStrides);
844            }
845            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
846            assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
847        }
848        mStreamSetOutputBaseAddress[i] = baseOutputBuffer[i];
849    }
850
851    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
852
853    Value * const initiallyFinal = mIsFinal;
854    if (LLVM_LIKELY(numOfStrides != nullptr)) {
855        mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
856        Value * const hasStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
857        b->CreateAssert(hasStride, getName() + " has insufficient input data or output space for one stride");
858        for (unsigned i = 0; i < inputSetCount; ++i) {
859            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
860            if (rate.isFixed() && mStreamSetInputs[i].nonDeferred()) {
861                mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i]));
862            }
863        }
864    }
865
866    //  We have one or more blocks of input data and output buffer space for all stream sets.
867    generateMultiBlockLogic(b, numOfStrides);
868
869    for (unsigned i = 0; i < inputSetCount; ++i) {
870        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
871        if (rate.isFixed() && mStreamSetInputs[i].nonDeferred()) {
872            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
873            b->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
874        }
875    }
876
877    for (unsigned i = 0; i < outputSetCount; ++i) {
878        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
879        if (rate.isFixed()) {
880            assert (mStreamSetOutputs[i].nonDeferred());
881            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
882            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
883            b->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
884        }
885    }
886
887    BasicBlock * const handleFinalBlock = b->CreateBasicBlock("HandleFinalBlock");
888    BasicBlock * const temporaryBufferCopyBack = b->CreateBasicBlock("TemporaryBufferCopyBack");
889    BasicBlock * const strideDone = b->CreateBasicBlock("MultiBlockDone");
890
891    b->CreateUnlikelyCondBr(mIsFinal, handleFinalBlock, temporaryBufferCopyBack);
892
893
894    /// FINAL STRIDE ADJUSTMENT
895    b->SetInsertPoint(handleFinalBlock);
896
897    // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
898    // the ITEM COUNT % FIXED RATE = 0 for all Fixed Input and Output streams. We correct that here
899    // to calculate them based on the actual input item counts.
900
901    reviseFinalProducedItemCounts(b);
902
903    b->CreateBr(temporaryBufferCopyBack);
904
905    /// TEMPORARY BUFFER COPY BACK
906    b->SetInsertPoint(temporaryBufferCopyBack);
907
908    // Copy back data to the actual output buffers.
909    for (unsigned i = 0; i < outputSetCount; i++) {
910        Value * const tempBuffer = temporaryOutputBuffer[i];
911        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
912            continue;
913        }
914        Value * const baseBuffer = baseOutputBuffer[i];
915        assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType()));
916        const auto & name = mStreamSetOutputs[i].getName();
917        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
918        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
919        BasicBlock * const clearBuffer = b->CreateBasicBlock(name + "ClearBuffer");
920        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
921        // If we used a temporary buffer, copy it back to the original output buffer
922        b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume);
923
924        b->SetInsertPoint(copyToBack);       
925        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
926        Value * const newProducedItemCount = b->getProducedItemCount(name);
927        Value * const newlyProduced = b->CreateSub(newProducedItemCount, mInitialProducedItemCount[i]);
928        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
929        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
930        b->CreateStreamCpy(name, baseBuffer, offset, tempBuffer, ZERO, toWrite, alignment);
931        // If we required a temporary output buffer, we will probably need to write to the beginning of the buffer as well.
932        b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, clearBuffer);
933
934        b->SetInsertPoint(copyToFront);
935        Value * const remaining = b->CreateSub(newlyProduced, toWrite);
936        Value * const baseAddress = b->getBaseAddress(name);
937        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
938        b->CreateBr(clearBuffer);
939        // Clear the buffer after use since we may end up reusing it within the same stride
940        b->SetInsertPoint(clearBuffer);
941
942        b->CreateBr(resume);
943
944        b->SetInsertPoint(resume);
945    }
946
947    //  We've dealt with the partial block processing and copied information back into the
948    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
949    if (hasNoTerminateAttribute()) {
950        b->CreateCondBr(mIsFinal, segmentDone, strideDone);
951    } else {
952        BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
953        b->CreateCondBr(mIsFinal, setTermination, strideDone);
954
955        b->SetInsertPoint(setTermination);
956        b->setTerminationSignal();
957        b->CreateBr(segmentDone);       
958    }
959
960    /// STRIDE DONE
961    strideDone->moveAfter(b->GetInsertBlock());
962    b->SetInsertPoint(strideDone);
963
964    b->CreateAssertZero(mIsFinal, "stride done cannot process the final block");
965
966    // do we have enough data for another stride?
967    Value * hasMoreStrides = b->getTrue();
968    for (unsigned i = 0; i < inputSetCount; ++i) {
969        const auto & name = mStreamSetInputs[i].getName();
970        Value * const avail = mInitialAvailableItemCount[i];
971        Value * const processed = b->getProcessedItemCount(name);
972        b->CreateAssert(b->CreateICmpULE(processed, avail), name + ": processed data cannot exceed available data");
973        Value * const remaining = b->CreateSub(avail, processed);
974        Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
975        Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
976        hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
977    }
978    // even if we do not have enough input data for a full stride, if this is our final stride, allow it ...
979    hasMoreStrides = b->CreateOr(hasMoreStrides, initiallyFinal);
980
981    // do we have enough room for another stride?
982    for (unsigned i = 0; i < outputSetCount; ++i) {
983        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
984        const auto & name = mStreamSetOutputs[i].getName();
985        Value * const produced = b->getProducedItemCount(name);
986        // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
987        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
988            Value * const consumed = b->getConsumedItemCount(name);
989            b->CreateAssert(b->CreateICmpULE(consumed, produced), name + ": consumed data cannot exceed produced data");
990            Value * const unconsumed = b->CreateSub(produced, consumed);
991            Value * const capacity = b->getCapacity(name);
992            b->CreateAssert(b->CreateICmpULE(unconsumed, capacity), name + ": unconsumed data cannot exceed capacity");
993            Value * const remaining = b->CreateSub(capacity, unconsumed);
994            Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
995            Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
996            hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
997        }
998        // Do copybacks if necessary.
999        if (mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate)) {
1000            b->CreateCopyBack(name, mInitialProducedItemCount[i], produced);
1001        }
1002    }
1003
1004    // b->CreateAssertZero(b->CreateOr(b->CreateNot(initiallyFinal), hasMoreStrides), getName() + " does not have enough output space for the final stride");
1005
1006    b->CreateCondBr(hasMoreStrides, segmentLoop, segmentDone);
1007
1008    /// SEGMENT DONE
1009    segmentDone->moveAfter(b->GetInsertBlock());
1010    b->SetInsertPoint(segmentDone);
1011
1012}
1013
1014/** ------------------------------------------------------------------------------------------------------------- *
1015 * @brief requiresCopyBack
1016 ** ------------------------------------------------------------------------------------------------------------- */
1017bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
1018    if (rate.isBounded() || rate.isUnknown()) {
1019        return true;
1020    } else if (rate.isRelative()) {
1021        return requiresCopyBack(getBinding(rate.getReference()).getRate());
1022    }
1023    return false;
1024}
1025
1026/** ------------------------------------------------------------------------------------------------------------- *
1027 * @brief CreateUDivCeil
1028 ** ------------------------------------------------------------------------------------------------------------- */
1029inline Value * CreateUDivCeil(const std::unique_ptr<KernelBuilder> & b, Value * const number, const ProcessingRate::RateValue divisor, const Twine & Name = "") {
1030    Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
1031    if (LLVM_LIKELY(divisor.denominator() == 1)) {
1032        return b->CreateUDivCeil(number, n, Name);
1033    } else {
1034        //   âŒŠ(num + ratio - 1) / ratio⌋
1035        // = ⌊(num - 1) / (n/d)⌋ + (ratio/ratio)
1036        // = ⌊(d * (num - 1)) / n⌋ + 1
1037        Constant * const ONE = ConstantInt::get(number->getType(), 1);
1038        Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
1039        return b->CreateAdd(b->CreateUDiv(b->CreateMul(b->CreateSub(number, ONE), d), n), ONE, Name);
1040    }
1041}
1042
1043
1044/** ------------------------------------------------------------------------------------------------------------- *
1045 * @brief reviseFinalProducedItemCounts
1046 ** ------------------------------------------------------------------------------------------------------------- */
1047void MultiBlockKernel::reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b) {
1048
1049    if (LLVM_UNLIKELY(mStreamSetInputs.empty())) {
1050        return;
1051    }
1052
1053    const auto inputSetCount = mStreamSetInputs.size();
1054
1055    ProcessingRate::RateValue rateLCM(1);
1056    unsigned first = 0;
1057    unsigned last = inputSetCount;
1058
1059    for (unsigned i = 0; i < inputSetCount; ++i) {
1060        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
1061        if (pr.isFixed()) {
1062            rateLCM = lcm(rateLCM, pr.getRate());
1063            if (mStreamSetInputs[i].isPrincipal()) {
1064                assert ("A kernel cannot have multiple principle input streams" && (first == 0 && last == inputSetCount));
1065                first = i;
1066                last = i + 1;
1067            }
1068        }       
1069    }
1070
1071    bool noFixedRateOutput = true;
1072
1073    for (const Binding & output : mStreamSetOutputs) {
1074        const ProcessingRate & pr = output.getRate();
1075        if (pr.isFixed()) {
1076            rateLCM = lcm(rateLCM, pr.getRate());
1077            noFixedRateOutput = false;
1078        }
1079    }
1080
1081    if (noFixedRateOutput) {
1082        return;
1083    }
1084
1085    Value * baseInitialProcessedItemCount = nullptr;
1086    Value * scaledInverseOfAvailItemCount = nullptr;
1087
1088    // For each Fixed output stream, this calculates:
1089
1090    //    CEILING(MIN(Available Item Count / Fixed Input Rate) * Fixed Output Rate)
1091
1092    // But avoids the possibility of overflow errors (assuming that each processed item count does not overflow)
1093
1094    for (unsigned i = first; i < last; ++i) {
1095        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
1096        if (pr.isFixed()) {
1097            Value * p = mInitialProcessedItemCount[i];
1098            Value * a = b->CreateSub(mInitialAvailableItemCount[i], p);
1099            const auto & rate = pr.getRate();
1100            if (LLVM_UNLIKELY(rateLCM != rate)) {
1101                const auto factor = rateLCM / rate;
1102                if (LLVM_UNLIKELY(factor.numerator() > 1)) {
1103                    a = b->CreateMul(a, b->getSize(factor.numerator()));
1104                }
1105                if (LLVM_UNLIKELY(factor.denominator() > 1)) {
1106                    a = b->CreateUDiv(a, b->getSize(factor.denominator()));
1107                }
1108            }
1109            if (LLVM_UNLIKELY(rate.denominator() > 1)) {
1110                p = b->CreateMul(p, b->getSize(rate.denominator()));
1111            }
1112            if (LLVM_UNLIKELY(rate.numerator() > 1)) {
1113                p = b->CreateUDiv(p, b->getSize(rate.numerator()));
1114            }
1115            if (scaledInverseOfAvailItemCount) {
1116                scaledInverseOfAvailItemCount = b->CreateUMin(scaledInverseOfAvailItemCount, a);
1117                baseInitialProcessedItemCount = b->CreateUMin(baseInitialProcessedItemCount, p);
1118            } else {
1119                scaledInverseOfAvailItemCount = a;
1120                baseInitialProcessedItemCount = p;
1121            }
1122        }
1123    }
1124
1125    for (const Binding & output : mStreamSetOutputs) {
1126        const auto name = output.getName();
1127        const ProcessingRate & pr = output.getRate();
1128        Value * produced = nullptr;
1129        if (pr.isFixed() && output.nonDeferred()) {
1130            assert (baseInitialProcessedItemCount && scaledInverseOfAvailItemCount);
1131            const auto rate = pr.getRate();
1132            Value * p = baseInitialProcessedItemCount;
1133            if (LLVM_UNLIKELY(rate.numerator() != 1)) {
1134                p = b->CreateMul(p, b->getSize(rate.numerator()));
1135            }
1136            if (LLVM_UNLIKELY(rate.denominator() != 1)) {
1137                p = b->CreateUDiv(p, b->getSize(rate.denominator()));
1138            }
1139            Value * const ic = CreateUDivCeil(b, scaledInverseOfAvailItemCount, rateLCM / pr.getRate());
1140            produced = b->CreateAdd(p, ic);
1141        } else { // check if we have an attribute; if so, get the current produced count and adjust it
1142            bool noAttributes = true;
1143            for (const Attribute & attr : output.getAttributes()) {
1144                if (attr.isAdd() || attr.isRoundUpTo()) {
1145                    noAttributes = false;
1146                    break;
1147                }
1148            }
1149            if (noAttributes) {
1150                continue;
1151            }
1152            produced = b->getProducedItemCount(name);
1153        }
1154        for (const Attribute & attr : output.getAttributes()) {
1155            if (attr.isAdd()) {
1156                produced = b->CreateAdd(produced, b->getSize(attr.getAmount()));
1157            } else if (attr.isRoundUpTo()) {
1158                produced = b->CreateRoundUp(produced, b->getSize(attr.getAmount()));
1159            }
1160        }
1161        b->setProducedItemCount(name, produced);
1162    }
1163
1164}
1165
1166/** ------------------------------------------------------------------------------------------------------------- *
1167 * @brief generateMultiBlockLogic
1168 ** ------------------------------------------------------------------------------------------------------------- */
1169Value * BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
1170
1171    if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
1172        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of BlockOrientedKernel "
1173                           "equal to the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
1174    }
1175
1176    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
1177
1178    BasicBlock * const entryBlock = b->GetInsertBlock();
1179    mStrideLoopBody = b->CreateBasicBlock(getName() + "_strideLoopBody");
1180    BasicBlock * const stridesDone = b->CreateBasicBlock(getName() + "_stridesDone");
1181    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
1182    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
1183    b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
1184                    "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
1185    const auto inputSetCount = mStreamSetInputs.size();
1186    Value * baseProcessedIndex[inputSetCount];
1187    Value * baseInputAddress[inputSetCount];
1188    for (unsigned i = 0; i < inputSetCount; i++) {
1189        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
1190        if (LLVM_UNLIKELY(!rate.isFixed())) {
1191            Value * const ic = mInitialProcessedItemCount[i];
1192            baseProcessedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
1193        }
1194        baseInputAddress[i] = mStreamSetInputBaseAddress[i];
1195    }
1196
1197    const auto outputSetCount = mStreamSetOutputs.size();
1198    Value * baseProducedIndex[outputSetCount];
1199    Value * baseOutputAddress[inputSetCount];
1200    for (unsigned i = 0; i < outputSetCount; i++) {
1201        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
1202        if (LLVM_UNLIKELY(!rate.isFixed())) {
1203            Value * const ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
1204            baseProducedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
1205        }
1206        baseOutputAddress[i] = mStreamSetOutputBaseAddress[i];
1207    }
1208
1209    b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, mStrideLoopBody);
1210
1211    /// BLOCK BODY
1212
1213    b->SetInsertPoint(mStrideLoopBody);
1214
1215    if (b->supportsIndirectBr()) {
1216        Value * const baseTarget = BlockAddress::get(segmentDone);
1217        mStrideLoopTarget = b->CreatePHI(baseTarget->getType(), 2, "strideTarget");
1218        mStrideLoopTarget->addIncoming(baseTarget, entryBlock);
1219    }
1220
1221    mStrideBlockIndex = b->CreatePHI(b->getSizeTy(), 2);
1222    mStrideBlockIndex->addIncoming(b->getSize(0), entryBlock);
1223
1224    /// GENERATE DO BLOCK METHOD
1225
1226    for (unsigned i = 0; i < inputSetCount; ++i) {
1227        Value * index = mStrideBlockIndex;
1228        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
1229        if (LLVM_UNLIKELY(!rate.isFixed())) {
1230            Value * ic = b->getProcessedItemCount(mStreamSetInputs[i].getName());
1231            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProcessedIndex[i]);
1232        }
1233        mStreamSetInputBaseAddress[i] = b->CreateGEP(mStreamSetInputBaseAddress[i], index);
1234    }
1235
1236    for (unsigned i = 0; i < outputSetCount; ++i) {
1237        Value * index = mStrideBlockIndex;
1238        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
1239        if (LLVM_UNLIKELY(!rate.isFixed())) {
1240            Value * ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
1241            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProducedIndex[i]);
1242        }
1243        mStreamSetOutputBaseAddress[i] = b->CreateGEP(mStreamSetOutputBaseAddress[i], index);
1244    }
1245
1246    writeDoBlockMethod(b);
1247
1248    BasicBlock * const bodyEnd = b->GetInsertBlock();
1249    if (mStrideLoopTarget) {
1250        mStrideLoopTarget->addIncoming(mStrideLoopTarget, bodyEnd);
1251    }
1252
1253    Value * const nextIndex = b->CreateAdd(mStrideBlockIndex, b->getSize(1));
1254    mStrideBlockIndex->addIncoming(nextIndex, bodyEnd);
1255    Value * const notDone = b->CreateICmpULT(nextIndex, numOfBlocks);
1256    b->CreateCondBr(notDone, mStrideLoopBody, stridesDone);
1257
1258    stridesDone->moveAfter(bodyEnd);
1259
1260    /// STRIDE DONE
1261
1262    b->SetInsertPoint(stridesDone);
1263
1264    // Now conditionally perform the final block processing depending on the doFinal parameter.
1265    if (mStrideLoopTarget) {
1266        mStrideLoopBranch = b->CreateIndirectBr(mStrideLoopTarget, 3);
1267        mStrideLoopBranch->addDestination(doFinalBlock);
1268        mStrideLoopBranch->addDestination(segmentDone);
1269    } else {
1270        b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
1271    }
1272
1273    doFinalBlock->moveAfter(stridesDone);
1274
1275    /// DO FINAL BLOCK
1276
1277    b->SetInsertPoint(doFinalBlock);
1278    for (unsigned i = 0; i < inputSetCount; ++i) {
1279        mStreamSetInputBaseAddress[i] = baseInputAddress[i];
1280    }
1281
1282    for (unsigned i = 0; i < outputSetCount; ++i) {
1283        mStreamSetOutputBaseAddress[i] = baseOutputAddress[i];
1284    }
1285
1286    writeFinalBlockMethod(b, getRemainingItems(b));
1287
1288    b->CreateBr(segmentDone);
1289
1290    segmentDone->moveAfter(b->GetInsertBlock());
1291
1292    b->SetInsertPoint(segmentDone);
1293
1294    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
1295    if (mStrideLoopTarget) {
1296        MDBuilder mdb(b->getContext());
1297        const auto destinations = mStrideLoopBranch->getNumDestinations();
1298        uint32_t weights[destinations];
1299        for (unsigned i = 0; i < destinations; ++i) {
1300            weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1;
1301        }
1302        ArrayRef<uint32_t> bw(weights, destinations);
1303        mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw));
1304    }
1305
1306    return numOfBlocks;
1307}
1308
1309/** ------------------------------------------------------------------------------------------------------------- *
1310 * @brief getRemainingItems
1311 ** ------------------------------------------------------------------------------------------------------------- */
1312Value * BlockOrientedKernel::getRemainingItems(const std::unique_ptr<KernelBuilder> & b) {
1313    Value * remainingItems = nullptr;
1314    const auto count = mStreamSetInputs.size();
1315    if (count == 1) {
1316        return mAvailableItemCount[0];
1317    } else {
1318        for (unsigned i = 0; i < count; i++) {
1319            if (mStreamSetInputs[i].isPrincipal()) {
1320                return mAvailableItemCount[i];
1321            }
1322        }
1323        for (unsigned i = 0; i < count; ++i) {
1324            const ProcessingRate & r = mStreamSetInputs[i].getRate();
1325            if (r.isFixed()) {
1326                Value * ic = CreateUDivCeil(b, mAvailableItemCount[i], r.getRate());
1327                if (remainingItems) {
1328                    remainingItems = b->CreateUMin(remainingItems, ic);
1329                } else {
1330                    remainingItems = ic;
1331                }
1332            }
1333        }
1334    }
1335    return remainingItems;
1336}
1337
1338/** ------------------------------------------------------------------------------------------------------------- *
1339 * @brief writeDoBlockMethod
1340 ** ------------------------------------------------------------------------------------------------------------- */
1341inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
1342
1343    Value * const self = getInstance();
1344    Function * const cp = mCurrentMethod;
1345    auto ip = b->saveIP();
1346    std::vector<Value *> availableItemCount(0);
1347
1348    /// Check if the do block method is called and create the function if necessary
1349    if (!b->supportsIndirectBr()) {
1350
1351        std::vector<Type *> params;
1352        params.reserve(1 + mAvailableItemCount.size());
1353        params.push_back(self->getType());
1354        for (Value * avail : mAvailableItemCount) {
1355            params.push_back(avail->getType());
1356        }
1357
1358        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
1359        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, b->getModule());
1360        mCurrentMethod->setCallingConv(CallingConv::C);
1361        mCurrentMethod->setDoesNotThrow();
1362        auto args = mCurrentMethod->arg_begin();
1363        args->setName("self");
1364        setInstance(&*args);
1365        availableItemCount.reserve(mAvailableItemCount.size());
1366        while (++args != mCurrentMethod->arg_end()) {
1367            availableItemCount.push_back(&*args);
1368        }
1369        assert (availableItemCount.size() == mAvailableItemCount.size());
1370        mAvailableItemCount.swap(availableItemCount);
1371        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
1372    }
1373
1374    generateDoBlockMethod(b); // must be implemented by the BlockOrientedKernelBuilder subtype
1375
1376    if (!b->supportsIndirectBr()) {
1377        // Restore the DoSegment function state then call the DoBlock method
1378        b->CreateRetVoid();
1379        mDoBlockMethod = mCurrentMethod;
1380        b->restoreIP(ip);
1381        setInstance(self);
1382        mCurrentMethod = cp;
1383        mAvailableItemCount.swap(availableItemCount);
1384        CreateDoBlockMethodCall(b);
1385    }
1386
1387}
1388
1389/** ------------------------------------------------------------------------------------------------------------- *
1390 * @brief writeFinalBlockMethod
1391 ** ------------------------------------------------------------------------------------------------------------- */
1392inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingItems) {
1393
1394    Value * const self = getInstance();
1395    Function * const cp = mCurrentMethod;
1396    Value * const remainingItemCount = remainingItems;
1397    auto ip = b->saveIP();
1398    std::vector<Value *> availableItemCount(0);
1399
1400    if (!b->supportsIndirectBr()) {
1401        std::vector<Type *> params;
1402        params.reserve(2 + mAvailableItemCount.size());
1403        params.push_back(self->getType());
1404        params.push_back(b->getSizeTy());
1405        for (Value * avail : mAvailableItemCount) {
1406            params.push_back(avail->getType());
1407        }
1408        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
1409        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, b->getModule());
1410        mCurrentMethod->setCallingConv(CallingConv::C);
1411        mCurrentMethod->setDoesNotThrow();
1412        auto args = mCurrentMethod->arg_begin();
1413        args->setName("self");
1414        setInstance(&*args);
1415        remainingItems = &*(++args);
1416        remainingItems->setName("remainingItems");
1417        availableItemCount.reserve(mAvailableItemCount.size());
1418        while (++args != mCurrentMethod->arg_end()) {
1419            availableItemCount.push_back(&*args);
1420        }
1421        assert (availableItemCount.size() == mAvailableItemCount.size());
1422        mAvailableItemCount.swap(availableItemCount);
1423        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
1424    }
1425
1426    generateFinalBlockMethod(b, remainingItems); // may be implemented by the BlockOrientedKernel subtype
1427
1428    if (!b->supportsIndirectBr()) {
1429        b->CreateRetVoid();
1430        b->restoreIP(ip);
1431        setInstance(self);
1432        mAvailableItemCount.swap(availableItemCount);
1433        // Restore the DoSegment function state then call the DoFinal method
1434        std::vector<Value *> args;
1435        args.reserve(2 + mAvailableItemCount.size());
1436        args.push_back(self);
1437        args.push_back(remainingItemCount);
1438        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
1439        b->CreateCall(mCurrentMethod, args);
1440        mCurrentMethod = cp;
1441    }
1442
1443}
1444
1445/** ------------------------------------------------------------------------------------------------------------- *
1446 * @brief generateFinalBlockMethod
1447 ** ------------------------------------------------------------------------------------------------------------- */
1448void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * /* remainingItems */) {
1449    //  The default finalBlock method simply dispatches to the doBlock routine.
1450    CreateDoBlockMethodCall(b);
1451}
1452
1453void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b) {
1454    if (b->supportsIndirectBr()) {
1455        BasicBlock * const bb = b->CreateBasicBlock("resume");
1456        mStrideLoopBranch->addDestination(bb);
1457        BasicBlock * const current = b->GetInsertBlock();
1458        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), current);
1459        mStrideBlockIndex->addIncoming(b->getSize(0), current);
1460        b->CreateBr(mStrideLoopBody);
1461        bb->moveAfter(current);
1462        b->SetInsertPoint(bb);
1463    } else {
1464        std::vector<Value *> args;
1465        args.reserve(1 + mAvailableItemCount.size());
1466        args.push_back(getInstance());
1467        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
1468        b->CreateCall(mDoBlockMethod, args);
1469    }
1470}
1471
1472static inline std::string annotateKernelNameWithDebugFlags(std::string && name) {
1473    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
1474        name += "_EA";
1475    }
1476    name += "_O" + std::to_string((int)codegen::OptLevel);
1477    return name;
1478}
1479
1480// CONSTRUCTOR
1481Kernel::Kernel(std::string && kernelName,
1482               Bindings && stream_inputs,
1483               Bindings && stream_outputs,
1484               Bindings && scalar_parameters,
1485               Bindings && scalar_outputs,
1486               Bindings && internal_scalars)
1487: KernelInterface(annotateKernelNameWithDebugFlags(std::move(kernelName))
1488                  , std::move(stream_inputs), std::move(stream_outputs)
1489                  , std::move(scalar_parameters), std::move(scalar_outputs)
1490                  , std::move(internal_scalars))
1491, mCurrentMethod(nullptr)
1492, mAvailablePrincipalItemCount(nullptr)
1493, mNoTerminateAttribute(false)
1494, mIsGenerated(false)
1495, mStride(0)
1496, mIsFinal(nullptr)
1497, mOutputScalarResult(nullptr) {
1498
1499}
1500
1501Kernel::~Kernel() {
1502
1503}
1504
1505// MULTI-BLOCK KERNEL CONSTRUCTOR
1506MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
1507                                   Bindings && stream_inputs,
1508                                   Bindings && stream_outputs,
1509                                   Bindings && scalar_parameters,
1510                                   Bindings && scalar_outputs,
1511                                   Bindings && internal_scalars)
1512: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1513
1514}
1515
1516// CONSTRUCTOR
1517BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
1518                                         Bindings && stream_inputs,
1519                                         Bindings && stream_outputs,
1520                                         Bindings && scalar_parameters,
1521                                         Bindings && scalar_outputs,
1522                                         Bindings && internal_scalars)
1523: MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1524, mDoBlockMethod(nullptr)
1525, mStrideLoopBody(nullptr)
1526, mStrideLoopBranch(nullptr)
1527, mStrideLoopTarget(nullptr)
1528, mStrideBlockIndex(nullptr) {
1529
1530}
1531
1532// CONSTRUCTOR
1533SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
1534                                             Bindings && stream_inputs,
1535                                             Bindings && stream_outputs,
1536                                             Bindings && scalar_parameters,
1537                                             Bindings && scalar_outputs,
1538                                             Bindings && internal_scalars)
1539: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1540
1541}
1542
1543
1544}
Note: See TracBrowser for help on using the repository browser.