Ignore:
Timestamp:
Dec 3, 2017, 12:40:40 PM (18 months ago)
Author:
nmedfort
Message:

Bug fixes and simplified MultiBlockKernel? logic

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5743 r5755  
    2222#include <sstream>
    2323#include <kernels/kernel_builder.h>
    24 #include <boost/math/common_factor_rt.hpp>
     24#include <boost/math/common_factor.hpp>
    2525#include <llvm/Support/Debug.h>
    2626
     
    5050        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
    5151    }
    52     if (LLVM_UNLIKELY(mKernelMap.count(name))) {
     52    if (LLVM_UNLIKELY(mKernelFieldMap.count(name))) {
    5353        report_fatal_error(getName() + " already contains scalar field " + name);
    5454    }
    5555    const auto index = mKernelFields.size();
    56     mKernelMap.emplace(name, index);
     56    mKernelFieldMap.emplace(name, index);
    5757    mKernelFields.push_back(type);
    5858    return index;
     
    189189        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
    190190        assert (mKernelStateType);
    191     }   
     191    }
    192192}
    193193
     
    206206    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
    207207        report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object");
    208     }   
    209 }
    210 
    211 /** ------------------------------------------------------------------------------------------------------------- *
    212  * @brief getItemsPerStride
    213  ** ------------------------------------------------------------------------------------------------------------- */
    214 std::pair<unsigned, unsigned> Kernel::getStreamRate(const Port p, const unsigned i) const {
    215     const ProcessingRate & rate = (p == Port::Input) ? mStreamSetInputs[i].getRate() : mStreamSetOutputs[i].getRate();
    216     unsigned min = 0, max = 0;
    217     if (rate.isFixed()) {
    218         min = max = rate.getRate();
    219     } else if (rate.isBounded()) {
    220         min = rate.getLowerBound();
    221         max = rate.getUpperBound();
    222     } else if (rate.isUnknown()) {
    223         min = rate.getLowerBound();
    224         max = 0;
    225     } else if (rate.isExactlyRelative()) {
    226         for (unsigned j = 0; j < mStreamSetInputs.size(); ++j) {
    227             if (mStreamSetInputs[j].getName() == rate.getReference()) {
    228                 std::tie(min, max) = getStreamRate(Port::Input, j);
    229                 min = (min * rate.getNumerator()) / rate.getDenominator();
    230                 assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
    231                 max = (max * rate.getNumerator()) / rate.getDenominator();
    232                 return std::make_pair(min, max);
    233             }
    234         }
    235         for (unsigned j = 0; j < mStreamSetOutputs.size(); ++j) {
    236             if (mStreamSetOutputs[j].getName() == rate.getReference()) {
    237                 assert (p == Port::Output);
    238                 std::tie(min, max) = getStreamRate(Port::Output, j);
    239                 min = (min * rate.getNumerator()) / rate.getDenominator();
    240                 assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
    241                 max = (max * rate.getNumerator()) / rate.getDenominator();
    242                 return std::make_pair(min, max);
    243             }
    244         }
    245         llvm_unreachable("Reference rate must be associated with an input or output!");
    246     }
    247     return std::make_pair(min, max);
     208    }
    248209}
    249210
     
    252213 ** ------------------------------------------------------------------------------------------------------------- */
    253214void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
    254    
     215
     216    if (mStreamMap.empty()) {
     217        prepareStreamSetNameMap();
     218    }
     219
     220    normalizeStreamProcessingRates();
     221
    255222    const unsigned inputSetCount = mStreamSetInputs.size();
    256223    const unsigned outputSetCount = mStreamSetOutputs.size();
    257    
     224
    258225    assert (inputSetCount == mStreamSetInputBuffers.size());
    259226    assert (outputSetCount == mStreamSetOutputBuffers.size());
     
    293260    for (const auto & binding : mScalarOutputs) {
    294261        addScalar(binding.getType(), binding.getName());
    295     }
    296     if (mStreamMap.empty()) {
    297         prepareStreamSetNameMap();
    298262    }
    299263    for (const auto & binding : mInternalScalars) {
     
    388352    setInstance(&*(args++));
    389353    mIsFinal = &*(args++);
    390     mAvailablePrincipleItemCount = nullptr;
    391 //    if (mHasPrincipleItemCount) {
    392 //        mAvailablePrincipleItemCount = &*(args++);
    393 //    }
     354    mAvailablePrincipalItemCount = nullptr;
    394355    const auto n = mStreamSetInputs.size();
    395356    mAvailableItemCount.resize(n, nullptr);
    396357    for (unsigned i = 0; i < n; i++) {
    397 //        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    398 //        Value * itemCount = nullptr;
    399 //        if (rate.isFixed()) {
    400 //            itemCount = mAvailablePrincipleItemCount;
    401 //            if (rate.getRate() != 1) {
    402 //                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getRate()));
    403 //            }
    404 //        } else if (rate.isBounded() || rate.isUnknown()) {
    405 //            itemCount = &*(args++);
    406 //        } else if (rate.isRelative()) {
    407 //            for (unsigned j = 0; j < i; ++j) {
    408 //                if (mStreamSetInputs[j].getName() == rate.getReference()) {
    409 //                    itemCount = mAvailableItemCount[j];
    410 //                    break;
    411 //                }
    412 //            }
    413 //            if (LLVM_UNLIKELY(itemCount == nullptr)) {
    414 //                report_fatal_error(mStreamSetInputs[i].getName() + " is declared before " + rate.getReference());
    415 //            }
    416 //            if (rate.getNumerator() != 1) {
    417 //                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
    418 //            }
    419 //            if (rate.getDenominator() != 1) {
    420 //                itemCount = idb->CreateUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
    421 //            }
    422 //        }
    423 //        assert (itemCount);
    424 //        mAvailableItemCount[i] = itemCount;
    425 
    426358        assert (args != mCurrentMethod->arg_end());
    427359        mAvailableItemCount[i] = &*(args++);
    428360    }
    429361    assert (args == mCurrentMethod->arg_end());
    430 
    431362    generateKernelMethod(idb); // must be overridden by the Kernel subtype
    432363    mIsFinal = nullptr;
     
    466397 ** ------------------------------------------------------------------------------------------------------------- */
    467398unsigned Kernel::getScalarIndex(const std::string & name) const {
    468     const auto f = mKernelMap.find(name);
    469     if (LLVM_UNLIKELY(f == mKernelMap.end())) {
     399    const auto f = mKernelFieldMap.find(name);
     400    if (LLVM_UNLIKELY(f == mKernelFieldMap.end())) {
    470401        assert (false);
    471402        report_fatal_error(getName() + " does not contain scalar: " + name);
     
    574505
    575506/** ------------------------------------------------------------------------------------------------------------- *
     507 * @brief getStreamPort
     508 ** ------------------------------------------------------------------------------------------------------------- */
     509const Binding & Kernel::getBinding(const std::string & name) const {
     510    Port port; unsigned index;
     511    std::tie(port, index) = getStreamPort(name);
     512    return (port == Port::Input) ? getStreamInput(index) : getStreamOutput(index);
     513}
     514
     515/** ------------------------------------------------------------------------------------------------------------- *
     516 * @brief normalizeRelativeToFixedProcessingRate
     517 ** ------------------------------------------------------------------------------------------------------------- */
     518bool Kernel::normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate) {
     519    if (base.isFixed()) {
     520        return true;
     521    } else if (LLVM_UNLIKELY(base.isRelative())) {
     522        const auto & ref = getBinding(base.getReference()).getRate();
     523        if (normalizeRelativeToFixedProcessingRate(ref, toUpdate)) {
     524            toUpdate.getRate() *= ref.getRate();
     525            return true;
     526        }
     527    }
     528    return false;
     529}
     530
     531/** ------------------------------------------------------------------------------------------------------------- *
     532 * @brief normalizeStreamProcessingRates
     533 *
     534 * If we allow a stream to be transitively relative to a fixed rate stream, it complicates detection of fixed
     535 * rate streams later. Find any such occurance and transform them. This implies, however, that a fixed rate
     536 * stream could have a rational processing rate (which should not occur normally.)
     537 ** ------------------------------------------------------------------------------------------------------------- */
     538inline void Kernel::normalizeStreamProcessingRates() {
     539    for (Binding & input : mStreamSetInputs) {
     540        normalizeRelativeToFixedProcessingRate(input.getRate(), input.getRate());
     541    }
     542    for (Binding & output : mStreamSetOutputs) {
     543        normalizeRelativeToFixedProcessingRate(output.getRate(), output.getRate());
     544    }
     545    // TODO: we want to consume whole units. Once the pipeline is able to schedule kernels based on their stride
     546    // and input/output rates, modify them here.
     547}
     548
     549/** ------------------------------------------------------------------------------------------------------------- *
    576550 * @brief generateKernelMethod
    577551 ** ------------------------------------------------------------------------------------------------------------- */
    578552void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
    579 
    580     Constant * const log2BlockWidth = b->getSize(std::log2(b->getBitBlockWidth()));
    581 
    582553    const auto inputSetCount = mStreamSetInputs.size();
    583     mStreamSetInputBufferPtr.resize(inputSetCount);
     554    mStreamSetInputBaseAddress.resize(inputSetCount);
    584555    for (unsigned i = 0; i < inputSetCount; ++i) {
    585         const auto & name = mStreamSetInputs[i].getName();
    586         Value * ic = b->getProcessedItemCount(name);
    587         Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
    588         mStreamSetInputBufferPtr[i] = b->getInputStreamPtr(name, blockIndex);
    589     }
    590 
     556        mStreamSetInputBaseAddress[i] = nullptr;
     557    }
    591558    const auto outputSetCount = mStreamSetOutputs.size();
    592     mStreamSetOutputBufferPtr.resize(outputSetCount);
     559    mStreamSetOutputBaseAddress.resize(outputSetCount);
    593560    for (unsigned i = 0; i < outputSetCount; ++i) {
    594         const auto & name = mStreamSetOutputs[i].getName();
    595         Value * ic = b->getProducedItemCount(name);
    596         Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
    597         mStreamSetOutputBufferPtr[i] = b->getOutputStreamPtr(name, blockIndex);
    598     }
    599 
     561        mStreamSetOutputBaseAddress[i] = nullptr;
     562    }
    600563    generateDoSegmentMethod(b);
    601 
     564}
     565
     566/** ------------------------------------------------------------------------------------------------------------- *
     567 * @brief requiresBufferedFinalStride
     568 ** ------------------------------------------------------------------------------------------------------------- */
     569inline bool requiresBufferedFinalStride(const Binding & b) {
     570    if (LLVM_LIKELY(isa<ArrayType>(b.getType()))) {
     571        return b.getType()->getArrayNumElements() == 1;
     572    }
     573    return true;
     574}
     575
     576/** ------------------------------------------------------------------------------------------------------------- *
     577 * @brief getItemWidth
     578 ** ------------------------------------------------------------------------------------------------------------- */
     579inline unsigned getItemWidth(const Binding & b) {
     580    Type * ty = b.getType();
     581    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     582        ty = ty->getArrayElementType();
     583    }
     584    return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
     585}
     586
     587/** ------------------------------------------------------------------------------------------------------------- *
     588 * @brief getLowerBound
     589 ** ------------------------------------------------------------------------------------------------------------- */
     590ProcessingRate::RateValue MultiBlockKernel::getLowerBound(const ProcessingRate & rate) const {
     591    if (rate.isFixed() || rate.isBounded()) {
     592        return rate.getLowerBound();
     593    } else if (rate.isRelative()) {
     594        return rate.getRate() * getLowerBound(getBinding(rate.getReference()).getRate());
     595    } else { // if (rate.isUnknown())
     596        return 0;
     597    }
     598}
     599
     600/** ------------------------------------------------------------------------------------------------------------- *
     601 * @brief getUpperBound
     602 ** ------------------------------------------------------------------------------------------------------------- */
     603ProcessingRate::RateValue MultiBlockKernel::getUpperBound(const ProcessingRate &rate) const {
     604    if (rate.isFixed() || rate.isBounded()) {
     605        return rate.getUpperBound();
     606    } else if (rate.isRelative()) {
     607        return rate.getRate() * getUpperBound(getBinding(rate.getReference()).getRate());
     608    } else { // if (rate.isUnknown())
     609        return 0;
     610    }
     611}
     612
     613/** ------------------------------------------------------------------------------------------------------------- *
     614 * @brief getUpperBound
     615 ** ------------------------------------------------------------------------------------------------------------- */
     616bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
     617    if (rate.isUnknown()) {
     618        return true;
     619    } else if (rate.isDerived()) {
     620        return isTransitivelyUnknownRate(getBinding(rate.getReference()).getRate());
     621    }
     622    return false;
     623}
     624
     625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief roundUp
     627 ** ------------------------------------------------------------------------------------------------------------- */
     628unsigned roundUp(const ProcessingRate::RateValue & r) {
     629    if (LLVM_LIKELY(r.denominator() == 1)) {
     630        return r.numerator();
     631    } else {
     632        return (r.numerator() + r.denominator() - 1) / r.denominator();
     633    }
     634}
     635
     636/** ------------------------------------------------------------------------------------------------------------- *
     637 * @brief getItemAlignment
     638 ** ------------------------------------------------------------------------------------------------------------- */
     639inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
     640    const auto & rate = binding.getRate();
     641    if (rate.isFixed()) {
     642        const auto & r = rate.getRate();
     643        const auto n = (r.numerator() * mStride);
     644        if (LLVM_LIKELY(r.denominator() == 1)) {
     645            return n;
     646        } else if (LLVM_LIKELY((n % r.denominator()) == 0)) {
     647            return n / r.denominator();
     648        }
     649    }
     650    return 1; // ∀x GCD(x, x + 1) = 1
     651}
     652
     653/** ------------------------------------------------------------------------------------------------------------- *
     654 * @brief getStrideSize
     655 ** ------------------------------------------------------------------------------------------------------------- */
     656llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
     657    // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
     658    const auto r = getUpperBound(rate);
     659    if (r.numerator() == 0) {
     660        return nullptr;
     661    } else {
     662        assert ((r.numerator() * mStride) % r.denominator() == 0);
     663        return b->getSize((r.numerator() * mStride) / r.denominator());
     664    }
    602665}
    603666
     
    605668 * @brief generateKernelMethod
    606669 ** ------------------------------------------------------------------------------------------------------------- */
    607 void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) {
     670void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
     671
     672    if (LLVM_UNLIKELY((mStride % b->getBitBlockWidth()) != 0)) {
     673        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of MultiBlockKernel "
     674                           "must be a multiple of the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
     675    }
    608676
    609677    const auto inputSetCount = mStreamSetInputs.size();
    610678    const auto outputSetCount = mStreamSetOutputs.size();
    611     const auto totalSetCount = inputSetCount + outputSetCount;
    612 
    613     // Scan through and see if any of our input streams is marked as the principle
    614 
    615     bool hasPrinciple = false;
    616     unsigned principleInput = 0;
    617 
    618     for (unsigned i = 0; i < inputSetCount; i++) {
    619         for (const auto attr : mStreamSetInputs[i].getAttributes()) {
    620             if (attr.isPrinciple()) {
    621                 hasPrinciple = true;
    622                 principleInput = i;
    623                 break;
     679
     680    // Define and allocate the temporary buffer area in the prolog.
     681    const auto alignment = b->getBitBlockWidth() / 8;
     682    Value * temporaryInputBuffer[inputSetCount];
     683    for (unsigned i = 0; i < inputSetCount; ++i) {
     684
     685        // TODO: if this is a fixed rate input stream and the pipeline guarantees it will not call the kernel unless
     686        // there is sufficient input and all buffers will be sized sufficiently for the input, we ought to be able to
     687        // avoid the temporary buffer checks.
     688
     689        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     690        Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
     691        const auto ub = getUpperBound(rate);
     692        if (ub.numerator() == 0) {
     693            report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
     694        } else {           
     695            temporaryInputBuffer[i] = b->CreateAlignedAlloca(ty, alignment, b->getSize(roundUp(ub)));
     696            Type * const sty = temporaryInputBuffer[i]->getType()->getPointerElementType();
     697            b->CreateStore(Constant::getNullValue(sty), temporaryInputBuffer[i]);
     698        }       
     699    }
     700
     701    Value * temporaryOutputBuffer[outputSetCount];
     702    for (unsigned i = 0; i < outputSetCount; i++) {
     703        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     704        Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     705        if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
     706            temporaryOutputBuffer[i] = nullptr;
     707        } else {           
     708            auto ub = getUpperBound(rate);
     709            if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     710                ub += mStreamSetOutputBuffers[i]->overflowSize();
    624711            }
     712            temporaryOutputBuffer[i] = b->CreateAlignedAlloca(ty, alignment, b->getSize(roundUp(ub)));
     713            Type * const sty = temporaryOutputBuffer[i]->getType()->getPointerElementType();
     714            b->CreateStore(Constant::getNullValue(sty), temporaryOutputBuffer[i]);
    625715        }
    626716    }
     
    634724    // to process, in which case we abort unless IsFinal was set.
    635725
     726    Constant * const ZERO = b->getSize(0);
     727    Constant * const ONE = b->getSize(1);
     728    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
     729    Constant * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
     730
    636731    // Now proceed with creation of the doSegment method.
    637     BasicBlock * const doSegmentLoop = kb->CreateBasicBlock("DoSegmentLoop");
    638     kb->CreateBr(doSegmentLoop);
     732    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     733
     734    b->CreateBr(segmentLoop);
    639735
    640736    /// DO SEGMENT LOOP
    641737
    642     kb->SetInsertPoint(doSegmentLoop);
    643 
    644     // For each input buffer, determine the processedItemCount, the block pointer for the
    645     // buffer block containing the next item, and the number of linearly available items.
    646 
    647     Value * processedItemCount[inputSetCount];
    648     Value * baseInputBuffer[inputSetCount];
    649     Value * unprocessed[inputSetCount];
    650     Value * linearlyAvailable[inputSetCount];
    651     Value * readableStrides[inputSetCount];
    652 
    653     Constant * const log2BlockWidth = kb->getSize(std::log2(kb->getBitBlockWidth()));
    654 
     738    b->SetInsertPoint(segmentLoop);
     739
     740    // For each input buffer, get the initial processed item count, base input pointer, and the number of
     741    // linearly available strides.
    655742    Value * numOfStrides = nullptr;
    656 
     743    mInitialAvailableItemCount.resize(inputSetCount);
     744    mInitialProcessedItemCount.resize(inputSetCount);
     745    mStreamSetInputBaseAddress.resize(inputSetCount);
     746    Value * inputStrideSize[inputSetCount];
    657747    for (unsigned i = 0; i < inputSetCount; i++) {
    658         const auto name = mStreamSetInputs[i].getName();
    659         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    660 
    661         processedItemCount[i] = kb->getProcessedItemCount(name);
    662 
    663         assert (processedItemCount[i]->getType() == mAvailableItemCount[i]->getType());
    664 
    665         Value * const blockIndex = kb->CreateLShr(processedItemCount[i], log2BlockWidth);
    666         baseInputBuffer[i] = kb->getInputStreamPtr(name, blockIndex);
    667 
    668         if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
    669             kb->CreateAssert(kb->CreateICmpUGE(mAvailableItemCount[i], processedItemCount[i]),
    670                              "Processed item count cannot exceed the available item count");
    671         }
    672 
    673         unprocessed[i] = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    674 
    675         //kb->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed[i]);
    676 
    677         // INVESTIGATE: If the input rate of this stream is constant and known a priori, we could
    678         // avoid checking whether it is linearly accessible. Should we have an attribute for this?
    679 
    680         linearlyAvailable[i] = kb->getLinearlyAccessibleItems(name, processedItemCount[i], unprocessed[i]);
    681 
    682         //kb->CallPrintInt(getName() + "_" + name + "_linearlyAvailable", linearlyAvailable[i]);
    683 
    684         readableStrides[i] = nullptr;
    685 
    686         if (rate.isFixed() || rate.isBounded()) {
    687             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    688             readableStrides[i] = kb->CreateUDiv(linearlyAvailable[i], maxStrideSize);
    689             if (numOfStrides) {
    690                 numOfStrides = kb->CreateUMin(numOfStrides, readableStrides[i]);
    691             } else {
    692                 numOfStrides = readableStrides[i];
     748        const auto & input = mStreamSetInputs[i];
     749        const auto & name = input.getName();
     750        const ProcessingRate & rate = input.getRate();
     751        Value * const ic = b->getProcessedItemCount(name);
     752        mInitialProcessedItemCount[i] = ic;
     753        b->CreateAssert(b->CreateICmpUGE(mAvailableItemCount[i], ic), "processed item count cannot exceed the available item count");
     754        assert (ic->getType() == mAvailableItemCount[i]->getType());
     755        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], ic);
     756        mStreamSetInputBaseAddress[i]  = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     757        mInitialAvailableItemCount[i] = mAvailableItemCount[i];
     758        mAvailableItemCount[i] = b->getLinearlyAccessibleItems(name, ic, unprocessed);
     759        // Are our linearly accessible items sufficient for a stride?
     760        inputStrideSize[i] = getStrideSize(b, rate);
     761        Value * accessibleStrides = b->CreateUDiv(mAvailableItemCount[i], inputStrideSize[i]);
     762        if (!rate.isFixed() || requiresBufferedFinalStride(input)) {
     763
     764            // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
     765            // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer.
     766
     767            BasicBlock * const entry = b->GetInsertBlock();
     768            BasicBlock * const copyFromBack = b->CreateBasicBlock(name + "CopyFromBack");
     769            BasicBlock * const copyFromFront = b->CreateBasicBlock(name + "CopyFromFront");
     770            BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
     771
     772            b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume);
     773
     774            b->SetInsertPoint(copyFromBack);
     775            Value * const temporaryAvailable = b->CreateUMin(unprocessed, inputStrideSize[i]);
     776            b->CreateAssert(b->CreateICmpULE(mAvailableItemCount[i], temporaryAvailable), "linearly available cannot be greater than temporarily available");
     777            Value * const tempBufferPtr = temporaryInputBuffer[i];
     778            Value * const offset = b->CreateAnd(ic, BLOCK_WIDTH_MASK);
     779            const auto alignment = getItemAlignment(mStreamSetInputs[i]);
     780            b->CreateStreamCpy(name, tempBufferPtr, ZERO, mStreamSetInputBaseAddress[i] , offset, mAvailableItemCount[i], alignment);
     781            Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
     782            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
     783            b->CreateCondBr(b->CreateICmpNE(mAvailableItemCount[i], temporaryAvailable), copyFromFront, resume);
     784
     785            b->SetInsertPoint(copyFromFront);
     786            Value * const remaining = b->CreateSub(temporaryAvailable, mAvailableItemCount[i]);
     787            Value * const baseAddress = b->getBaseAddress(name);
     788            b->CreateStreamCpy(name, tempBufferPtr, mAvailableItemCount[i], baseAddress, ZERO, remaining, alignment);
     789            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
     790            b->CreateBr(resume);
     791
     792            b->SetInsertPoint(resume);
     793            PHINode * const bufferPtr = b->CreatePHI(mStreamSetInputBaseAddress[i] ->getType(), 3);
     794            bufferPtr->addIncoming(mStreamSetInputBaseAddress[i] , entry);
     795            bufferPtr->addIncoming(tempBufferPtr, copyToBackEnd);
     796            bufferPtr->addIncoming(tempBufferPtr, copyToFrontEnd);
     797            mStreamSetInputBaseAddress[i] = bufferPtr;
     798
     799            PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 3);
     800            phiAvailItemCount->addIncoming(mAvailableItemCount[i], entry);
     801            phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
     802            phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd);
     803            mAvailableItemCount[i] = phiAvailItemCount;
     804
     805            PHINode * const phiNumOfStrides = b->CreatePHI(b->getSizeTy(), 2);
     806            phiNumOfStrides->addIncoming(accessibleStrides, entry);
     807            phiNumOfStrides->addIncoming(temporaryStrides, copyToBackEnd);
     808            phiNumOfStrides->addIncoming(temporaryStrides, copyToFrontEnd);
     809            accessibleStrides = phiNumOfStrides;
     810        }
     811        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
     812    }
     813
     814    // Now determine the linearly writeable strides
     815    Value * linearlyWritable[outputSetCount];
     816    Value * baseOutputBuffer[outputSetCount];
     817    Value * outputStrideSize[outputSetCount];
     818    mInitialProducedItemCount.resize(outputSetCount);
     819    mStreamSetOutputBaseAddress.resize(outputSetCount);
     820    for (unsigned i = 0; i < outputSetCount; i++) {
     821        const auto & output = mStreamSetOutputs[i];
     822        const auto & name = output.getName();
     823        const ProcessingRate & rate = output.getRate();
     824        Value * const ic = b->getProducedItemCount(name);
     825        baseOutputBuffer[i] = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     826        assert (baseOutputBuffer[i]->getType()->isPointerTy());
     827        linearlyWritable[i] = b->getLinearlyWritableItems(name, ic);
     828        mInitialProducedItemCount[i] = ic;
     829        outputStrideSize[i] = nullptr;
     830        if (temporaryOutputBuffer[i]) {
     831            outputStrideSize[i] = getStrideSize(b, rate);
     832            // Is the number of linearly writable items sufficient for a stride?
     833            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
     834            if (!rate.isFixed() || requiresBufferedFinalStride(output)) {
     835                Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
     836                assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
     837                baseOutputBuffer[i] = b->CreateSelect(requiresCopy, temporaryOutputBuffer[i], baseOutputBuffer[i]);
     838                writableStrides = b->CreateSelect(requiresCopy, ONE, writableStrides);
    693839            }
    694         }
    695     }
    696 
    697     //kb->CallPrintInt(getName() + "_numOfStrides", numOfStrides);
    698 
    699     // Now determine the linearly writeable blocks, based on available blocks reduced
    700     // by limitations of output buffer space.
    701 
    702     Value * producedItemCount[outputSetCount];
    703     Value * baseOutputBuffer[outputSetCount];
    704     Value * writableStrides[outputSetCount];
    705     Value * linearlyWritable[outputSetCount];
    706 
    707     for (unsigned i = 0; i < outputSetCount; i++) {
    708         const auto & name = mStreamSetOutputs[i].getName();
    709         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    710         producedItemCount[i] = kb->getProducedItemCount(name);
    711 
    712         //kb->CallPrintInt(getName() + "_" + name + "_producedItemCount", producedItemCount[i]);
    713 
    714         Value * const blockIndex = kb->CreateLShr(producedItemCount[i], log2BlockWidth);
    715         baseOutputBuffer[i] = kb->getOutputStreamPtr(name, blockIndex);
    716         linearlyWritable[i] = nullptr;
    717         writableStrides[i] = nullptr;
    718         if (rate.isFixed() || rate.isBounded()) {
    719             linearlyWritable[i] = kb->getLinearlyWritableItems(name, producedItemCount[i]);
    720 
    721             //kb->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]);
    722 
    723             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    724             writableStrides[i] = kb->CreateUDiv(linearlyWritable[i], maxStrideSize);
    725             if (numOfStrides) {
    726                 numOfStrides = kb->CreateUMin(numOfStrides, writableStrides[i]);
    727             } else {
    728                 numOfStrides = writableStrides[i];
     840            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
     841            assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
     842        }
     843        mStreamSetOutputBaseAddress[i] = baseOutputBuffer[i];
     844    }
     845
     846    Value * const initiallyFinal = mIsFinal;
     847    if (LLVM_LIKELY(numOfStrides != nullptr)) {
     848        mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
     849        Value * const processStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
     850        b->CreateAssert(processStride, getName() + " does not have sufficient input data or output space for one stride");
     851        for (unsigned i = 0; i < inputSetCount; ++i) {
     852            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     853            if (rate.isFixed() && mStreamSetInputs[i].notDeferred()) {
     854                mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i]));
    729855            }
    730856        }
    731857    }
    732858
    733     //kb->CallPrintInt(getName() + "_numOfStrides'", numOfStrides);
    734 
    735     for (unsigned i = 0; i < inputSetCount; i++) {
    736         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    737         if (rate.isFixed()) {
    738             mAvailableItemCount[i] = kb->CreateMul(numOfStrides, kb->getSize(rate.getRate() * mStride));
    739         } else {
    740             mAvailableItemCount[i] = linearlyAvailable[i];
    741         }
    742 
    743         //kb->CallPrintInt(getName() + "_" + mStreamSetInputs[i].getName() + "_avail", mAvailableItemCount[i]);
    744     }
    745 
    746     // Define and allocate the temporary buffer area.
    747     Type * tempBuffers[totalSetCount];
    748     for (unsigned i = 0; i < inputSetCount; ++i) {
    749         Type * bufType = baseInputBuffer[i]->getType()->getPointerElementType();
    750         assert (baseInputBuffer[i]->getType()->getPointerAddressSpace() == 0);
    751         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    752         unsigned count = 0;
    753         if (rate.isFixed()) {
    754             count = rate.getRate();
    755         } else if (rate.isBounded()) {
    756             count = rate.getUpperBound() + 2;
    757         }
    758         tempBuffers[i] = ArrayType::get(bufType, count);
    759     }
    760     for (unsigned i = 0; i < outputSetCount; i++) {
    761         Type * const bufType = baseOutputBuffer[i]->getType()->getPointerElementType();
    762         assert (baseOutputBuffer[i]->getType()->getPointerAddressSpace() == 0);
    763         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    764         unsigned count = 0;
    765         if (rate.isFixed()) {
    766             count = rate.getRate();
    767         } else if (rate.isBounded()) {
    768             count = rate.getUpperBound() + 2;
    769         }
    770         tempBuffers[i + inputSetCount] = ArrayType::get(bufType, count);
    771     }
    772 
    773     Type * const tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount));
    774 
    775     Value * const tempBufferArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
    776 
    777     BasicBlock * const temporaryBufferCheck = kb->CreateBasicBlock("temporaryBufferCheck");
    778     BasicBlock * const doMultiBlock = kb->CreateBasicBlock("doMultiBlock");
    779     BasicBlock * const copyToTemporaryBuffers = kb->CreateBasicBlock("copyToTemporaryBuffers");
    780     BasicBlock * const segmentDone = kb->CreateBasicBlock("segmentDone");
    781 
    782     Value * const hasFullStride = numOfStrides ? kb->CreateICmpNE(numOfStrides, kb->getSize(0)) : kb->getTrue();
    783     kb->CreateCondBr(hasFullStride, doMultiBlock, temporaryBufferCheck);
    784 
    785     // We use temporary buffers in 3 different cases that preclude full stride processing.
    786 
    787     //  (a) One or more input buffers does not have a sufficient number of input items linearly available.
    788     //  (b) One or more output buffers does not have sufficient linearly available buffer space.
    789     //  (c) We have processed all the full strides of input and only the final block remains.
    790 
    791     kb->SetInsertPoint(temporaryBufferCheck);
    792 
    793     // Even if we copy the input data into a linear arrays, is there enough data to perform this stride?
    794     // If not, proceed only if this is our final block.
    795     Value * hasFullFragmentedStride = nullptr;
    796     for (unsigned i = 0; i < inputSetCount; i++) {
    797         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    798         if (r.isBounded() || (r.isUnknown() && r.getLowerBound() > 0)) {
    799             const auto l = r.isBounded() ? r.getUpperBound() : r.getLowerBound();
    800             Constant * const strideSize = kb->getSize(l * mStride);
    801             Value * enoughAvail = kb->CreateICmpUGE(unprocessed[i], strideSize);
    802             if (hasFullFragmentedStride) {
    803                 hasFullFragmentedStride = kb->CreateAnd(hasFullFragmentedStride, enoughAvail);
    804             } else {
    805                 hasFullFragmentedStride = enoughAvail;
    806             }
    807         }
    808     }
    809 
    810     Value * hasFragmentedOrFinalStride = nullptr;
    811     if (hasFullFragmentedStride) {
    812         hasFragmentedOrFinalStride = kb->CreateOr(hasFullFragmentedStride, mIsFinal);
    813         // Although this might be the final segment, we may have a full fragmented stride to process prior
    814         // to the actual final stride.
    815         mIsFinal = kb->CreateAnd(mIsFinal, kb->CreateNot(hasFullFragmentedStride));
    816     } else {
    817         hasFragmentedOrFinalStride = mIsFinal;
    818     }
    819     kb->CreateCondBr(hasFragmentedOrFinalStride, copyToTemporaryBuffers, segmentDone);
    820 
    821     /// COPY TO TEMPORARY BUFFERS
    822     kb->SetInsertPoint(copyToTemporaryBuffers);
    823 
    824     kb->CreateAlignedStore(Constant::getNullValue(tempParameterStructType), tempBufferArea, kb->getCacheAlignment());
    825 
    826     // For each input and output buffer, copy over necessary data starting from the last block boundary.
    827 
    828     Value * temporaryInputBuffer[inputSetCount];
    829     Value * temporaryAvailable[inputSetCount];
    830 
    831     for (unsigned i = 0; i < inputSetCount; i++) {
    832         temporaryInputBuffer[i] = baseInputBuffer[i];
    833         if (readableStrides[i]) {
    834             const auto name = mStreamSetInputs[i].getName();
    835             const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    836             assert (rate.getUpperBound() > 0);
    837             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    838             temporaryAvailable[i] = kb->CreateUMin(unprocessed[i], maxStrideSize);
    839 
    840             BasicBlock * entry = kb->GetInsertBlock();
    841             BasicBlock * copy = kb->CreateBasicBlock(name + "Copy");
    842             BasicBlock * resume = kb->CreateBasicBlock(name + "ResumeCopy");
    843             Value * const test = kb->CreateOr(kb->CreateICmpNE(readableStrides[i], kb->getSize(0)), mIsFinal);
    844             kb->CreateCondBr(test, resume, copy);
    845 
    846             kb->SetInsertPoint(copy);
    847             Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(i), kb->getInt32(0)});
    848             assert (tempBufferPtr->getType() == baseInputBuffer[i]->getType());
    849             Value * const neededItems = linearlyAvailable[i];
    850             Value * const bytesCopied = kb->copy(name, tempBufferPtr, baseInputBuffer[i], neededItems);
    851             Value * const nextInputPtr = kb->getRawInputPointer(name, kb->getSize(0));
    852             Value * const remaining = kb->CreateSub(temporaryAvailable[i], neededItems);
    853             Value * nextBufPtr = kb->CreatePointerCast(tempBufferPtr, kb->getInt8PtrTy());
    854             nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
    855             kb->copy(name, nextBufPtr, nextInputPtr, remaining);
    856 
    857             kb->CreateBr(resume);
    858 
    859             kb->SetInsertPoint(resume);
    860             PHINode * bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
    861             bufferPtr->addIncoming(baseInputBuffer[i], entry);
    862             bufferPtr->addIncoming(tempBufferPtr, copy);
    863             temporaryInputBuffer[i] = bufferPtr;
    864         }
    865     }
    866 
    867     Value * temporaryOutputBuffer[outputSetCount];
    868     for (unsigned i = 0; i < outputSetCount; i++) {
    869         temporaryOutputBuffer[i] = baseOutputBuffer[i];
    870         if (writableStrides[i]) {
    871             const auto name = mStreamSetOutputs[i].getName();
    872 
    873             BasicBlock * const entry = kb->GetInsertBlock();
    874             BasicBlock * const copy = kb->CreateBasicBlock(name + "Copy");
    875             BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopy");
    876 
    877             Value * const test = kb->CreateOr(kb->CreateICmpNE(writableStrides[i], kb->getSize(0)), mIsFinal);
    878             kb->CreateCondBr(test, resume, copy);
    879 
    880             kb->SetInsertPoint(copy);
    881             Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i), kb->getInt32(0)});
    882             assert (tempBufferPtr->getType() == baseOutputBuffer[i]->getType());
    883             Value * const itemsToCopy = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
    884             kb->copy(name, tempBufferPtr, baseOutputBuffer[i], itemsToCopy);
    885             kb->CreateBr(resume);
    886 
    887             kb->SetInsertPoint(resume);
    888             PHINode * bufferPtr = kb->CreatePHI(tempBufferPtr->getType(), 2);
    889             bufferPtr->addIncoming(baseOutputBuffer[i], entry);
    890             bufferPtr->addIncoming(tempBufferPtr, copy);
    891             temporaryOutputBuffer[i] = bufferPtr;
    892         }
    893     }
    894 
    895     kb->CreateBr(doMultiBlock);
    896     BasicBlock * const usingTemporaryBuffers = kb->GetInsertBlock();
    897     doMultiBlock->moveAfter(usingTemporaryBuffers);
    898 
    899     /// DO MULTI BLOCK
    900 
    901     //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
    902     //  Now prepare the doMultiBlock call.
    903     kb->SetInsertPoint(doMultiBlock);
    904 
    905     PHINode * const isFinal = kb->CreatePHI(mIsFinal->getType(), 2);
    906     isFinal->addIncoming(kb->getFalse(), doSegmentLoop);
    907     isFinal->addIncoming(mIsFinal, usingTemporaryBuffers);
    908     mIsFinal = isFinal;
    909 
    910     mStreamSetInputBufferPtr.resize(inputSetCount);
    911     for (unsigned i = 0; i < inputSetCount; ++i) {
    912         assert (baseInputBuffer[i] && temporaryInputBuffer[i]);
    913         if (baseInputBuffer[i] != temporaryInputBuffer[i]) {
    914             PHINode * const avail = kb->CreatePHI(kb->getSizeTy(), 2);
    915             avail->addIncoming(mAvailableItemCount[i], doSegmentLoop);
    916             avail->addIncoming(temporaryAvailable[i], usingTemporaryBuffers);
    917             mAvailableItemCount[i] = avail;
    918             PHINode * const bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
    919             bufferPtr->addIncoming(baseInputBuffer[i], doSegmentLoop);
    920             assert (baseInputBuffer[i]->getType() == temporaryInputBuffer[i]->getType());
    921             bufferPtr->addIncoming(temporaryInputBuffer[i], usingTemporaryBuffers);
    922             temporaryInputBuffer[i] = bufferPtr;
    923         }
    924         mStreamSetInputBufferPtr[i] = temporaryInputBuffer[i];
    925     }
    926 
    927     mStreamSetOutputBufferPtr.resize(outputSetCount);
    928     for (unsigned i = 0; i < outputSetCount; ++i) {
    929         assert (baseOutputBuffer[i] && temporaryOutputBuffer[i]);
    930         if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
    931             PHINode * const bufferPtr = kb->CreatePHI(baseOutputBuffer[i]->getType(), 2);
    932             bufferPtr->addIncoming(baseOutputBuffer[i], doSegmentLoop);
    933             assert (baseOutputBuffer[i]->getType() == temporaryOutputBuffer[i]->getType());
    934             bufferPtr->addIncoming(temporaryOutputBuffer[i], usingTemporaryBuffers);
    935             temporaryOutputBuffer[i] = bufferPtr;
    936         }
    937         mStreamSetOutputBufferPtr[i] = temporaryOutputBuffer[i];
    938     }
    939 
    940     // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
    941     // provide the required multi-block kernel logic.
    942     generateMultiBlockLogic(kb, numOfStrides);
    943 
    944     // If we have no fixed rate inputs, we won't know when we're done parsing until we test
    945     // whether any input data was processed.
    946     bool mayMakeNoProgress = true;
    947 
    948     // Update the processed item count of any Fixed input or output stream. While doing so, also
    949     // calculate the LCM of their rates. The LCM is used to calculate the final item counts.
    950 
    951     unsigned rateLCM = 1;
     859    //  We have one or more blocks of input data and output buffer space for all stream sets.
     860    generateMultiBlockLogic(b, numOfStrides);
    952861
    953862    for (unsigned i = 0; i < inputSetCount; ++i) {
    954863        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    955         if (rate.isFixed()) {
    956             mayMakeNoProgress = false;
    957             rateLCM = lcm(rateLCM, rate.getRate());
    958             Value * const processed = mAvailableItemCount[i]; // kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
    959             Value * const ic = kb->CreateAdd(processedItemCount[i], processed);
    960             kb->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
     864        if (rate.isFixed() && mStreamSetInputs[i].notDeferred()) {
     865            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
     866            b->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
    961867        }
    962868    }
     
    965871        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    966872        if (rate.isFixed()) {
    967             rateLCM = lcm(rateLCM, rate.getRate());
    968             Value * const produced = kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
    969             Value * const ic = kb->CreateAdd(producedItemCount[i], produced);
    970             kb->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
    971         }
    972     }
    973 
    974     BasicBlock * const finalStrideCheck = kb->CreateBasicBlock("finalStrideCheck");
    975     BasicBlock * const finalStrideAdjustment = kb->CreateBasicBlock("finalStrideAdjustment");
    976     BasicBlock * const standardCopyBack = kb->CreateBasicBlock("standardCopyBack");
    977     BasicBlock * const temporaryBufferCopyBack = kb->CreateBasicBlock("temporaryBufferCopyBack");
    978 
    979     kb->CreateLikelyCondBr(hasFullStride, standardCopyBack, finalStrideCheck);
    980 
    981 
    982     /// FINAL STRIDE CHECK
    983     kb->SetInsertPoint(finalStrideCheck);
    984     kb->CreateUnlikelyCondBr(mIsFinal, finalStrideAdjustment, temporaryBufferCopyBack);
     873            assert (mStreamSetOutputs[i].notDeferred());
     874            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
     875            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
     876            b->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
     877        }
     878    }
     879
     880    BasicBlock * const handleFinalBlock = b->CreateBasicBlock("HandleFinalBlock");
     881    BasicBlock * const temporaryBufferCopyBack = b->CreateBasicBlock("TemporaryBufferCopyBack");
     882    BasicBlock * const strideDone = b->CreateBasicBlock("MultiBlockDone");
     883
     884    b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), temporaryBufferCopyBack, handleFinalBlock);
     885
    985886
    986887    /// FINAL STRIDE ADJUSTMENT
    987     kb->SetInsertPoint(finalStrideAdjustment);
     888    b->SetInsertPoint(handleFinalBlock);
    988889
    989890    // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
     
    991892    // to calculate them based on the actual input item counts.
    992893
    993     // NOTE: This appears overly complex to avoid an integer overflow without reducing the maximum
    994     // integer size. For each Fixed output stream, this calculates:
    995 
    996     //       CEILING(MIN(Total Available Item Count / Fixed Input Rate) * Fixed Output Rate)
    997 
    998     Value * basePreviouslyProcessedItemCount = nullptr;
    999     Value * scaledInverseOfStrideItemCount = nullptr;
    1000 
     894    reviseFinalProducedItemCounts(b);
     895
     896    b->CreateBr(temporaryBufferCopyBack);
     897
     898    /// TEMPORARY BUFFER COPY BACK
     899    b->SetInsertPoint(temporaryBufferCopyBack);
     900
     901    // Copy back data to the actual output buffers.
     902    for (unsigned i = 0; i < outputSetCount; i++) {
     903        Value * const tempBuffer = temporaryOutputBuffer[i];
     904        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
     905            continue;
     906        }
     907        Value * const baseBuffer = baseOutputBuffer[i];
     908        assert ("stack overflow" && (tempBuffer->getType() == baseBuffer->getType()));
     909        const auto & name = mStreamSetOutputs[i].getName();
     910        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
     911        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
     912        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
     913        // If we used a temporary buffer, copy it back to the original output buffer
     914        b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume);
     915
     916        b->SetInsertPoint(copyToBack);       
     917        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
     918        Value * const newProducedItemCount = b->getProducedItemCount(name);
     919        Value * const newlyProduced = b->CreateSub(newProducedItemCount, mInitialProducedItemCount[i]);
     920        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
     921        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     922        b->CreateStreamCpy(name, baseBuffer, offset, tempBuffer, ZERO, toWrite, alignment);
     923        // If we required a temporary output buffer, we will probably need to write to the beginning of the buffer as well.
     924        b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, resume);
     925
     926        b->SetInsertPoint(copyToFront);
     927        Value * const remaining = b->CreateSub(newlyProduced, toWrite);
     928        Value * const baseAddress = b->getBaseAddress(name);
     929        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
     930        b->CreateBr(resume);
     931
     932        b->SetInsertPoint(resume);
     933    }
     934
     935    strideDone->moveAfter(b->GetInsertBlock());
     936
     937    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
     938    //  We've dealt with the partial block processing and copied information back into the
     939    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
     940    if (hasNoTerminateAttribute()) {
     941        b->CreateCondBr(mIsFinal, segmentDone, strideDone);
     942    } else {
     943        BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
     944        b->CreateCondBr(mIsFinal, setTermination, strideDone);
     945
     946        b->SetInsertPoint(setTermination);
     947        b->setTerminationSignal();
     948        b->CreateBr(segmentDone);       
     949    }
     950
     951    /// STRIDE DONE
     952    b->SetInsertPoint(strideDone);
     953
     954    // do we have enough data for another stride?
     955    Value * pendingStrides = nullptr;
    1001956    for (unsigned i = 0; i < inputSetCount; ++i) {
    1002         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1003         if (r.isFixed()) {
    1004             assert (rateLCM % r.getRate() == 0);
    1005             Value * const a = kb->CreateMul(mAvailableItemCount[i], kb->getSize(rateLCM / r.getRate())); // unprocessed
    1006             Value * const p = kb->CreateUDiv(processedItemCount[i], kb->getSize(r.getRate()));
    1007             if (scaledInverseOfStrideItemCount) {
    1008                 scaledInverseOfStrideItemCount = kb->CreateUMin(scaledInverseOfStrideItemCount, a);
    1009                 basePreviouslyProcessedItemCount = kb->CreateUMin(basePreviouslyProcessedItemCount, p);
     957        Value * const processed = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     958        Value * const remaining = b->CreateSub(mInitialAvailableItemCount[i], processed);
     959        Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
     960        pendingStrides = b->CreateUMin(pendingStrides, remainingStrides);
     961    }
     962
     963    // do we have enough room for another stride?
     964    for (unsigned i = 0; i < outputSetCount; ++i) {
     965        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     966        const auto & name = mStreamSetOutputs[i].getName();
     967        Value * const newProduced = b->getProducedItemCount(name);
     968        // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
     969        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
     970            Value * const unconsumed = b->CreateSub(newProduced, b->getConsumedItemCount(name));
     971            Value * const remaining = b->CreateSub(b->getCapacity(name), unconsumed);
     972            Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
     973            pendingStrides = b->CreateUMin(pendingStrides, remainingStrides);
     974        }
     975        // Do copybacks if necessary.
     976        if (mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate)) {
     977            b->CreateCopyBack(name, mInitialProducedItemCount[i], newProduced);
     978        }
     979    }
     980
     981    Value * const hasMoreStrides = b->CreateOr(b->CreateICmpNE(pendingStrides, ZERO), initiallyFinal);
     982    b->CreateCondBr(hasMoreStrides, segmentLoop, segmentDone);
     983
     984    /// SEGMENT DONE
     985    segmentDone->moveAfter(b->GetInsertBlock());
     986    b->SetInsertPoint(segmentDone);
     987
     988}
     989
     990/** ------------------------------------------------------------------------------------------------------------- *
     991 * @brief requiresCopyBack
     992 ** ------------------------------------------------------------------------------------------------------------- */
     993bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
     994    if (rate.isBounded() || rate.isUnknown()) {
     995        return true;
     996    } else if (rate.isRelative()) {
     997        return requiresCopyBack(getBinding(rate.getReference()).getRate());
     998    }
     999    return false;
     1000}
     1001
     1002/** ------------------------------------------------------------------------------------------------------------- *
     1003 * @brief CreateUDivCeil
     1004 ** ------------------------------------------------------------------------------------------------------------- */
     1005inline Value * CreateUDivCeil(const std::unique_ptr<KernelBuilder> & b, Value * const number, const ProcessingRate::RateValue divisor, const Twine & Name = "") {
     1006    Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
     1007    if (LLVM_LIKELY(divisor.denominator() == 1)) {
     1008        return b->CreateUDivCeil(number, n, Name);
     1009    } else {
     1010        //   âŒŠ(num + ratio - 1) / ratio⌋
     1011        // = ⌊(num - 1) / (n/d)⌋ + (ratio/ratio)
     1012        // = ⌊(d * (num - 1)) / n⌋ + 1
     1013        Constant * const ONE = ConstantInt::get(number->getType(), 1);
     1014        Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
     1015        return b->CreateAdd(b->CreateUDiv(b->CreateMul(b->CreateSub(number, ONE), d), n), ONE, Name);
     1016    }
     1017}
     1018
     1019
     1020/** ------------------------------------------------------------------------------------------------------------- *
     1021 * @brief reviseFinalProducedItemCounts
     1022 ** ------------------------------------------------------------------------------------------------------------- */
     1023void MultiBlockKernel::reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b) {
     1024
     1025    if (LLVM_UNLIKELY(mStreamSetInputs.empty())) {
     1026        return;
     1027    }
     1028
     1029    const auto inputSetCount = mStreamSetInputs.size();
     1030
     1031    ProcessingRate::RateValue rateLCM(1);
     1032    unsigned first = 0;
     1033    unsigned last = inputSetCount;
     1034
     1035    for (unsigned i = 0; i < inputSetCount; ++i) {
     1036        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
     1037        if (pr.isFixed()) {
     1038            rateLCM = lcm(rateLCM, pr.getRate());
     1039            if (mStreamSetInputs[i].isPrincipal()) {
     1040                assert ("A kernel cannot have multiple principle input streams" && (first == 0 && last == inputSetCount));
     1041                first = i;
     1042                last = i + 1;
     1043            }
     1044        }       
     1045    }
     1046
     1047    bool noFixedRateOutput = true;
     1048
     1049    for (const Binding & output : mStreamSetOutputs) {
     1050        const ProcessingRate & pr = output.getRate();
     1051        if (pr.isFixed()) {
     1052            rateLCM = lcm(rateLCM, pr.getRate());
     1053            noFixedRateOutput = false;
     1054        }
     1055    }
     1056
     1057    if (noFixedRateOutput) {
     1058        return;
     1059    }
     1060
     1061    Value * baseInitialProcessedItemCount = nullptr;
     1062    Value * scaledInverseOfAvailItemCount = nullptr;
     1063
     1064    // For each Fixed output stream, this calculates:
     1065
     1066    //    CEILING(MIN(Available Item Count / Fixed Input Rate) * Fixed Output Rate)
     1067
     1068    // But avoids the possibility of overflow errors (assuming that each processed item count does not overflow)
     1069
     1070    for (unsigned i = first; i < last; ++i) {
     1071        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
     1072        if (pr.isFixed()) {
     1073            Value * p = mInitialProcessedItemCount[i];
     1074            Value * a = b->CreateSub(mInitialAvailableItemCount[i], p);
     1075            const auto & rate = pr.getRate();
     1076            if (LLVM_UNLIKELY(rateLCM != rate)) {
     1077                const auto factor = rateLCM / rate;
     1078                if (LLVM_UNLIKELY(factor.numerator() > 1)) {
     1079                    a = b->CreateMul(a, b->getSize(factor.numerator()));
     1080                }
     1081                if (LLVM_UNLIKELY(factor.denominator() > 1)) {
     1082                    a = b->CreateUDiv(a, b->getSize(factor.denominator()));
     1083                }
     1084            }
     1085            if (LLVM_UNLIKELY(rate.denominator() > 1)) {
     1086                p = b->CreateMul(p, b->getSize(rate.denominator()));
     1087            }
     1088            if (LLVM_UNLIKELY(rate.numerator() > 1)) {
     1089                p = b->CreateUDiv(p, b->getSize(rate.numerator()));
     1090            }
     1091            if (scaledInverseOfAvailItemCount) {
     1092                scaledInverseOfAvailItemCount = b->CreateUMin(scaledInverseOfAvailItemCount, a);
     1093                baseInitialProcessedItemCount = b->CreateUMin(baseInitialProcessedItemCount, p);
    10101094            } else {
    1011                 scaledInverseOfStrideItemCount = a;
    1012                 basePreviouslyProcessedItemCount = p;
     1095                scaledInverseOfAvailItemCount = a;
     1096                baseInitialProcessedItemCount = p;
    10131097            }
    10141098        }
    1015 //        const auto name = mStreamSetInputs[i].getName();
    1016 //        Value * const processed = kb->CreateAdd(processedItemCount[i], unprocessed[i]);
    1017 //        kb->setProcessedItemCount(name, processed);
    1018     }
    1019 
    1020     for (unsigned i = 0; i < outputSetCount; ++i) {
    1021         const auto name = mStreamSetOutputs[i].getName();
    1022         const ProcessingRate & r = mStreamSetOutputs[i].getRate();
     1099    }
     1100
     1101    for (const Binding & output : mStreamSetOutputs) {
     1102        const auto name = output.getName();
     1103        const ProcessingRate & pr = output.getRate();
    10231104        Value * produced = nullptr;
    1024         if (r.isFixed()) {
    1025             assert (rateLCM % r.getRate() == 0);
    1026             assert (basePreviouslyProcessedItemCount && scaledInverseOfStrideItemCount);
    1027             Value * const p = kb->CreateMul(basePreviouslyProcessedItemCount, kb->getSize(r.getRate()));
    1028             Value * const ic = kb->CreateUDivCeil(scaledInverseOfStrideItemCount, kb->getSize(rateLCM / r.getRate()));
    1029             produced = kb->CreateAdd(p, ic);
     1105        if (pr.isFixed() && output.notDeferred()) {
     1106            assert (baseInitialProcessedItemCount && scaledInverseOfAvailItemCount);
     1107            const auto rate = pr.getRate();
     1108            Value * p = baseInitialProcessedItemCount;
     1109            if (LLVM_UNLIKELY(rate.numerator() != 1)) {
     1110                p = b->CreateMul(p, b->getSize(rate.numerator()));
     1111            }
     1112            if (LLVM_UNLIKELY(rate.denominator() != 1)) {
     1113                p = b->CreateUDiv(p, b->getSize(rate.denominator()));
     1114            }
     1115            Value * const ic = CreateUDivCeil(b, scaledInverseOfAvailItemCount, rateLCM / pr.getRate());
     1116            produced = b->CreateAdd(p, ic);
    10301117        } else { // check if we have an attribute; if so, get the current produced count and adjust it
    10311118            bool noAttributes = true;
    1032             for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1119            for (const Attribute & attr : output.getAttributes()) {
    10331120                if (attr.isAdd() || attr.isRoundUpTo()) {
    10341121                    noAttributes = false;
     
    10391126                continue;
    10401127            }
    1041             produced = kb->getProducedItemCount(name);
    1042         }
    1043         for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1128            produced = b->getProducedItemCount(name);
     1129        }
     1130        for (const Attribute & attr : output.getAttributes()) {
    10441131            if (attr.isAdd()) {
    1045                 produced = kb->CreateAdd(produced, kb->getSize(attr.getAmount()));
     1132                produced = b->CreateAdd(produced, b->getSize(attr.getAmount()));
    10461133            } else if (attr.isRoundUpTo()) {
    1047                 produced = kb->CreateRoundUp(produced, kb->getSize(attr.getAmount()));
     1134                produced = b->CreateRoundUp(produced, b->getSize(attr.getAmount()));
    10481135            }
    10491136        }
    1050         kb->setProducedItemCount(name, produced);
    1051     }
    1052 
    1053     kb->CreateBr(temporaryBufferCopyBack);
    1054 
    1055     /// TEMPORARY BUFFER COPY BACK
    1056     kb->SetInsertPoint(temporaryBufferCopyBack);
    1057 
    1058     // Copy back data to the actual output buffers.
    1059     for (unsigned i = 0; i < outputSetCount; i++) {
    1060 
    1061         if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
    1062 
    1063             const auto name = mStreamSetOutputs[i].getName();
    1064 
    1065             BasicBlock * const copy = kb->CreateBasicBlock(name + "CopyBack");
    1066             BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopyBack");
    1067             Value * const usedTemporary = kb->CreateICmpNE(temporaryOutputBuffer[i], baseOutputBuffer[i]);
    1068 
    1069             // If we used a temporary buffer ...
    1070             kb->CreateCondBr(usedTemporary, copy, resume);
    1071 
    1072             kb->SetInsertPoint(copy);
    1073             Value * bytesCopied = kb->copy(name, baseOutputBuffer[i], temporaryOutputBuffer[i], linearlyWritable[i]);
    1074             Value * nextOutputPtr = kb->getRawOutputPointer(name, kb->getSize(0));
    1075             Value * producedCount = kb->getProducedItemCount(name);
    1076 
    1077             Value * remaining = kb->CreateSub(producedCount, linearlyWritable[i]);
    1078             Value * nextBufPtr = kb->CreatePointerCast(temporaryOutputBuffer[i], kb->getInt8PtrTy());
    1079             nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
    1080 
    1081             kb->copy(name, nextOutputPtr, nextBufPtr, remaining);
    1082             kb->CreateBr(resume);
    1083 
    1084             kb->SetInsertPoint(resume);
    1085         }
    1086     }
    1087 
    1088     //  We've dealt with the partial block processing and copied information back into the
    1089     //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1090     BasicBlock * setTermination = nullptr;
    1091     if (hasNoTerminateAttribute()) {
    1092         kb->CreateCondBr(mIsFinal, segmentDone, standardCopyBack);
    1093     } else {
    1094         setTermination = kb->CreateBasicBlock("setTermination");
    1095         kb->CreateCondBr(mIsFinal, setTermination, standardCopyBack);
    1096     }
    1097 
    1098     /// STANDARD COPY BACK
    1099     kb->SetInsertPoint(standardCopyBack);
    1100 
    1101     // Do copybacks if necessary.
    1102     for (unsigned i = 0; i < outputSetCount; i++) {
    1103         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    1104             const auto name = mStreamSetOutputs[i].getName();
    1105             Value * newProduced = kb->getProducedItemCount(name);
    1106             kb->CreateCopyBack(name, producedItemCount[i], newProduced);
    1107         }
    1108     }
    1109 
    1110     // If it is possible to make no progress, verify we processed some of the input. If we haven't,
    1111     // we're finished this segment.
    1112     if (mayMakeNoProgress) {
    1113         Value * madeProgress = nullptr;
    1114         for (unsigned i = 0; i < inputSetCount; ++i) {
    1115             Value * const processed = kb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1116             Value * const progress = kb->CreateICmpNE(processed, processedItemCount[i]);
    1117             if (madeProgress) {
    1118                 madeProgress = kb->CreateOr(madeProgress, progress);
    1119             } else {
    1120                 madeProgress = progress;
    1121             }
    1122         }
    1123         assert (madeProgress);
    1124         kb->CreateCondBr(madeProgress, doSegmentLoop, segmentDone);
    1125     } else {
    1126         kb->CreateBr(doSegmentLoop);
    1127     }
    1128 
    1129     if (hasNoTerminateAttribute()) {
    1130         segmentDone->moveAfter(kb->GetInsertBlock());
    1131     } else {
    1132         /// SET TERMINATION
    1133         setTermination->moveAfter(kb->GetInsertBlock());
    1134         kb->SetInsertPoint(setTermination);
    1135         kb->setTerminationSignal();
    1136         kb->CreateBr(segmentDone);
    1137         segmentDone->moveAfter(setTermination);
    1138     }
    1139 
    1140     kb->SetInsertPoint(segmentDone);
    1141 
    1142 }
    1143 
    1144 //bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
    1145 //    if (rate.isBounded() || rate.isUnknown()) {
    1146 //        return true;
    1147 //    } else if (rate.isDirectlyRelative()) {
    1148 //        Port port; unsigned i;
    1149 //        std::tie(port, i) = getStreamPort(rate.getReference());
    1150 //        const auto & binding = (port == Port::Input) ? mStreamSetInputs[i] : mStreamSetOutputs[i];
    1151 //        return requiresCopyBack(binding.getRate());
    1152 //    }
    1153 //    return false;
    1154 //}
    1155 
    1156 //  The default doSegment method dispatches to the doBlock routine for
    1157 //  each block of the given number of blocksToDo, and then updates counts.
    1158 
    1159 void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) {
    1160 
    1161     BasicBlock * const entryBlock = idb->GetInsertBlock();
    1162     BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
    1163     mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody");
    1164     BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone");
    1165     BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock");
    1166     BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone");
    1167 
    1168     Value * baseTarget = nullptr;
    1169     if (idb->supportsIndirectBr()) {
    1170         baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
    1171     }
    1172 
    1173     Constant * const log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
    1174 
     1137        b->setProducedItemCount(name, produced);
     1138    }
     1139
     1140}
     1141
     1142/** ------------------------------------------------------------------------------------------------------------- *
     1143 * @brief generateMultiBlockLogic
     1144 ** ------------------------------------------------------------------------------------------------------------- */
     1145Value * BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     1146
     1147    if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
     1148        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of BlockOrientedKernel "
     1149                           "equal to the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
     1150    }
     1151
     1152    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
     1153
     1154    BasicBlock * const entryBlock = b->GetInsertBlock();
     1155    mStrideLoopBody = b->CreateBasicBlock(getName() + "_strideLoopBody");
     1156    BasicBlock * const stridesDone = b->CreateBasicBlock(getName() + "_stridesDone");
     1157    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
     1158    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
     1159    b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
     1160                    "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
    11751161    const auto inputSetCount = mStreamSetInputs.size();
    11761162    Value * baseProcessedIndex[inputSetCount];
    1177     for (unsigned i = 0; i < inputSetCount; ++i) {
     1163    Value * baseInputAddress[inputSetCount];
     1164    for (unsigned i = 0; i < inputSetCount; i++) {
    11781165        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    1179         if (rate.isFixed()) {
    1180             baseProcessedIndex[i] = nullptr;
    1181         } else {
    1182             Value * ic = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1183             ic = idb->CreateLShr(ic, log2BlockSize);
    1184             baseProcessedIndex[i] = ic;
    1185         }
     1166        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1167            Value * const ic = mInitialProcessedItemCount[i];
     1168            baseProcessedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
     1169        }
     1170        baseInputAddress[i] = mStreamSetInputBaseAddress[i];
    11861171    }
    11871172
    11881173    const auto outputSetCount = mStreamSetOutputs.size();
    11891174    Value * baseProducedIndex[outputSetCount];
     1175    Value * baseOutputAddress[inputSetCount];
     1176    for (unsigned i = 0; i < outputSetCount; i++) {
     1177        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     1178        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1179            Value * const ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
     1180            baseProducedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
     1181        }
     1182        baseOutputAddress[i] = mStreamSetOutputBaseAddress[i];
     1183    }
     1184
     1185    b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, mStrideLoopBody);
     1186
     1187    /// BLOCK BODY
     1188
     1189    b->SetInsertPoint(mStrideLoopBody);
     1190
     1191    if (b->supportsIndirectBr()) {
     1192        Value * const baseTarget = BlockAddress::get(segmentDone);
     1193        mStrideLoopTarget = b->CreatePHI(baseTarget->getType(), 2, "strideTarget");
     1194        mStrideLoopTarget->addIncoming(baseTarget, entryBlock);
     1195    }
     1196
     1197    mStrideBlockIndex = b->CreatePHI(b->getSizeTy(), 2);
     1198    mStrideBlockIndex->addIncoming(b->getSize(0), entryBlock);
     1199
     1200    /// GENERATE DO BLOCK METHOD
     1201
     1202    for (unsigned i = 0; i < inputSetCount; ++i) {
     1203        Value * index = mStrideBlockIndex;
     1204        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     1205        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1206            Value * ic = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     1207            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProcessedIndex[i]);
     1208        }
     1209        mStreamSetInputBaseAddress[i] = b->CreateGEP(mStreamSetInputBaseAddress[i], index);
     1210    }
     1211
    11901212    for (unsigned i = 0; i < outputSetCount; ++i) {
     1213        Value * index = mStrideBlockIndex;
    11911214        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    1192         if (rate.isFixed()) {
    1193             baseProducedIndex[i] = nullptr;
    1194         } else {
    1195             Value * ic = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
    1196             ic = idb->CreateLShr(ic, log2BlockSize);
    1197             baseProducedIndex[i] = ic;
    1198         }
    1199     }
    1200 
    1201     Value * const numOfBlocksToProcess = idb->CreateMul(numOfStrides, idb->getSize(mStride / idb->getBitBlockWidth()));
    1202 
    1203     idb->CreateBr(strideLoopCond);
    1204 
    1205     /// BLOCK COND
    1206 
    1207     idb->SetInsertPoint(strideLoopCond);
    1208 
    1209     PHINode * branchTarget = nullptr;
    1210     if (baseTarget) {
    1211         branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
    1212         branchTarget->addIncoming(baseTarget, entryBlock);
    1213     }
    1214 
    1215     PHINode * const blockIndex = idb->CreatePHI(idb->getSizeTy(), 2, "index");
    1216     blockIndex->addIncoming(idb->getSize(0), entryBlock);
    1217 
    1218     for (unsigned i = 0; i < inputSetCount; ++i) {
    1219         Value * offset = blockIndex;
    1220         if (baseProcessedIndex[i]) {
    1221             offset = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1222             offset = idb->CreateLShr(offset, log2BlockSize);
    1223             offset = idb->CreateSub(offset, baseProcessedIndex[i]);
    1224         }
    1225         mStreamSetInputBufferPtr[i] = idb->CreateGEP(mStreamSetInputBufferPtr[i], offset);
    1226     }
    1227 
    1228     for (unsigned i = 0; i < outputSetCount; ++i) {
    1229         Value * offset = blockIndex;
    1230         if (baseProducedIndex[i]) {
    1231             offset = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
    1232             offset = idb->CreateLShr(offset, log2BlockSize);
    1233             offset = idb->CreateSub(offset, baseProducedIndex[i]);
    1234         }
    1235         mStreamSetOutputBufferPtr[i] = idb->CreateGEP(mStreamSetOutputBufferPtr[i], offset);
    1236     }
    1237 
    1238     Value * const notDone = idb->CreateICmpULT(blockIndex, numOfBlocksToProcess);
    1239     idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
    1240 
    1241     /// BLOCK BODY
    1242 
    1243     idb->SetInsertPoint(mStrideLoopBody);
    1244 
    1245     if (idb->supportsIndirectBr()) {
    1246         mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget");
    1247         mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
    1248     }
    1249 
    1250     /// GENERATE DO BLOCK METHOD
    1251 
    1252     writeDoBlockMethod(idb);
    1253 
    1254     BasicBlock * const bodyEnd = idb->GetInsertBlock();
    1255     blockIndex->addIncoming(idb->CreateAdd(blockIndex, idb->getSize(1)), bodyEnd);
    1256     if (branchTarget) {
    1257         branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
    1258     }
    1259     idb->CreateBr(strideLoopCond);
     1215        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1216            Value * ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
     1217            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProducedIndex[i]);
     1218        }
     1219        mStreamSetOutputBaseAddress[i] = b->CreateGEP(mStreamSetOutputBaseAddress[i], index);
     1220    }
     1221
     1222    writeDoBlockMethod(b);
     1223
     1224    BasicBlock * const bodyEnd = b->GetInsertBlock();
     1225    if (mStrideLoopTarget) {
     1226        mStrideLoopTarget->addIncoming(mStrideLoopTarget, bodyEnd);
     1227    }
     1228
     1229    Value * const nextIndex = b->CreateAdd(mStrideBlockIndex, b->getSize(1));
     1230    mStrideBlockIndex->addIncoming(nextIndex, bodyEnd);
     1231    Value * const notDone = b->CreateICmpULT(nextIndex, numOfBlocks);
     1232    b->CreateCondBr(notDone, mStrideLoopBody, stridesDone);
    12601233
    12611234    stridesDone->moveAfter(bodyEnd);
     
    12631236    /// STRIDE DONE
    12641237
    1265     idb->SetInsertPoint(stridesDone);
     1238    b->SetInsertPoint(stridesDone);
    12661239
    12671240    // Now conditionally perform the final block processing depending on the doFinal parameter.
    1268     if (branchTarget) {
    1269         mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
     1241    if (mStrideLoopTarget) {
     1242        mStrideLoopBranch = b->CreateIndirectBr(mStrideLoopTarget, 3);
    12701243        mStrideLoopBranch->addDestination(doFinalBlock);
    12711244        mStrideLoopBranch->addDestination(segmentDone);
    12721245    } else {
    1273         idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
     1246        b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
    12741247    }
    12751248
    12761249    doFinalBlock->moveAfter(stridesDone);
    12771250
    1278     idb->SetInsertPoint(doFinalBlock);
    1279 
    1280     Value * remainingItems = nullptr;
     1251    /// DO FINAL BLOCK
     1252
     1253    b->SetInsertPoint(doFinalBlock);
    12811254    for (unsigned i = 0; i < inputSetCount; ++i) {
    1282         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1283         if (r.isFixed()) {
    1284             Value * ic = idb->CreateUDiv(mAvailableItemCount[i], idb->getSize(r.getRate()));
    1285             if (remainingItems) {
    1286                 remainingItems = idb->CreateUMax(remainingItems, ic);
    1287             } else {
    1288                 remainingItems = ic;
    1289             }
    1290         }
    1291     }
    1292 
    1293     writeFinalBlockMethod(idb, remainingItems);
    1294 
    1295     idb->CreateBr(segmentDone);
    1296 
    1297     segmentDone->moveAfter(idb->GetInsertBlock());
    1298 
    1299     idb->SetInsertPoint(segmentDone);
     1255        mStreamSetInputBaseAddress[i] = baseInputAddress[i];
     1256    }
     1257
     1258    for (unsigned i = 0; i < outputSetCount; ++i) {
     1259        mStreamSetOutputBaseAddress[i] = baseOutputAddress[i];
     1260    }
     1261
     1262    writeFinalBlockMethod(b, getRemainingItems(b));
     1263
     1264    b->CreateBr(segmentDone);
     1265
     1266    segmentDone->moveAfter(b->GetInsertBlock());
     1267
     1268    b->SetInsertPoint(segmentDone);
    13001269
    13011270    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
    1302     if (branchTarget) {
    1303         MDBuilder mdb(idb->getContext());
     1271    if (mStrideLoopTarget) {
     1272        MDBuilder mdb(b->getContext());
    13041273        const auto destinations = mStrideLoopBranch->getNumDestinations();
    13051274        uint32_t weights[destinations];
     
    13111280    }
    13121281
    1313 }
    1314 
    1315 inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) {
     1282    return numOfBlocks;
     1283}
     1284
     1285/** ------------------------------------------------------------------------------------------------------------- *
     1286 * @brief getRemainingItems
     1287 ** ------------------------------------------------------------------------------------------------------------- */
     1288Value * BlockOrientedKernel::getRemainingItems(const std::unique_ptr<KernelBuilder> & b) {
     1289    Value * remainingItems = nullptr;
     1290    const auto count = mStreamSetInputs.size();
     1291    if (count == 1) {
     1292        return mAvailableItemCount[0];
     1293    } else {
     1294        for (unsigned i = 0; i < count; i++) {
     1295            if (mStreamSetInputs[i].isPrincipal()) {
     1296                return mAvailableItemCount[i];
     1297            }
     1298        }
     1299        for (unsigned i = 0; i < count; ++i) {
     1300            const ProcessingRate & r = mStreamSetInputs[i].getRate();
     1301            if (r.isFixed()) {
     1302                Value * ic = CreateUDivCeil(b, mAvailableItemCount[i], r.getRate());
     1303                if (remainingItems) {
     1304                    remainingItems = b->CreateUMin(remainingItems, ic);
     1305                } else {
     1306                    remainingItems = ic;
     1307                }
     1308            }
     1309        }
     1310    }
     1311    return remainingItems;
     1312}
     1313
     1314/** ------------------------------------------------------------------------------------------------------------- *
     1315 * @brief writeDoBlockMethod
     1316 ** ------------------------------------------------------------------------------------------------------------- */
     1317inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    13161318
    13171319    Value * const self = getInstance();
    13181320    Function * const cp = mCurrentMethod;
    1319     auto ip = idb->saveIP();
     1321    auto ip = b->saveIP();
    13201322    std::vector<Value *> availableItemCount(0);
    13211323
    13221324    /// Check if the do block method is called and create the function if necessary
    1323     if (!idb->supportsIndirectBr()) {
     1325    if (!b->supportsIndirectBr()) {
    13241326
    13251327        std::vector<Type *> params;
     
    13301332        }
    13311333
    1332         FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false);
    1333         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule());
     1334        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
     1335        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, b->getModule());
    13341336        mCurrentMethod->setCallingConv(CallingConv::C);
    13351337        mCurrentMethod->setDoesNotThrow();
     
    13431345        assert (availableItemCount.size() == mAvailableItemCount.size());
    13441346        mAvailableItemCount.swap(availableItemCount);
    1345         idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
    1346     }
    1347 
    1348     generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
    1349 
    1350     if (!idb->supportsIndirectBr()) {
     1347        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
     1348    }
     1349
     1350    generateDoBlockMethod(b); // must be implemented by the BlockOrientedKernelBuilder subtype
     1351
     1352    if (!b->supportsIndirectBr()) {
    13511353        // Restore the DoSegment function state then call the DoBlock method
    1352         idb->CreateRetVoid();
     1354        b->CreateRetVoid();
    13531355        mDoBlockMethod = mCurrentMethod;
    1354         idb->restoreIP(ip);
     1356        b->restoreIP(ip);
    13551357        setInstance(self);
    13561358        mCurrentMethod = cp;
    13571359        mAvailableItemCount.swap(availableItemCount);
    1358         CreateDoBlockMethodCall(idb);
    1359     }
    1360 
    1361 }
    1362 
    1363 inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) {
     1360        CreateDoBlockMethodCall(b);
     1361    }
     1362
     1363}
     1364
     1365/** ------------------------------------------------------------------------------------------------------------- *
     1366 * @brief writeFinalBlockMethod
     1367 ** ------------------------------------------------------------------------------------------------------------- */
     1368inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingItems) {
    13641369
    13651370    Value * const self = getInstance();
    13661371    Function * const cp = mCurrentMethod;
    13671372    Value * const remainingItemCount = remainingItems;
    1368     auto ip = idb->saveIP();
     1373    auto ip = b->saveIP();
    13691374    std::vector<Value *> availableItemCount(0);
    13701375
    1371     if (!idb->supportsIndirectBr()) {
     1376    if (!b->supportsIndirectBr()) {
    13721377        std::vector<Type *> params;
    13731378        params.reserve(2 + mAvailableItemCount.size());
    13741379        params.push_back(self->getType());
    1375         params.push_back(idb->getSizeTy());
     1380        params.push_back(b->getSizeTy());
    13761381        for (Value * avail : mAvailableItemCount) {
    13771382            params.push_back(avail->getType());
    13781383        }
    1379         FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false);
    1380         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule());
     1384        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
     1385        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, b->getModule());
    13811386        mCurrentMethod->setCallingConv(CallingConv::C);
    13821387        mCurrentMethod->setDoesNotThrow();
     
    13921397        assert (availableItemCount.size() == mAvailableItemCount.size());
    13931398        mAvailableItemCount.swap(availableItemCount);
    1394         idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
    1395     }
    1396 
    1397     generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
    1398 
    1399     if (!idb->supportsIndirectBr()) {
    1400         idb->CreateRetVoid();
    1401         idb->restoreIP(ip);
     1399        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
     1400    }
     1401
     1402    generateFinalBlockMethod(b, remainingItems); // may be implemented by the BlockOrientedKernel subtype
     1403
     1404    if (!b->supportsIndirectBr()) {
     1405        b->CreateRetVoid();
     1406        b->restoreIP(ip);
    14021407        setInstance(self);
    14031408        mAvailableItemCount.swap(availableItemCount);
     
    14071412        args.push_back(self);
    14081413        args.push_back(remainingItemCount);
    1409         for (Value * avail : mAvailableItemCount) {
    1410             args.push_back(avail);
    1411         }
    1412         idb->CreateCall(mCurrentMethod, args);
     1414        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
     1415        b->CreateCall(mCurrentMethod, args);
    14131416        mCurrentMethod = cp;
    14141417    }
     
    14161419}
    14171420
    1418 //  The default finalBlock method simply dispatches to the doBlock routine.
    1419 void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) {
    1420     CreateDoBlockMethodCall(idb);
    1421 }
    1422 
    1423 void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) {
    1424     if (idb->supportsIndirectBr()) {
    1425         BasicBlock * bb = idb->CreateBasicBlock("resume");
     1421/** ------------------------------------------------------------------------------------------------------------- *
     1422 * @brief generateFinalBlockMethod
     1423 ** ------------------------------------------------------------------------------------------------------------- */
     1424void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * /* remainingItems */) {
     1425    //  The default finalBlock method simply dispatches to the doBlock routine.
     1426    CreateDoBlockMethodCall(b);
     1427}
     1428
     1429void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b) {
     1430    if (b->supportsIndirectBr()) {
     1431        BasicBlock * const bb = b->CreateBasicBlock("resume");
    14261432        mStrideLoopBranch->addDestination(bb);
    1427         mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock());
    1428         idb->CreateBr(mStrideLoopBody);
    1429         bb->moveAfter(idb->GetInsertBlock());
    1430         idb->SetInsertPoint(bb);
     1433        BasicBlock * const current = b->GetInsertBlock();
     1434        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), current);
     1435        mStrideBlockIndex->addIncoming(b->getSize(0), current);
     1436        b->CreateBr(mStrideLoopBody);
     1437        bb->moveAfter(current);
     1438        b->SetInsertPoint(bb);
    14311439    } else {
    14321440        std::vector<Value *> args;
    14331441        args.reserve(1 + mAvailableItemCount.size());
    14341442        args.push_back(getInstance());
    1435         for (Value * avail : mAvailableItemCount) {
    1436             args.push_back(avail);
    1437         }
    1438         idb->CreateCall(mDoBlockMethod, args);
     1443        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
     1444        b->CreateCall(mDoBlockMethod, args);
    14391445    }
    14401446}
    14411447
    14421448static inline std::string annotateKernelNameWithDebugFlags(std::string && name) {
    1443     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1449    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    14441450        name += "_EA";
    14451451    }
     
    14501456// CONSTRUCTOR
    14511457Kernel::Kernel(std::string && kernelName,
    1452                std::vector<Binding> && stream_inputs,
    1453                std::vector<Binding> && stream_outputs,
    1454                std::vector<Binding> && scalar_parameters,
    1455                std::vector<Binding> && scalar_outputs,
    1456                std::vector<Binding> && internal_scalars)
     1458               Bindings && stream_inputs,
     1459               Bindings && stream_outputs,
     1460               Bindings && scalar_parameters,
     1461               Bindings && scalar_outputs,
     1462               Bindings && internal_scalars)
    14571463: KernelInterface(annotateKernelNameWithDebugFlags(std::move(kernelName))
    14581464                  , std::move(stream_inputs), std::move(stream_outputs)
     
    14601466                  , std::move(internal_scalars))
    14611467, mCurrentMethod(nullptr)
    1462 , mAvailablePrincipleItemCount(nullptr)
     1468, mAvailablePrincipalItemCount(nullptr)
    14631469, mNoTerminateAttribute(false)
    14641470, mIsGenerated(false)
     
    14731479}
    14741480
     1481// MULTI-BLOCK KERNEL CONSTRUCTOR
     1482MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
     1483                                   Bindings && stream_inputs,
     1484                                   Bindings && stream_outputs,
     1485                                   Bindings && scalar_parameters,
     1486                                   Bindings && scalar_outputs,
     1487                                   Bindings && internal_scalars)
     1488: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1489
     1490}
     1491
    14751492// CONSTRUCTOR
    14761493BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
    1477                                          std::vector<Binding> && stream_inputs,
    1478                                          std::vector<Binding> && stream_outputs,
    1479                                          std::vector<Binding> && scalar_parameters,
    1480                                          std::vector<Binding> && scalar_outputs,
    1481                                          std::vector<Binding> && internal_scalars)
     1494                                         Bindings && stream_inputs,
     1495                                         Bindings && stream_outputs,
     1496                                         Bindings && scalar_parameters,
     1497                                         Bindings && scalar_outputs,
     1498                                         Bindings && internal_scalars)
    14821499: MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
    14831500, mDoBlockMethod(nullptr)
    14841501, mStrideLoopBody(nullptr)
    14851502, mStrideLoopBranch(nullptr)
    1486 , mStrideLoopTarget(nullptr) {
    1487 
    1488 }
    1489 
    1490 // MULTI-BLOCK KERNEL CONSTRUCTOR
    1491 MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
    1492                                    std::vector<Binding> && stream_inputs,
    1493                                    std::vector<Binding> && stream_outputs,
    1494                                    std::vector<Binding> && scalar_parameters,
    1495                                    std::vector<Binding> && scalar_outputs,
    1496                                    std::vector<Binding> && internal_scalars)
    1497 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1503, mStrideLoopTarget(nullptr)
     1504, mStrideBlockIndex(nullptr) {
    14981505
    14991506}
     
    15011508// CONSTRUCTOR
    15021509SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
    1503                                              std::vector<Binding> && stream_inputs,
    1504                                              std::vector<Binding> && stream_outputs,
    1505                                              std::vector<Binding> && scalar_parameters,
    1506                                              std::vector<Binding> && scalar_outputs,
    1507                                              std::vector<Binding> && internal_scalars)
     1510                                             Bindings && stream_inputs,
     1511                                             Bindings && stream_outputs,
     1512                                             Bindings && scalar_parameters,
     1513                                             Bindings && scalar_outputs,
     1514                                             Bindings && internal_scalars)
    15081515: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    15091516
Note: See TracChangeset for help on using the changeset viewer.