Ignore:
Timestamp:
Oct 25, 2017, 4:57:58 PM (2 years ago)
Author:
nmedfort
Message:

First stage of MultiBlockKernel? and pipeline restructuring

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5688 r5706  
    1818#include <sstream>
    1919#include <kernels/kernel_builder.h>
     20#include <boost/math/common_factor_rt.hpp>
    2021#include <llvm/Support/Debug.h>
    2122
    2223using namespace llvm;
    2324using namespace parabix;
     25using namespace boost::math;
    2426
    2527namespace kernel {
     
    3739const std::string Kernel::CYCLECOUNT_SCALAR = "CPUcycles";
    3840
     41/** ------------------------------------------------------------------------------------------------------------- *
     42 * @brief addScalar
     43 ** ------------------------------------------------------------------------------------------------------------- */
    3944unsigned Kernel::addScalar(Type * const type, const std::string & name) {
    4045    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     
    5055}
    5156
     57
     58/** ------------------------------------------------------------------------------------------------------------- *
     59 * @brief addUnnamedScalar
     60 ** ------------------------------------------------------------------------------------------------------------- */
    5261unsigned Kernel::addUnnamedScalar(Type * const type) {
    5362    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     
    5968}
    6069
     70
     71/** ------------------------------------------------------------------------------------------------------------- *
     72 * @brief prepareStreamSetNameMap
     73 ** ------------------------------------------------------------------------------------------------------------- */
    6174void Kernel::prepareStreamSetNameMap() {
    6275    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    63         mStreamMap.emplace(mStreamSetInputs[i].name, std::make_pair(Port::Input, i));
     76        mStreamMap.emplace(mStreamSetInputs[i].getName(), std::make_pair(Port::Input, i));
    6477    }
    6578    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    66         mStreamMap.emplace(mStreamSetOutputs[i].name, std::make_pair(Port::Output, i));
    67     }
    68 }
    69 
     79        mStreamMap.emplace(mStreamSetOutputs[i].getName(), std::make_pair(Port::Output, i));
     80    }
     81}
     82
     83
     84/** ------------------------------------------------------------------------------------------------------------- *
     85 * @brief bindPorts
     86 ** ------------------------------------------------------------------------------------------------------------- */
    7087void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
    7188    assert (mModule == nullptr);
     
    111128}
    112129
     130
     131/** ------------------------------------------------------------------------------------------------------------- *
     132 * @brief getCacheName
     133 ** ------------------------------------------------------------------------------------------------------------- */
    113134std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & idb) const {
    114135    std::stringstream cacheName;
     
    123144}
    124145
     146
     147/** ------------------------------------------------------------------------------------------------------------- *
     148 * @brief setModule
     149 ** ------------------------------------------------------------------------------------------------------------- */
    125150Module * Kernel::setModule(Module * const module) {
    126151    assert (mModule == nullptr || mModule == module);
     
    130155}
    131156
     157
     158/** ------------------------------------------------------------------------------------------------------------- *
     159 * @brief makeModule
     160 ** ------------------------------------------------------------------------------------------------------------- */
    132161Module * Kernel::makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    133162    return setModule(new Module(getCacheName(idb), idb->getContext()));
    134163}
    135164
     165
     166/** ------------------------------------------------------------------------------------------------------------- *
     167 * @brief prepareKernel
     168 ** ------------------------------------------------------------------------------------------------------------- */
    136169void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
    137170    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     
    139172        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
    140173    }
    141     const auto blockSize = idb->getBitBlockWidth();
     174    addBaseKernelProperties(idb);
     175    addInternalKernelProperties(idb);
     176    // NOTE: StructType::create always creates a new type even if an identical one exists.
     177    if (LLVM_UNLIKELY(mModule == nullptr)) {
     178        setModule(new Module(getCacheName(idb), idb->getContext()));
     179    }
     180    mKernelStateType = mModule->getTypeByName(getName());
     181    if (LLVM_LIKELY(mKernelStateType == nullptr)) {
     182        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
     183        assert (mKernelStateType);
     184    }   
     185}
     186
     187
     188/** ------------------------------------------------------------------------------------------------------------- *
     189 * @brief prepareCachedKernel
     190 ** ------------------------------------------------------------------------------------------------------------- */
     191void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
     192    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     193    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     194        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
     195    }
     196    assert (getModule());
     197    addBaseKernelProperties(idb);
     198    mKernelStateType = getModule()->getTypeByName(getName());
     199    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
     200        report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object");
     201    }   
     202}
     203
     204/** ------------------------------------------------------------------------------------------------------------- *
     205 * @brief getItemsPerStride
     206 ** ------------------------------------------------------------------------------------------------------------- */
     207std::pair<unsigned, unsigned> Kernel::getStreamRate(const Port p, const unsigned i) const {
     208    const ProcessingRate & rate = (p == Port::Input) ? mStreamSetInputs[i].getRate() : mStreamSetOutputs[i].getRate();
     209    unsigned min = 0, max = 0;
     210    if (rate.isFixed()) {
     211        min = max = rate.getRate();
     212    } else if (rate.isBounded()) {
     213        min = rate.getLowerBound();
     214        max = rate.getUpperBound();
     215    } else if (rate.isUnknown()) {
     216        min = rate.getLowerBound();
     217        max = 0;
     218    } else if (rate.isExactlyRelative()) {
     219        for (unsigned j = 0; j < mStreamSetInputs.size(); ++j) {
     220            if (mStreamSetInputs[j].getName() == rate.getReference()) {
     221                std::tie(min, max) = getStreamRate(Port::Input, j);
     222                min = (min * rate.getNumerator()) / rate.getDenominator();
     223                assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
     224                max = (max * rate.getNumerator()) / rate.getDenominator();
     225                return std::make_pair(min, max);
     226            }
     227        }
     228        for (unsigned j = 0; j < mStreamSetOutputs.size(); ++j) {
     229            if (mStreamSetOutputs[j].getName() == rate.getReference()) {
     230                assert (p == Port::Output);
     231                std::tie(min, max) = getStreamRate(Port::Output, j);
     232                min = (min * rate.getNumerator()) / rate.getDenominator();
     233                assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
     234                max = (max * rate.getNumerator()) / rate.getDenominator();
     235                return std::make_pair(min, max);
     236            }
     237        }
     238        llvm_unreachable("Reference rate must be associated with an input or output!");
     239    }
     240    return std::make_pair(min, max);
     241}
     242
     243/** ------------------------------------------------------------------------------------------------------------- *
     244 * @brief addBaseKernelProperties
     245 ** ------------------------------------------------------------------------------------------------------------- */
     246void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
     247   
     248    const unsigned inputSetCount = mStreamSetInputs.size();
     249    const unsigned outputSetCount = mStreamSetOutputs.size();
     250   
     251    assert (inputSetCount == mStreamSetInputBuffers.size());
     252    assert (outputSetCount == mStreamSetOutputBuffers.size());
     253
    142254    if (mStride == 0) {
    143255        // Set the default kernel stride.
    144         mStride = blockSize;
    145     }
     256        mStride = idb->getBitBlockWidth();
     257    }
     258
    146259    IntegerType * const sizeTy = idb->getSizeTy();
    147260
    148     assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
    149 //    assert (mStreamSetInputs.size() == mStreamSetInputLookahead.size());
    150 
    151     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    152 //        const auto requiredBlocks = codegen::SegmentSize + ((mStreamSetInputLookahead[i] + blockSize - 1) / blockSize);
    153 //        if ((mStreamSetInputBuffers[i]->getBufferBlocks() != 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
    154 //            report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
    155 //        }
    156         mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
    157         if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
    158             addScalar(sizeTy, mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
    159         }
    160     }
    161 
    162     assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
    163 
    164     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    165         mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
    166         if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
    167             addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
    168         }
     261    for (unsigned i = 0; i < inputSetCount; i++) {
     262        const Binding & b = mStreamSetInputs[i];
     263        //const ProcessingRate & rate = b.getRate();
     264        //if (rate.isBounded() || rate.isUnknown()) {
     265            addScalar(sizeTy, b.getName() + PROCESSED_ITEM_COUNT_SUFFIX);
     266        //}
     267    }
     268
     269    for (unsigned i = 0; i < outputSetCount; i++) {
     270        const Binding & b = mStreamSetOutputs[i];
     271        //const ProcessingRate & rate = b.getRate();
     272        //if (rate.isBounded() || rate.isUnknown()) {
     273            addScalar(sizeTy, b.getName() + PRODUCED_ITEM_COUNT_SUFFIX);
     274        //}
     275    }
     276
     277    for (unsigned i = 0; i < inputSetCount; i++) {
     278        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].getName() + BUFFER_PTR_SUFFIX);
     279    }
     280    for (unsigned i = 0; i < outputSetCount; i++) {
     281        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].getName() + BUFFER_PTR_SUFFIX);
    169282    }
    170283    for (const auto & binding : mScalarInputs) {
    171         addScalar(binding.type, binding.name);
     284        addScalar(binding.getType(), binding.getName());
    172285    }
    173286    for (const auto & binding : mScalarOutputs) {
    174         addScalar(binding.type, binding.name);
     287        addScalar(binding.getType(), binding.getName());
    175288    }
    176289    if (mStreamMap.empty()) {
     
    178291    }
    179292    for (const auto & binding : mInternalScalars) {
    180         addScalar(binding.type, binding.name);
    181     }
    182 
     293        addScalar(binding.getType(), binding.getName());
     294    }
    183295    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
    184296    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    185         addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
    186     }
    187 
     297        addScalar(consumerSetTy, mStreamSetOutputs[i].getName() + CONSUMER_SUFFIX);
     298    }
    188299    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    189300    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
    190 
    191301    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    192         addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
    193     }
    194 
     302        addScalar(sizeTy, mStreamSetOutputs[i].getName() + CONSUMED_ITEM_COUNT_SUFFIX);
     303    }
    195304    // We compile in a 64-bit CPU cycle counter into every kernel.   It will remain unused
    196305    // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
    197306    // will be able to add instrumentation to cached modules without recompilation.
    198307    addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
    199     addInternalKernelProperties(idb);
    200     // NOTE: StructType::create always creates a new type even if an identical one exists.
    201     if (LLVM_UNLIKELY(mModule == nullptr)) {
    202         setModule(new Module(getCacheName(idb), idb->getContext()));
    203     }
    204     mKernelStateType = mModule->getTypeByName(getName());
    205     if (LLVM_LIKELY(mKernelStateType == nullptr)) {
    206         mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
    207         assert (mKernelStateType);
    208     }
    209     processingRateAnalysis();
    210 }
    211 
    212 void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
    213 
    214     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
    215     if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
    216         report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
    217     }
    218     assert (getModule());
    219     const auto blockSize = idb->getBitBlockWidth();
    220     if (mStride == 0) {
    221         // Set the default kernel stride.
    222         mStride = blockSize;
    223     }
    224     IntegerType * const sizeTy = idb->getSizeTy();
    225 
    226     assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
    227 //    assert (mStreamSetInputs.size() == mStreamSetInputLookahead.size());
    228 
    229     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    230 //        const auto requiredBlocks = codegen::SegmentSize + ((mStreamSetInputLookahead[i] + blockSize - 1) / blockSize);
    231 //        if ((mStreamSetInputBuffers[i]->getBufferBlocks() != 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
    232 //            report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
    233 //        }
    234         mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
    235         if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
    236             addScalar(sizeTy, mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
    237         }
    238     }
    239 
    240     assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
    241 
    242     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    243         mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
    244         if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
    245             addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
    246         }
    247     }
    248     for (const auto & binding : mScalarInputs) {
    249         addScalar(binding.type, binding.name);
    250     }
    251     for (const auto & binding : mScalarOutputs) {
    252         addScalar(binding.type, binding.name);
    253     }
    254     if (mStreamMap.empty()) {
    255         prepareStreamSetNameMap();
    256     }
    257     for (const auto & binding : mInternalScalars) {
    258         addScalar(binding.type, binding.name);
    259     }
    260     Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
    261     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    262         addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
    263     }
    264     addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    265     addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
    266     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    267         addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
    268     }
    269     // We compile in a 64-bit CPU cycle counter into every kernel.   It will remain unused
    270     // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
    271     // will be able to add instrumentation to cached modules without recompilation.
    272     addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
    273 
    274     mKernelStateType = getModule()->getTypeByName(getName());
    275     if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
    276         report_fatal_error("Kernel " + getName() + " definition could not be found in the cache object");
    277     }
    278     processingRateAnalysis();
    279 }
    280    
    281 void Kernel::processingRateAnalysis() {
    282    
    283     const unsigned inputSetCount = mStreamSetInputs.size();
    284     const unsigned outputSetCount = mStreamSetOutputs.size();
    285     const unsigned totalSetCount = inputSetCount + outputSetCount;
    286    
    287     mItemsPerStride.resize(totalSetCount);
    288     mIsDerived.resize(totalSetCount);
    289 
    290     mItemsPerStride[0] = mStride;
    291     mIsDerived[0] = true;
    292    
    293     for (unsigned i = 0; i < inputSetCount; i++) {
    294         // Default reference stream set is the principal input stream set.
    295         auto & rate = mStreamSetInputs[i].rate;
    296         if (rate.referenceStreamSet() == "") {
    297             rate.setReferenceStreamSet(mStreamSetInputs[0].name);
    298         }
    299         Port port; unsigned ssIdx;
    300         std::tie(port, ssIdx) = getStreamPort(rate.referenceStreamSet());
    301         if ((port == Port::Output) || (ssIdx > i) || ((ssIdx == i) && (i > 0))) {
    302             report_fatal_error(getName() + ": input set " + mStreamSetInputs[i].name + ": forward or circular rate dependency");
    303         }
    304         if ((rate.isExact() || rate.isMaxRatio()) && mIsDerived[ssIdx]) {
    305             if ((mItemsPerStride[ssIdx] % rate.getRatioDenominator()) != 0) {
    306                 report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " processing rate denominator does not exactly divide items per stride.");
    307             }
    308             mItemsPerStride[i] = rate.calculateRatio(mItemsPerStride[ssIdx]);
    309             mIsDerived[i] = rate.isExact();
    310         }
    311         else {
    312             mIsDerived[i] = false;
    313             mItemsPerStride[i] = 0;  // For unknown input rate, no items will be copied to temp buffers.
    314         }
    315     }
    316    
    317     for (unsigned i = inputSetCount; i < totalSetCount; i++) {
    318         auto & rate = mStreamSetOutputs[i-inputSetCount].rate;
    319         // Default reference stream set is the principal input stream set for the principal output stream set.
    320         // Default reference stream set is the principal output stream set for other output stream sets.
    321         if (rate.referenceStreamSet() == "") {
    322             if ((mStreamSetInputs.size() > 0) && (i == inputSetCount)) {
    323                 rate.setReferenceStreamSet(mStreamSetInputs[0].name);
    324             }
    325             else {
    326                 rate.setReferenceStreamSet(mStreamSetOutputs[0].name);
    327             }
    328         }
    329         Port port; unsigned ssIdx;
    330         std::tie(port, ssIdx) = getStreamPort(rate.referenceStreamSet());
    331         if (port == Port::Output) ssIdx += inputSetCount;
    332         if ((ssIdx > i) || ((ssIdx == i) && (i > 0))) {
    333             report_fatal_error(getName() + ": output set " + mStreamSetOutputs[i].name + ": forward or circular rate dependency");
    334         }
    335         if ((rate.isExact() || rate.isMaxRatio()) && mIsDerived[ssIdx]) {
    336             if ((mItemsPerStride[ssIdx] % rate.getRatioDenominator()) != 0) {
    337                 report_fatal_error(getName() + ": " + mStreamSetOutputs[i-inputSetCount].name + " processing rate denominator does not exactly divide items per stride.");
    338             }
    339             mItemsPerStride[i] = rate.calculateRatio(mItemsPerStride[ssIdx]);
    340             mIsDerived[i] = rate.isExact();
    341         }
    342         else {
    343             mIsDerived[i] = false;
    344             mItemsPerStride[i] = 0;  // For unknown output rate, no items will be copied to temp buffers.
    345         }
    346     }
    347 }
    348 
    349    
    350 
    351 // Default kernel signature: generate the IR and emit as byte code.
     308
     309}
     310
     311
     312/** ------------------------------------------------------------------------------------------------------------- *
     313 * @brief makeSignature
     314 *
     315 * Default kernel signature: generate the IR and emit as byte code.
     316 ** ------------------------------------------------------------------------------------------------------------- */
    352317std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    353318    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
     
    363328}
    364329
     330
     331/** ------------------------------------------------------------------------------------------------------------- *
     332 * @brief generateKernel
     333 ** ------------------------------------------------------------------------------------------------------------- */
    365334void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    366335    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
     
    383352}
    384353
     354
     355/** ------------------------------------------------------------------------------------------------------------- *
     356 * @brief callGenerateInitializeMethod
     357 ** ------------------------------------------------------------------------------------------------------------- */
    385358inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    386359    mCurrentMethod = getInitFunction(idb->getModule());
     
    390363    idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
    391364    for (const auto & binding : mScalarInputs) {
    392         idb->setScalarField(binding.name, &*(args++));
     365        idb->setScalarField(binding.getName(), &*(args++));
    393366    }
    394367    for (const auto & binding : mStreamSetOutputs) {
    395         idb->setConsumerLock(binding.name, &*(args++));
     368        idb->setConsumerLock(binding.getName(), &*(args++));
    396369    }
    397370    generateInitializeMethod(idb);
     
    399372}
    400373
     374/** ------------------------------------------------------------------------------------------------------------- *
     375 * @brief callGenerateDoSegmentMethod
     376 ** ------------------------------------------------------------------------------------------------------------- */
    401377inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    402378    mCurrentMethod = getDoSegmentFunction(idb->getModule());
     
    405381    setInstance(&*(args++));
    406382    mIsFinal = &*(args++);
     383    mAvailablePrincipleItemCount = nullptr;
     384//    if (mHasPrincipleItemCount) {
     385//        mAvailablePrincipleItemCount = &*(args++);
     386//    }
    407387    const auto n = mStreamSetInputs.size();
    408388    mAvailableItemCount.resize(n, nullptr);
    409     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     389    for (unsigned i = 0; i < n; i++) {
     390//        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     391//        Value * itemCount = nullptr;
     392//        if (rate.isFixed()) {
     393//            itemCount = mAvailablePrincipleItemCount;
     394//            if (rate.getRate() != 1) {
     395//                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getRate()));
     396//            }
     397//        } else if (rate.isBounded() || rate.isUnknown()) {
     398//            itemCount = &*(args++);
     399//        } else if (rate.isRelative()) {
     400//            for (unsigned j = 0; j < i; ++j) {
     401//                if (mStreamSetInputs[j].getName() == rate.getReference()) {
     402//                    itemCount = mAvailableItemCount[j];
     403//                    break;
     404//                }
     405//            }
     406//            if (LLVM_UNLIKELY(itemCount == nullptr)) {
     407//                report_fatal_error(mStreamSetInputs[i].getName() + " is declared before " + rate.getReference());
     408//            }
     409//            if (rate.getNumerator() != 1) {
     410//                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
     411//            }
     412//            if (rate.getDenominator() != 1) {
     413//                itemCount = idb->CreateUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
     414//            }
     415//        }
     416//        assert (itemCount);
     417//        mAvailableItemCount[i] = itemCount;
     418
     419        assert (args != mCurrentMethod->arg_end());
    410420        mAvailableItemCount[i] = &*(args++);
    411421    }
    412     generateDoSegmentMethod(idb); // must be overridden by the KernelBuilder subtype
     422    assert (args == mCurrentMethod->arg_end());
     423
     424    generateKernelMethod(idb); // must be overridden by the Kernel subtype
    413425    mIsFinal = nullptr;
    414426    mAvailableItemCount.clear();
     
    416428}
    417429
     430
     431/** ------------------------------------------------------------------------------------------------------------- *
     432 * @brief callGenerateFinalizeMethod
     433 ** ------------------------------------------------------------------------------------------------------------- */
    418434inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
    419435    mCurrentMethod = getTerminateFunction(idb->getModule());
     
    421437    auto args = mCurrentMethod->arg_begin();
    422438    setInstance(&*(args++));
    423     generateFinalizeMethod(idb); // may be overridden by the KernelBuilder subtype
     439    generateFinalizeMethod(idb); // may be overridden by the Kernel subtype
    424440    const auto n = mScalarOutputs.size();
    425441    if (n == 0) {
     
    428444        Value * outputs[n];
    429445        for (unsigned i = 0; i < n; ++i) {
    430             outputs[i] = idb->getScalarField(mScalarOutputs[i].name);
     446            outputs[i] = idb->getScalarField(mScalarOutputs[i].getName());
    431447        }
    432448        if (n == 1) {
     
    438454}
    439455
     456
     457/** ------------------------------------------------------------------------------------------------------------- *
     458 * @brief getScalarIndex
     459 ** ------------------------------------------------------------------------------------------------------------- */
    440460unsigned Kernel::getScalarIndex(const std::string & name) const {
    441461    const auto f = mKernelMap.find(name);
    442462    if (LLVM_UNLIKELY(f == mKernelMap.end())) {
     463        assert (false);
    443464        report_fatal_error(getName() + " does not contain scalar: " + name);
    444465    }
     
    446467}
    447468
     469
     470/** ------------------------------------------------------------------------------------------------------------- *
     471 * @brief createInstance
     472 ** ------------------------------------------------------------------------------------------------------------- */
    448473Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) {
    449474    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     
    455480}
    456481
     482
     483/** ------------------------------------------------------------------------------------------------------------- *
     484 * @brief initializeInstance
     485 ** ------------------------------------------------------------------------------------------------------------- */
    457486void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
    458487    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     
    518547}
    519548
     549/** ------------------------------------------------------------------------------------------------------------- *
     550 * @brief finalizeInstance
     551 ** ------------------------------------------------------------------------------------------------------------- */
     552void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
     553    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     554    mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
     555}
     556
     557/** ------------------------------------------------------------------------------------------------------------- *
     558 * @brief getStreamPort
     559 ** ------------------------------------------------------------------------------------------------------------- */
     560Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
     561    const auto f = mStreamMap.find(name);
     562    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
     563        report_fatal_error(getName() + " does not contain stream set " + name);
     564    }
     565    return f->second;
     566}
     567
     568/** ------------------------------------------------------------------------------------------------------------- *
     569 * @brief generateKernelMethod
     570 ** ------------------------------------------------------------------------------------------------------------- */
     571void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
     572
     573    Constant * const log2BlockWidth = b->getSize(std::log2(b->getBitBlockWidth()));
     574
     575    const auto inputSetCount = mStreamSetInputs.size();
     576    mStreamSetInputBufferPtr.resize(inputSetCount);
     577    for (unsigned i = 0; i < inputSetCount; ++i) {
     578        const auto & name = mStreamSetInputs[i].getName();
     579        Value * ic = b->getProcessedItemCount(name);
     580        Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
     581        mStreamSetInputBufferPtr[i] = b->getInputStreamPtr(name, blockIndex);
     582    }
     583
     584    const auto outputSetCount = mStreamSetOutputs.size();
     585    mStreamSetOutputBufferPtr.resize(outputSetCount);
     586    for (unsigned i = 0; i < outputSetCount; ++i) {
     587        const auto & name = mStreamSetOutputs[i].getName();
     588        Value * ic = b->getProducedItemCount(name);
     589        Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
     590        mStreamSetOutputBufferPtr[i] = b->getOutputStreamPtr(name, blockIndex);
     591    }
     592
     593    generateDoSegmentMethod(b);
     594
     595}
     596
     597/** ------------------------------------------------------------------------------------------------------------- *
     598 * @brief generateKernelMethod
     599 ** ------------------------------------------------------------------------------------------------------------- */
     600void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) {
     601
     602    const auto inputSetCount = mStreamSetInputs.size();
     603    const auto outputSetCount = mStreamSetOutputs.size();
     604    const auto totalSetCount = inputSetCount + outputSetCount;
     605
     606    // Scan through and see if any of our input streams is marked as the principle
     607
     608    bool hasPrinciple = false;
     609    unsigned principleInput = 0;
     610
     611    for (unsigned i = 0; i < inputSetCount; i++) {
     612        for (const auto attr : mStreamSetInputs[i].getAttributes()) {
     613            if (attr.isPrinciple()) {
     614                hasPrinciple = true;
     615                principleInput = i;
     616                break;
     617            }
     618        }
     619    }
     620
     621    // Now we iteratively process these blocks using the doMultiBlock method.
     622    // In each iteration, we check how many linearly accessible / writable
     623    // items can be processed with our current input / output buffers. If we
     624    // cannot support an full stride, we check whether (a) there is enough
     625    // input data to process but it is not linearly accessible, in which case
     626    // we move the data into temporary buffers or (b) there is not enough data
     627    // to process, in which case we abort unless IsFinal was set.
     628
     629    // Now proceed with creation of the doSegment method.
     630    BasicBlock * const doSegmentLoop = kb->CreateBasicBlock("DoSegmentLoop");
     631    kb->CreateBr(doSegmentLoop);
     632
     633    /// DO SEGMENT LOOP
     634
     635    kb->SetInsertPoint(doSegmentLoop);
     636
     637    // For each input buffer, determine the processedItemCount, the block pointer for the
     638    // buffer block containing the next item, and the number of linearly available items.
     639
     640    Value * processedItemCount[inputSetCount];
     641    Value * baseInputBuffer[inputSetCount];
     642    Value * unprocessed[inputSetCount];
     643    Value * linearlyAvailable[inputSetCount];
     644    Value * readableStrides[inputSetCount];
     645
     646    Constant * const log2BlockWidth = kb->getSize(std::log2(kb->getBitBlockWidth()));
     647
     648    Value * numOfStrides = nullptr;
     649
     650    for (unsigned i = 0; i < inputSetCount; i++) {
     651        const auto name = mStreamSetInputs[i].getName();
     652        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     653
     654        processedItemCount[i] = kb->getProcessedItemCount(name);
     655
     656        assert (processedItemCount[i]->getType() == mAvailableItemCount[i]->getType());
     657
     658        Value * const blockIndex = kb->CreateLShr(processedItemCount[i], log2BlockWidth);
     659        baseInputBuffer[i] = kb->getInputStreamPtr(name, blockIndex);
     660
     661        if (codegen::EnableAsserts) {
     662            kb->CreateAssert(kb->CreateICmpUGE(mAvailableItemCount[i], processedItemCount[i]),
     663                             "Processed item count cannot exceed the available item count");
     664        }
     665
     666        unprocessed[i] = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
     667
     668        //kb->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed[i]);
     669
     670        // INVESTIGATE: If the input rate of this stream is constant and known a priori, we could
     671        // avoid checking whether it is linearly accessible. Should we have an attribute for this?
     672
     673        linearlyAvailable[i] = kb->getLinearlyAccessibleItems(name, processedItemCount[i], unprocessed[i]);
     674
     675        //kb->CallPrintInt(getName() + "_" + name + "_linearlyAvailable", linearlyAvailable[i]);
     676
     677        readableStrides[i] = nullptr;
     678
     679        if (rate.isFixed() || rate.isBounded()) {
     680            Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
     681            readableStrides[i] = kb->CreateUDiv(linearlyAvailable[i], maxStrideSize);
     682            if (numOfStrides) {
     683                numOfStrides = kb->CreateUMin(numOfStrides, readableStrides[i]);
     684            } else {
     685                numOfStrides = readableStrides[i];
     686            }
     687        }
     688    }
     689
     690    //kb->CallPrintInt(getName() + "_numOfStrides", numOfStrides);
     691
     692    // Now determine the linearly writeable blocks, based on available blocks reduced
     693    // by limitations of output buffer space.
     694
     695    Value * producedItemCount[outputSetCount];
     696    Value * baseOutputBuffer[outputSetCount];
     697    Value * writableStrides[outputSetCount];
     698    Value * linearlyWritable[outputSetCount];
     699
     700    for (unsigned i = 0; i < outputSetCount; i++) {
     701        const auto & name = mStreamSetOutputs[i].getName();
     702        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     703        producedItemCount[i] = kb->getProducedItemCount(name);
     704
     705        //kb->CallPrintInt(getName() + "_" + name + "_producedItemCount", producedItemCount[i]);
     706
     707        Value * const blockIndex = kb->CreateLShr(producedItemCount[i], log2BlockWidth);
     708        baseOutputBuffer[i] = kb->getOutputStreamPtr(name, blockIndex);
     709        linearlyWritable[i] = nullptr;
     710        writableStrides[i] = nullptr;
     711        if (rate.isFixed() || rate.isBounded()) {
     712            linearlyWritable[i] = kb->getLinearlyWritableItems(name, producedItemCount[i]);
     713
     714            //kb->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]);
     715
     716            Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
     717            writableStrides[i] = kb->CreateUDiv(linearlyWritable[i], maxStrideSize);
     718            if (numOfStrides) {
     719                numOfStrides = kb->CreateUMin(numOfStrides, writableStrides[i]);
     720            } else {
     721                numOfStrides = writableStrides[i];
     722            }
     723        }
     724    }
     725
     726    //kb->CallPrintInt(getName() + "_numOfStrides'", numOfStrides);
     727
     728    for (unsigned i = 0; i < inputSetCount; i++) {
     729        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     730        if (rate.isFixed()) {
     731            mAvailableItemCount[i] = kb->CreateMul(numOfStrides, kb->getSize(rate.getRate() * mStride));
     732        } else {
     733            mAvailableItemCount[i] = linearlyAvailable[i];
     734        }
     735
     736        //kb->CallPrintInt(getName() + "_" + mStreamSetInputs[i].getName() + "_avail", mAvailableItemCount[i]);
     737    }
     738
     739    // Define and allocate the temporary buffer area.
     740    Type * tempBuffers[totalSetCount];
     741    for (unsigned i = 0; i < inputSetCount; ++i) {
     742        Type * bufType = baseInputBuffer[i]->getType()->getPointerElementType();
     743        assert (baseInputBuffer[i]->getType()->getPointerAddressSpace() == 0);
     744        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     745        unsigned count = 0;
     746        if (rate.isFixed()) {
     747            count = rate.getRate();
     748        } else if (rate.isBounded()) {
     749            count = rate.getUpperBound() + 2;
     750        }
     751        tempBuffers[i] = ArrayType::get(bufType, count);
     752    }
     753    for (unsigned i = 0; i < outputSetCount; i++) {
     754        Type * const bufType = baseOutputBuffer[i]->getType()->getPointerElementType();
     755        assert (baseOutputBuffer[i]->getType()->getPointerAddressSpace() == 0);
     756        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     757        unsigned count = 0;
     758        if (rate.isFixed()) {
     759            count = rate.getRate();
     760        } else if (rate.isBounded()) {
     761            count = rate.getUpperBound() + 2;
     762        }
     763        tempBuffers[i + inputSetCount] = ArrayType::get(bufType, count);
     764    }
     765
     766    Type * const tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount));
     767
     768    Value * const tempBufferArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
     769
     770    BasicBlock * const temporaryBufferCheck = kb->CreateBasicBlock("temporaryBufferCheck");
     771    BasicBlock * const doMultiBlock = kb->CreateBasicBlock("doMultiBlock");
     772    BasicBlock * const copyToTemporaryBuffers = kb->CreateBasicBlock("copyToTemporaryBuffers");
     773    BasicBlock * const segmentDone = kb->CreateBasicBlock("segmentDone");
     774
     775    Value * const hasFullStride = numOfStrides ? kb->CreateICmpNE(numOfStrides, kb->getSize(0)) : kb->getTrue();
     776    kb->CreateCondBr(hasFullStride, doMultiBlock, temporaryBufferCheck);
     777
     778    // We use temporary buffers in 3 different cases that preclude full stride processing.
     779
     780    //  (a) One or more input buffers does not have a sufficient number of input items linearly available.
     781    //  (b) One or more output buffers does not have sufficient linearly available buffer space.
     782    //  (c) We have processed all the full strides of input and only the final block remains.
     783
     784    kb->SetInsertPoint(temporaryBufferCheck);
     785
     786    // Even if we copy the input data into a linear arrays, is there enough data to perform this stride?
     787    // If not, proceed only if this is our final block.
     788    Value * hasFullFragmentedStride = nullptr;
     789    for (unsigned i = 0; i < inputSetCount; i++) {
     790        const ProcessingRate & r = mStreamSetInputs[i].getRate();
     791        if (r.isBounded() || (r.isUnknown() && r.getLowerBound() > 0)) {
     792            const auto l = r.isBounded() ? r.getUpperBound() : r.getLowerBound();
     793            Constant * const strideSize = kb->getSize(l * mStride);
     794            Value * enoughAvail = kb->CreateICmpUGE(unprocessed[i], strideSize);
     795            if (hasFullFragmentedStride) {
     796                hasFullFragmentedStride = kb->CreateAnd(hasFullFragmentedStride, enoughAvail);
     797            } else {
     798                hasFullFragmentedStride = enoughAvail;
     799            }
     800        }
     801    }
     802
     803    Value * hasFragmentedOrFinalStride = nullptr;
     804    if (hasFullFragmentedStride) {
     805        hasFragmentedOrFinalStride = kb->CreateOr(hasFullFragmentedStride, mIsFinal);
     806        // Although this might be the final segment, we may have a full fragmented stride to process prior
     807        // to the actual final stride.
     808        mIsFinal = kb->CreateAnd(mIsFinal, kb->CreateNot(hasFullFragmentedStride));
     809    } else {
     810        hasFragmentedOrFinalStride = mIsFinal;
     811    }
     812    kb->CreateCondBr(hasFragmentedOrFinalStride, copyToTemporaryBuffers, segmentDone);
     813
     814    /// COPY TO TEMPORARY BUFFERS
     815    kb->SetInsertPoint(copyToTemporaryBuffers);
     816
     817    kb->CreateAlignedStore(Constant::getNullValue(tempParameterStructType), tempBufferArea, kb->getCacheAlignment());
     818
     819    // For each input and output buffer, copy over necessary data starting from the last block boundary.
     820
     821    Value * temporaryInputBuffer[inputSetCount];
     822    Value * temporaryAvailable[inputSetCount];
     823
     824    for (unsigned i = 0; i < inputSetCount; i++) {
     825        temporaryInputBuffer[i] = baseInputBuffer[i];
     826        if (readableStrides[i]) {
     827            const auto name = mStreamSetInputs[i].getName();
     828            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     829            assert (rate.getUpperBound() > 0);
     830            Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
     831            temporaryAvailable[i] = kb->CreateUMin(unprocessed[i], maxStrideSize);
     832
     833            BasicBlock * entry = kb->GetInsertBlock();
     834            BasicBlock * copy = kb->CreateBasicBlock(name + "Copy");
     835            BasicBlock * resume = kb->CreateBasicBlock(name + "ResumeCopy");
     836            Value * const test = kb->CreateOr(kb->CreateICmpNE(readableStrides[i], kb->getSize(0)), mIsFinal);
     837            kb->CreateCondBr(test, resume, copy);
     838
     839            kb->SetInsertPoint(copy);
     840            Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(i), kb->getInt32(0)});
     841            assert (tempBufferPtr->getType() == baseInputBuffer[i]->getType());
     842            Value * const neededItems = linearlyAvailable[i];
     843            Value * const bytesCopied = kb->copy(name, tempBufferPtr, baseInputBuffer[i], neededItems);
     844            Value * const nextInputPtr = kb->getRawInputPointer(name, kb->getSize(0));
     845            Value * const remaining = kb->CreateSub(temporaryAvailable[i], neededItems);
     846            Value * nextBufPtr = kb->CreatePointerCast(tempBufferPtr, kb->getInt8PtrTy());
     847            nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
     848            kb->copy(name, nextBufPtr, nextInputPtr, remaining);
     849
     850            kb->CreateBr(resume);
     851
     852            kb->SetInsertPoint(resume);
     853            PHINode * bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
     854            bufferPtr->addIncoming(baseInputBuffer[i], entry);
     855            bufferPtr->addIncoming(tempBufferPtr, copy);
     856            temporaryInputBuffer[i] = bufferPtr;
     857        }
     858    }
     859
     860    Value * temporaryOutputBuffer[outputSetCount];
     861    for (unsigned i = 0; i < outputSetCount; i++) {
     862        temporaryOutputBuffer[i] = baseOutputBuffer[i];
     863        if (writableStrides[i]) {
     864            const auto name = mStreamSetOutputs[i].getName();
     865
     866            BasicBlock * const entry = kb->GetInsertBlock();
     867            BasicBlock * const copy = kb->CreateBasicBlock(name + "Copy");
     868            BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopy");
     869
     870            Value * const test = kb->CreateOr(kb->CreateICmpNE(writableStrides[i], kb->getSize(0)), mIsFinal);
     871            kb->CreateCondBr(test, resume, copy);
     872
     873            kb->SetInsertPoint(copy);
     874            Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i), kb->getInt32(0)});
     875            assert (tempBufferPtr->getType() == baseOutputBuffer[i]->getType());
     876            Value * const itemsToCopy = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
     877            kb->copy(name, tempBufferPtr, baseOutputBuffer[i], itemsToCopy);
     878            kb->CreateBr(resume);
     879
     880            kb->SetInsertPoint(resume);
     881            PHINode * bufferPtr = kb->CreatePHI(tempBufferPtr->getType(), 2);
     882            bufferPtr->addIncoming(baseOutputBuffer[i], entry);
     883            bufferPtr->addIncoming(tempBufferPtr, copy);
     884            temporaryOutputBuffer[i] = bufferPtr;
     885        }
     886    }
     887
     888    kb->CreateBr(doMultiBlock);
     889    BasicBlock * const usingTemporaryBuffers = kb->GetInsertBlock();
     890    doMultiBlock->moveAfter(usingTemporaryBuffers);
     891
     892    /// DO MULTI BLOCK
     893
     894    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
     895    //  Now prepare the doMultiBlock call.
     896    kb->SetInsertPoint(doMultiBlock);
     897
     898    PHINode * const isFinal = kb->CreatePHI(mIsFinal->getType(), 2);
     899    isFinal->addIncoming(kb->getFalse(), doSegmentLoop);
     900    isFinal->addIncoming(mIsFinal, usingTemporaryBuffers);
     901    mIsFinal = isFinal;
     902
     903    mStreamSetInputBufferPtr.resize(inputSetCount);
     904    for (unsigned i = 0; i < inputSetCount; ++i) {
     905        assert (baseInputBuffer[i] && temporaryInputBuffer[i]);
     906        if (baseInputBuffer[i] != temporaryInputBuffer[i]) {
     907            PHINode * const avail = kb->CreatePHI(kb->getSizeTy(), 2);
     908            avail->addIncoming(mAvailableItemCount[i], doSegmentLoop);
     909            avail->addIncoming(temporaryAvailable[i], usingTemporaryBuffers);
     910            mAvailableItemCount[i] = avail;
     911            PHINode * const bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
     912            bufferPtr->addIncoming(baseInputBuffer[i], doSegmentLoop);
     913            assert (baseInputBuffer[i]->getType() == temporaryInputBuffer[i]->getType());
     914            bufferPtr->addIncoming(temporaryInputBuffer[i], usingTemporaryBuffers);
     915            temporaryInputBuffer[i] = bufferPtr;
     916        }
     917        mStreamSetInputBufferPtr[i] = temporaryInputBuffer[i];
     918    }
     919
     920    mStreamSetOutputBufferPtr.resize(outputSetCount);
     921    for (unsigned i = 0; i < outputSetCount; ++i) {
     922        assert (baseOutputBuffer[i] && temporaryOutputBuffer[i]);
     923        if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
     924            PHINode * const bufferPtr = kb->CreatePHI(baseOutputBuffer[i]->getType(), 2);
     925            bufferPtr->addIncoming(baseOutputBuffer[i], doSegmentLoop);
     926            assert (baseOutputBuffer[i]->getType() == temporaryOutputBuffer[i]->getType());
     927            bufferPtr->addIncoming(temporaryOutputBuffer[i], usingTemporaryBuffers);
     928            temporaryOutputBuffer[i] = bufferPtr;
     929        }
     930        mStreamSetOutputBufferPtr[i] = temporaryOutputBuffer[i];
     931    }
     932
     933    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
     934    // provide the required multi-block kernel logic.
     935    generateMultiBlockLogic(kb, numOfStrides);
     936
     937    // If we have no fixed rate inputs, we won't know when we're done parsing until we test
     938    // whether any input data was processed.
     939    bool mayMakeNoProgress = true;
     940
     941    // Update the processed item count of any Fixed input or output stream. While doing so, also
     942    // calculate the LCM of their rates. The LCM is used to calculate the final item counts.
     943
     944    unsigned rateLCM = 1;
     945
     946    for (unsigned i = 0; i < inputSetCount; ++i) {
     947        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     948        if (rate.isFixed()) {
     949            mayMakeNoProgress = false;
     950            rateLCM = lcm(rateLCM, rate.getRate());
     951            Value * const processed = mAvailableItemCount[i]; // kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
     952            Value * const ic = kb->CreateAdd(processedItemCount[i], processed);
     953            kb->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
     954        }
     955    }
     956
     957    for (unsigned i = 0; i < outputSetCount; ++i) {
     958        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     959        if (rate.isFixed()) {
     960            rateLCM = lcm(rateLCM, rate.getRate());
     961            Value * const produced = kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
     962            Value * const ic = kb->CreateAdd(producedItemCount[i], produced);
     963            kb->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
     964        }
     965    }
     966
     967    BasicBlock * const finalStrideCheck = kb->CreateBasicBlock("finalStrideCheck");
     968    BasicBlock * const finalStrideAdjustment = kb->CreateBasicBlock("finalStrideAdjustment");
     969    BasicBlock * const standardCopyBack = kb->CreateBasicBlock("standardCopyBack");
     970    BasicBlock * const temporaryBufferCopyBack = kb->CreateBasicBlock("temporaryBufferCopyBack");
     971
     972    kb->CreateLikelyCondBr(hasFullStride, standardCopyBack, finalStrideCheck);
     973
     974
     975    /// FINAL STRIDE CHECK
     976    kb->SetInsertPoint(finalStrideCheck);
     977    kb->CreateUnlikelyCondBr(mIsFinal, finalStrideAdjustment, temporaryBufferCopyBack);
     978
     979    /// FINAL STRIDE ADJUSTMENT
     980    kb->SetInsertPoint(finalStrideAdjustment);
     981
     982    // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
     983    // the ITEM COUNT % FIXED RATE = 0 for all Fixed Input and Output streams. We correct that here
     984    // to calculate them based on the actual input item counts.
     985
     986    // NOTE: This appears overly complex to avoid an integer overflow without reducing the maximum
     987    // integer size. For each Fixed output stream, this calculates:
     988
     989    //       CEILING(MIN(Total Available Item Count / Fixed Input Rate) * Fixed Output Rate)
     990
     991    Value * basePreviouslyProcessedItemCount = nullptr;
     992    Value * scaledInverseOfStrideItemCount = nullptr;
     993
     994    for (unsigned i = 0; i < inputSetCount; ++i) {
     995        const ProcessingRate & r = mStreamSetInputs[i].getRate();
     996        if (r.isFixed()) {
     997            assert (rateLCM % r.getRate() == 0);
     998            Value * const a = kb->CreateMul(mAvailableItemCount[i], kb->getSize(rateLCM / r.getRate())); // unprocessed
     999            Value * const p = kb->CreateUDiv(processedItemCount[i], kb->getSize(r.getRate()));
     1000            if (scaledInverseOfStrideItemCount) {
     1001                scaledInverseOfStrideItemCount = kb->CreateUMin(scaledInverseOfStrideItemCount, a);
     1002                basePreviouslyProcessedItemCount = kb->CreateUMin(basePreviouslyProcessedItemCount, p);
     1003            } else {
     1004                scaledInverseOfStrideItemCount = a;
     1005                basePreviouslyProcessedItemCount = p;
     1006            }
     1007        }
     1008//        const auto name = mStreamSetInputs[i].getName();
     1009//        Value * const processed = kb->CreateAdd(processedItemCount[i], unprocessed[i]);
     1010//        kb->setProcessedItemCount(name, processed);
     1011    }
     1012
     1013    for (unsigned i = 0; i < outputSetCount; ++i) {
     1014        const auto name = mStreamSetOutputs[i].getName();
     1015        const ProcessingRate & r = mStreamSetOutputs[i].getRate();
     1016        Value * produced = nullptr;
     1017        if (r.isFixed()) {
     1018            assert (rateLCM % r.getRate() == 0);
     1019            assert (basePreviouslyProcessedItemCount && scaledInverseOfStrideItemCount);
     1020            Value * const p = kb->CreateMul(basePreviouslyProcessedItemCount, kb->getSize(r.getRate()));
     1021            Value * const ic = kb->CreateUDivCeil(scaledInverseOfStrideItemCount, kb->getSize(rateLCM / r.getRate()));
     1022            produced = kb->CreateAdd(p, ic);
     1023        } else { // check if we have an attribute; if so, get the current produced count and adjust it
     1024            bool noAttributes = true;
     1025            for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1026                if (attr.isAdd() || attr.isRoundUpTo()) {
     1027                    noAttributes = false;
     1028                    break;
     1029                }
     1030            }
     1031            if (noAttributes) {
     1032                continue;
     1033            }
     1034            produced = kb->getProducedItemCount(name);
     1035        }
     1036        for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1037            if (attr.isAdd()) {
     1038                produced = kb->CreateAdd(produced, kb->getSize(attr.getAmount()));
     1039            } else if (attr.isRoundUpTo()) {
     1040                produced = kb->CreateRoundUp(produced, kb->getSize(attr.getAmount()));
     1041            }
     1042        }
     1043        kb->setProducedItemCount(name, produced);
     1044    }
     1045
     1046    kb->CreateBr(temporaryBufferCopyBack);
     1047
     1048    /// TEMPORARY BUFFER COPY BACK
     1049    kb->SetInsertPoint(temporaryBufferCopyBack);
     1050
     1051    // Copy back data to the actual output buffers.
     1052    for (unsigned i = 0; i < outputSetCount; i++) {
     1053
     1054        if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
     1055
     1056            const auto name = mStreamSetOutputs[i].getName();
     1057
     1058            BasicBlock * const copy = kb->CreateBasicBlock(name + "CopyBack");
     1059            BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopyBack");
     1060            Value * const usedTemporary = kb->CreateICmpNE(temporaryOutputBuffer[i], baseOutputBuffer[i]);
     1061
     1062            // If we used a temporary buffer ...
     1063            kb->CreateCondBr(usedTemporary, copy, resume);
     1064
     1065            kb->SetInsertPoint(copy);
     1066            Value * bytesCopied = kb->copy(name, baseOutputBuffer[i], temporaryOutputBuffer[i], linearlyWritable[i]);
     1067            Value * nextOutputPtr = kb->getRawOutputPointer(name, kb->getSize(0));
     1068            Value * producedCount = kb->getProducedItemCount(name);
     1069
     1070            Value * remaining = kb->CreateSub(producedCount, linearlyWritable[i]);
     1071            Value * nextBufPtr = kb->CreatePointerCast(temporaryOutputBuffer[i], kb->getInt8PtrTy());
     1072            nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
     1073
     1074            kb->copy(name, nextOutputPtr, nextBufPtr, remaining);
     1075            kb->CreateBr(resume);
     1076
     1077            kb->SetInsertPoint(resume);
     1078        }
     1079    }
     1080
     1081    //  We've dealt with the partial block processing and copied information back into the
     1082    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
     1083    BasicBlock * setTermination = nullptr;
     1084    if (hasNoTerminateAttribute()) {
     1085        kb->CreateCondBr(mIsFinal, segmentDone, standardCopyBack);
     1086    } else {
     1087        setTermination = kb->CreateBasicBlock("setTermination");
     1088        kb->CreateCondBr(mIsFinal, setTermination, standardCopyBack);
     1089    }
     1090
     1091    /// STANDARD COPY BACK
     1092    kb->SetInsertPoint(standardCopyBack);
     1093
     1094    // Do copybacks if necessary.
     1095    for (unsigned i = 0; i < outputSetCount; i++) {
     1096        if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
     1097            const auto name = mStreamSetOutputs[i].getName();
     1098            Value * newProduced = kb->getProducedItemCount(name);
     1099            kb->CreateCopyBack(name, producedItemCount[i], newProduced);
     1100        }
     1101    }
     1102
     1103    // If it is possible to make no progress, verify we processed some of the input. If we haven't,
     1104    // we're finished this segment.
     1105    if (mayMakeNoProgress) {
     1106        Value * madeProgress = nullptr;
     1107        for (unsigned i = 0; i < inputSetCount; ++i) {
     1108            Value * const processed = kb->getProcessedItemCount(mStreamSetInputs[i].getName());
     1109            Value * const progress = kb->CreateICmpNE(processed, processedItemCount[i]);
     1110            if (madeProgress) {
     1111                madeProgress = kb->CreateOr(madeProgress, progress);
     1112            } else {
     1113                madeProgress = progress;
     1114            }
     1115        }
     1116        assert (madeProgress);
     1117        kb->CreateCondBr(madeProgress, doSegmentLoop, segmentDone);
     1118    } else {
     1119        kb->CreateBr(doSegmentLoop);
     1120    }
     1121
     1122    if (hasNoTerminateAttribute()) {
     1123        segmentDone->moveAfter(kb->GetInsertBlock());
     1124    } else {
     1125        /// SET TERMINATION
     1126        setTermination->moveAfter(kb->GetInsertBlock());
     1127        kb->SetInsertPoint(setTermination);
     1128        kb->setTerminationSignal();
     1129        kb->CreateBr(segmentDone);
     1130        segmentDone->moveAfter(setTermination);
     1131    }
     1132
     1133    kb->SetInsertPoint(segmentDone);
     1134
     1135}
     1136
     1137//bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
     1138//    if (rate.isBounded() || rate.isUnknown()) {
     1139//        return true;
     1140//    } else if (rate.isDirectlyRelative()) {
     1141//        Port port; unsigned i;
     1142//        std::tie(port, i) = getStreamPort(rate.getReference());
     1143//        const auto & binding = (port == Port::Input) ? mStreamSetInputs[i] : mStreamSetOutputs[i];
     1144//        return requiresCopyBack(binding.getRate());
     1145//    }
     1146//    return false;
     1147//}
     1148
    5201149//  The default doSegment method dispatches to the doBlock routine for
    5211150//  each block of the given number of blocksToDo, and then updates counts.
    5221151
    523 void BlockOrientedKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb) {
     1152void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) {
     1153
    5241154    BasicBlock * const entryBlock = idb->GetInsertBlock();
    5251155    BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
     
    5341164    }
    5351165
    536     ConstantInt * stride = idb->getSize(idb->getStride());
    537     Value * availablePos = mAvailableItemCount[0];
    538     Value * processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
    539     Value * itemsAvail = idb->CreateSub(availablePos, processed);
    540     Value * stridesToDo = idb->CreateUDiv(itemsAvail, stride);
     1166    Constant * const log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
     1167
     1168    const auto inputSetCount = mStreamSetInputs.size();
     1169    Value * baseProcessedIndex[inputSetCount];
     1170    for (unsigned i = 0; i < inputSetCount; ++i) {
     1171        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     1172        if (rate.isFixed()) {
     1173            baseProcessedIndex[i] = nullptr;
     1174        } else {
     1175            Value * ic = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
     1176            ic = idb->CreateLShr(ic, log2BlockSize);
     1177            baseProcessedIndex[i] = ic;
     1178        }
     1179    }
     1180
     1181    const auto outputSetCount = mStreamSetOutputs.size();
     1182    Value * baseProducedIndex[outputSetCount];
     1183    for (unsigned i = 0; i < outputSetCount; ++i) {
     1184        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     1185        if (rate.isFixed()) {
     1186            baseProducedIndex[i] = nullptr;
     1187        } else {
     1188            Value * ic = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
     1189            ic = idb->CreateLShr(ic, log2BlockSize);
     1190            baseProducedIndex[i] = ic;
     1191        }
     1192    }
     1193
     1194    Value * const numOfBlocksToProcess = idb->CreateMul(numOfStrides, idb->getSize(mStride / idb->getBitBlockWidth()));
    5411195
    5421196    idb->CreateBr(strideLoopCond);
    5431197
     1198    /// BLOCK COND
     1199
    5441200    idb->SetInsertPoint(strideLoopCond);
    5451201
    5461202    PHINode * branchTarget = nullptr;
    547     if (idb->supportsIndirectBr()) {
     1203    if (baseTarget) {
    5481204        branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
    5491205        branchTarget->addIncoming(baseTarget, entryBlock);
    5501206    }
    5511207
    552     PHINode * const stridesRemaining = idb->CreatePHI(idb->getSizeTy(), 2, "stridesRemaining");
    553     stridesRemaining->addIncoming(stridesToDo, entryBlock);
    554     // NOTE: stridesRemaining may go to a negative number in the final block if the generateFinalBlockMethod(...)
    555     // calls CreateDoBlockMethodCall(). Do *not* replace the comparator with an unsigned one!
    556     Value * notDone = idb->CreateICmpSGT(stridesRemaining, idb->getSize(0));
     1208    PHINode * const blockIndex = idb->CreatePHI(idb->getSizeTy(), 2, "index");
     1209    blockIndex->addIncoming(idb->getSize(0), entryBlock);
     1210
     1211    for (unsigned i = 0; i < inputSetCount; ++i) {
     1212        Value * offset = blockIndex;
     1213        if (baseProcessedIndex[i]) {
     1214            offset = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
     1215            offset = idb->CreateLShr(offset, log2BlockSize);
     1216            offset = idb->CreateSub(offset, baseProcessedIndex[i]);
     1217        }
     1218        mStreamSetInputBufferPtr[i] = idb->CreateGEP(mStreamSetInputBufferPtr[i], offset);
     1219    }
     1220
     1221    for (unsigned i = 0; i < outputSetCount; ++i) {
     1222        Value * offset = blockIndex;
     1223        if (baseProducedIndex[i]) {
     1224            offset = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
     1225            offset = idb->CreateLShr(offset, log2BlockSize);
     1226            offset = idb->CreateSub(offset, baseProducedIndex[i]);
     1227        }
     1228        mStreamSetOutputBufferPtr[i] = idb->CreateGEP(mStreamSetOutputBufferPtr[i], offset);
     1229    }
     1230
     1231    Value * const notDone = idb->CreateICmpULT(blockIndex, numOfBlocksToProcess);
    5571232    idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
     1233
     1234    /// BLOCK BODY
    5581235
    5591236    idb->SetInsertPoint(mStrideLoopBody);
     
    5681245    writeDoBlockMethod(idb);
    5691246
    570     /// UPDATE PROCESSED COUNTS
    571 
    572     processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
    573     Value * itemsDone = idb->CreateAdd(processed, stride);
    574     idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
    575 
    576     stridesRemaining->addIncoming(idb->CreateSub(stridesRemaining, idb->getSize(1)), idb->GetInsertBlock());
    577 
    578     BasicBlock * bodyEnd = idb->GetInsertBlock();
    579     if (idb->supportsIndirectBr()) {
     1247    BasicBlock * const bodyEnd = idb->GetInsertBlock();
     1248    blockIndex->addIncoming(idb->CreateAdd(blockIndex, idb->getSize(1)), bodyEnd);
     1249    if (branchTarget) {
    5801250        branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
    5811251    }
     
    5841254    stridesDone->moveAfter(bodyEnd);
    5851255
     1256    /// STRIDE DONE
     1257
    5861258    idb->SetInsertPoint(stridesDone);
    5871259
    5881260    // Now conditionally perform the final block processing depending on the doFinal parameter.
    589     if (idb->supportsIndirectBr()) {
     1261    if (branchTarget) {
    5901262        mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
    5911263        mStrideLoopBranch->addDestination(doFinalBlock);
     
    5991271    idb->SetInsertPoint(doFinalBlock);
    6001272
    601     Value * remainingItems = idb->CreateSub(mAvailableItemCount[0], idb->getProcessedItemCount(mStreamSetInputs[0].name));
     1273    Value * remainingItems = nullptr;
     1274    for (unsigned i = 0; i < inputSetCount; ++i) {
     1275        const ProcessingRate & r = mStreamSetInputs[i].getRate();
     1276        if (r.isFixed()) {
     1277            Value * ic = idb->CreateUDiv(mAvailableItemCount[i], idb->getSize(r.getRate()));
     1278            if (remainingItems) {
     1279                remainingItems = idb->CreateUMax(remainingItems, ic);
     1280            } else {
     1281                remainingItems = ic;
     1282            }
     1283        }
     1284    }
    6021285
    6031286    writeFinalBlockMethod(idb, remainingItems);
    6041287
    605     itemsDone = mAvailableItemCount[0];
    606     idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
    607     idb->setTerminationSignal();
    6081288    idb->CreateBr(segmentDone);
    6091289
     
    6131293
    6141294    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
    615     if (idb->supportsIndirectBr()) {
     1295    if (branchTarget) {
    6161296        MDBuilder mdb(idb->getContext());
    6171297        const auto destinations = mStrideLoopBranch->getNumDestinations();
     
    6331313    std::vector<Value *> availableItemCount(0);
    6341314
    635     /// Check if the do block method is called and create the function if necessary   
     1315    /// Check if the do block method is called and create the function if necessary
    6361316    if (!idb->supportsIndirectBr()) {
    6371317
     
    6601340    }
    6611341
    662     std::vector<Value *> priorProduced;
    663     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    664         if (mStreamSetOutputBuffers[i]->supportsCopyBack())  {
    665             priorProduced.push_back(idb->getProducedItemCount(mStreamSetOutputs[i].name));
    666         }
    667     }
    668 
    6691342    generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
    670 
    671     unsigned priorIdx = 0;
    672     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    673         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    674             Value * newProduced = idb->getProducedItemCount(mStreamSetOutputs[i].name);
    675             Value * handle = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
    676             mStreamSetOutputBuffers[i]->genCopyBackLogic(idb.get(), handle, priorProduced[priorIdx], newProduced, mStreamSetOutputs[i].name);
    677             priorIdx++;
    678         }
    679     }
    6801343
    6811344    if (!idb->supportsIndirectBr()) {
     
    7281391
    7291392    generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
    730 
    731     RecursivelyDeleteTriviallyDeadInstructions(remainingItems); // if remainingItems was not used, this will eliminate it.
    7321393
    7331394    if (!idb->supportsIndirectBr()) {
     
    7741435}
    7751436
    776 void MultiBlockKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
    777 
    778     // Stream set and buffer analysis.  When near the end of buffers
    779     // or for final block processing, data for each streamset may need
    780     // to be copied into temporary buffers to ensure linear access.
    781     // Data is always copied as a number of whole blocks, dependent
    782     // on the stream set processing rate.
    783    
    784     const unsigned bitBlockWidth = kb->getBitBlockWidth();
    785     const unsigned inputSetCount = mStreamSetInputs.size();
    786     const unsigned outputSetCount = mStreamSetOutputs.size();
    787     const unsigned totalSetCount = inputSetCount + outputSetCount;
    788    
    789     int maxBlocksToCopy[totalSetCount];
    790     for (unsigned i = 0; i < totalSetCount; i++) {
    791         if (mIsDerived[i]) {
    792             if (mItemsPerStride[i] % bitBlockWidth == 0) {
    793                 maxBlocksToCopy[i] = mItemsPerStride[i] / bitBlockWidth;
    794             }
    795             else {
    796                 // May not be block aligned, can overlap partial blocks at both ends.
    797                 maxBlocksToCopy[i] = mItemsPerStride[i]/bitBlockWidth + 2;
    798             }
    799         }
    800         else {
    801             // For variable input stream sets, we make a single stride of items
    802             // available, if possible, but this stride could be nonaligned.
    803             maxBlocksToCopy[i] = mStride / bitBlockWidth + 2;
    804         }
    805     }
    806     auto ip = kb->saveIP();
    807     Function * const cp = mCurrentMethod;
    808     const auto saveInstance = getInstance();
    809 
    810     // First prepare the multi-block method that will be used.
    811 
    812     std::vector<Type *> multiBlockParmTypes;
    813     multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
    814     multiBlockParmTypes.push_back(kb->getSizeTy());
    815     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    816         if (!mIsDerived[i]) multiBlockParmTypes.push_back(kb->getSizeTy());
    817     }
    818     for (auto buffer : mStreamSetInputBuffers) {
    819         multiBlockParmTypes.push_back(buffer->getStreamSetBlockType()->getPointerTo());
    820     }
    821     for (auto buffer : mStreamSetOutputBuffers) {
    822         multiBlockParmTypes.push_back(buffer->getStreamSetBlockType()->getPointerTo());
    823     }
    824 
    825     FunctionType * const type = FunctionType::get(kb->getVoidTy(), multiBlockParmTypes, false);
    826     Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, kb->getModule());
    827     multiBlockFunction->setCallingConv(CallingConv::C);
    828     multiBlockFunction->setDoesNotThrow();
    829     mCurrentMethod = multiBlockFunction;
    830     kb->SetInsertPoint(BasicBlock::Create(kb->getContext(), "multiBlockEntry", multiBlockFunction, 0));
    831 
    832     auto args = multiBlockFunction->arg_begin();
    833     args->setName("self");
    834     setInstance(&*args);
    835     (++args)->setName("itemsToDo");
    836     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    837         if (!mIsDerived[i]) (++args)->setName(mStreamSetInputs[i].name + "_availItems");
    838     }
    839     for (auto binding : mStreamSetInputs) {
    840         (++args)->setName(binding.name + "BufPtr");
    841     }
    842     for (auto binding : mStreamSetOutputs) {
    843         (++args)->setName(binding.name + "BufPtr");
    844     }
    845 
    846     // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
    847     // provide the required multi-block kernel logic.
    848     generateMultiBlockLogic(kb);
    849 
    850     kb->CreateRetVoid();
    851 
    852     kb->restoreIP(ip);
    853     mCurrentMethod = cp;
    854     setInstance(saveInstance);
    855 
    856     // Now proceed with creation of the doSegment method.
    857 
    858     BasicBlock * const entry = kb->GetInsertBlock();
    859     BasicBlock * const doSegmentOuterLoop = kb->CreateBasicBlock(getName() + "_doSegmentOuterLoop");
    860     BasicBlock * const doMultiBlockCall = kb->CreateBasicBlock(getName() + "_doMultiBlockCall");
    861     BasicBlock * const tempBlockCheck = kb->CreateBasicBlock(getName() + "_tempBlockCheck");
    862     BasicBlock * const doTempBufferBlock = kb->CreateBasicBlock(getName() + "_doTempBufferBlock");
    863     BasicBlock * const segmentDone = kb->CreateBasicBlock(getName() + "_segmentDone");
    864 
    865     Value * blockBaseMask = kb->CreateNot(kb->getSize(kb->getBitBlockWidth() - 1));
    866     ConstantInt * blockSize = kb->getSize(kb->getBitBlockWidth());
    867     ConstantInt * strideSize = kb->getSize(mStride);
    868    
    869     Value * availablePos = mAvailableItemCount[0];
    870     Value * itemsAvail = availablePos;
    871 
    872     //  Make sure that corresponding data is available depending on processing rate
    873     //  for all derived input stream sets.
    874     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    875         Value * a = mAvailableItemCount[i];
    876         auto & rate = mStreamSetInputs[i].rate;
    877         if (mIsDerived[i]) {
    878             Value * maxItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), a);
    879             itemsAvail = kb->CreateSelect(kb->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
    880         }
    881     }
    882 
    883     Value * processed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    884     Value * itemsToDo = kb->CreateSub(itemsAvail, processed);
    885     Value * fullStridesToDo = kb->CreateUDiv(itemsToDo, strideSize);
    886 
    887     //  Now we iteratively process these blocks using the doMultiBlock method.
    888     //  In each iteration, we process the maximum number of linearly accessible
    889     //  blocks on the principal input, reduced to ensure that the corresponding
    890     //  data is linearly available at the specified processing rates for the other inputs,
    891     //  and that each of the output buffers has sufficient linearly available space
    892     //  (using overflow areas, if necessary) for the maximum output that can be
    893     //  produced.
    894 
    895     kb->CreateBr(doSegmentOuterLoop);
    896     kb->SetInsertPoint(doSegmentOuterLoop);
    897     PHINode * const stridesRemaining = kb->CreatePHI(kb->getSizeTy(), 2, "stridesRemaining");
    898     stridesRemaining->addIncoming(fullStridesToDo, entry);
    899 
    900     // For each input buffer, determine the processedItemCount, the block pointer for the
    901     // buffer block containing the next item, and the number of linearly available items.
    902 
    903     Value * processedItemCount[inputSetCount];
    904     Value * inputBlockPtr[inputSetCount];
    905     Value * linearlyAvailItems[inputSetCount];
    906 
    907     Value * linearlyAvailStrides = stridesRemaining;
    908     for (unsigned i = 0; i < inputSetCount; i++) {
    909         processedItemCount[i] = kb->getProcessedItemCount(mStreamSetInputs[i].name);
    910         inputBlockPtr[i] = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
    911         Value * avail = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    912         linearlyAvailItems[i] = kb->getLinearlyAccessibleItems(mStreamSetInputs[i].name, processedItemCount[i], avail);
    913         auto & rate = mStreamSetInputs[i].rate;
    914         if (rate.isUnknownRate()) continue;  // No calculation possible for unknown rates.
    915         Value * maxReferenceItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), linearlyAvailItems[i]);
    916         Value * maxStrides = kb->CreateUDiv(maxReferenceItems, strideSize);
    917         linearlyAvailStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyAvailStrides), maxStrides, linearlyAvailStrides);
    918     }
    919 
    920     Value * producedItemCount[outputSetCount];
    921     Value * outputBlockPtr[outputSetCount];
    922     //  Now determine the linearly writeable blocks, based on available blocks reduced
    923     //  by limitations of output buffer space.
    924     Value * linearlyWritableStrides = linearlyAvailStrides;
    925     for (unsigned i = 0; i < outputSetCount; i++) {
    926         producedItemCount[i] = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    927         outputBlockPtr[i] = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
    928        
    929         auto & rate = mStreamSetOutputs[i].rate;
    930         if (rate.isUnknownRate()) continue;  // No calculation possible for unknown rates.
    931         Value * writableItems = kb->getLinearlyWritableItems(mStreamSetOutputs[i].name, producedItemCount[i]);
    932         Value * maxReferenceItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), writableItems);
    933         Value * maxStrides = kb->CreateUDiv(maxReferenceItems, strideSize);
    934         linearlyWritableStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyWritableStrides), maxStrides, linearlyWritableStrides);
    935     }
    936     Value * const haveFullStrides = kb->CreateICmpUGT(linearlyWritableStrides, kb->getSize(0));
    937     kb->CreateCondBr(haveFullStrides, doMultiBlockCall, tempBlockCheck);
    938 
    939     //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
    940     //  Now prepare the doMultiBlock call.
    941     kb->SetInsertPoint(doMultiBlockCall);
    942 
    943     Value * principalItemsToDo = kb->CreateMul(linearlyWritableStrides, strideSize);
    944 
    945     std::vector<Value *> doMultiBlockArgs;
    946     doMultiBlockArgs.push_back(getInstance());
    947     doMultiBlockArgs.push_back(principalItemsToDo);
    948     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    949         if (!mIsDerived[i]) {
    950             doMultiBlockArgs.push_back(linearlyAvailItems[i]);
    951         }
    952     }
    953     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    954         Value * bufPtr = kb->CreatePointerCast(inputBlockPtr[i], mStreamSetInputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    955         doMultiBlockArgs.push_back(bufPtr);
    956     }
    957     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    958         Value * bufPtr = kb->CreatePointerCast(outputBlockPtr[i], mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    959         doMultiBlockArgs.push_back(bufPtr);
    960     }
    961 
    962     kb->CreateCall(multiBlockFunction, doMultiBlockArgs);
    963 
    964     // Do copybacks if necessary.
    965     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    966         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    967             Value * newProduced = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    968             Value * handle = mStreamSetOutputBuffers[i]->getStreamSetHandle();
    969             mStreamSetOutputBuffers[i]->genCopyBackLogic(kb.get(), handle, producedItemCount[i], newProduced, mStreamSetOutputs[i].name);
    970         }
    971     }
    972 
    973     if (mIsDerived[0]) {
    974         Value * reducedStridesToDo = kb->CreateSub(stridesRemaining, linearlyWritableStrides);
    975         stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    976         Value * nowProcessed = kb->CreateAdd(processedItemCount[0], principalItemsToDo);
    977         kb->setProcessedItemCount(mStreamSetInputs[0].name, nowProcessed);
    978         kb->CreateBr(doSegmentOuterLoop);
    979     }
    980     else {
    981         // Processed item count updated by the kernel itself.
    982         Value * nowProcessed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    983         Value * remainingItemsToDo = kb->CreateSub(itemsAvail, nowProcessed);
    984         Value * reducedStridesToDo = kb->CreateUDiv(remainingItemsToDo, nowProcessed);
    985         stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    986         // If we didn't make progress, we have gone as far as we can in this segment.
    987         kb->CreateCondBr(kb->CreateICmpUGT(nowProcessed, processedItemCount[0]), doSegmentOuterLoop, segmentDone);
    988     }
    989 
    990     //
    991     // We use temporary buffers in 3 different cases that preclude full block processing.
    992     // (a) One or more input buffers does not have a sufficient number of input items linearly available.
    993     // (b) One or more output buffers does not have sufficient linearly available buffer space.
    994     // (c) We have processed all the full blocks of input and only the excessItems remain.
    995     // In each case we set up temporary buffers for input and output and then
    996     // call the Multiblock routine.
    997     //
    998 
    999     kb->SetInsertPoint(tempBlockCheck);
    1000     Value * const haveStrides = kb->CreateICmpUGT(stridesRemaining, kb->getSize(0));
    1001     kb->CreateCondBr(kb->CreateOr(mIsFinal, haveStrides), doTempBufferBlock, segmentDone);
    1002 
    1003     kb->SetInsertPoint(doTempBufferBlock);
    1004     Value * excessItems = kb->CreateSub(itemsAvail, kb->getProcessedItemCount(mStreamSetInputs[0].name));
    1005     Value * tempBlockItems = kb->CreateSelect(haveStrides, strideSize, excessItems);
    1006     Value * doFinal = kb->CreateNot(haveStrides);
    1007 
    1008     // Begin constructing the doMultiBlock args.
    1009     std::vector<Value *> tempArgs;
    1010     tempArgs.push_back(getInstance());
    1011     tempArgs.push_back(tempBlockItems);
    1012     // For non-derived inputs, add the available items.
    1013     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    1014         if (!mIsDerived[i]) {
    1015             Value * avail = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    1016             tempArgs.push_back(kb->CreateSelect(kb->CreateICmpULT(avail, strideSize), avail, strideSize));
    1017         }
    1018     }
    1019     //
    1020     // Define and allocate the temporary buffer area.
    1021     //
    1022     Type * tempBuffers[totalSetCount];
    1023     for (unsigned i = 0; i < inputSetCount; ++i) {
    1024         Type * bufType = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    1025         tempBuffers[i] = ArrayType::get(bufType, maxBlocksToCopy[i]);
    1026     }
    1027     for (unsigned i = 0; i < outputSetCount; i++) {
    1028         Type * bufType = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
    1029         tempBuffers[i + inputSetCount] = ArrayType::get(bufType, maxBlocksToCopy[i + inputSetCount]);
    1030     }
    1031     Type * tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount), "tempBuf");
    1032     // Prepare the temporary buffer area.
    1033     Value * tempParameterArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
    1034     kb->CreateMemZero(tempParameterArea, ConstantExpr::getSizeOf(tempParameterStructType));
    1035     // For each input and output buffer, copy over necessary data starting from the last block boundary.
    1036     Value * itemCountNeeded[inputSetCount];
    1037     itemCountNeeded[0] = tempBlockItems;
    1038     Value * finalItemCountNeeded[inputSetCount];
    1039 
    1040     for (unsigned i = 0; i < inputSetCount; i++) {
    1041         Type * bufPtrType = mStreamSetInputBuffers[i]->getStreamSetBlockType()->getPointerTo();
    1042         if (mItemsPerStride[i] != 0) {
    1043             Value * tempBufPtr = kb->CreateGEP(tempParameterArea, {kb->getInt32(0), kb->getInt32(i)});
    1044             tempBufPtr = kb->CreatePointerCast(tempBufPtr, bufPtrType);
    1045             ConstantInt * strideItems = kb->getSize(mItemsPerStride[i]);
    1046             Value * strideBasePos = kb->CreateSub(processedItemCount[i], kb->CreateURem(processedItemCount[i], strideItems));
    1047             Value * blockBasePos = strideBasePos;
    1048             if (mItemsPerStride[i] & (bitBlockWidth - 1)) {
    1049                 blockBasePos = kb->CreateAnd(strideBasePos, blockBaseMask);
    1050             }
    1051 
    1052             // The number of items to copy is determined by the processing rate requirements.
    1053             if (i >= 1) {
    1054                 auto & rate = mStreamSetInputs[i].rate;
    1055                 std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
    1056                 Port port; unsigned ssIdx;
    1057                 std::tie(port, ssIdx) = getStreamPort(refSet);
    1058                 itemCountNeeded[i] = rate.CreateRatioCalculation(kb.get(), itemCountNeeded[ssIdx], doFinal);
    1059             }
    1060             finalItemCountNeeded[i] = kb->CreateAdd(itemCountNeeded[i], processedItemCount[i]);
    1061 
    1062             Value * inputPtr = kb->CreatePointerCast(kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), blockBasePos), bufPtrType);
    1063            
    1064             if (maxBlocksToCopy[i] == 1) {
    1065                 // copy one block
    1066                 mStreamSetInputBuffers[i]->createBlockCopy(kb.get(), tempBufPtr, inputPtr, kb->getSize(1));
    1067             }
    1068             else {
    1069                 Value * neededItems = kb->CreateSub(finalItemCountNeeded[i], blockBasePos);
    1070                 Value * copyItems1 = kb->getLinearlyAccessibleItems(mStreamSetInputs[i].name, blockBasePos, neededItems);
    1071                 Value * allAvail = kb->CreateICmpEQ(neededItems, copyItems1);
    1072                 Value * copyBlocks1 = kb->CreateUDivCeil(copyItems1, blockSize);
    1073                 mStreamSetInputBuffers[i]->createBlockCopy(kb.get(), tempBufPtr, inputPtr, copyBlocks1);
    1074                 BasicBlock * copyRemaining = kb->CreateBasicBlock("copyRemaining");
    1075                 BasicBlock * copyDone = kb->CreateBasicBlock("copyDone");
    1076                 kb->CreateCondBr(allAvail, copyDone, copyRemaining);
    1077                 kb->SetInsertPoint(copyRemaining);
    1078                 Value * copyItems2 = kb->CreateSub(neededItems, copyItems1);
    1079                 Value * copyBlocks2 = kb->CreateUDivCeil(copyItems2, blockSize);
    1080                 //Value * nextBasePos = kb->CreateAdd(blockBasePos, copyItems1);
    1081                 Value * nextBasePos = kb->CreateAdd(blockBasePos, kb->CreateMul(copyBlocks2, blockSize));
    1082                 Value * nextInputPtr = kb->CreatePointerCast(kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), nextBasePos), bufPtrType);
    1083                 Value * nextBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(copyItems1, blockSize));
    1084                 //mStreamSetInputBuffers[i]->createBlockAlignedCopy(kb.get(), nextBufPtr, nextInputPtr, copyItems2);
    1085                 mStreamSetInputBuffers[i]->createBlockCopy(kb.get(), nextBufPtr, nextInputPtr, copyBlocks2);
    1086                 kb->CreateBr(copyDone);
    1087                 kb->SetInsertPoint(copyDone);
    1088             }
    1089             tempArgs.push_back(tempBufPtr);
    1090         } else {
    1091             Value * bufPtr = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
    1092             bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetInputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    1093             tempArgs.push_back(bufPtr);           
    1094         }
    1095     }
    1096     Value * outputBasePos[outputSetCount];
    1097     for (unsigned i = 0; i < outputSetCount; i++) {
    1098         Value * tempBufPtr = kb->CreateGEP(tempParameterArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i)});
    1099         Type * bufPtrType = mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo();
    1100         tempBufPtr = kb->CreatePointerCast(tempBufPtr, bufPtrType);
    1101         producedItemCount[i] = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    1102         outputBasePos[i] = kb->CreateAnd(producedItemCount[i], blockBaseMask);
    1103         //mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), tempBufPtr, outputBlockPtr[i], kb->CreateSub(producedItemCount[i], outputBasePos[i]));
    1104         Value * copyBlocks = kb->CreateUDivCeil(kb->CreateSub(producedItemCount[i], outputBasePos[i]), blockSize);
    1105         mStreamSetOutputBuffers[i]->createBlockCopy(kb.get(), tempBufPtr, outputBlockPtr[i], copyBlocks);
    1106         tempArgs.push_back(tempBufPtr);
    1107     }
    1108 
    1109     kb->CreateCall(multiBlockFunction, tempArgs);
    1110    
    1111     //  The items have been processed and output generated to the temporary areas.
    1112     //  Update the processed item count (and hence all the counts derived automatically
    1113     //  therefrom).
    1114     if (mIsDerived[0]) {
    1115         kb->setProcessedItemCount(mStreamSetInputs[0].name, finalItemCountNeeded[0]);
    1116     }
    1117    
    1118     // Copy back data to the actual output buffers.
    1119     for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
    1120         Value * tempBufPtr = kb->CreateGEP(tempParameterArea,  {kb->getInt32(0), kb->getInt32(mStreamSetInputs.size() + i)});
    1121         tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    1122         Value * finalOutputItems = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    1123         Value * copyItems = kb->CreateSub(finalOutputItems, outputBasePos[i]);
    1124         // Round up to exact multiple of block size.
    1125         //copyItems = kb->CreateAnd(kb->CreateAdd(copyItems, kb->getSize(bitBlockWidth - 1)), blockBaseMask);
    1126         Value * writableFromBase = kb->getLinearlyWritableItems(mStreamSetOutputs[i].name, outputBasePos[i]); // must be a whole number of blocks.
    1127         Value * allWritable = kb->CreateICmpULE(copyItems, writableFromBase);
    1128         Value * copyItems1 = kb->CreateSelect(allWritable, copyItems, writableFromBase);
    1129         //mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), outputBlockPtr[i], tempBufPtr, copyItems1);
    1130         Value * copyBlocks1 = kb->CreateUDivCeil(copyItems1, blockSize);
    1131         mStreamSetOutputBuffers[i]->createBlockCopy(kb.get(), outputBlockPtr[i], tempBufPtr, copyBlocks1);
    1132         BasicBlock * copyBackRemaining = kb->CreateBasicBlock("copyBackRemaining");
    1133         BasicBlock * copyBackDone = kb->CreateBasicBlock("copyBackDone");
    1134         kb->CreateCondBr(allWritable, copyBackDone, copyBackRemaining);
    1135         kb->SetInsertPoint(copyBackRemaining);
    1136         Value * copyItems2 = kb->CreateSub(copyItems, copyItems1);
    1137         Value * nextBasePos = kb->CreateAdd(outputBasePos[i], copyItems1);
    1138         Type * bufPtrType = mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo();
    1139         Value * nextOutputPtr = kb->CreatePointerCast(kb->getRawOutputPointer(mStreamSetOutputs[i].name, kb->getInt32(0), nextBasePos), bufPtrType);
    1140         tempBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(copyItems1, blockSize));
    1141         //mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), nextOutputPtr, tempBufPtr, copyItems2);
    1142         Value * copyBlocks2 = kb->CreateUDivCeil(copyItems2, blockSize);
    1143         mStreamSetOutputBuffers[i]->createBlockCopy(kb.get(), nextOutputPtr, tempBufPtr, copyBlocks2);
    1144         kb->CreateBr(copyBackDone);
    1145         kb->SetInsertPoint(copyBackDone);
    1146     }
    1147 
    1148     //  We've dealt with the partial block processing and copied information back into the
    1149     //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1150     //
    1151     BasicBlock * setTermination = kb->CreateBasicBlock("mBsetTermination");
    1152     if (mIsDerived[0]) {
    1153         stridesRemaining->addIncoming(kb->CreateSub(stridesRemaining, kb->CreateZExt(haveStrides, kb->getSizeTy())), kb->GetInsertBlock());
    1154         kb->CreateCondBr(haveStrides, doSegmentOuterLoop, setTermination);
    1155     }
    1156     else {
    1157         Value * nowProcessed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    1158         Value * remainingItemsToDo = kb->CreateSub(itemsAvail, nowProcessed);
    1159         Value * reducedStridesToDo = kb->CreateUDiv(remainingItemsToDo, nowProcessed);
    1160         stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    1161         Value * haveStrides = kb->CreateICmpUGT(reducedStridesToDo, kb->getSize(0));
    1162         kb->CreateCondBr(haveStrides, doSegmentOuterLoop, setTermination);
    1163     }   
    1164     kb->SetInsertPoint(setTermination);
    1165     kb->setTerminationSignal();
    1166     kb->CreateBr(segmentDone);
    1167     kb->SetInsertPoint(segmentDone);
    1168 }
    1169 
    1170 void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
    1171     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
    1172     mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
    1173 }
    1174 
    1175 Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
    1176     const auto f = mStreamMap.find(name);
    1177     if (LLVM_UNLIKELY(f == mStreamMap.end())) {
    1178         report_fatal_error(getName() + " does not contain stream set " + name);
    1179     }
    1180     return f->second;
    1181 }
    1182 
    11831437static inline std::string annotateKernelNameWithDebugFlags(std::string && name) {
    11841438    if (codegen::EnableAsserts) {
     
    12011455                  , std::move(internal_scalars))
    12021456, mCurrentMethod(nullptr)
     1457, mAvailablePrincipleItemCount(nullptr)
    12031458, mNoTerminateAttribute(false)
    12041459, mIsGenerated(false)
     1460, mStride(0)
    12051461, mIsFinal(nullptr)
    1206 , mOutputScalarResult(nullptr)
    1207 , mStride(0) {
     1462, mOutputScalarResult(nullptr) {
    12081463
    12091464}
     
    12201475                                         std::vector<Binding> && scalar_outputs,
    12211476                                         std::vector<Binding> && internal_scalars)
    1222 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
     1477: MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
    12231478, mDoBlockMethod(nullptr)
    12241479, mStrideLoopBody(nullptr)
     
    12281483}
    12291484
    1230 // CONSTRUCTOR
     1485// MULTI-BLOCK KERNEL CONSTRUCTOR
    12311486MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
    12321487                                   std::vector<Binding> && stream_inputs,
     
    12361491                                   std::vector<Binding> && internal_scalars)
    12371492: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1493
    12381494}
    12391495
     
    12461502                                             std::vector<Binding> && internal_scalars)
    12471503: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    1248    
    1249 }
    1250  
    1251    
    1252 void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & kb,
    1253                                  std::vector<Value *> inputAvailable,
    1254                                  Value * doFinal) {
    1255     auto kernel = kb->getKernel();
    1256     const unsigned inputSetCount = inputAvailable.size();
    1257     if (inputSetCount == 0) return;  //  Cannot calculate buffer items expected from input.
    1258     auto & outputs = kernel->getStreamSetOutputBuffers();
    1259     const unsigned outputSetCount = outputs.size();
    1260 
    1261     Constant * blockSize = kb->getSize(kb->getBitBlockWidth());
    1262     Value * newlyAvailInputItems[inputSetCount];
    1263     Value * requiredOutputBufferSpace[outputSetCount];
    1264     for (unsigned i = 0; i < inputSetCount; i++) {
    1265         Value * processed = kb->getProcessedItemCount(kernel->getStreamInput(i).name);
    1266         newlyAvailInputItems[i] = kb->CreateSub(inputAvailable[i], processed);
    1267     }
    1268     //kb->GetInsertBlock()->dump();
    1269     for (unsigned i = 0; i < outputSetCount; i++) {
    1270         const auto & rate = kernel->getStreamOutput(i).rate;
    1271         if (rate.isUnknownRate()) continue;  // No calculations possible.
    1272         Kernel::Port port; unsigned ssIdx;
    1273         std::tie(port, ssIdx) = kernel->getStreamPort(rate.referenceStreamSet());
    1274         Value * base = nullptr;
    1275         if (port == Kernel::Port::Output) {
    1276             base = requiredOutputBufferSpace[ssIdx]; assert (base);
    1277         } else {
    1278             base = newlyAvailInputItems[ssIdx]; assert (base);
    1279         }
    1280         requiredOutputBufferSpace[i] = rate.CreateRatioCalculation(kb.get(), base, doFinal);
    1281         if (auto db = dyn_cast<DynamicBuffer>(outputs[i])) {
    1282             Value * handle = db->getStreamSetHandle();
    1283             // This buffer can be expanded.
    1284             Value * producedBlock = kb->CreateUDivCeil(kb->getProducedItemCount(kernel->getStreamOutput(i).name), blockSize);
    1285             Value * consumedBlock = kb->CreateUDiv(kb->getConsumedItemCount(kernel->getStreamOutput(i).name), blockSize);
    1286             Value * blocksInUse = kb->CreateSub(producedBlock, consumedBlock);
    1287             Value * blocksRequired = kb->CreateAdd(blocksInUse, kb->CreateUDivCeil(requiredOutputBufferSpace[i], blockSize));
    1288             Value * spaceRequired = kb->CreateMul(blocksRequired, blockSize);
    1289             Value * expansionNeeded = kb->CreateICmpUGT(spaceRequired, db->getBufferedSize(kb.get(), handle));
    1290             BasicBlock * doExpand = kb->CreateBasicBlock("doExpand");
    1291             BasicBlock * bufferReady = kb->CreateBasicBlock("bufferReady");
    1292             kb->CreateCondBr(expansionNeeded, doExpand, bufferReady);
    1293             kb->SetInsertPoint(doExpand);
    1294             db->doubleCapacity(kb.get(), handle);
    1295             // Ensure that capacity is sufficient by successive doubling, if necessary.
    1296             expansionNeeded = kb->CreateICmpUGT(spaceRequired, db->getBufferedSize(kb.get(), handle));
    1297             kb->CreateCondBr(expansionNeeded, doExpand, bufferReady);
    1298             kb->SetInsertPoint(bufferReady);
    1299         }
    1300     }
    1301 
    1302 }
    1303 
    1304 }
     1504
     1505}
     1506
     1507
     1508}
Note: See TracChangeset for help on using the changeset viewer.