Ignore:
Timestamp:
Dec 3, 2017, 12:40:40 PM (18 months ago)
Author:
nmedfort
Message:

Bug fixes and simplified MultiBlockKernel? logic

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
2 added
30 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/attributes.h

    r5706 r5755  
    22#define ATTRIBUTES_H
    33
     4#include <vector>
     5
    46namespace kernel {
    57
    68struct Attribute {
    79
     10    friend struct AttributeSet;
     11
    812    friend struct Binding;
    913
     
    1216        /** INPUT STREAM ATTRIBUTES **/
    1317
    14         BlockSize,
    15 
    16         // A BlockSize(K) attribute, where K=2^k for some value of k>=4 declares
    17         // that the layout of stream data items within the corresponding input
    18         // or output buffer is arranged in blocks of K items each.   In each
    19         // block, the data buffer contains K items of the first stream in the
    20         // set, followed by K items of the next stream in the set and so on,
    21         // up to and including K items of the last stream in the set.
    22 
    23         // (Note: this replaces the concept of swizzling and anticipates that
    24         // the pipeline will take on the role of automatically inserting the
    25         // swizzling code necessary).
    26 
    27         LookAhead,
     18        LookAhead, /// NOT DONE
    2819
    2920        // A LookAhead(n) attribute on an input stream set S declares that the kernel
     
    4637        // that holds a copy of the data at the physical start of buffer).
    4738
    48         LookBehind,
     39        LookBehind, /// NOT DONE
    4940
    5041        // A LookBehind(n) attribute on an input stream S declares that the kernel
     
    6051        // (Example: lz4d lookbehind(65536)).
    6152
    62         Principle,
    63 
    64         // One input stream can be declared as the principle input buffer for a kernel.
    65         // If a kernel has a principle input stream, when processing the final stride,
     53        Principal,
     54
     55        // One input stream can be declared as the principal input buffer for a kernel.
     56        // If a kernel has a principal input stream, when processing the final stride,
    6657        // a MultiBlockKernel assumes the item count of the principle is the correct
    6758        // one and zero extends / truncates all other input streams to match it.
    6859
     60        Deferred,
     61
     62        // Normally, the processed item count of fixed rate streams is automatically
     63        // updated by the MultiBlock kernel. However, some streams behave like Fixed
     64        // rate streams (in that they will always eventually process a Fixed amount of
     65        // data) but the kernel processes the data in unpredictable chunks. Rather than
     66        // declaring those as Unknown or Bounded rates, marking their rate calculation
     67        // as Deferred provides the pipeline with a stronger guarantee when it comes to
     68        // buffer size calculations.
     69
     70        Greedy,
     71
     72        // Normally, the available item count of fixed rate streams is equal to the
     73        // number of strides processed by the MultiBlock times its stride size for all
     74        // strides except for the final stride. Some kernels consume
     75
    6976        /** OUTPUT STREAM ATTRIBUTES **/
    7077
    7178        Add,
    7279
    73         // An Add(K) attribute states that K bits will be added to this stream after
     80        // An Add(K) attribute states that K items will be added to this stream after
    7481        // processing the final block.
    7582
     
    7986        // be rounded up to the nearest multiple of k
    8087
     88        /** INPUT/OUTPUT STREAM ATTRIBUTES **/
     89
     90        BlockSize, /// NOT DONE
     91
     92        // A BlockSize(K) attribute, where K=2^k for some value of k>=4 declares
     93        // that the layout of stream data items within the corresponding input
     94        // or output buffer is arranged in blocks of K items each.   In each
     95        // block, the data buffer contains K items of the first stream in the
     96        // set, followed by K items of the next stream in the set and so on,
     97        // up to and including K items of the last stream in the set.
     98
     99        // (Note: this replaces the concept of swizzling and anticipates that
     100        // the pipeline will take on the role of automatically inserting the
     101        // swizzling code necessary).
     102
    81103        /** KERNEL ATTRIBUTES **/
    82104
    83         SelectMinimumInputLength,
     105        SelectMinimumInputLength, /// NOT DONE
    84106
    85107        // If a kernel has multiple input streams and their final item count differs,
     
    88110
    89111        // NOTE: this is the default if a kernel does not have SelectMaximumInputLength
    90         // set and no PrincipleInputStream was declared.
    91 
    92         SelectMaximumInputLength,
     112        // set and no PrincipalInputStream was declared.
     113
     114        SelectMaximumInputLength, /// NOT DONE
    93115
    94116        // If a kernel has multiple input streams and their final item count differs,
     
    96118        // principle item length and zero-extend the streams accordingly.
    97119
     120        CanTerminate,
     121
     122        // Informs the pipeline that this kernel can pass a "termination" message to it.
     123        // in which case the pipeline will propogate the message to the subsequent
     124        // kernels and end the program once the final kernel has returned its result.
     125
     126        IndependentRegions,
     127
     128        // Some kernels can divide their processing into concrete non-overlapping regions
     129        // between a start and end position in which the data produced by a kernel. If a
     130        // kernel K is processed simultaneously by two threads, K_0 and K_1, and K_1 is
     131        // waiting K_0 to finish and update it's kernel state for K_1 to resume at, K_1 can
     132        // compute what its state will be and begin processing before K_0 is finished. This
     133        // requires a the pipeline to intervene and call an optimized "output-less" instance
     134        // of the kernel prior to calling B.
     135
    98136    };
    99137
     
    102140    }
    103141
    104     bool isPrinciple() const {
    105         return mKind == KindId::Principle;
     142    bool isPrincipal() const {
     143        return mKind == KindId::Principal;
    106144    }
    107145
     
    110148    }
    111149
     150    bool isBlockSize() const {
     151        return mKind == KindId::BlockSize;
     152    }
     153
    112154    unsigned getAmount() const {
    113155        return mK;
     
    129171
    130172    friend Attribute Add1();
    131     friend Attribute Principle();
     173    friend Attribute Principal();
    132174    friend Attribute RoundUpTo(const unsigned);
    133175    friend Attribute LookBehind(const unsigned);
     176    friend Attribute Deferred();
    134177
    135178    Attribute(const KindId kind, const unsigned k) : mKind(kind), mK(k) { }
     
    138181
    139182    const KindId    mKind;
    140     const unsigned  mK;
    141 
     183    unsigned        mK;
    142184};
     185
     186struct AttributeSet : public std::vector<Attribute> {
     187
     188    using AttributeId = Attribute::KindId;
     189
     190    const AttributeSet & getAttributes() const {
     191        return *this;
     192    }
     193
     194    const Attribute & getAttribute(const unsigned i) const {
     195        return getAttributes()[i];
     196    }
     197
     198    void addAttribute(Attribute attribute);
     199
     200    bool hasAttributes() const {
     201        return !empty();
     202    }
     203
     204    bool hasAttribute(const AttributeId id) const;
     205
     206    AttributeSet() = default;
     207
     208    AttributeSet(std::initializer_list<Attribute> attrs) : std::vector<Attribute>(attrs) { }
     209};
     210
    143211
    144212inline Attribute Add1() {
     
    150218}
    151219
    152 inline Attribute Principle() {
    153     return Attribute(Attribute::KindId::Principle, 0);
     220inline Attribute Principal() {
     221    return Attribute(Attribute::KindId::Principal, 0);
    154222}
    155223
     
    158226}
    159227
     228inline Attribute Deferred() {
     229    return Attribute(Attribute::KindId::Deferred, 0);
     230}
    160231
    161232}
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5706 r5755  
    369369        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    370370    }
    371     Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
     371    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
    372372    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    373373}
    374374
    375 DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount)
    376 : BlockOrientedKernel("del" + std::to_string(fw) + "_" + std::to_string(streamCount),
     375DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
     376: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    377377              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    378378               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    379379              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
    380                Binding{iBuilder->getStreamSetTy(), "deletionCounts"}},
     380               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
    381381              {}, {}, {})
    382 , mDeletionFieldWidth(fw)
     382, mDeletionFieldWidth(fieldWidth)
    383383, mStreamCount(streamCount) {
    384384}
     
    626626        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
    627627    }
    628     iBuilder->setScalarField("pendingOffset", pendingOffset);
    629     iBuilder->CallPrintInt("pendingOffset", pendingOffset);
    630 
    631    
     628    iBuilder->setScalarField("pendingOffset", pendingOffset);   
    632629    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
    633630    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
  • icGREP/icgrep-devel/icgrep/kernels/interface.cpp

    r5733 r5755  
    6767    args->setName("self");
    6868    (++args)->setName("doFinal");
    69 //    if (mHasPrincipleItemCount) {
     69//    if (mHasPrincipalItemCount) {
    7070//        (++args)->setName("principleAvailableItemCount");
    7171//    }
     
    142142}
    143143
    144 void Binding::addAttribute(Attribute attribute) {
    145     for (Attribute & attr : attributes) {
    146         if (attr.getKind() == attribute.getKind()) {
    147             return;
    148         }
    149     }
    150     attributes.emplace_back(attribute);
    151144}
    152 
    153 void KernelInterface::normalizeStreamProcessingRates() {
    154 
    155 }
    156 
    157 }
  • icGREP/icgrep-devel/icgrep/kernels/interface.h

    r5706 r5755  
    2525namespace kernel {
    2626
    27 struct Binding {
    28 
    29     friend class KernelInterface;
     27struct Binding : public AttributeSet {
    3028
    3129    Binding(llvm::Type * type, const std::string & name, ProcessingRate r = FixedRate(1))
    32     : type(type), name(name), rate(r), attributes() { }
     30    : AttributeSet()
     31    , mType(type), mName(name), mRate(std::move(r)) { }
    3332
    3433
    3534    Binding(llvm::Type * type, const std::string & name, ProcessingRate r, Attribute && attribute)
    36     : type(type), name(name), rate(r), attributes({std::move(attribute)}) { }
     35    : AttributeSet({std::move(attribute)})
     36    , mType(type), mName(name), mRate(std::move(r)) { }
    3737
    3838
    3939    Binding(llvm::Type * type, const std::string & name, ProcessingRate r, std::initializer_list<Attribute> attributes)
    40     : type(type), name(name), rate(r), attributes(attributes) { }
     40    : AttributeSet(attributes)
     41    , mType(type), mName(name), mRate(std::move(r)) { }
    4142
    4243    llvm::Type * getType() const {
    43         return type;
     44        return mType;
    4445    }
    4546
    4647    const std::string & getName() const {
    47         return name;
     48        return mName;
    4849    }
    4950
    5051    const ProcessingRate & getRate() const {
    51         return rate;
    52     }
    53 
    54     const Attribute & getAttribute(const unsigned i) const {
    55         return attributes[i];
    56     }
    57 
    58     const std::vector<Attribute> & getAttributes() const {
    59         return attributes;
    60     }
    61 
    62     void addAttribute(Attribute attribute);
    63 
    64     bool hasAttributes() const {
    65         return !attributes.empty();
     52        return mRate;
     53    }
     54
     55    ProcessingRate & getRate() {
     56        return mRate;
     57    }
     58
     59    bool isPrincipal() const {
     60        return hasAttribute(Attribute::KindId::Principal);
     61    }
     62
     63    bool notDeferred() const {
     64        return !hasAttribute(Attribute::KindId::Deferred);
    6665    }
    6766
    6867private:
    69     llvm::Type * const          type;
    70     const std::string           name;
    71     ProcessingRate              rate;
    72     std::vector<Attribute>      attributes;
     68    llvm::Type * const          mType;
     69    const std::string           mName;
     70    ProcessingRate              mRate;
    7371};
    7472
    75 class KernelInterface {
     73using Bindings = std::vector<Binding>;
     74
     75class KernelInterface : public AttributeSet {
    7676public:
    7777    /*
     
    9797
    9898    const Binding & getStreamInput(const unsigned i) const {
     99        assert (i < getNumOfStreamInputs());
    99100        return mStreamSetInputs[i];
    100101    }
     
    113114
    114115    const Binding & getStreamOutput(const unsigned i) const {
     116        assert (i < getNumOfStreamOutputs());
    115117        return mStreamSetOutputs[i];
    116118    }
     
    153155    void setInstance(llvm::Value * const instance);
    154156
    155     bool hasPrincipleItemCount() const {
    156         return mHasPrincipleItemCount;
     157    bool hasPrincipalItemCount() const {
     158        return mHasPrincipalItemCount;
    157159    }
    158160
     
    184186    , mModule(nullptr)
    185187    , mKernelStateType(nullptr)
    186     , mHasPrincipleItemCount(false)
     188    , mHasPrincipalItemCount(false)
    187189    , mKernelName(kernelName)
    188190    , mStreamSetInputs(stream_inputs)
     
    191193    , mScalarOutputs(scalar_outputs)
    192194    , mInternalScalars(internal_scalars) {
    193         normalizeStreamProcessingRates();
     195
    194196    }
    195197   
    196 private:
    197 
    198     void normalizeStreamProcessingRates();
    199 
    200198protected:
    201199
     
    203201    llvm::Module *                          mModule;
    204202    llvm::StructType *                      mKernelStateType;
    205     bool                                    mHasPrincipleItemCount;
     203    bool                                    mHasPrincipalItemCount;
    206204    const std::string                       mKernelName;
    207205    std::vector<llvm::Value *>              mInitialArguments;
     
    211209    std::vector<Binding>                    mScalarOutputs;
    212210    std::vector<Binding>                    mInternalScalars;
    213 
    214211};
    215212
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5743 r5755  
    2222#include <sstream>
    2323#include <kernels/kernel_builder.h>
    24 #include <boost/math/common_factor_rt.hpp>
     24#include <boost/math/common_factor.hpp>
    2525#include <llvm/Support/Debug.h>
    2626
     
    5050        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
    5151    }
    52     if (LLVM_UNLIKELY(mKernelMap.count(name))) {
     52    if (LLVM_UNLIKELY(mKernelFieldMap.count(name))) {
    5353        report_fatal_error(getName() + " already contains scalar field " + name);
    5454    }
    5555    const auto index = mKernelFields.size();
    56     mKernelMap.emplace(name, index);
     56    mKernelFieldMap.emplace(name, index);
    5757    mKernelFields.push_back(type);
    5858    return index;
     
    189189        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
    190190        assert (mKernelStateType);
    191     }   
     191    }
    192192}
    193193
     
    206206    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
    207207        report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object");
    208     }   
    209 }
    210 
    211 /** ------------------------------------------------------------------------------------------------------------- *
    212  * @brief getItemsPerStride
    213  ** ------------------------------------------------------------------------------------------------------------- */
    214 std::pair<unsigned, unsigned> Kernel::getStreamRate(const Port p, const unsigned i) const {
    215     const ProcessingRate & rate = (p == Port::Input) ? mStreamSetInputs[i].getRate() : mStreamSetOutputs[i].getRate();
    216     unsigned min = 0, max = 0;
    217     if (rate.isFixed()) {
    218         min = max = rate.getRate();
    219     } else if (rate.isBounded()) {
    220         min = rate.getLowerBound();
    221         max = rate.getUpperBound();
    222     } else if (rate.isUnknown()) {
    223         min = rate.getLowerBound();
    224         max = 0;
    225     } else if (rate.isExactlyRelative()) {
    226         for (unsigned j = 0; j < mStreamSetInputs.size(); ++j) {
    227             if (mStreamSetInputs[j].getName() == rate.getReference()) {
    228                 std::tie(min, max) = getStreamRate(Port::Input, j);
    229                 min = (min * rate.getNumerator()) / rate.getDenominator();
    230                 assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
    231                 max = (max * rate.getNumerator()) / rate.getDenominator();
    232                 return std::make_pair(min, max);
    233             }
    234         }
    235         for (unsigned j = 0; j < mStreamSetOutputs.size(); ++j) {
    236             if (mStreamSetOutputs[j].getName() == rate.getReference()) {
    237                 assert (p == Port::Output);
    238                 std::tie(min, max) = getStreamRate(Port::Output, j);
    239                 min = (min * rate.getNumerator()) / rate.getDenominator();
    240                 assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
    241                 max = (max * rate.getNumerator()) / rate.getDenominator();
    242                 return std::make_pair(min, max);
    243             }
    244         }
    245         llvm_unreachable("Reference rate must be associated with an input or output!");
    246     }
    247     return std::make_pair(min, max);
     208    }
    248209}
    249210
     
    252213 ** ------------------------------------------------------------------------------------------------------------- */
    253214void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
    254    
     215
     216    if (mStreamMap.empty()) {
     217        prepareStreamSetNameMap();
     218    }
     219
     220    normalizeStreamProcessingRates();
     221
    255222    const unsigned inputSetCount = mStreamSetInputs.size();
    256223    const unsigned outputSetCount = mStreamSetOutputs.size();
    257    
     224
    258225    assert (inputSetCount == mStreamSetInputBuffers.size());
    259226    assert (outputSetCount == mStreamSetOutputBuffers.size());
     
    293260    for (const auto & binding : mScalarOutputs) {
    294261        addScalar(binding.getType(), binding.getName());
    295     }
    296     if (mStreamMap.empty()) {
    297         prepareStreamSetNameMap();
    298262    }
    299263    for (const auto & binding : mInternalScalars) {
     
    388352    setInstance(&*(args++));
    389353    mIsFinal = &*(args++);
    390     mAvailablePrincipleItemCount = nullptr;
    391 //    if (mHasPrincipleItemCount) {
    392 //        mAvailablePrincipleItemCount = &*(args++);
    393 //    }
     354    mAvailablePrincipalItemCount = nullptr;
    394355    const auto n = mStreamSetInputs.size();
    395356    mAvailableItemCount.resize(n, nullptr);
    396357    for (unsigned i = 0; i < n; i++) {
    397 //        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    398 //        Value * itemCount = nullptr;
    399 //        if (rate.isFixed()) {
    400 //            itemCount = mAvailablePrincipleItemCount;
    401 //            if (rate.getRate() != 1) {
    402 //                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getRate()));
    403 //            }
    404 //        } else if (rate.isBounded() || rate.isUnknown()) {
    405 //            itemCount = &*(args++);
    406 //        } else if (rate.isRelative()) {
    407 //            for (unsigned j = 0; j < i; ++j) {
    408 //                if (mStreamSetInputs[j].getName() == rate.getReference()) {
    409 //                    itemCount = mAvailableItemCount[j];
    410 //                    break;
    411 //                }
    412 //            }
    413 //            if (LLVM_UNLIKELY(itemCount == nullptr)) {
    414 //                report_fatal_error(mStreamSetInputs[i].getName() + " is declared before " + rate.getReference());
    415 //            }
    416 //            if (rate.getNumerator() != 1) {
    417 //                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
    418 //            }
    419 //            if (rate.getDenominator() != 1) {
    420 //                itemCount = idb->CreateUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
    421 //            }
    422 //        }
    423 //        assert (itemCount);
    424 //        mAvailableItemCount[i] = itemCount;
    425 
    426358        assert (args != mCurrentMethod->arg_end());
    427359        mAvailableItemCount[i] = &*(args++);
    428360    }
    429361    assert (args == mCurrentMethod->arg_end());
    430 
    431362    generateKernelMethod(idb); // must be overridden by the Kernel subtype
    432363    mIsFinal = nullptr;
     
    466397 ** ------------------------------------------------------------------------------------------------------------- */
    467398unsigned Kernel::getScalarIndex(const std::string & name) const {
    468     const auto f = mKernelMap.find(name);
    469     if (LLVM_UNLIKELY(f == mKernelMap.end())) {
     399    const auto f = mKernelFieldMap.find(name);
     400    if (LLVM_UNLIKELY(f == mKernelFieldMap.end())) {
    470401        assert (false);
    471402        report_fatal_error(getName() + " does not contain scalar: " + name);
     
    574505
    575506/** ------------------------------------------------------------------------------------------------------------- *
     507 * @brief getStreamPort
     508 ** ------------------------------------------------------------------------------------------------------------- */
     509const Binding & Kernel::getBinding(const std::string & name) const {
     510    Port port; unsigned index;
     511    std::tie(port, index) = getStreamPort(name);
     512    return (port == Port::Input) ? getStreamInput(index) : getStreamOutput(index);
     513}
     514
     515/** ------------------------------------------------------------------------------------------------------------- *
     516 * @brief normalizeRelativeToFixedProcessingRate
     517 ** ------------------------------------------------------------------------------------------------------------- */
     518bool Kernel::normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate) {
     519    if (base.isFixed()) {
     520        return true;
     521    } else if (LLVM_UNLIKELY(base.isRelative())) {
     522        const auto & ref = getBinding(base.getReference()).getRate();
     523        if (normalizeRelativeToFixedProcessingRate(ref, toUpdate)) {
     524            toUpdate.getRate() *= ref.getRate();
     525            return true;
     526        }
     527    }
     528    return false;
     529}
     530
     531/** ------------------------------------------------------------------------------------------------------------- *
     532 * @brief normalizeStreamProcessingRates
     533 *
     534 * If we allow a stream to be transitively relative to a fixed rate stream, it complicates detection of fixed
     535 * rate streams later. Find any such occurance and transform them. This implies, however, that a fixed rate
     536 * stream could have a rational processing rate (which should not occur normally.)
     537 ** ------------------------------------------------------------------------------------------------------------- */
     538inline void Kernel::normalizeStreamProcessingRates() {
     539    for (Binding & input : mStreamSetInputs) {
     540        normalizeRelativeToFixedProcessingRate(input.getRate(), input.getRate());
     541    }
     542    for (Binding & output : mStreamSetOutputs) {
     543        normalizeRelativeToFixedProcessingRate(output.getRate(), output.getRate());
     544    }
     545    // TODO: we want to consume whole units. Once the pipeline is able to schedule kernels based on their stride
     546    // and input/output rates, modify them here.
     547}
     548
     549/** ------------------------------------------------------------------------------------------------------------- *
    576550 * @brief generateKernelMethod
    577551 ** ------------------------------------------------------------------------------------------------------------- */
    578552void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
    579 
    580     Constant * const log2BlockWidth = b->getSize(std::log2(b->getBitBlockWidth()));
    581 
    582553    const auto inputSetCount = mStreamSetInputs.size();
    583     mStreamSetInputBufferPtr.resize(inputSetCount);
     554    mStreamSetInputBaseAddress.resize(inputSetCount);
    584555    for (unsigned i = 0; i < inputSetCount; ++i) {
    585         const auto & name = mStreamSetInputs[i].getName();
    586         Value * ic = b->getProcessedItemCount(name);
    587         Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
    588         mStreamSetInputBufferPtr[i] = b->getInputStreamPtr(name, blockIndex);
    589     }
    590 
     556        mStreamSetInputBaseAddress[i] = nullptr;
     557    }
    591558    const auto outputSetCount = mStreamSetOutputs.size();
    592     mStreamSetOutputBufferPtr.resize(outputSetCount);
     559    mStreamSetOutputBaseAddress.resize(outputSetCount);
    593560    for (unsigned i = 0; i < outputSetCount; ++i) {
    594         const auto & name = mStreamSetOutputs[i].getName();
    595         Value * ic = b->getProducedItemCount(name);
    596         Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
    597         mStreamSetOutputBufferPtr[i] = b->getOutputStreamPtr(name, blockIndex);
    598     }
    599 
     561        mStreamSetOutputBaseAddress[i] = nullptr;
     562    }
    600563    generateDoSegmentMethod(b);
    601 
     564}
     565
     566/** ------------------------------------------------------------------------------------------------------------- *
     567 * @brief requiresBufferedFinalStride
     568 ** ------------------------------------------------------------------------------------------------------------- */
     569inline bool requiresBufferedFinalStride(const Binding & b) {
     570    if (LLVM_LIKELY(isa<ArrayType>(b.getType()))) {
     571        return b.getType()->getArrayNumElements() == 1;
     572    }
     573    return true;
     574}
     575
     576/** ------------------------------------------------------------------------------------------------------------- *
     577 * @brief getItemWidth
     578 ** ------------------------------------------------------------------------------------------------------------- */
     579inline unsigned getItemWidth(const Binding & b) {
     580    Type * ty = b.getType();
     581    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     582        ty = ty->getArrayElementType();
     583    }
     584    return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
     585}
     586
     587/** ------------------------------------------------------------------------------------------------------------- *
     588 * @brief getLowerBound
     589 ** ------------------------------------------------------------------------------------------------------------- */
     590ProcessingRate::RateValue MultiBlockKernel::getLowerBound(const ProcessingRate & rate) const {
     591    if (rate.isFixed() || rate.isBounded()) {
     592        return rate.getLowerBound();
     593    } else if (rate.isRelative()) {
     594        return rate.getRate() * getLowerBound(getBinding(rate.getReference()).getRate());
     595    } else { // if (rate.isUnknown())
     596        return 0;
     597    }
     598}
     599
     600/** ------------------------------------------------------------------------------------------------------------- *
     601 * @brief getUpperBound
     602 ** ------------------------------------------------------------------------------------------------------------- */
     603ProcessingRate::RateValue MultiBlockKernel::getUpperBound(const ProcessingRate &rate) const {
     604    if (rate.isFixed() || rate.isBounded()) {
     605        return rate.getUpperBound();
     606    } else if (rate.isRelative()) {
     607        return rate.getRate() * getUpperBound(getBinding(rate.getReference()).getRate());
     608    } else { // if (rate.isUnknown())
     609        return 0;
     610    }
     611}
     612
     613/** ------------------------------------------------------------------------------------------------------------- *
     614 * @brief getUpperBound
     615 ** ------------------------------------------------------------------------------------------------------------- */
     616bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
     617    if (rate.isUnknown()) {
     618        return true;
     619    } else if (rate.isDerived()) {
     620        return isTransitivelyUnknownRate(getBinding(rate.getReference()).getRate());
     621    }
     622    return false;
     623}
     624
     625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief roundUp
     627 ** ------------------------------------------------------------------------------------------------------------- */
     628unsigned roundUp(const ProcessingRate::RateValue & r) {
     629    if (LLVM_LIKELY(r.denominator() == 1)) {
     630        return r.numerator();
     631    } else {
     632        return (r.numerator() + r.denominator() - 1) / r.denominator();
     633    }
     634}
     635
     636/** ------------------------------------------------------------------------------------------------------------- *
     637 * @brief getItemAlignment
     638 ** ------------------------------------------------------------------------------------------------------------- */
     639inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
     640    const auto & rate = binding.getRate();
     641    if (rate.isFixed()) {
     642        const auto & r = rate.getRate();
     643        const auto n = (r.numerator() * mStride);
     644        if (LLVM_LIKELY(r.denominator() == 1)) {
     645            return n;
     646        } else if (LLVM_LIKELY((n % r.denominator()) == 0)) {
     647            return n / r.denominator();
     648        }
     649    }
     650    return 1; // ∀x GCD(x, x + 1) = 1
     651}
     652
     653/** ------------------------------------------------------------------------------------------------------------- *
     654 * @brief getStrideSize
     655 ** ------------------------------------------------------------------------------------------------------------- */
     656llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
     657    // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
     658    const auto r = getUpperBound(rate);
     659    if (r.numerator() == 0) {
     660        return nullptr;
     661    } else {
     662        assert ((r.numerator() * mStride) % r.denominator() == 0);
     663        return b->getSize((r.numerator() * mStride) / r.denominator());
     664    }
    602665}
    603666
     
    605668 * @brief generateKernelMethod
    606669 ** ------------------------------------------------------------------------------------------------------------- */
    607 void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) {
     670void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
     671
     672    if (LLVM_UNLIKELY((mStride % b->getBitBlockWidth()) != 0)) {
     673        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of MultiBlockKernel "
     674                           "must be a multiple of the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
     675    }
    608676
    609677    const auto inputSetCount = mStreamSetInputs.size();
    610678    const auto outputSetCount = mStreamSetOutputs.size();
    611     const auto totalSetCount = inputSetCount + outputSetCount;
    612 
    613     // Scan through and see if any of our input streams is marked as the principle
    614 
    615     bool hasPrinciple = false;
    616     unsigned principleInput = 0;
    617 
    618     for (unsigned i = 0; i < inputSetCount; i++) {
    619         for (const auto attr : mStreamSetInputs[i].getAttributes()) {
    620             if (attr.isPrinciple()) {
    621                 hasPrinciple = true;
    622                 principleInput = i;
    623                 break;
     679
     680    // Define and allocate the temporary buffer area in the prolog.
     681    const auto alignment = b->getBitBlockWidth() / 8;
     682    Value * temporaryInputBuffer[inputSetCount];
     683    for (unsigned i = 0; i < inputSetCount; ++i) {
     684
     685        // TODO: if this is a fixed rate input stream and the pipeline guarantees it will not call the kernel unless
     686        // there is sufficient input and all buffers will be sized sufficiently for the input, we ought to be able to
     687        // avoid the temporary buffer checks.
     688
     689        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     690        Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
     691        const auto ub = getUpperBound(rate);
     692        if (ub.numerator() == 0) {
     693            report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
     694        } else {           
     695            temporaryInputBuffer[i] = b->CreateAlignedAlloca(ty, alignment, b->getSize(roundUp(ub)));
     696            Type * const sty = temporaryInputBuffer[i]->getType()->getPointerElementType();
     697            b->CreateStore(Constant::getNullValue(sty), temporaryInputBuffer[i]);
     698        }       
     699    }
     700
     701    Value * temporaryOutputBuffer[outputSetCount];
     702    for (unsigned i = 0; i < outputSetCount; i++) {
     703        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     704        Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     705        if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
     706            temporaryOutputBuffer[i] = nullptr;
     707        } else {           
     708            auto ub = getUpperBound(rate);
     709            if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     710                ub += mStreamSetOutputBuffers[i]->overflowSize();
    624711            }
     712            temporaryOutputBuffer[i] = b->CreateAlignedAlloca(ty, alignment, b->getSize(roundUp(ub)));
     713            Type * const sty = temporaryOutputBuffer[i]->getType()->getPointerElementType();
     714            b->CreateStore(Constant::getNullValue(sty), temporaryOutputBuffer[i]);
    625715        }
    626716    }
     
    634724    // to process, in which case we abort unless IsFinal was set.
    635725
     726    Constant * const ZERO = b->getSize(0);
     727    Constant * const ONE = b->getSize(1);
     728    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
     729    Constant * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
     730
    636731    // Now proceed with creation of the doSegment method.
    637     BasicBlock * const doSegmentLoop = kb->CreateBasicBlock("DoSegmentLoop");
    638     kb->CreateBr(doSegmentLoop);
     732    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     733
     734    b->CreateBr(segmentLoop);
    639735
    640736    /// DO SEGMENT LOOP
    641737
    642     kb->SetInsertPoint(doSegmentLoop);
    643 
    644     // For each input buffer, determine the processedItemCount, the block pointer for the
    645     // buffer block containing the next item, and the number of linearly available items.
    646 
    647     Value * processedItemCount[inputSetCount];
    648     Value * baseInputBuffer[inputSetCount];
    649     Value * unprocessed[inputSetCount];
    650     Value * linearlyAvailable[inputSetCount];
    651     Value * readableStrides[inputSetCount];
    652 
    653     Constant * const log2BlockWidth = kb->getSize(std::log2(kb->getBitBlockWidth()));
    654 
     738    b->SetInsertPoint(segmentLoop);
     739
     740    // For each input buffer, get the initial processed item count, base input pointer, and the number of
     741    // linearly available strides.
    655742    Value * numOfStrides = nullptr;
    656 
     743    mInitialAvailableItemCount.resize(inputSetCount);
     744    mInitialProcessedItemCount.resize(inputSetCount);
     745    mStreamSetInputBaseAddress.resize(inputSetCount);
     746    Value * inputStrideSize[inputSetCount];
    657747    for (unsigned i = 0; i < inputSetCount; i++) {
    658         const auto name = mStreamSetInputs[i].getName();
    659         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    660 
    661         processedItemCount[i] = kb->getProcessedItemCount(name);
    662 
    663         assert (processedItemCount[i]->getType() == mAvailableItemCount[i]->getType());
    664 
    665         Value * const blockIndex = kb->CreateLShr(processedItemCount[i], log2BlockWidth);
    666         baseInputBuffer[i] = kb->getInputStreamPtr(name, blockIndex);
    667 
    668         if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
    669             kb->CreateAssert(kb->CreateICmpUGE(mAvailableItemCount[i], processedItemCount[i]),
    670                              "Processed item count cannot exceed the available item count");
    671         }
    672 
    673         unprocessed[i] = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    674 
    675         //kb->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed[i]);
    676 
    677         // INVESTIGATE: If the input rate of this stream is constant and known a priori, we could
    678         // avoid checking whether it is linearly accessible. Should we have an attribute for this?
    679 
    680         linearlyAvailable[i] = kb->getLinearlyAccessibleItems(name, processedItemCount[i], unprocessed[i]);
    681 
    682         //kb->CallPrintInt(getName() + "_" + name + "_linearlyAvailable", linearlyAvailable[i]);
    683 
    684         readableStrides[i] = nullptr;
    685 
    686         if (rate.isFixed() || rate.isBounded()) {
    687             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    688             readableStrides[i] = kb->CreateUDiv(linearlyAvailable[i], maxStrideSize);
    689             if (numOfStrides) {
    690                 numOfStrides = kb->CreateUMin(numOfStrides, readableStrides[i]);
    691             } else {
    692                 numOfStrides = readableStrides[i];
     748        const auto & input = mStreamSetInputs[i];
     749        const auto & name = input.getName();
     750        const ProcessingRate & rate = input.getRate();
     751        Value * const ic = b->getProcessedItemCount(name);
     752        mInitialProcessedItemCount[i] = ic;
     753        b->CreateAssert(b->CreateICmpUGE(mAvailableItemCount[i], ic), "processed item count cannot exceed the available item count");
     754        assert (ic->getType() == mAvailableItemCount[i]->getType());
     755        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], ic);
     756        mStreamSetInputBaseAddress[i]  = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     757        mInitialAvailableItemCount[i] = mAvailableItemCount[i];
     758        mAvailableItemCount[i] = b->getLinearlyAccessibleItems(name, ic, unprocessed);
     759        // Are our linearly accessible items sufficient for a stride?
     760        inputStrideSize[i] = getStrideSize(b, rate);
     761        Value * accessibleStrides = b->CreateUDiv(mAvailableItemCount[i], inputStrideSize[i]);
     762        if (!rate.isFixed() || requiresBufferedFinalStride(input)) {
     763
     764            // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
     765            // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer.
     766
     767            BasicBlock * const entry = b->GetInsertBlock();
     768            BasicBlock * const copyFromBack = b->CreateBasicBlock(name + "CopyFromBack");
     769            BasicBlock * const copyFromFront = b->CreateBasicBlock(name + "CopyFromFront");
     770            BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
     771
     772            b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume);
     773
     774            b->SetInsertPoint(copyFromBack);
     775            Value * const temporaryAvailable = b->CreateUMin(unprocessed, inputStrideSize[i]);
     776            b->CreateAssert(b->CreateICmpULE(mAvailableItemCount[i], temporaryAvailable), "linearly available cannot be greater than temporarily available");
     777            Value * const tempBufferPtr = temporaryInputBuffer[i];
     778            Value * const offset = b->CreateAnd(ic, BLOCK_WIDTH_MASK);
     779            const auto alignment = getItemAlignment(mStreamSetInputs[i]);
     780            b->CreateStreamCpy(name, tempBufferPtr, ZERO, mStreamSetInputBaseAddress[i] , offset, mAvailableItemCount[i], alignment);
     781            Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
     782            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
     783            b->CreateCondBr(b->CreateICmpNE(mAvailableItemCount[i], temporaryAvailable), copyFromFront, resume);
     784
     785            b->SetInsertPoint(copyFromFront);
     786            Value * const remaining = b->CreateSub(temporaryAvailable, mAvailableItemCount[i]);
     787            Value * const baseAddress = b->getBaseAddress(name);
     788            b->CreateStreamCpy(name, tempBufferPtr, mAvailableItemCount[i], baseAddress, ZERO, remaining, alignment);
     789            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
     790            b->CreateBr(resume);
     791
     792            b->SetInsertPoint(resume);
     793            PHINode * const bufferPtr = b->CreatePHI(mStreamSetInputBaseAddress[i] ->getType(), 3);
     794            bufferPtr->addIncoming(mStreamSetInputBaseAddress[i] , entry);
     795            bufferPtr->addIncoming(tempBufferPtr, copyToBackEnd);
     796            bufferPtr->addIncoming(tempBufferPtr, copyToFrontEnd);
     797            mStreamSetInputBaseAddress[i] = bufferPtr;
     798
     799            PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 3);
     800            phiAvailItemCount->addIncoming(mAvailableItemCount[i], entry);
     801            phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
     802            phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd);
     803            mAvailableItemCount[i] = phiAvailItemCount;
     804
     805            PHINode * const phiNumOfStrides = b->CreatePHI(b->getSizeTy(), 2);
     806            phiNumOfStrides->addIncoming(accessibleStrides, entry);
     807            phiNumOfStrides->addIncoming(temporaryStrides, copyToBackEnd);
     808            phiNumOfStrides->addIncoming(temporaryStrides, copyToFrontEnd);
     809            accessibleStrides = phiNumOfStrides;
     810        }
     811        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
     812    }
     813
     814    // Now determine the linearly writeable strides
     815    Value * linearlyWritable[outputSetCount];
     816    Value * baseOutputBuffer[outputSetCount];
     817    Value * outputStrideSize[outputSetCount];
     818    mInitialProducedItemCount.resize(outputSetCount);
     819    mStreamSetOutputBaseAddress.resize(outputSetCount);
     820    for (unsigned i = 0; i < outputSetCount; i++) {
     821        const auto & output = mStreamSetOutputs[i];
     822        const auto & name = output.getName();
     823        const ProcessingRate & rate = output.getRate();
     824        Value * const ic = b->getProducedItemCount(name);
     825        baseOutputBuffer[i] = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     826        assert (baseOutputBuffer[i]->getType()->isPointerTy());
     827        linearlyWritable[i] = b->getLinearlyWritableItems(name, ic);
     828        mInitialProducedItemCount[i] = ic;
     829        outputStrideSize[i] = nullptr;
     830        if (temporaryOutputBuffer[i]) {
     831            outputStrideSize[i] = getStrideSize(b, rate);
     832            // Is the number of linearly writable items sufficient for a stride?
     833            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
     834            if (!rate.isFixed() || requiresBufferedFinalStride(output)) {
     835                Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
     836                assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
     837                baseOutputBuffer[i] = b->CreateSelect(requiresCopy, temporaryOutputBuffer[i], baseOutputBuffer[i]);
     838                writableStrides = b->CreateSelect(requiresCopy, ONE, writableStrides);
    693839            }
    694         }
    695     }
    696 
    697     //kb->CallPrintInt(getName() + "_numOfStrides", numOfStrides);
    698 
    699     // Now determine the linearly writeable blocks, based on available blocks reduced
    700     // by limitations of output buffer space.
    701 
    702     Value * producedItemCount[outputSetCount];
    703     Value * baseOutputBuffer[outputSetCount];
    704     Value * writableStrides[outputSetCount];
    705     Value * linearlyWritable[outputSetCount];
    706 
    707     for (unsigned i = 0; i < outputSetCount; i++) {
    708         const auto & name = mStreamSetOutputs[i].getName();
    709         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    710         producedItemCount[i] = kb->getProducedItemCount(name);
    711 
    712         //kb->CallPrintInt(getName() + "_" + name + "_producedItemCount", producedItemCount[i]);
    713 
    714         Value * const blockIndex = kb->CreateLShr(producedItemCount[i], log2BlockWidth);
    715         baseOutputBuffer[i] = kb->getOutputStreamPtr(name, blockIndex);
    716         linearlyWritable[i] = nullptr;
    717         writableStrides[i] = nullptr;
    718         if (rate.isFixed() || rate.isBounded()) {
    719             linearlyWritable[i] = kb->getLinearlyWritableItems(name, producedItemCount[i]);
    720 
    721             //kb->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]);
    722 
    723             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    724             writableStrides[i] = kb->CreateUDiv(linearlyWritable[i], maxStrideSize);
    725             if (numOfStrides) {
    726                 numOfStrides = kb->CreateUMin(numOfStrides, writableStrides[i]);
    727             } else {
    728                 numOfStrides = writableStrides[i];
     840            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
     841            assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
     842        }
     843        mStreamSetOutputBaseAddress[i] = baseOutputBuffer[i];
     844    }
     845
     846    Value * const initiallyFinal = mIsFinal;
     847    if (LLVM_LIKELY(numOfStrides != nullptr)) {
     848        mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
     849        Value * const processStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
     850        b->CreateAssert(processStride, getName() + " does not have sufficient input data or output space for one stride");
     851        for (unsigned i = 0; i < inputSetCount; ++i) {
     852            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     853            if (rate.isFixed() && mStreamSetInputs[i].notDeferred()) {
     854                mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i]));
    729855            }
    730856        }
    731857    }
    732858
    733     //kb->CallPrintInt(getName() + "_numOfStrides'", numOfStrides);
    734 
    735     for (unsigned i = 0; i < inputSetCount; i++) {
    736         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    737         if (rate.isFixed()) {
    738             mAvailableItemCount[i] = kb->CreateMul(numOfStrides, kb->getSize(rate.getRate() * mStride));
    739         } else {
    740             mAvailableItemCount[i] = linearlyAvailable[i];
    741         }
    742 
    743         //kb->CallPrintInt(getName() + "_" + mStreamSetInputs[i].getName() + "_avail", mAvailableItemCount[i]);
    744     }
    745 
    746     // Define and allocate the temporary buffer area.
    747     Type * tempBuffers[totalSetCount];
    748     for (unsigned i = 0; i < inputSetCount; ++i) {
    749         Type * bufType = baseInputBuffer[i]->getType()->getPointerElementType();
    750         assert (baseInputBuffer[i]->getType()->getPointerAddressSpace() == 0);
    751         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    752         unsigned count = 0;
    753         if (rate.isFixed()) {
    754             count = rate.getRate();
    755         } else if (rate.isBounded()) {
    756             count = rate.getUpperBound() + 2;
    757         }
    758         tempBuffers[i] = ArrayType::get(bufType, count);
    759     }
    760     for (unsigned i = 0; i < outputSetCount; i++) {
    761         Type * const bufType = baseOutputBuffer[i]->getType()->getPointerElementType();
    762         assert (baseOutputBuffer[i]->getType()->getPointerAddressSpace() == 0);
    763         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    764         unsigned count = 0;
    765         if (rate.isFixed()) {
    766             count = rate.getRate();
    767         } else if (rate.isBounded()) {
    768             count = rate.getUpperBound() + 2;
    769         }
    770         tempBuffers[i + inputSetCount] = ArrayType::get(bufType, count);
    771     }
    772 
    773     Type * const tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount));
    774 
    775     Value * const tempBufferArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
    776 
    777     BasicBlock * const temporaryBufferCheck = kb->CreateBasicBlock("temporaryBufferCheck");
    778     BasicBlock * const doMultiBlock = kb->CreateBasicBlock("doMultiBlock");
    779     BasicBlock * const copyToTemporaryBuffers = kb->CreateBasicBlock("copyToTemporaryBuffers");
    780     BasicBlock * const segmentDone = kb->CreateBasicBlock("segmentDone");
    781 
    782     Value * const hasFullStride = numOfStrides ? kb->CreateICmpNE(numOfStrides, kb->getSize(0)) : kb->getTrue();
    783     kb->CreateCondBr(hasFullStride, doMultiBlock, temporaryBufferCheck);
    784 
    785     // We use temporary buffers in 3 different cases that preclude full stride processing.
    786 
    787     //  (a) One or more input buffers does not have a sufficient number of input items linearly available.
    788     //  (b) One or more output buffers does not have sufficient linearly available buffer space.
    789     //  (c) We have processed all the full strides of input and only the final block remains.
    790 
    791     kb->SetInsertPoint(temporaryBufferCheck);
    792 
    793     // Even if we copy the input data into a linear arrays, is there enough data to perform this stride?
    794     // If not, proceed only if this is our final block.
    795     Value * hasFullFragmentedStride = nullptr;
    796     for (unsigned i = 0; i < inputSetCount; i++) {
    797         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    798         if (r.isBounded() || (r.isUnknown() && r.getLowerBound() > 0)) {
    799             const auto l = r.isBounded() ? r.getUpperBound() : r.getLowerBound();
    800             Constant * const strideSize = kb->getSize(l * mStride);
    801             Value * enoughAvail = kb->CreateICmpUGE(unprocessed[i], strideSize);
    802             if (hasFullFragmentedStride) {
    803                 hasFullFragmentedStride = kb->CreateAnd(hasFullFragmentedStride, enoughAvail);
    804             } else {
    805                 hasFullFragmentedStride = enoughAvail;
    806             }
    807         }
    808     }
    809 
    810     Value * hasFragmentedOrFinalStride = nullptr;
    811     if (hasFullFragmentedStride) {
    812         hasFragmentedOrFinalStride = kb->CreateOr(hasFullFragmentedStride, mIsFinal);
    813         // Although this might be the final segment, we may have a full fragmented stride to process prior
    814         // to the actual final stride.
    815         mIsFinal = kb->CreateAnd(mIsFinal, kb->CreateNot(hasFullFragmentedStride));
    816     } else {
    817         hasFragmentedOrFinalStride = mIsFinal;
    818     }
    819     kb->CreateCondBr(hasFragmentedOrFinalStride, copyToTemporaryBuffers, segmentDone);
    820 
    821     /// COPY TO TEMPORARY BUFFERS
    822     kb->SetInsertPoint(copyToTemporaryBuffers);
    823 
    824     kb->CreateAlignedStore(Constant::getNullValue(tempParameterStructType), tempBufferArea, kb->getCacheAlignment());
    825 
    826     // For each input and output buffer, copy over necessary data starting from the last block boundary.
    827 
    828     Value * temporaryInputBuffer[inputSetCount];
    829     Value * temporaryAvailable[inputSetCount];
    830 
    831     for (unsigned i = 0; i < inputSetCount; i++) {
    832         temporaryInputBuffer[i] = baseInputBuffer[i];
    833         if (readableStrides[i]) {
    834             const auto name = mStreamSetInputs[i].getName();
    835             const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    836             assert (rate.getUpperBound() > 0);
    837             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    838             temporaryAvailable[i] = kb->CreateUMin(unprocessed[i], maxStrideSize);
    839 
    840             BasicBlock * entry = kb->GetInsertBlock();
    841             BasicBlock * copy = kb->CreateBasicBlock(name + "Copy");
    842             BasicBlock * resume = kb->CreateBasicBlock(name + "ResumeCopy");
    843             Value * const test = kb->CreateOr(kb->CreateICmpNE(readableStrides[i], kb->getSize(0)), mIsFinal);
    844             kb->CreateCondBr(test, resume, copy);
    845 
    846             kb->SetInsertPoint(copy);
    847             Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(i), kb->getInt32(0)});
    848             assert (tempBufferPtr->getType() == baseInputBuffer[i]->getType());
    849             Value * const neededItems = linearlyAvailable[i];
    850             Value * const bytesCopied = kb->copy(name, tempBufferPtr, baseInputBuffer[i], neededItems);
    851             Value * const nextInputPtr = kb->getRawInputPointer(name, kb->getSize(0));
    852             Value * const remaining = kb->CreateSub(temporaryAvailable[i], neededItems);
    853             Value * nextBufPtr = kb->CreatePointerCast(tempBufferPtr, kb->getInt8PtrTy());
    854             nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
    855             kb->copy(name, nextBufPtr, nextInputPtr, remaining);
    856 
    857             kb->CreateBr(resume);
    858 
    859             kb->SetInsertPoint(resume);
    860             PHINode * bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
    861             bufferPtr->addIncoming(baseInputBuffer[i], entry);
    862             bufferPtr->addIncoming(tempBufferPtr, copy);
    863             temporaryInputBuffer[i] = bufferPtr;
    864         }
    865     }
    866 
    867     Value * temporaryOutputBuffer[outputSetCount];
    868     for (unsigned i = 0; i < outputSetCount; i++) {
    869         temporaryOutputBuffer[i] = baseOutputBuffer[i];
    870         if (writableStrides[i]) {
    871             const auto name = mStreamSetOutputs[i].getName();
    872 
    873             BasicBlock * const entry = kb->GetInsertBlock();
    874             BasicBlock * const copy = kb->CreateBasicBlock(name + "Copy");
    875             BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopy");
    876 
    877             Value * const test = kb->CreateOr(kb->CreateICmpNE(writableStrides[i], kb->getSize(0)), mIsFinal);
    878             kb->CreateCondBr(test, resume, copy);
    879 
    880             kb->SetInsertPoint(copy);
    881             Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i), kb->getInt32(0)});
    882             assert (tempBufferPtr->getType() == baseOutputBuffer[i]->getType());
    883             Value * const itemsToCopy = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
    884             kb->copy(name, tempBufferPtr, baseOutputBuffer[i], itemsToCopy);
    885             kb->CreateBr(resume);
    886 
    887             kb->SetInsertPoint(resume);
    888             PHINode * bufferPtr = kb->CreatePHI(tempBufferPtr->getType(), 2);
    889             bufferPtr->addIncoming(baseOutputBuffer[i], entry);
    890             bufferPtr->addIncoming(tempBufferPtr, copy);
    891             temporaryOutputBuffer[i] = bufferPtr;
    892         }
    893     }
    894 
    895     kb->CreateBr(doMultiBlock);
    896     BasicBlock * const usingTemporaryBuffers = kb->GetInsertBlock();
    897     doMultiBlock->moveAfter(usingTemporaryBuffers);
    898 
    899     /// DO MULTI BLOCK
    900 
    901     //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
    902     //  Now prepare the doMultiBlock call.
    903     kb->SetInsertPoint(doMultiBlock);
    904 
    905     PHINode * const isFinal = kb->CreatePHI(mIsFinal->getType(), 2);
    906     isFinal->addIncoming(kb->getFalse(), doSegmentLoop);
    907     isFinal->addIncoming(mIsFinal, usingTemporaryBuffers);
    908     mIsFinal = isFinal;
    909 
    910     mStreamSetInputBufferPtr.resize(inputSetCount);
    911     for (unsigned i = 0; i < inputSetCount; ++i) {
    912         assert (baseInputBuffer[i] && temporaryInputBuffer[i]);
    913         if (baseInputBuffer[i] != temporaryInputBuffer[i]) {
    914             PHINode * const avail = kb->CreatePHI(kb->getSizeTy(), 2);
    915             avail->addIncoming(mAvailableItemCount[i], doSegmentLoop);
    916             avail->addIncoming(temporaryAvailable[i], usingTemporaryBuffers);
    917             mAvailableItemCount[i] = avail;
    918             PHINode * const bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
    919             bufferPtr->addIncoming(baseInputBuffer[i], doSegmentLoop);
    920             assert (baseInputBuffer[i]->getType() == temporaryInputBuffer[i]->getType());
    921             bufferPtr->addIncoming(temporaryInputBuffer[i], usingTemporaryBuffers);
    922             temporaryInputBuffer[i] = bufferPtr;
    923         }
    924         mStreamSetInputBufferPtr[i] = temporaryInputBuffer[i];
    925     }
    926 
    927     mStreamSetOutputBufferPtr.resize(outputSetCount);
    928     for (unsigned i = 0; i < outputSetCount; ++i) {
    929         assert (baseOutputBuffer[i] && temporaryOutputBuffer[i]);
    930         if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
    931             PHINode * const bufferPtr = kb->CreatePHI(baseOutputBuffer[i]->getType(), 2);
    932             bufferPtr->addIncoming(baseOutputBuffer[i], doSegmentLoop);
    933             assert (baseOutputBuffer[i]->getType() == temporaryOutputBuffer[i]->getType());
    934             bufferPtr->addIncoming(temporaryOutputBuffer[i], usingTemporaryBuffers);
    935             temporaryOutputBuffer[i] = bufferPtr;
    936         }
    937         mStreamSetOutputBufferPtr[i] = temporaryOutputBuffer[i];
    938     }
    939 
    940     // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
    941     // provide the required multi-block kernel logic.
    942     generateMultiBlockLogic(kb, numOfStrides);
    943 
    944     // If we have no fixed rate inputs, we won't know when we're done parsing until we test
    945     // whether any input data was processed.
    946     bool mayMakeNoProgress = true;
    947 
    948     // Update the processed item count of any Fixed input or output stream. While doing so, also
    949     // calculate the LCM of their rates. The LCM is used to calculate the final item counts.
    950 
    951     unsigned rateLCM = 1;
     859    //  We have one or more blocks of input data and output buffer space for all stream sets.
     860    generateMultiBlockLogic(b, numOfStrides);
    952861
    953862    for (unsigned i = 0; i < inputSetCount; ++i) {
    954863        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    955         if (rate.isFixed()) {
    956             mayMakeNoProgress = false;
    957             rateLCM = lcm(rateLCM, rate.getRate());
    958             Value * const processed = mAvailableItemCount[i]; // kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
    959             Value * const ic = kb->CreateAdd(processedItemCount[i], processed);
    960             kb->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
     864        if (rate.isFixed() && mStreamSetInputs[i].notDeferred()) {
     865            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
     866            b->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
    961867        }
    962868    }
     
    965871        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    966872        if (rate.isFixed()) {
    967             rateLCM = lcm(rateLCM, rate.getRate());
    968             Value * const produced = kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
    969             Value * const ic = kb->CreateAdd(producedItemCount[i], produced);
    970             kb->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
    971         }
    972     }
    973 
    974     BasicBlock * const finalStrideCheck = kb->CreateBasicBlock("finalStrideCheck");
    975     BasicBlock * const finalStrideAdjustment = kb->CreateBasicBlock("finalStrideAdjustment");
    976     BasicBlock * const standardCopyBack = kb->CreateBasicBlock("standardCopyBack");
    977     BasicBlock * const temporaryBufferCopyBack = kb->CreateBasicBlock("temporaryBufferCopyBack");
    978 
    979     kb->CreateLikelyCondBr(hasFullStride, standardCopyBack, finalStrideCheck);
    980 
    981 
    982     /// FINAL STRIDE CHECK
    983     kb->SetInsertPoint(finalStrideCheck);
    984     kb->CreateUnlikelyCondBr(mIsFinal, finalStrideAdjustment, temporaryBufferCopyBack);
     873            assert (mStreamSetOutputs[i].notDeferred());
     874            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
     875            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
     876            b->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
     877        }
     878    }
     879
     880    BasicBlock * const handleFinalBlock = b->CreateBasicBlock("HandleFinalBlock");
     881    BasicBlock * const temporaryBufferCopyBack = b->CreateBasicBlock("TemporaryBufferCopyBack");
     882    BasicBlock * const strideDone = b->CreateBasicBlock("MultiBlockDone");
     883
     884    b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), temporaryBufferCopyBack, handleFinalBlock);
     885
    985886
    986887    /// FINAL STRIDE ADJUSTMENT
    987     kb->SetInsertPoint(finalStrideAdjustment);
     888    b->SetInsertPoint(handleFinalBlock);
    988889
    989890    // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
     
    991892    // to calculate them based on the actual input item counts.
    992893
    993     // NOTE: This appears overly complex to avoid an integer overflow without reducing the maximum
    994     // integer size. For each Fixed output stream, this calculates:
    995 
    996     //       CEILING(MIN(Total Available Item Count / Fixed Input Rate) * Fixed Output Rate)
    997 
    998     Value * basePreviouslyProcessedItemCount = nullptr;
    999     Value * scaledInverseOfStrideItemCount = nullptr;
    1000 
     894    reviseFinalProducedItemCounts(b);
     895
     896    b->CreateBr(temporaryBufferCopyBack);
     897
     898    /// TEMPORARY BUFFER COPY BACK
     899    b->SetInsertPoint(temporaryBufferCopyBack);
     900
     901    // Copy back data to the actual output buffers.
     902    for (unsigned i = 0; i < outputSetCount; i++) {
     903        Value * const tempBuffer = temporaryOutputBuffer[i];
     904        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
     905            continue;
     906        }
     907        Value * const baseBuffer = baseOutputBuffer[i];
     908        assert ("stack overflow" && (tempBuffer->getType() == baseBuffer->getType()));
     909        const auto & name = mStreamSetOutputs[i].getName();
     910        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
     911        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
     912        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
     913        // If we used a temporary buffer, copy it back to the original output buffer
     914        b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume);
     915
     916        b->SetInsertPoint(copyToBack);       
     917        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
     918        Value * const newProducedItemCount = b->getProducedItemCount(name);
     919        Value * const newlyProduced = b->CreateSub(newProducedItemCount, mInitialProducedItemCount[i]);
     920        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
     921        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     922        b->CreateStreamCpy(name, baseBuffer, offset, tempBuffer, ZERO, toWrite, alignment);
     923        // If we required a temporary output buffer, we will probably need to write to the beginning of the buffer as well.
     924        b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, resume);
     925
     926        b->SetInsertPoint(copyToFront);
     927        Value * const remaining = b->CreateSub(newlyProduced, toWrite);
     928        Value * const baseAddress = b->getBaseAddress(name);
     929        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
     930        b->CreateBr(resume);
     931
     932        b->SetInsertPoint(resume);
     933    }
     934
     935    strideDone->moveAfter(b->GetInsertBlock());
     936
     937    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
     938    //  We've dealt with the partial block processing and copied information back into the
     939    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
     940    if (hasNoTerminateAttribute()) {
     941        b->CreateCondBr(mIsFinal, segmentDone, strideDone);
     942    } else {
     943        BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
     944        b->CreateCondBr(mIsFinal, setTermination, strideDone);
     945
     946        b->SetInsertPoint(setTermination);
     947        b->setTerminationSignal();
     948        b->CreateBr(segmentDone);       
     949    }
     950
     951    /// STRIDE DONE
     952    b->SetInsertPoint(strideDone);
     953
     954    // do we have enough data for another stride?
     955    Value * pendingStrides = nullptr;
    1001956    for (unsigned i = 0; i < inputSetCount; ++i) {
    1002         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1003         if (r.isFixed()) {
    1004             assert (rateLCM % r.getRate() == 0);
    1005             Value * const a = kb->CreateMul(mAvailableItemCount[i], kb->getSize(rateLCM / r.getRate())); // unprocessed
    1006             Value * const p = kb->CreateUDiv(processedItemCount[i], kb->getSize(r.getRate()));
    1007             if (scaledInverseOfStrideItemCount) {
    1008                 scaledInverseOfStrideItemCount = kb->CreateUMin(scaledInverseOfStrideItemCount, a);
    1009                 basePreviouslyProcessedItemCount = kb->CreateUMin(basePreviouslyProcessedItemCount, p);
     957        Value * const processed = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     958        Value * const remaining = b->CreateSub(mInitialAvailableItemCount[i], processed);
     959        Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
     960        pendingStrides = b->CreateUMin(pendingStrides, remainingStrides);
     961    }
     962
     963    // do we have enough room for another stride?
     964    for (unsigned i = 0; i < outputSetCount; ++i) {
     965        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     966        const auto & name = mStreamSetOutputs[i].getName();
     967        Value * const newProduced = b->getProducedItemCount(name);
     968        // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
     969        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
     970            Value * const unconsumed = b->CreateSub(newProduced, b->getConsumedItemCount(name));
     971            Value * const remaining = b->CreateSub(b->getCapacity(name), unconsumed);
     972            Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
     973            pendingStrides = b->CreateUMin(pendingStrides, remainingStrides);
     974        }
     975        // Do copybacks if necessary.
     976        if (mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate)) {
     977            b->CreateCopyBack(name, mInitialProducedItemCount[i], newProduced);
     978        }
     979    }
     980
     981    Value * const hasMoreStrides = b->CreateOr(b->CreateICmpNE(pendingStrides, ZERO), initiallyFinal);
     982    b->CreateCondBr(hasMoreStrides, segmentLoop, segmentDone);
     983
     984    /// SEGMENT DONE
     985    segmentDone->moveAfter(b->GetInsertBlock());
     986    b->SetInsertPoint(segmentDone);
     987
     988}
     989
     990/** ------------------------------------------------------------------------------------------------------------- *
     991 * @brief requiresCopyBack
     992 ** ------------------------------------------------------------------------------------------------------------- */
     993bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
     994    if (rate.isBounded() || rate.isUnknown()) {
     995        return true;
     996    } else if (rate.isRelative()) {
     997        return requiresCopyBack(getBinding(rate.getReference()).getRate());
     998    }
     999    return false;
     1000}
     1001
     1002/** ------------------------------------------------------------------------------------------------------------- *
     1003 * @brief CreateUDivCeil
     1004 ** ------------------------------------------------------------------------------------------------------------- */
     1005inline Value * CreateUDivCeil(const std::unique_ptr<KernelBuilder> & b, Value * const number, const ProcessingRate::RateValue divisor, const Twine & Name = "") {
     1006    Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
     1007    if (LLVM_LIKELY(divisor.denominator() == 1)) {
     1008        return b->CreateUDivCeil(number, n, Name);
     1009    } else {
     1010        //   âŒŠ(num + ratio - 1) / ratio⌋
     1011        // = ⌊(num - 1) / (n/d)⌋ + (ratio/ratio)
     1012        // = ⌊(d * (num - 1)) / n⌋ + 1
     1013        Constant * const ONE = ConstantInt::get(number->getType(), 1);
     1014        Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
     1015        return b->CreateAdd(b->CreateUDiv(b->CreateMul(b->CreateSub(number, ONE), d), n), ONE, Name);
     1016    }
     1017}
     1018
     1019
     1020/** ------------------------------------------------------------------------------------------------------------- *
     1021 * @brief reviseFinalProducedItemCounts
     1022 ** ------------------------------------------------------------------------------------------------------------- */
     1023void MultiBlockKernel::reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b) {
     1024
     1025    if (LLVM_UNLIKELY(mStreamSetInputs.empty())) {
     1026        return;
     1027    }
     1028
     1029    const auto inputSetCount = mStreamSetInputs.size();
     1030
     1031    ProcessingRate::RateValue rateLCM(1);
     1032    unsigned first = 0;
     1033    unsigned last = inputSetCount;
     1034
     1035    for (unsigned i = 0; i < inputSetCount; ++i) {
     1036        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
     1037        if (pr.isFixed()) {
     1038            rateLCM = lcm(rateLCM, pr.getRate());
     1039            if (mStreamSetInputs[i].isPrincipal()) {
     1040                assert ("A kernel cannot have multiple principle input streams" && (first == 0 && last == inputSetCount));
     1041                first = i;
     1042                last = i + 1;
     1043            }
     1044        }       
     1045    }
     1046
     1047    bool noFixedRateOutput = true;
     1048
     1049    for (const Binding & output : mStreamSetOutputs) {
     1050        const ProcessingRate & pr = output.getRate();
     1051        if (pr.isFixed()) {
     1052            rateLCM = lcm(rateLCM, pr.getRate());
     1053            noFixedRateOutput = false;
     1054        }
     1055    }
     1056
     1057    if (noFixedRateOutput) {
     1058        return;
     1059    }
     1060
     1061    Value * baseInitialProcessedItemCount = nullptr;
     1062    Value * scaledInverseOfAvailItemCount = nullptr;
     1063
     1064    // For each Fixed output stream, this calculates:
     1065
     1066    //    CEILING(MIN(Available Item Count / Fixed Input Rate) * Fixed Output Rate)
     1067
     1068    // But avoids the possibility of overflow errors (assuming that each processed item count does not overflow)
     1069
     1070    for (unsigned i = first; i < last; ++i) {
     1071        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
     1072        if (pr.isFixed()) {
     1073            Value * p = mInitialProcessedItemCount[i];
     1074            Value * a = b->CreateSub(mInitialAvailableItemCount[i], p);
     1075            const auto & rate = pr.getRate();
     1076            if (LLVM_UNLIKELY(rateLCM != rate)) {
     1077                const auto factor = rateLCM / rate;
     1078                if (LLVM_UNLIKELY(factor.numerator() > 1)) {
     1079                    a = b->CreateMul(a, b->getSize(factor.numerator()));
     1080                }
     1081                if (LLVM_UNLIKELY(factor.denominator() > 1)) {
     1082                    a = b->CreateUDiv(a, b->getSize(factor.denominator()));
     1083                }
     1084            }
     1085            if (LLVM_UNLIKELY(rate.denominator() > 1)) {
     1086                p = b->CreateMul(p, b->getSize(rate.denominator()));
     1087            }
     1088            if (LLVM_UNLIKELY(rate.numerator() > 1)) {
     1089                p = b->CreateUDiv(p, b->getSize(rate.numerator()));
     1090            }
     1091            if (scaledInverseOfAvailItemCount) {
     1092                scaledInverseOfAvailItemCount = b->CreateUMin(scaledInverseOfAvailItemCount, a);
     1093                baseInitialProcessedItemCount = b->CreateUMin(baseInitialProcessedItemCount, p);
    10101094            } else {
    1011                 scaledInverseOfStrideItemCount = a;
    1012                 basePreviouslyProcessedItemCount = p;
     1095                scaledInverseOfAvailItemCount = a;
     1096                baseInitialProcessedItemCount = p;
    10131097            }
    10141098        }
    1015 //        const auto name = mStreamSetInputs[i].getName();
    1016 //        Value * const processed = kb->CreateAdd(processedItemCount[i], unprocessed[i]);
    1017 //        kb->setProcessedItemCount(name, processed);
    1018     }
    1019 
    1020     for (unsigned i = 0; i < outputSetCount; ++i) {
    1021         const auto name = mStreamSetOutputs[i].getName();
    1022         const ProcessingRate & r = mStreamSetOutputs[i].getRate();
     1099    }
     1100
     1101    for (const Binding & output : mStreamSetOutputs) {
     1102        const auto name = output.getName();
     1103        const ProcessingRate & pr = output.getRate();
    10231104        Value * produced = nullptr;
    1024         if (r.isFixed()) {
    1025             assert (rateLCM % r.getRate() == 0);
    1026             assert (basePreviouslyProcessedItemCount && scaledInverseOfStrideItemCount);
    1027             Value * const p = kb->CreateMul(basePreviouslyProcessedItemCount, kb->getSize(r.getRate()));
    1028             Value * const ic = kb->CreateUDivCeil(scaledInverseOfStrideItemCount, kb->getSize(rateLCM / r.getRate()));
    1029             produced = kb->CreateAdd(p, ic);
     1105        if (pr.isFixed() && output.notDeferred()) {
     1106            assert (baseInitialProcessedItemCount && scaledInverseOfAvailItemCount);
     1107            const auto rate = pr.getRate();
     1108            Value * p = baseInitialProcessedItemCount;
     1109            if (LLVM_UNLIKELY(rate.numerator() != 1)) {
     1110                p = b->CreateMul(p, b->getSize(rate.numerator()));
     1111            }
     1112            if (LLVM_UNLIKELY(rate.denominator() != 1)) {
     1113                p = b->CreateUDiv(p, b->getSize(rate.denominator()));
     1114            }
     1115            Value * const ic = CreateUDivCeil(b, scaledInverseOfAvailItemCount, rateLCM / pr.getRate());
     1116            produced = b->CreateAdd(p, ic);
    10301117        } else { // check if we have an attribute; if so, get the current produced count and adjust it
    10311118            bool noAttributes = true;
    1032             for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1119            for (const Attribute & attr : output.getAttributes()) {
    10331120                if (attr.isAdd() || attr.isRoundUpTo()) {
    10341121                    noAttributes = false;
     
    10391126                continue;
    10401127            }
    1041             produced = kb->getProducedItemCount(name);
    1042         }
    1043         for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1128            produced = b->getProducedItemCount(name);
     1129        }
     1130        for (const Attribute & attr : output.getAttributes()) {
    10441131            if (attr.isAdd()) {
    1045                 produced = kb->CreateAdd(produced, kb->getSize(attr.getAmount()));
     1132                produced = b->CreateAdd(produced, b->getSize(attr.getAmount()));
    10461133            } else if (attr.isRoundUpTo()) {
    1047                 produced = kb->CreateRoundUp(produced, kb->getSize(attr.getAmount()));
     1134                produced = b->CreateRoundUp(produced, b->getSize(attr.getAmount()));
    10481135            }
    10491136        }
    1050         kb->setProducedItemCount(name, produced);
    1051     }
    1052 
    1053     kb->CreateBr(temporaryBufferCopyBack);
    1054 
    1055     /// TEMPORARY BUFFER COPY BACK
    1056     kb->SetInsertPoint(temporaryBufferCopyBack);
    1057 
    1058     // Copy back data to the actual output buffers.
    1059     for (unsigned i = 0; i < outputSetCount; i++) {
    1060 
    1061         if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
    1062 
    1063             const auto name = mStreamSetOutputs[i].getName();
    1064 
    1065             BasicBlock * const copy = kb->CreateBasicBlock(name + "CopyBack");
    1066             BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopyBack");
    1067             Value * const usedTemporary = kb->CreateICmpNE(temporaryOutputBuffer[i], baseOutputBuffer[i]);
    1068 
    1069             // If we used a temporary buffer ...
    1070             kb->CreateCondBr(usedTemporary, copy, resume);
    1071 
    1072             kb->SetInsertPoint(copy);
    1073             Value * bytesCopied = kb->copy(name, baseOutputBuffer[i], temporaryOutputBuffer[i], linearlyWritable[i]);
    1074             Value * nextOutputPtr = kb->getRawOutputPointer(name, kb->getSize(0));
    1075             Value * producedCount = kb->getProducedItemCount(name);
    1076 
    1077             Value * remaining = kb->CreateSub(producedCount, linearlyWritable[i]);
    1078             Value * nextBufPtr = kb->CreatePointerCast(temporaryOutputBuffer[i], kb->getInt8PtrTy());
    1079             nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
    1080 
    1081             kb->copy(name, nextOutputPtr, nextBufPtr, remaining);
    1082             kb->CreateBr(resume);
    1083 
    1084             kb->SetInsertPoint(resume);
    1085         }
    1086     }
    1087 
    1088     //  We've dealt with the partial block processing and copied information back into the
    1089     //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1090     BasicBlock * setTermination = nullptr;
    1091     if (hasNoTerminateAttribute()) {
    1092         kb->CreateCondBr(mIsFinal, segmentDone, standardCopyBack);
    1093     } else {
    1094         setTermination = kb->CreateBasicBlock("setTermination");
    1095         kb->CreateCondBr(mIsFinal, setTermination, standardCopyBack);
    1096     }
    1097 
    1098     /// STANDARD COPY BACK
    1099     kb->SetInsertPoint(standardCopyBack);
    1100 
    1101     // Do copybacks if necessary.
    1102     for (unsigned i = 0; i < outputSetCount; i++) {
    1103         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    1104             const auto name = mStreamSetOutputs[i].getName();
    1105             Value * newProduced = kb->getProducedItemCount(name);
    1106             kb->CreateCopyBack(name, producedItemCount[i], newProduced);
    1107         }
    1108     }
    1109 
    1110     // If it is possible to make no progress, verify we processed some of the input. If we haven't,
    1111     // we're finished this segment.
    1112     if (mayMakeNoProgress) {
    1113         Value * madeProgress = nullptr;
    1114         for (unsigned i = 0; i < inputSetCount; ++i) {
    1115             Value * const processed = kb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1116             Value * const progress = kb->CreateICmpNE(processed, processedItemCount[i]);
    1117             if (madeProgress) {
    1118                 madeProgress = kb->CreateOr(madeProgress, progress);
    1119             } else {
    1120                 madeProgress = progress;
    1121             }
    1122         }
    1123         assert (madeProgress);
    1124         kb->CreateCondBr(madeProgress, doSegmentLoop, segmentDone);
    1125     } else {
    1126         kb->CreateBr(doSegmentLoop);
    1127     }
    1128 
    1129     if (hasNoTerminateAttribute()) {
    1130         segmentDone->moveAfter(kb->GetInsertBlock());
    1131     } else {
    1132         /// SET TERMINATION
    1133         setTermination->moveAfter(kb->GetInsertBlock());
    1134         kb->SetInsertPoint(setTermination);
    1135         kb->setTerminationSignal();
    1136         kb->CreateBr(segmentDone);
    1137         segmentDone->moveAfter(setTermination);
    1138     }
    1139 
    1140     kb->SetInsertPoint(segmentDone);
    1141 
    1142 }
    1143 
    1144 //bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
    1145 //    if (rate.isBounded() || rate.isUnknown()) {
    1146 //        return true;
    1147 //    } else if (rate.isDirectlyRelative()) {
    1148 //        Port port; unsigned i;
    1149 //        std::tie(port, i) = getStreamPort(rate.getReference());
    1150 //        const auto & binding = (port == Port::Input) ? mStreamSetInputs[i] : mStreamSetOutputs[i];
    1151 //        return requiresCopyBack(binding.getRate());
    1152 //    }
    1153 //    return false;
    1154 //}
    1155 
    1156 //  The default doSegment method dispatches to the doBlock routine for
    1157 //  each block of the given number of blocksToDo, and then updates counts.
    1158 
    1159 void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) {
    1160 
    1161     BasicBlock * const entryBlock = idb->GetInsertBlock();
    1162     BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
    1163     mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody");
    1164     BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone");
    1165     BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock");
    1166     BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone");
    1167 
    1168     Value * baseTarget = nullptr;
    1169     if (idb->supportsIndirectBr()) {
    1170         baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
    1171     }
    1172 
    1173     Constant * const log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
    1174 
     1137        b->setProducedItemCount(name, produced);
     1138    }
     1139
     1140}
     1141
     1142/** ------------------------------------------------------------------------------------------------------------- *
     1143 * @brief generateMultiBlockLogic
     1144 ** ------------------------------------------------------------------------------------------------------------- */
     1145Value * BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     1146
     1147    if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
     1148        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of BlockOrientedKernel "
     1149                           "equal to the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
     1150    }
     1151
     1152    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
     1153
     1154    BasicBlock * const entryBlock = b->GetInsertBlock();
     1155    mStrideLoopBody = b->CreateBasicBlock(getName() + "_strideLoopBody");
     1156    BasicBlock * const stridesDone = b->CreateBasicBlock(getName() + "_stridesDone");
     1157    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
     1158    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
     1159    b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
     1160                    "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
    11751161    const auto inputSetCount = mStreamSetInputs.size();
    11761162    Value * baseProcessedIndex[inputSetCount];
    1177     for (unsigned i = 0; i < inputSetCount; ++i) {
     1163    Value * baseInputAddress[inputSetCount];
     1164    for (unsigned i = 0; i < inputSetCount; i++) {
    11781165        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    1179         if (rate.isFixed()) {
    1180             baseProcessedIndex[i] = nullptr;
    1181         } else {
    1182             Value * ic = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1183             ic = idb->CreateLShr(ic, log2BlockSize);
    1184             baseProcessedIndex[i] = ic;
    1185         }
     1166        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1167            Value * const ic = mInitialProcessedItemCount[i];
     1168            baseProcessedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
     1169        }
     1170        baseInputAddress[i] = mStreamSetInputBaseAddress[i];
    11861171    }
    11871172
    11881173    const auto outputSetCount = mStreamSetOutputs.size();
    11891174    Value * baseProducedIndex[outputSetCount];
     1175    Value * baseOutputAddress[inputSetCount];
     1176    for (unsigned i = 0; i < outputSetCount; i++) {
     1177        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     1178        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1179            Value * const ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
     1180            baseProducedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
     1181        }
     1182        baseOutputAddress[i] = mStreamSetOutputBaseAddress[i];
     1183    }
     1184
     1185    b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, mStrideLoopBody);
     1186
     1187    /// BLOCK BODY
     1188
     1189    b->SetInsertPoint(mStrideLoopBody);
     1190
     1191    if (b->supportsIndirectBr()) {
     1192        Value * const baseTarget = BlockAddress::get(segmentDone);
     1193        mStrideLoopTarget = b->CreatePHI(baseTarget->getType(), 2, "strideTarget");
     1194        mStrideLoopTarget->addIncoming(baseTarget, entryBlock);
     1195    }
     1196
     1197    mStrideBlockIndex = b->CreatePHI(b->getSizeTy(), 2);
     1198    mStrideBlockIndex->addIncoming(b->getSize(0), entryBlock);
     1199
     1200    /// GENERATE DO BLOCK METHOD
     1201
     1202    for (unsigned i = 0; i < inputSetCount; ++i) {
     1203        Value * index = mStrideBlockIndex;
     1204        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     1205        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1206            Value * ic = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     1207            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProcessedIndex[i]);
     1208        }
     1209        mStreamSetInputBaseAddress[i] = b->CreateGEP(mStreamSetInputBaseAddress[i], index);
     1210    }
     1211
    11901212    for (unsigned i = 0; i < outputSetCount; ++i) {
     1213        Value * index = mStrideBlockIndex;
    11911214        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    1192         if (rate.isFixed()) {
    1193             baseProducedIndex[i] = nullptr;
    1194         } else {
    1195             Value * ic = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
    1196             ic = idb->CreateLShr(ic, log2BlockSize);
    1197             baseProducedIndex[i] = ic;
    1198         }
    1199     }
    1200 
    1201     Value * const numOfBlocksToProcess = idb->CreateMul(numOfStrides, idb->getSize(mStride / idb->getBitBlockWidth()));
    1202 
    1203     idb->CreateBr(strideLoopCond);
    1204 
    1205     /// BLOCK COND
    1206 
    1207     idb->SetInsertPoint(strideLoopCond);
    1208 
    1209     PHINode * branchTarget = nullptr;
    1210     if (baseTarget) {
    1211         branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
    1212         branchTarget->addIncoming(baseTarget, entryBlock);
    1213     }
    1214 
    1215     PHINode * const blockIndex = idb->CreatePHI(idb->getSizeTy(), 2, "index");
    1216     blockIndex->addIncoming(idb->getSize(0), entryBlock);
    1217 
    1218     for (unsigned i = 0; i < inputSetCount; ++i) {
    1219         Value * offset = blockIndex;
    1220         if (baseProcessedIndex[i]) {
    1221             offset = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1222             offset = idb->CreateLShr(offset, log2BlockSize);
    1223             offset = idb->CreateSub(offset, baseProcessedIndex[i]);
    1224         }
    1225         mStreamSetInputBufferPtr[i] = idb->CreateGEP(mStreamSetInputBufferPtr[i], offset);
    1226     }
    1227 
    1228     for (unsigned i = 0; i < outputSetCount; ++i) {
    1229         Value * offset = blockIndex;
    1230         if (baseProducedIndex[i]) {
    1231             offset = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
    1232             offset = idb->CreateLShr(offset, log2BlockSize);
    1233             offset = idb->CreateSub(offset, baseProducedIndex[i]);
    1234         }
    1235         mStreamSetOutputBufferPtr[i] = idb->CreateGEP(mStreamSetOutputBufferPtr[i], offset);
    1236     }
    1237 
    1238     Value * const notDone = idb->CreateICmpULT(blockIndex, numOfBlocksToProcess);
    1239     idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
    1240 
    1241     /// BLOCK BODY
    1242 
    1243     idb->SetInsertPoint(mStrideLoopBody);
    1244 
    1245     if (idb->supportsIndirectBr()) {
    1246         mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget");
    1247         mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
    1248     }
    1249 
    1250     /// GENERATE DO BLOCK METHOD
    1251 
    1252     writeDoBlockMethod(idb);
    1253 
    1254     BasicBlock * const bodyEnd = idb->GetInsertBlock();
    1255     blockIndex->addIncoming(idb->CreateAdd(blockIndex, idb->getSize(1)), bodyEnd);
    1256     if (branchTarget) {
    1257         branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
    1258     }
    1259     idb->CreateBr(strideLoopCond);
     1215        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1216            Value * ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
     1217            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProducedIndex[i]);
     1218        }
     1219        mStreamSetOutputBaseAddress[i] = b->CreateGEP(mStreamSetOutputBaseAddress[i], index);
     1220    }
     1221
     1222    writeDoBlockMethod(b);
     1223
     1224    BasicBlock * const bodyEnd = b->GetInsertBlock();
     1225    if (mStrideLoopTarget) {
     1226        mStrideLoopTarget->addIncoming(mStrideLoopTarget, bodyEnd);
     1227    }
     1228
     1229    Value * const nextIndex = b->CreateAdd(mStrideBlockIndex, b->getSize(1));
     1230    mStrideBlockIndex->addIncoming(nextIndex, bodyEnd);
     1231    Value * const notDone = b->CreateICmpULT(nextIndex, numOfBlocks);
     1232    b->CreateCondBr(notDone, mStrideLoopBody, stridesDone);
    12601233
    12611234    stridesDone->moveAfter(bodyEnd);
     
    12631236    /// STRIDE DONE
    12641237
    1265     idb->SetInsertPoint(stridesDone);
     1238    b->SetInsertPoint(stridesDone);
    12661239
    12671240    // Now conditionally perform the final block processing depending on the doFinal parameter.
    1268     if (branchTarget) {
    1269         mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
     1241    if (mStrideLoopTarget) {
     1242        mStrideLoopBranch = b->CreateIndirectBr(mStrideLoopTarget, 3);
    12701243        mStrideLoopBranch->addDestination(doFinalBlock);
    12711244        mStrideLoopBranch->addDestination(segmentDone);
    12721245    } else {
    1273         idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
     1246        b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
    12741247    }
    12751248
    12761249    doFinalBlock->moveAfter(stridesDone);
    12771250
    1278     idb->SetInsertPoint(doFinalBlock);
    1279 
    1280     Value * remainingItems = nullptr;
     1251    /// DO FINAL BLOCK
     1252
     1253    b->SetInsertPoint(doFinalBlock);
    12811254    for (unsigned i = 0; i < inputSetCount; ++i) {
    1282         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1283         if (r.isFixed()) {
    1284             Value * ic = idb->CreateUDiv(mAvailableItemCount[i], idb->getSize(r.getRate()));
    1285             if (remainingItems) {
    1286                 remainingItems = idb->CreateUMax(remainingItems, ic);
    1287             } else {
    1288                 remainingItems = ic;
    1289             }
    1290         }
    1291     }
    1292 
    1293     writeFinalBlockMethod(idb, remainingItems);
    1294 
    1295     idb->CreateBr(segmentDone);
    1296 
    1297     segmentDone->moveAfter(idb->GetInsertBlock());
    1298 
    1299     idb->SetInsertPoint(segmentDone);
     1255        mStreamSetInputBaseAddress[i] = baseInputAddress[i];
     1256    }
     1257
     1258    for (unsigned i = 0; i < outputSetCount; ++i) {
     1259        mStreamSetOutputBaseAddress[i] = baseOutputAddress[i];
     1260    }
     1261
     1262    writeFinalBlockMethod(b, getRemainingItems(b));
     1263
     1264    b->CreateBr(segmentDone);
     1265
     1266    segmentDone->moveAfter(b->GetInsertBlock());
     1267
     1268    b->SetInsertPoint(segmentDone);
    13001269
    13011270    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
    1302     if (branchTarget) {
    1303         MDBuilder mdb(idb->getContext());
     1271    if (mStrideLoopTarget) {
     1272        MDBuilder mdb(b->getContext());
    13041273        const auto destinations = mStrideLoopBranch->getNumDestinations();
    13051274        uint32_t weights[destinations];
     
    13111280    }
    13121281
    1313 }
    1314 
    1315 inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) {
     1282    return numOfBlocks;
     1283}
     1284
     1285/** ------------------------------------------------------------------------------------------------------------- *
     1286 * @brief getRemainingItems
     1287 ** ------------------------------------------------------------------------------------------------------------- */
     1288Value * BlockOrientedKernel::getRemainingItems(const std::unique_ptr<KernelBuilder> & b) {
     1289    Value * remainingItems = nullptr;
     1290    const auto count = mStreamSetInputs.size();
     1291    if (count == 1) {
     1292        return mAvailableItemCount[0];
     1293    } else {
     1294        for (unsigned i = 0; i < count; i++) {
     1295            if (mStreamSetInputs[i].isPrincipal()) {
     1296                return mAvailableItemCount[i];
     1297            }
     1298        }
     1299        for (unsigned i = 0; i < count; ++i) {
     1300            const ProcessingRate & r = mStreamSetInputs[i].getRate();
     1301            if (r.isFixed()) {
     1302                Value * ic = CreateUDivCeil(b, mAvailableItemCount[i], r.getRate());
     1303                if (remainingItems) {
     1304                    remainingItems = b->CreateUMin(remainingItems, ic);
     1305                } else {
     1306                    remainingItems = ic;
     1307                }
     1308            }
     1309        }
     1310    }
     1311    return remainingItems;
     1312}
     1313
     1314/** ------------------------------------------------------------------------------------------------------------- *
     1315 * @brief writeDoBlockMethod
     1316 ** ------------------------------------------------------------------------------------------------------------- */
     1317inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    13161318
    13171319    Value * const self = getInstance();
    13181320    Function * const cp = mCurrentMethod;
    1319     auto ip = idb->saveIP();
     1321    auto ip = b->saveIP();
    13201322    std::vector<Value *> availableItemCount(0);
    13211323
    13221324    /// Check if the do block method is called and create the function if necessary
    1323     if (!idb->supportsIndirectBr()) {
     1325    if (!b->supportsIndirectBr()) {
    13241326
    13251327        std::vector<Type *> params;
     
    13301332        }
    13311333
    1332         FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false);
    1333         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule());
     1334        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
     1335        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, b->getModule());
    13341336        mCurrentMethod->setCallingConv(CallingConv::C);
    13351337        mCurrentMethod->setDoesNotThrow();
     
    13431345        assert (availableItemCount.size() == mAvailableItemCount.size());
    13441346        mAvailableItemCount.swap(availableItemCount);
    1345         idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
    1346     }
    1347 
    1348     generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
    1349 
    1350     if (!idb->supportsIndirectBr()) {
     1347        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
     1348    }
     1349
     1350    generateDoBlockMethod(b); // must be implemented by the BlockOrientedKernelBuilder subtype
     1351
     1352    if (!b->supportsIndirectBr()) {
    13511353        // Restore the DoSegment function state then call the DoBlock method
    1352         idb->CreateRetVoid();
     1354        b->CreateRetVoid();
    13531355        mDoBlockMethod = mCurrentMethod;
    1354         idb->restoreIP(ip);
     1356        b->restoreIP(ip);
    13551357        setInstance(self);
    13561358        mCurrentMethod = cp;
    13571359        mAvailableItemCount.swap(availableItemCount);
    1358         CreateDoBlockMethodCall(idb);
    1359     }
    1360 
    1361 }
    1362 
    1363 inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) {
     1360        CreateDoBlockMethodCall(b);
     1361    }
     1362
     1363}
     1364
     1365/** ------------------------------------------------------------------------------------------------------------- *
     1366 * @brief writeFinalBlockMethod
     1367 ** ------------------------------------------------------------------------------------------------------------- */
     1368inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingItems) {
    13641369
    13651370    Value * const self = getInstance();
    13661371    Function * const cp = mCurrentMethod;
    13671372    Value * const remainingItemCount = remainingItems;
    1368     auto ip = idb->saveIP();
     1373    auto ip = b->saveIP();
    13691374    std::vector<Value *> availableItemCount(0);
    13701375
    1371     if (!idb->supportsIndirectBr()) {
     1376    if (!b->supportsIndirectBr()) {
    13721377        std::vector<Type *> params;
    13731378        params.reserve(2 + mAvailableItemCount.size());
    13741379        params.push_back(self->getType());
    1375         params.push_back(idb->getSizeTy());
     1380        params.push_back(b->getSizeTy());
    13761381        for (Value * avail : mAvailableItemCount) {
    13771382            params.push_back(avail->getType());
    13781383        }
    1379         FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false);
    1380         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule());
     1384        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
     1385        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, b->getModule());
    13811386        mCurrentMethod->setCallingConv(CallingConv::C);
    13821387        mCurrentMethod->setDoesNotThrow();
     
    13921397        assert (availableItemCount.size() == mAvailableItemCount.size());
    13931398        mAvailableItemCount.swap(availableItemCount);
    1394         idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
    1395     }
    1396 
    1397     generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
    1398 
    1399     if (!idb->supportsIndirectBr()) {
    1400         idb->CreateRetVoid();
    1401         idb->restoreIP(ip);
     1399        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
     1400    }
     1401
     1402    generateFinalBlockMethod(b, remainingItems); // may be implemented by the BlockOrientedKernel subtype
     1403
     1404    if (!b->supportsIndirectBr()) {
     1405        b->CreateRetVoid();
     1406        b->restoreIP(ip);
    14021407        setInstance(self);
    14031408        mAvailableItemCount.swap(availableItemCount);
     
    14071412        args.push_back(self);
    14081413        args.push_back(remainingItemCount);
    1409         for (Value * avail : mAvailableItemCount) {
    1410             args.push_back(avail);
    1411         }
    1412         idb->CreateCall(mCurrentMethod, args);
     1414        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
     1415        b->CreateCall(mCurrentMethod, args);
    14131416        mCurrentMethod = cp;
    14141417    }
     
    14161419}
    14171420
    1418 //  The default finalBlock method simply dispatches to the doBlock routine.
    1419 void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) {
    1420     CreateDoBlockMethodCall(idb);
    1421 }
    1422 
    1423 void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) {
    1424     if (idb->supportsIndirectBr()) {
    1425         BasicBlock * bb = idb->CreateBasicBlock("resume");
     1421/** ------------------------------------------------------------------------------------------------------------- *
     1422 * @brief generateFinalBlockMethod
     1423 ** ------------------------------------------------------------------------------------------------------------- */
     1424void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * /* remainingItems */) {
     1425    //  The default finalBlock method simply dispatches to the doBlock routine.
     1426    CreateDoBlockMethodCall(b);
     1427}
     1428
     1429void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b) {
     1430    if (b->supportsIndirectBr()) {
     1431        BasicBlock * const bb = b->CreateBasicBlock("resume");
    14261432        mStrideLoopBranch->addDestination(bb);
    1427         mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock());
    1428         idb->CreateBr(mStrideLoopBody);
    1429         bb->moveAfter(idb->GetInsertBlock());
    1430         idb->SetInsertPoint(bb);
     1433        BasicBlock * const current = b->GetInsertBlock();
     1434        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), current);
     1435        mStrideBlockIndex->addIncoming(b->getSize(0), current);
     1436        b->CreateBr(mStrideLoopBody);
     1437        bb->moveAfter(current);
     1438        b->SetInsertPoint(bb);
    14311439    } else {
    14321440        std::vector<Value *> args;
    14331441        args.reserve(1 + mAvailableItemCount.size());
    14341442        args.push_back(getInstance());
    1435         for (Value * avail : mAvailableItemCount) {
    1436             args.push_back(avail);
    1437         }
    1438         idb->CreateCall(mDoBlockMethod, args);
     1443        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
     1444        b->CreateCall(mDoBlockMethod, args);
    14391445    }
    14401446}
    14411447
    14421448static inline std::string annotateKernelNameWithDebugFlags(std::string && name) {
    1443     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1449    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    14441450        name += "_EA";
    14451451    }
     
    14501456// CONSTRUCTOR
    14511457Kernel::Kernel(std::string && kernelName,
    1452                std::vector<Binding> && stream_inputs,
    1453                std::vector<Binding> && stream_outputs,
    1454                std::vector<Binding> && scalar_parameters,
    1455                std::vector<Binding> && scalar_outputs,
    1456                std::vector<Binding> && internal_scalars)
     1458               Bindings && stream_inputs,
     1459               Bindings && stream_outputs,
     1460               Bindings && scalar_parameters,
     1461               Bindings && scalar_outputs,
     1462               Bindings && internal_scalars)
    14571463: KernelInterface(annotateKernelNameWithDebugFlags(std::move(kernelName))
    14581464                  , std::move(stream_inputs), std::move(stream_outputs)
     
    14601466                  , std::move(internal_scalars))
    14611467, mCurrentMethod(nullptr)
    1462 , mAvailablePrincipleItemCount(nullptr)
     1468, mAvailablePrincipalItemCount(nullptr)
    14631469, mNoTerminateAttribute(false)
    14641470, mIsGenerated(false)
     
    14731479}
    14741480
     1481// MULTI-BLOCK KERNEL CONSTRUCTOR
     1482MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
     1483                                   Bindings && stream_inputs,
     1484                                   Bindings && stream_outputs,
     1485                                   Bindings && scalar_parameters,
     1486                                   Bindings && scalar_outputs,
     1487                                   Bindings && internal_scalars)
     1488: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1489
     1490}
     1491
    14751492// CONSTRUCTOR
    14761493BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
    1477                                          std::vector<Binding> && stream_inputs,
    1478                                          std::vector<Binding> && stream_outputs,
    1479                                          std::vector<Binding> && scalar_parameters,
    1480                                          std::vector<Binding> && scalar_outputs,
    1481                                          std::vector<Binding> && internal_scalars)
     1494                                         Bindings && stream_inputs,
     1495                                         Bindings && stream_outputs,
     1496                                         Bindings && scalar_parameters,
     1497                                         Bindings && scalar_outputs,
     1498                                         Bindings && internal_scalars)
    14821499: MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
    14831500, mDoBlockMethod(nullptr)
    14841501, mStrideLoopBody(nullptr)
    14851502, mStrideLoopBranch(nullptr)
    1486 , mStrideLoopTarget(nullptr) {
    1487 
    1488 }
    1489 
    1490 // MULTI-BLOCK KERNEL CONSTRUCTOR
    1491 MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
    1492                                    std::vector<Binding> && stream_inputs,
    1493                                    std::vector<Binding> && stream_outputs,
    1494                                    std::vector<Binding> && scalar_parameters,
    1495                                    std::vector<Binding> && scalar_outputs,
    1496                                    std::vector<Binding> && internal_scalars)
    1497 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1503, mStrideLoopTarget(nullptr)
     1504, mStrideBlockIndex(nullptr) {
    14981505
    14991506}
     
    15011508// CONSTRUCTOR
    15021509SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
    1503                                              std::vector<Binding> && stream_inputs,
    1504                                              std::vector<Binding> && stream_outputs,
    1505                                              std::vector<Binding> && scalar_parameters,
    1506                                              std::vector<Binding> && scalar_outputs,
    1507                                              std::vector<Binding> && internal_scalars)
     1510                                             Bindings && stream_inputs,
     1511                                             Bindings && stream_outputs,
     1512                                             Bindings && scalar_parameters,
     1513                                             Bindings && scalar_outputs,
     1514                                             Bindings && internal_scalars)
    15081515: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    15091516
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5706 r5755  
    1111
    1212namespace llvm { class BasicBlock; }
     13namespace llvm { class Constant; }
    1314namespace llvm { class Function; }
    1415namespace llvm { class IntegerType; }
     
    2627class Kernel : public KernelInterface {
    2728    friend class KernelBuilder;
    28 public:
    29     enum class Port { Input, Output };
    30 
    31     using StreamPort = std::pair<Port, unsigned>;
    32 
    33 protected:
    34 
    35     using KernelMap = boost::container::flat_map<std::string, unsigned>;
    36     using StreamMap = boost::container::flat_map<std::string, StreamPort>;
    37     using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
    38     using Kernels = std::vector<Kernel *>;
     29protected:
    3930
    4031    static const std::string DO_BLOCK_SUFFIX;
     
    5243public:
    5344   
     45    enum class Port { Input, Output };
     46    using StreamPort = std::pair<Port, unsigned>;
     47    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
     48    using KernelFieldMap = boost::container::flat_map<std::string, unsigned>;
     49    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
     50
    5451    // Kernel Signatures and Module IDs
    5552    //
    5653    // A kernel signature uniquely identifies a kernel and its full functionality.
    5754    // In the event that a particular kernel instance is to be generated and compiled
    58     // to produce object code, and we have a cached kernel object code instance with 
    59     // the same signature and targetting the same IDISA architecture, then the cached 
     55    // to produce object code, and we have a cached kernel object code instance with
     56    // the same signature and targetting the same IDISA architecture, then the cached
    6057    // object code may safely be used to avoid recompilation.
    6158    //
     
    6461    // Kernel developers should take responsibility for designing appropriate signature
    6562    // mechanisms that are short, inexpensive to compute and guarantee uniqueness
    66     // based on the semantics of the kernel. 
     63    // based on the semantics of the kernel.
    6764    //
    6865    // If no other mechanism is available, the default makeSignature() method uses the
     
    8784    // be added, the default method for preparing kernel state may be used.
    8885
    89        
    90     bool isCachable() const override { return false; }
    9186
    9287    std::string makeSignature(const std::unique_ptr<KernelBuilder> & idb) override;
     
    9590    virtual bool hasSignature() const { return true; }
    9691
    97     // Create a module stub for the kernel, populated only with its Module ID.     
    98     //
     92    bool isCachable() const override { return false; }
    9993
    10094    void bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs);
    101 
    102     StreamPort getStreamPort(const std::string & name) const;
    10395
    10496    llvm::Module * setModule(llvm::Module * const module);
     
    122114    }
    123115
     116    StreamPort getStreamPort(const std::string & name) const;
     117
     118    const Binding & getBinding(const std::string & name) const;
     119
    124120    const StreamSetBuffers & getStreamSetInputBuffers() const {
    125121        return mStreamSetInputBuffers;
     
    127123
    128124    const parabix::StreamSetBuffer * getStreamSetInputBuffer(const unsigned i) const {
     125        assert (i < mStreamSetInputBuffers.size());
     126        assert (mStreamSetInputBuffers[i]);
    129127        return mStreamSetInputBuffers[i];
     128    }
     129
     130    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
     131        const auto port = getStreamPort(name);
     132        assert (port.first == Port::Input);
     133        return getStreamSetInputBuffer(port.second);
    130134    }
    131135
     
    134138    }
    135139
     140    const Binding & getStreamInput(const unsigned i) const {
     141        return KernelInterface::getStreamInput(i);
     142    }
     143
     144    const Binding & getStreamInput(const std::string & name) const {
     145        const auto port = getStreamPort(name);
     146        assert (port.first == Port::Input);
     147        return KernelInterface::getStreamInput(port.second);
     148    }
     149
    136150    const parabix::StreamSetBuffer * getStreamSetOutputBuffer(const unsigned i) const {
     151        assert (i < mStreamSetOutputBuffers.size());
     152        assert (mStreamSetOutputBuffers[i]);
    137153        return mStreamSetOutputBuffers[i];
     154    }
     155
     156    const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
     157        const auto port = getStreamPort(name);
     158        assert (port.first == Port::Output);
     159        return getStreamSetOutputBuffer(port.second);
     160    }
     161
     162    const Binding & getStreamOutput(const unsigned i) const {
     163        return KernelInterface::getStreamOutput(i);
     164    }
     165
     166    const Binding & getStreamOutput(const std::string & name) const {
     167        const auto port = getStreamPort(name);
     168        assert (port.first == Port::Output);
     169        return KernelInterface::getStreamOutput(port.second);
    138170    }
    139171   
     
    144176    //
    145177   
    146     unsigned getKernelStride() const { return mStride; }
     178    unsigned getStride() const { return mStride; }
    147179   
    148180    virtual ~Kernel() = 0;
     
    156188protected:
    157189
    158     void setKernelStride(unsigned stride) { mStride = stride; }
    159 
    160190    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
    161191
     
    163193
    164194    // Constructor
    165     Kernel(std::string && kernelName,
    166                   std::vector<Binding> && stream_inputs,
    167                   std::vector<Binding> && stream_outputs,
    168                   std::vector<Binding> && scalar_parameters,
    169                   std::vector<Binding> && scalar_outputs,
    170                   std::vector<Binding> && internal_scalars);
     195    Kernel(std::string && kernelName, Bindings && stream_inputs,
     196          Bindings && stream_outputs,
     197          Bindings && scalar_parameters,
     198          Bindings && scalar_outputs,
     199          Bindings && internal_scalars);
    171200
    172201    void setNoTerminateAttribute(const bool noTerminate = true) {
     
    174203    }
    175204
    176     llvm::Value * getPrincipleItemCount() const {
    177         return mAvailablePrincipleItemCount;
     205    llvm::Value * getPrincipalItemCount() const {
     206        return mAvailablePrincipalItemCount;
    178207    }
    179208
     
    201230
    202231    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
    203 
    204 
    205     std::pair<unsigned, unsigned> getStreamRate(const Port p, const unsigned i) const;
    206 
    207     const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
    208         const auto port = getStreamPort(name);
    209         assert (port.first == Port::Input);
    210         assert (port.second < mStreamSetInputBuffers.size());
    211         assert (mStreamSetInputBuffers[port.second]);
    212         return mStreamSetInputBuffers[port.second];
    213     }
    214 
    215     const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
    216         const auto port = getStreamPort(name);
    217         assert (port.first == Port::Output);
    218         assert (port.second < mStreamSetOutputBuffers.size());
    219         assert (mStreamSetOutputBuffers[port.second]);
    220         return mStreamSetOutputBuffers[port.second];
    221     }
    222232
    223233    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
     
    235245    }
    236246
    237     llvm::Value * getStreamSetInputBufferPtr(const unsigned i) const {
    238         return mStreamSetInputBufferPtr[i];
    239     }
    240 
    241     llvm::Value * getStreamSetOutputBufferPtr(const unsigned i) const {
    242         return mStreamSetOutputBufferPtr[i];
    243     }
     247    void setStride(unsigned stride) { mStride = stride; }
    244248
    245249private:
    246250
    247251    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
     252
     253    llvm::Value * getStreamSetInputAddress(const std::string & name) const {
     254        const Kernel::StreamPort p = getStreamPort(name);
     255        assert (p.first == Port::Input);
     256        return mStreamSetInputBaseAddress[p.second];
     257    }
     258
     259    llvm::Value * getStreamSetOutputAddress(const std::string & name) const {
     260        const Kernel::StreamPort p = getStreamPort(name);
     261        assert (p.first == Port::Output);
     262        return mStreamSetOutputBaseAddress[p.second];
     263    }
    248264
    249265    llvm::Value * getAvailableItemCount(const unsigned i) const {
     
    251267    }
    252268
     269    void normalizeStreamProcessingRates();
     270
     271    bool normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate);
     272
    253273protected:
    254274
    255275    llvm::Function *                    mCurrentMethod;
    256     llvm::Value *                       mAvailablePrincipleItemCount;
     276    llvm::Value *                       mAvailablePrincipalItemCount;
    257277    bool                                mNoTerminateAttribute;
    258278    bool                                mIsGenerated;
     
    260280    llvm::Value *                       mIsFinal;
    261281    llvm::Value *                       mOutputScalarResult;
    262 
    263 
    264282    std::vector<llvm::Value *>          mAvailableItemCount;
    265283
     284    KernelFieldMap                      mKernelFieldMap;
    266285    std::vector<llvm::Type *>           mKernelFields;
    267     KernelMap                           mKernelMap;
     286
    268287    StreamMap                           mStreamMap;
     288
    269289    StreamSetBuffers                    mStreamSetInputBuffers;
    270     std::vector<llvm::Value *>          mStreamSetInputBufferPtr;
     290    std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
    271291    StreamSetBuffers                    mStreamSetOutputBuffers;
    272     std::vector<llvm::Value *>          mStreamSetOutputBufferPtr;
    273 
     292    std::vector<llvm::Value *>          mStreamSetOutputBaseAddress;
    274293};
    275294
     295using Kernels = std::vector<Kernel *>;
     296
    276297class SegmentOrientedKernel : public Kernel {
    277298protected:
    278299
    279300    SegmentOrientedKernel(std::string && kernelName,
    280                           std::vector<Binding> && stream_inputs,
    281                           std::vector<Binding> && stream_outputs,
    282                           std::vector<Binding> && scalar_parameters,
    283                           std::vector<Binding> && scalar_outputs,
    284                           std::vector<Binding> && internal_scalars);
     301                          Bindings && stream_inputs,
     302                          Bindings && stream_outputs,
     303                          Bindings && scalar_parameters,
     304                          Bindings && scalar_outputs,
     305                          Bindings && internal_scalars);
    285306protected:
    286307
    287308    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
    288309
    289     virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) = 0;
     310    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
    290311
    291312};
     
    386407
    387408    MultiBlockKernel(std::string && kernelName,
    388                      std::vector<Binding> && stream_inputs,
    389                      std::vector<Binding> && stream_outputs,
    390                      std::vector<Binding> && scalar_parameters,
    391                      std::vector<Binding> && scalar_outputs,
    392                      std::vector<Binding> && internal_scalars);
     409                     Bindings && stream_inputs,
     410                     Bindings && stream_outputs,
     411                     Bindings && scalar_parameters,
     412                     Bindings && scalar_outputs,
     413                     Bindings && internal_scalars);
    393414
    394415    // Each multi-block kernel subtype must provide its own logic for handling
     
    399420    // exit the RetVoid instruction will be added to complete the method.
    400421    //
    401     virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) = 0;
     422    virtual llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
    402423
    403424private:
     
    406427    // method of the multi-block kernel builder makes all the necessary arrangements
    407428    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
    408     void generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) final;
     429    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
     430
     431    unsigned getItemAlignment(const Binding & binding) const;
     432
     433    ProcessingRate::RateValue getLowerBound(const ProcessingRate &rate) const;
     434
     435    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
     436
     437    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
     438
     439    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
    409440
    410441    bool requiresCopyBack(const ProcessingRate & rate) const;
    411442
     443    void reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b);
     444
     445protected:
     446
     447    std::vector<llvm::Value *>      mInitialAvailableItemCount;
     448    std::vector<llvm::Value *>      mInitialProcessedItemCount;
     449    std::vector<llvm::Value *>      mInitialProducedItemCount;
     450
    412451};
    413452
     
    416455protected:
    417456
    418     void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb);
     457    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b);
    419458
    420459    // Each kernel builder subtype must provide its own logic for generating
    421460    // doBlock calls.
    422     virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) = 0;
     461    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
    423462
    424463    // Each kernel builder subtypre must also specify the logic for processing the
     
    429468    // not be overridden.
    430469
    431     virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
    432 
    433     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) final;
     470    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
    434471
    435472    BlockOrientedKernel(std::string && kernelName,
    436                         std::vector<Binding> && stream_inputs,
    437                         std::vector<Binding> && stream_outputs,
    438                         std::vector<Binding> && scalar_parameters,
    439                         std::vector<Binding> && scalar_outputs,
    440                         std::vector<Binding> && internal_scalars);
     473                        Bindings && stream_inputs,
     474                        Bindings && stream_outputs,
     475                        Bindings && scalar_parameters,
     476                        Bindings && scalar_outputs,
     477                        Bindings && internal_scalars);
    441478
    442479private:
    443480
    444     void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb);
    445 
    446     void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
     481    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     482
     483    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
     484
     485    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
     486
     487    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
    447488
    448489private:
    449490
    450     llvm::Function *        mDoBlockMethod;
    451     llvm::BasicBlock *      mStrideLoopBody;
    452     llvm::IndirectBrInst *  mStrideLoopBranch;
    453     llvm::PHINode *         mStrideLoopTarget;
     491    llvm::Function *            mDoBlockMethod;
     492    llvm::BasicBlock *          mStrideLoopBody;
     493    llvm::IndirectBrInst *      mStrideLoopBranch;
     494    llvm::PHINode *             mStrideLoopTarget;
     495    llvm::PHINode *             mStrideBlockIndex;
    454496};
    455497
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5706 r5755  
    44#include <kernels/streamset.h>
    55#include <llvm/Support/raw_ostream.h>
     6#include <llvm/IR/Module.h>
    67
    78using namespace llvm;
    89using namespace parabix;
    910
    10 using Value = Value;
     11inline static bool is_power_2(const uint64_t n) {
     12    return ((n & (n - 1)) == 0) && n;
     13}
    1114
    1215namespace kernel {
     
    1417using Port = Kernel::Port;
    1518
    16 Value * KernelBuilder::getScalarFieldPtr(llvm::Value * instance, Value * const index) {
    17     assert (instance);
    18     CreateAssert(instance, "getScalarFieldPtr: instance cannot be null!");
     19Value * KernelBuilder::getScalarFieldPtr(llvm::Value * const instance, Value * const index) {
     20    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     21        CreateAssert(instance, "getScalarFieldPtr: instance cannot be null!");
     22    }
    1923    return CreateGEP(instance, {getInt32(0), index});
    2024}
    2125
    22 Value * KernelBuilder::getScalarFieldPtr(llvm::Value * instance, const std::string & fieldName) {
    23     return getScalarFieldPtr(instance, getInt32(mKernel->getScalarIndex(fieldName)));
    24 }
    25 
    26 llvm::Value * KernelBuilder::getScalarFieldPtr(llvm::Value * index) {
     26Value * KernelBuilder::getScalarFieldPtr(llvm::Value * const handle, const std::string & fieldName) {
     27    return getScalarFieldPtr(handle, getInt32(mKernel->getScalarIndex(fieldName)));
     28}
     29
     30llvm::Value * KernelBuilder::getScalarFieldPtr(llvm::Value * const index) {
    2731    return getScalarFieldPtr(mKernel->getInstance(), index);
    2832}
     
    4246Value * KernelBuilder::getStreamHandle(const std::string & name) {
    4347    Value * const ptr = getScalarField(name + Kernel::BUFFER_PTR_SUFFIX);
    44     CreateAssert(ptr, name + " cannot be null!");
     48    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     49        CreateAssert(ptr, name + " handle cannot be null!");
     50    }
    4551    return ptr;
    4652}
     
    5864}
    5965
    60 inline const Binding & getBinding(const Kernel * k, const std::string & name) {
    61     Port port; unsigned index;
    62     std::tie(port, index) = k->getStreamPort(name);
    63     if (port == Port::Input) {
    64         return k->getStreamInput(index);
    65     } else {
    66         return k->getStreamOutput(index);
    67     }
    68 }
    69 
    7066Value * KernelBuilder::getInternalItemCount(const std::string & name, const std::string & suffix) {
    71     const ProcessingRate & rate = getBinding(mKernel, name).getRate();
     67    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    7268    Value * itemCount = nullptr;
    73     if (rate.isExactlyRelative()) {
     69    if (LLVM_UNLIKELY(rate.isRelative())) {
    7470        Port port; unsigned index;
    7571        std::tie(port, index) = mKernel->getStreamPort(rate.getReference());
     
    7975            itemCount = getProducedItemCount(rate.getReference());
    8076        }
    81         if (rate.getNumerator() != 1) {
    82             itemCount = CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
    83         }
    84         if (rate.getDenominator() != 1) {
    85             itemCount = CreateExactUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
     77        const auto & r = rate.getRate();
     78        if (r.numerator() != 1) {
     79            itemCount = CreateMul(itemCount, ConstantInt::get(itemCount->getType(), r.numerator()));
     80        }
     81        if (r.denominator() != 1) {
     82            itemCount = CreateExactUDiv(itemCount, ConstantInt::get(itemCount->getType(), r.denominator()));
    8683        }
    8784    } else {
     
    9289
    9390void KernelBuilder::setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value) {
    94     const ProcessingRate & rate = getBinding(mKernel, name).getRate();
     91    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    9592    if (LLVM_UNLIKELY(rate.isDerived())) {
    9693        report_fatal_error("Cannot set item count: " + name + " is a Derived rate");
     
    139136}
    140137
    141 Value * KernelBuilder::copy(const std::string & name, Value * target, Value * source, Value * itemsToCopy, const unsigned alignment) {
     138//Value * KernelBuilder::getLinearlyCopyableItems(const std::string & name, Value * fromPosition, bool reverse) {
     139//    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     140//    return buf->getLinearlyCopyableItems(this, getStreamHandle(name), fromPosition, reverse);
     141//}
     142
     143/** ------------------------------------------------------------------------------------------------------------- *
     144 * @brief isConstantZero
     145 ** ------------------------------------------------------------------------------------------------------------- */
     146inline bool isConstantZero(Value * const v) {
     147    return isa<ConstantInt>(v) && cast<ConstantInt>(v)->isNullValue();
     148}
     149
     150/** ------------------------------------------------------------------------------------------------------------- *
     151 * @brief isConstantOne
     152 ** ------------------------------------------------------------------------------------------------------------- */
     153inline bool isConstantOne(Value * const v) {
     154    return isa<ConstantInt>(v) && cast<ConstantInt>(v)->isOne();
     155}
     156
     157/** ------------------------------------------------------------------------------------------------------------- *
     158 * @brief getItemWidth
     159 ** ------------------------------------------------------------------------------------------------------------- */
     160inline unsigned getItemWidth(const Type * ty) {
     161    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     162        ty = ty->getArrayElementType();
     163    }
     164    return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
     165}
     166
     167/** ------------------------------------------------------------------------------------------------------------- *
     168 * @brief getFieldWidth
     169 ** ------------------------------------------------------------------------------------------------------------- */
     170inline unsigned getFieldWidth(const unsigned bitWidth, const unsigned blockWidth) {
     171    for (unsigned k = 16; k < blockWidth; k *= 2) {
     172        if ((bitWidth & (k - 1)) != 0) {
     173            return k / 2;
     174        }
     175    }
     176    return blockWidth;
     177}
     178
     179/** ------------------------------------------------------------------------------------------------------------- *
     180 * @brief CreateStreamCpy
     181 ** ------------------------------------------------------------------------------------------------------------- */
     182void KernelBuilder::CreateStreamCpy(const std::string & name, Value * target, Value * targetOffset, Value * source, Value * sourceOffset, Value * itemsToCopy, const unsigned itemAlignment) {
     183
     184    assert (target && targetOffset);
     185    assert (source && sourceOffset);
     186    assert (target->getType() == source->getType());
     187    assert (target->getType()->isPointerTy());
     188
    142189    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    143     return buf->copy(this, getStreamHandle(name), target, source, itemsToCopy, alignment);
     190
     191    const auto itemWidth = getItemWidth(buf->getBaseType());
     192    assert ("invalid item width" && is_power_2(itemWidth));
     193    const auto blockWidth = getBitBlockWidth();
     194
     195    const auto fieldWidth = getFieldWidth(itemWidth * itemAlignment, blockWidth);
     196    assert ("overflow error" && is_power_2(fieldWidth) && (itemWidth <= fieldWidth));
     197
     198    assert (isConstantZero(targetOffset) || isConstantZero(sourceOffset));
     199
     200    IntegerType * const fieldWidthTy = getIntNTy(fieldWidth / 8);
     201
     202    const auto alignment = fieldWidth / 8;
     203
     204    if (LLVM_LIKELY(itemWidth < fieldWidth)) {
     205        Constant * const factor = getSize(fieldWidth / itemWidth);
     206        CreateAssertZero(CreateURem(targetOffset, factor), "target offset is not a multiple of its field width");
     207        targetOffset = CreateUDiv(targetOffset, factor);
     208        CreateAssertZero(CreateURem(sourceOffset, factor), "source offset is not a multiple of its field width");
     209        sourceOffset = CreateUDiv(sourceOffset, factor);
     210    }
     211
     212    /*
     213
     214       Streams are conceptually modelled as:
     215
     216                                            BLOCKS
     217
     218                                      A     B     C     D
     219           STREAM SET ELEMENT   1  |aaaaa|bbbbb|ccccc|dddd |
     220                                2  |eeeee|fffff|ggggg|hhhh |
     221                                3  |iiiii|jjjjj|kkkkk|llll |
     222
     223       But the memory layout is actually:
     224
     225           A_1   A_2   A_3   B_1   B_2   B_3   C_1   C_2   C_3   D_1   D_2   D_3
     226
     227         |aaaaa|eeeee|iiiii|bbbbb|fffff|jjjjj|ccccc|ggggg|kkkkk|dddd |hhhh |llll |
     228
     229
     230       So if we're copying the entire stream set block or our stream set has one element, we can use memcpy.
     231
     232    */
     233
     234    Value * const n = buf->getStreamSetCount(this, getStreamHandle(name));
     235    if (fieldWidth == blockWidth || isConstantOne(n) || (isConstantZero(targetOffset) && isConstantZero(sourceOffset))) {
     236        PointerType * const fieldWidthPtrTy = fieldWidthTy->getPointerTo();
     237        if (isConstantOne(n)) {
     238            if (LLVM_LIKELY(itemWidth < 8)) {
     239                itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
     240            } else if (LLVM_UNLIKELY(itemWidth > 8)) {
     241                itemsToCopy = CreateMul(itemsToCopy, getSize(itemWidth / 8));
     242            }
     243        } else {
     244            itemsToCopy = CreateMul(CreateUDivCeil(itemsToCopy, getSize(blockWidth / (8 * itemWidth))), n);
     245        }
     246        target = CreateGEP(CreatePointerCast(target, fieldWidthPtrTy), targetOffset);
     247        source = CreateGEP(CreatePointerCast(source, fieldWidthPtrTy), sourceOffset);
     248        CreateMemCpy(target, source, itemsToCopy, alignment);
     249
     250    } else { // either the target offset or source offset is non-zero but not both
     251
     252        VectorType * const blockTy = getBitBlockType();
     253        PointerType * const blockPtrTy = blockTy->getPointerTo();
     254
     255        target = CreatePointerCast(target, blockPtrTy);
     256        source = CreatePointerCast(source, blockPtrTy);
     257
     258        VectorType * const shiftTy = VectorType::get(fieldWidthTy, blockWidth / fieldWidth);
     259        Constant * const width = getSize(blockWidth / itemWidth);
     260        BasicBlock * const entry = GetInsertBlock();
     261
     262
     263        if (isConstantZero(targetOffset)) {
     264
     265            /*
     266                                                BLOCKS
     267
     268                                          A     B     C     D
     269               SOURCE STREAM        1  |aaa--|bbbBB|cccCC|  dDD|
     270                                    2  |eee--|fffFF|gggGG|  hHH|
     271                                    3  |iii--|jjjJJ|kkkKK|  lLL|
     272
     273
     274                                          A     B     C     D
     275               TARGET STREAM        1  |BBaaa|CCbbb|DDccc|    d|
     276                                    2  |FFeee|GGfff|HHggg|    h|
     277                                    3  |JJiii|KKjjj|LLkkk|    l|
     278             */
     279
     280            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, width), n);
     281            Value * const offset = CreateURem(sourceOffset, width);
     282            Value * const remaining = CreateSub(width, offset);
     283            Value * const trailing = CreateURem(CreateAdd(sourceOffset, itemsToCopy), width);
     284
     285            BasicBlock * const streamCopy = CreateBasicBlock(name + "StreamCopy");
     286            BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "StreamCopyRemaining");
     287            BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "StreamCopyEnd");
     288
     289            CreateCondBr(CreateICmpNE(blocksToCopy, getSize(0)), streamCopy, streamCopyRemaining);
     290
     291            SetInsertPoint(streamCopy);
     292            PHINode * const i = CreatePHI(getSizeTy(), 2);
     293            i->addIncoming(n, entry);
     294            Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
     295            prior = CreateLShr(CreateBitCast(prior, shiftTy), offset);
     296            Value * value = CreateAlignedLoad(CreateGEP(source, i), alignment);
     297            value = CreateShl(CreateBitCast(value, shiftTy), remaining);
     298            Value * const result = CreateBitCast(CreateOr(value, prior), blockTy);
     299            CreateAlignedStore(result, CreateGEP(target, i), alignment);
     300            Value * const next_i = CreateAdd(i, getSize(1));
     301            i->addIncoming(next_i, streamCopy);
     302            CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemaining);
     303
     304            SetInsertPoint(streamCopyRemaining);
     305            PHINode * const j = CreatePHI(getSizeTy(), 2);
     306            j->addIncoming(getSize(0), streamCopy);
     307            Value * k = CreateAdd(blocksToCopy, j);
     308            Value * final = CreateAlignedLoad(CreateGEP(source, k), alignment);
     309            final = CreateLShr(CreateBitCast(prior, shiftTy), trailing);
     310            CreateAlignedStore(final, CreateGEP(target, k), alignment);
     311            Value * const next_j = CreateAdd(i, getSize(1));
     312            i->addIncoming(next_j, streamCopyRemaining);
     313            CreateCondBr(CreateICmpNE(next_j, n), streamCopyRemaining, streamCopyEnd);
     314
     315            SetInsertPoint(streamCopyEnd);
     316
     317        } else if (isConstantZero(sourceOffset)) {
     318
     319            /*
     320                                                BLOCKS
     321
     322                                          A     B     C     D
     323               SOURCE STREAM        1  |AAAaa|BBBaa|CCCcc|    d|
     324                                    2  |EEEee|FFFff|GGGgg|    h|
     325                                    3  |IIIii|JJJjj|KKKkk|    l|
     326
     327
     328                                          A     B     C     D
     329               TARGET STREAM        1  |aa---|bbAAA|ccBBB| dCCC|
     330                                    2  |ee---|ffEEE|ggFFF| hGGG|
     331                                    3  |ii---|jjIII|kkJJJ| lKKK|
     332
     333            */
     334
     335            BasicBlock * const streamCopy = CreateBasicBlock(name + "StreamCopy");
     336            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock(name + "StreamCopyRemainingCond");
     337            BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "StreamCopyRemaining");
     338            BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "StreamCopyEnd");
     339
     340            Value * const offset = CreateURem(targetOffset, width);
     341            Value * const copied = CreateSub(width, offset);
     342            Value * const mask = CreateLShr(Constant::getAllOnesValue(shiftTy), copied);
     343
     344            SetInsertPoint(streamCopy);
     345            PHINode * const i = CreatePHI(getSizeTy(), 2);
     346            i->addIncoming(getSize(0), entry);
     347            Value * targetValue = CreateAlignedLoad(CreateGEP(target, i), alignment);
     348            targetValue = CreateAnd(CreateBitCast(targetValue, shiftTy), mask);
     349            Value * sourceValue = CreateAlignedLoad(CreateGEP(source, i), alignment);
     350            sourceValue = CreateShl(CreateBitCast(sourceValue, shiftTy), offset);
     351            CreateAlignedStore(CreateOr(sourceValue, targetValue), CreateGEP(source, i), alignment);
     352            Value * const next_i = CreateAdd(i, getSize(1));
     353            i->addIncoming(next_i, streamCopy);
     354            CreateCondBr(CreateICmpNE(next_i, n), streamCopy, streamCopyRemainingCond);
     355
     356            SetInsertPoint(streamCopyRemainingCond);
     357            Value * const blocksToCopy = CreateMul(CreateUDiv(CreateSub(itemsToCopy, copied), width), n);
     358            CreateCondBr(CreateICmpULT(copied, itemsToCopy), streamCopyRemaining, streamCopyEnd);
     359
     360            SetInsertPoint(streamCopyRemaining);
     361            PHINode * const j = CreatePHI(getSizeTy(), 2);
     362            j->addIncoming(n, entry);
     363            Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
     364            prior = CreateShl(CreateBitCast(prior, shiftTy), offset);
     365            Value * value = CreateAlignedLoad(CreateGEP(source, j), alignment);
     366            value = CreateLShr(CreateBitCast(value, shiftTy), copied);
     367            Value * const result = CreateBitCast(CreateOr(value, prior), blockTy);
     368            CreateAlignedStore(result, CreateGEP(target, j), alignment);
     369            Value * const next_j = CreateAdd(j, getSize(1));
     370            j->addIncoming(next_j, streamCopy);
     371            CreateCondBr(CreateICmpNE(next_j, blocksToCopy), streamCopyRemaining, streamCopyEnd);
     372
     373            SetInsertPoint(streamCopyEnd);
     374        }
     375
     376    }
    144377}
    145378
    146379void KernelBuilder::CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to) {
    147380    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    148     return buf->genCopyBackLogic(this, getStreamHandle(name), from, to, name);
     381    buf->genCopyBackLogic(this, getStreamHandle(name), from, to, name);
    149382}
    150383
     
    157390}
    158391
    159 inline Value * KernelBuilder::computeBlockIndex(Value * itemCount) {
    160     const auto divisor = getBitBlockWidth();
    161     if (LLVM_LIKELY((divisor & (divisor - 1)) == 0)) {
    162         return CreateLShr(itemCount, std::log2(divisor));
    163     } else {
    164         return CreateUDiv(itemCount, getSize(divisor));
    165     }
    166 }
    167 
    168 Value * KernelBuilder::getInputStreamPtr(const std::string & name, Value * const blockIndex) {
    169 //    Value * const blockIndex = computeBlockIndex(getProcessedItemCount(name));
    170     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    171     return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
    172 }
    173 
    174392Value * KernelBuilder::getInputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    175     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    176     if (LLVM_UNLIKELY(p.first == Port::Output)) {
    177         report_fatal_error(name + " is not an input stream set");
    178     }
    179     Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
    180     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    181     return buf->getStreamBlockPtr(this, getStreamHandle(name), addr, streamIndex, true);
     393    Value * const addr = mKernel->getStreamSetInputAddress(name);
     394    if (addr) {
     395        return CreateGEP(addr, {getInt32(0), streamIndex});
     396    } else {
     397        const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     398        Value * const blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     399        return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, true);
     400    }
    182401}
    183402
     
    187406
    188407Value * KernelBuilder::getInputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    189     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    190     if (LLVM_UNLIKELY(p.first == Port::Output)) {
    191         report_fatal_error(name + " is not an input stream set");
    192     }
    193     Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
    194     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    195     return buf->getStreamPackPtr(this, getStreamHandle(name), addr, streamIndex, packIndex, true);
     408    Value * const addr = mKernel->getStreamSetInputAddress(name);
     409    if (addr) {
     410        return CreateGEP(addr, {getInt32(0), streamIndex, packIndex});
     411    } else {
     412        const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     413        Value * const blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     414        return buf->getStreamPackPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, packIndex, true);
     415    }
    196416}
    197417
    198418Value * KernelBuilder::loadInputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex) {
     419
     420
     421
    199422    return CreateBlockAlignedLoad(getInputStreamPackPtr(name, streamIndex, packIndex));
    200423}
     
    206429
    207430Value * KernelBuilder::getAdjustedInputStreamBlockPtr(Value * blockAdjustment, const std::string & name, Value * streamIndex) {
    208     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    209     if (LLVM_UNLIKELY(p.first == Port::Output)) {
    210         report_fatal_error(name + " is not an input stream set");
    211     }
    212     Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
    213     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    214     return buf->getStreamBlockPtr(this, getStreamHandle(name), CreateGEP(addr, blockAdjustment), streamIndex, true);
    215 }
    216 
    217 Value * KernelBuilder::getOutputStreamPtr(const std::string & name, Value * const blockIndex) {
    218 //    Value * const blockIndex = computeBlockIndex(getProducedItemCount(name));
    219     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    220     return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
     431    Value * const addr = mKernel->getStreamSetInputAddress(name);
     432    if (addr) {
     433        return CreateGEP(addr, {blockAdjustment, streamIndex});
     434    } else {
     435        const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     436        Value * blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     437        blockIndex = CreateAdd(blockIndex, blockAdjustment);
     438        return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, true);
     439    }
    221440}
    222441
    223442Value * KernelBuilder::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    224     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    225     if (LLVM_UNLIKELY(p.first == Port::Input)) {
    226         report_fatal_error(name + " is not an output stream set");
    227     }
    228     Value * addr = mKernel->getStreamSetOutputBufferPtr(p.second);
    229     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    230     return buf->getStreamBlockPtr(this, getStreamHandle(name), addr, streamIndex, true);
     443    Value * const addr = mKernel->getStreamSetOutputAddress(name);
     444    if (addr) {
     445        return CreateGEP(addr, {getInt32(0), streamIndex});
     446    } else {
     447        const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     448        Value * const blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
     449        return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, false);
     450    }
    231451}
    232452
     
    236456
    237457Value * KernelBuilder::getOutputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    238     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    239     if (LLVM_UNLIKELY(p.first == Port::Input)) {
    240         report_fatal_error(name + " is not an output stream set");
    241     }
    242     Value * addr = mKernel->getStreamSetOutputBufferPtr(p.second);
    243     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    244     return buf->getStreamPackPtr(this, getStreamHandle(name), addr, streamIndex, packIndex, false);
     458    Value * const addr = mKernel->getStreamSetOutputAddress(name);
     459    if (addr) {
     460        return CreateGEP(addr, {getInt32(0), streamIndex, packIndex});
     461    } else {
     462        const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     463        Value * const blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
     464        return buf->getStreamPackPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, packIndex, false);
     465    }
    245466}
    246467
     
    280501}
    281502
    282 
    283503Value * KernelBuilder::getCapacity(const std::string & name) {
    284504    return mKernel->getAnyStreamSetBuffer(name)->getCapacity(this, getStreamHandle(name));
     
    289509}
    290510
     511Value * KernelBuilder::getBlockAddress(const std::string & name, Value * blockIndex) {
     512    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
     513    return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
     514}
     515
     516void KernelBuilder::protectOutputStream(const std::string & name, const bool readOnly) {
     517    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     518    Value * const handle = getStreamHandle(name);
     519    Value * const base = buf->getBaseAddress(this, handle);
     520    Value * sz = ConstantExpr::getSizeOf(buf->getType());
     521    sz = CreateMul(sz, getInt64(buf->getBufferBlocks()));
     522    sz = CreateMul(sz, CreateZExt(buf->getStreamSetCount(this, handle), getInt64Ty()));
     523    CreateMProtect(base, sz, readOnly ? CBuilder::READ : (CBuilder::READ | CBuilder::WRITE));
     524}
    291525   
    292526CallInst * KernelBuilder::createDoSegmentCall(const std::vector<Value *> & args) {
    293 //    Function * const doSegment = mKernel->getDoSegmentFunction(getModule());
    294 //    assert (doSegment->getArgumentList().size() == args.size());
    295 //    return CreateCall(doSegment, args);
    296527    return mKernel->makeDoSegmentCall(*this, args);
    297528}
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5706 r5755  
    4444    }
    4545
    46     llvm::Value * getProcessedItemCount(const std::string & name) {
     46    llvm::Value * getProcessedItemCount(const std::string & name) {       
    4747        return getInternalItemCount(name, Kernel::PROCESSED_ITEM_COUNT_SUFFIX);
    4848    }
     
    7171    // use in implementing kernels.
    7272
    73     llvm::Value * getInputStreamPtr(const std::string & name, llvm::Value * const blockIndex);
    74 
    7573    llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
    7674
     
    8280
    8381    llvm::Value * getInputStreamSetCount(const std::string & name);
    84 
    85     llvm::Value * getOutputStreamPtr(const std::string & name, llvm::Value * const blockIndex);
    8682
    8783    llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
     
    10399    llvm::Value * getBaseAddress(const std::string & name);
    104100
     101    llvm::Value * getBlockAddress(const std::string & name, llvm::Value * const blockIndex);
     102
    105103    void CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to);
    106104
     
    121119    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    122120   
    123     llvm::Value * copy(const std::string & name, llvm::Value * target, llvm::Value * source, llvm::Value * itemsToCopy, const unsigned alignment = 0);
     121    void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopy, const unsigned itemAlignment);
    124122
    125123    llvm::BasicBlock * CreateConsumerWait();
     
    143141    }
    144142
     143    void protectOutputStream(const std::string & name, const bool readOnly);
     144
    145145protected:
    146146
     
    158158
    159159    void setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value);
    160 
    161 private:
    162 
    163     llvm::Value * computeBlockIndex(llvm::Value * itemCount);
    164160
    165161protected:
  • icGREP/icgrep-devel/icgrep/kernels/linebreak_kernel.cpp

    r5706 r5755  
    2121
    2222LineBreakKernelBuilder::LineBreakKernelBuilder(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned basisBitsCount)
    23 : PabloKernel(b, "lb", {Binding{b->getStreamSetTy(basisBitsCount), "basis"}}, {Binding{b->getStreamSetTy(1), "linebreak", FixedRate(), Add1()}}) {
     23: PabloKernel(b, "lb",
     24    {Binding{b->getStreamSetTy(basisBitsCount), "basis", FixedRate(), Principal()}},
     25    {Binding{b->getStreamSetTy(1), "linebreak", FixedRate(), Add1()}}) {
    2426
    2527}
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5706 r5755  
    1111using namespace kernel;
    1212
    13 Value * getInputPtr(const std::unique_ptr<KernelBuilder> & iBuilder, Value * blockStartPtr, Value * offset) {
    14     return iBuilder->CreateGEP(
    15             iBuilder->CreatePointerCast(blockStartPtr, iBuilder->getInt32Ty()->getPointerTo()),
    16             offset
    17             );
    18 }
     13Value * LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {
    1914
    20 Value * selectMin(const std::unique_ptr<KernelBuilder> & iBuilder, Value * a, Value * b) {
    21     return iBuilder->CreateSelect(iBuilder->CreateICmpULT(a, b), a, b);
    22 }
     15    BasicBlock * entry_block = b->GetInsertBlock();
     16    BasicBlock * loopBody = b->CreateBasicBlock("bytestream_block_loop_body");
     17    BasicBlock * loopExit = b->CreateBasicBlock("bytestream_block_loop_exit");
     18    Type * const i32PtrTy = b->getInt32Ty()->getPointerTo();
     19    Type * const sizeTy = b->getSizeTy();
     20    assert (mBufferSize > 0);
     21    Value * bufferSize = b->getSize(mBufferSize);
     22    Value * bufferSizeMask = b->getSize(mBufferSize - 1);
     23    Value * const iterations = b->getAvailableItemCount("literalIndexes");
     24    Value * const inputBufferBasePtr = b->getRawInputPointer("inputStream", b->getInt32(0));
     25    Value * const outputBufferBasePtr = b->getRawOutputPointer("outputStream", b->getInt32(0));
     26    Value * baseLiteralStartPtr = b->getInputStreamBlockPtr("literalIndexes", b->getSize(0));
     27    baseLiteralStartPtr = b->CreatePointerCast(baseLiteralStartPtr, i32PtrTy);
     28    Value * baseLiteralLengthPtr = b->getInputStreamBlockPtr("literalIndexes", b->getSize(1));
     29    baseLiteralLengthPtr = b->CreatePointerCast(baseLiteralLengthPtr, i32PtrTy);
     30    Value * baseMatchOffsetPtr = b->getInputStreamBlockPtr("matchIndexes", b->getSize(0));
     31    baseMatchOffsetPtr = b->CreatePointerCast(baseMatchOffsetPtr, i32PtrTy);
     32    Value * baseMatchLengthPtr = b->getInputStreamBlockPtr("matchIndexes", b->getSize(1));
     33    baseMatchLengthPtr = b->CreatePointerCast(baseMatchLengthPtr, i32PtrTy);
     34    b->CreateBr(loopBody);
    2335
    24 void LZ4ByteStreamDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    25     BasicBlock * entry_block = iBuilder->GetInsertBlock();
    26     BasicBlock * loopBody = iBuilder->CreateBasicBlock("bytestream_block_loop_body");
    27     BasicBlock * loopExit = iBuilder->CreateBasicBlock("bytestream_block_loop_exit");
    28 
    29     Value * bufferSize = iBuilder->getSize(mBufferSize);
    30     Value * bufferSizeMask = iBuilder->CreateSub(bufferSize, iBuilder->getSize(1));
    31     Value * iterations = selectMin(iBuilder,
    32             iBuilder->getSize(iBuilder->getBitBlockWidth()),
    33             iBuilder->CreateSub(iBuilder->getAvailableItemCount("literalIndexes"), iBuilder->getProcessedItemCount("literalIndexes")));
    34     Value * inputBufferBasePtr = iBuilder->getRawInputPointer("inputStream", iBuilder->getSize(0));
    35     Value * outputBufferBasePtr = iBuilder->getRawOutputPointer("outputStream", iBuilder->getSize(0));
    36     iBuilder->CreateBr(loopBody);
    37 
    38     iBuilder->SetInsertPoint(loopBody);
    39     PHINode * phiInputIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "inputIndex");
    40     phiInputIndex->addIncoming(iBuilder->getSize(0), entry_block);
     36    b->SetInsertPoint(loopBody);
     37    PHINode * phiInputIndex = b->CreatePHI(sizeTy, 2, "inputIndex");
     38    phiInputIndex->addIncoming(b->getSize(0), entry_block);
    4139
    4240    // =================================================
    4341    // Indexes extraction.
    44     Value * literalStartPtr = getInputPtr(iBuilder,
    45             iBuilder->getInputStreamBlockPtr("literalIndexes", iBuilder->getSize(0)), phiInputIndex);
    46     Value * literalLengthPtr = getInputPtr(iBuilder,
    47             iBuilder->getInputStreamBlockPtr("literalIndexes", iBuilder->getSize(1)), phiInputIndex);
    48     Value * matchOffsetPtr = getInputPtr(iBuilder,
    49             iBuilder->getInputStreamBlockPtr("matchIndexes", iBuilder->getSize(0)), phiInputIndex);
    50     Value * matchLengthPtr = getInputPtr(iBuilder,
    51             iBuilder->getInputStreamBlockPtr("matchIndexes", iBuilder->getSize(1)), phiInputIndex);
    52     Value * literalStart = iBuilder->CreateZExt(iBuilder->CreateLoad(literalStartPtr), iBuilder->getSizeTy());
    53     Value * literalLength = iBuilder->CreateZExt(iBuilder->CreateLoad(literalLengthPtr), iBuilder->getSizeTy());
    54     Value * matchOffset = iBuilder->CreateZExt(iBuilder->CreateLoad(matchOffsetPtr), iBuilder->getSizeTy());
    55     Value * matchLength = iBuilder->CreateZExt(iBuilder->CreateLoad(matchLengthPtr), iBuilder->getSizeTy());
    5642
    57 //    iBuilder->CallPrintInt(" ----- literalStart", literalStart);
    58 //    iBuilder->CallPrintInt(" ----- literalLength", literalLength);
    59 //    iBuilder->CallPrintInt(" ----- matchOffset", matchOffset);
    60 //    iBuilder->CallPrintInt(" ----- matchLength", matchLength);
    6143
    62 //#if 0
    63 //    Value * processedItem = iBuilder->CreateAdd(iBuilder->getProcessedItemCount("literalIndexes"), phiInputIndex);
    64 //    iBuilder->CallPrintInt("ProccessedItem", processedItem);
    65 //    iBuilder->CallPrintInt("LiteralStart", literalStart);
    66 //    iBuilder->CallPrintInt("LiteralLength", literalLength);
    67 //    iBuilder->CallPrintInt("MatchOffset", matchOffset);
    68 //    iBuilder->CallPrintInt("MatchLength", matchLength);
    69 //#endif
     44    Value * literalStartPtr = b->CreateGEP(baseLiteralStartPtr, phiInputIndex);
     45    Value * literalLengthPtr = b->CreateGEP(baseLiteralLengthPtr, phiInputIndex);
     46    Value * matchOffsetPtr = b->CreateGEP(baseMatchOffsetPtr, phiInputIndex);
     47    Value * matchLengthPtr = b->CreateGEP(baseMatchLengthPtr, phiInputIndex);
     48
     49    Value * literalStart = b->CreateZExt(b->CreateLoad(literalStartPtr), sizeTy);
     50    Value * literalLength = b->CreateZExt(b->CreateLoad(literalLengthPtr), sizeTy);
     51    Value * matchOffset = b->CreateZExt(b->CreateLoad(matchOffsetPtr), sizeTy);
     52    Value * matchLength = b->CreateZExt(b->CreateLoad(matchLengthPtr), sizeTy);
    7053
    7154    // =================================================
    7255    // Literals.
    73     Value * outputItems = iBuilder->getProducedItemCount("outputStream");
    74     Value * bufferOffset = iBuilder->CreateAnd(outputItems, bufferSizeMask);
    75     Value * remainingBuffer = iBuilder->CreateSub(bufferSize, bufferOffset);
    76     Value * copyLength1 = selectMin(iBuilder, remainingBuffer, literalLength);
    77     iBuilder->CreateMemCpy(
    78             iBuilder->CreateGEP(outputBufferBasePtr, bufferOffset),
    79             iBuilder->CreateGEP(inputBufferBasePtr, literalStart),
     56    Value * outputItems = b->getProducedItemCount("outputStream");
     57    Value * bufferOffset = b->CreateAnd(outputItems, bufferSizeMask);
     58    Value * remainingBuffer = b->CreateSub(bufferSize, bufferOffset);
     59    Value * copyLength1 = b->CreateUMin(remainingBuffer, literalLength);
     60    b->CreateMemCpy(
     61            b->CreateGEP(outputBufferBasePtr, bufferOffset),
     62            b->CreateGEP(inputBufferBasePtr, literalStart),
    8063            copyLength1, 1);    // no alignment guaranteed
    8164    // Potential wrap around.
    82     iBuilder->CreateMemCpy(
     65    b->CreateMemCpy(
    8366            outputBufferBasePtr,
    84             iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(literalStart, copyLength1)),
    85             iBuilder->CreateSub(literalLength, copyLength1), 1); // Buffer start is aligned.
     67            b->CreateGEP(inputBufferBasePtr, b->CreateAdd(literalStart, copyLength1)),
     68            b->CreateSub(literalLength, copyLength1), 1); // Buffer start is aligned.
    8669    // NOTE: Test case reported non-8-byte alignment
    87     outputItems = iBuilder->CreateAdd(outputItems, literalLength);
     70    outputItems = b->CreateAdd(outputItems, literalLength);
    8871
    8972    // =================================================
     
    9275    // [cur, cur+matchLength] sequentially, with two ranges potentially overlapping.
    9376    // If matchOffset is larger than 4, we copy 4 bytes at a time; otherwise, one byte a time.
    94     Value * matchStart = iBuilder->CreateSub(outputItems, matchOffset);
    95     Value * baseSrcOffset = iBuilder->CreateAnd(matchStart, bufferSizeMask);
    96     Value * baseDstOffset = iBuilder->CreateAnd(outputItems, bufferSizeMask);
    97     Value * copyStep = iBuilder->CreateSelect(
    98             iBuilder->CreateICmpULT(matchOffset, iBuilder->getSize(4)),
    99             iBuilder->getSize(1),
    100             iBuilder->getSize(4)
    101             );
    102     BasicBlock * cpyLoopCond = iBuilder->CreateBasicBlock("matchcopy_loop_cond");
    103     BasicBlock * cpyLoopBody = iBuilder->CreateBasicBlock("matchcopy_loop_body");
    104     BasicBlock * cpyLoopExit = iBuilder->CreateBasicBlock("matchcopy_loop_exit");
    105     iBuilder->CreateBr(cpyLoopCond);
     77    Value * matchStart = b->CreateSub(outputItems, matchOffset);
     78    Value * baseSrcOffset = b->CreateAnd(matchStart, bufferSizeMask);
     79    Value * baseDstOffset = b->CreateAnd(outputItems, bufferSizeMask);
     80    Value * const copyStep = b->CreateSelect(
     81            b->CreateICmpULT(matchOffset, b->getSize(4)),
     82            b->getSize(1),
     83            b->getSize(4));
     84    BasicBlock * cpyLoopCond = b->CreateBasicBlock("matchcopy_loop_cond");
     85    BasicBlock * cpyLoopBody = b->CreateBasicBlock("matchcopy_loop_body");
     86    BasicBlock * cpyLoopExit = b->CreateBasicBlock("matchcopy_loop_exit");
     87    b->CreateBr(cpyLoopCond);
    10688
    107     iBuilder->SetInsertPoint(cpyLoopCond);
    108     PHINode * phiSrcOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "srcOffset");
    109     PHINode * phiDstOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "dstOffset");
    110     PHINode * phiIter = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "iterator");
     89    b->SetInsertPoint(cpyLoopCond);
     90    PHINode * phiSrcOffset = b->CreatePHI(sizeTy, 3, "srcOffset");
     91    PHINode * phiDstOffset = b->CreatePHI(sizeTy, 3, "dstOffset");
     92    PHINode * phiIter = b->CreatePHI(sizeTy, 3, "iterator");
    11193    phiSrcOffset->addIncoming(baseSrcOffset, loopBody);
    11294    phiDstOffset->addIncoming(baseDstOffset, loopBody);
    113     phiIter->addIncoming(iBuilder->getSize(0), loopBody);
    114     iBuilder->CreateCondBr(
    115             iBuilder->CreateICmpUGE(phiIter, matchLength),
     95    phiIter->addIncoming(b->getSize(0), loopBody);
     96    b->CreateCondBr(
     97            b->CreateICmpUGE(phiIter, matchLength),
    11698            cpyLoopExit,
    11799            cpyLoopBody
    118100            );
    119101
    120     iBuilder->SetInsertPoint(cpyLoopBody);
     102    b->SetInsertPoint(cpyLoopBody);
    121103//#ifndef NDEBUG
    122104//    iBuilder->CallPrintIntToStderr("srcOffset", phiSrcOffset);
    123105//    iBuilder->CallPrintIntToStderr("dstOffset", phiDstOffset);
    124106//#endif
    125     BasicBlock * reachingBufferEnd_then = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_then");
    126     BasicBlock * reachingBufferEnd_else = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_else");
    127     Value * distSrcEnd = iBuilder->CreateSub(bufferSize, phiSrcOffset);
    128     Value * distDstEnd = iBuilder->CreateSub(bufferSize, phiDstOffset);
    129     Value * minDist = selectMin(iBuilder, distSrcEnd, distDstEnd);
    130     iBuilder->CreateUnlikelyCondBr(
    131             iBuilder->CreateICmpULE(minDist, iBuilder->getSize(4)),
     107    BasicBlock * reachingBufferEnd_then = b->CreateBasicBlock("matchcopy_reaching_buf_end_then");
     108    BasicBlock * reachingBufferEnd_else = b->CreateBasicBlock("matchcopy_reaching_buf_end_else");
     109    Value * distSrcEnd = b->CreateSub(bufferSize, phiSrcOffset);
     110    Value * distDstEnd = b->CreateSub(bufferSize, phiDstOffset);
     111    Value * minDist = b->CreateUMin(distSrcEnd, distDstEnd);
     112    b->CreateUnlikelyCondBr(
     113            b->CreateICmpULE(minDist, b->getSize(4)),
    132114            reachingBufferEnd_then,
    133115            reachingBufferEnd_else
    134116            );
    135117
    136     iBuilder->SetInsertPoint(reachingBufferEnd_then);
    137     Value * src8 = iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset);
    138     Value * dst8 = iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset);
    139     iBuilder->CreateStore(iBuilder->CreateLoad(src8), dst8);
    140     Value * newSrcOffset = iBuilder->CreateAnd(
    141             iBuilder->CreateAdd(phiSrcOffset, iBuilder->getSize(1)),
     118    b->SetInsertPoint(reachingBufferEnd_then);
     119    Value * src8 = b->CreateGEP(outputBufferBasePtr, phiSrcOffset);
     120    Value * dst8 = b->CreateGEP(outputBufferBasePtr, phiDstOffset);
     121    b->CreateStore(b->CreateLoad(src8), dst8);
     122    Value * newSrcOffset = b->CreateAnd(
     123            b->CreateAdd(phiSrcOffset, b->getSize(1)),
    142124            bufferSizeMask
    143125            );
    144     Value * newDstOffset = iBuilder->CreateAnd(
    145             iBuilder->CreateAdd(phiDstOffset, iBuilder->getSize(1)),
     126    Value * newDstOffset = b->CreateAnd(
     127            b->CreateAdd(phiDstOffset, b->getSize(1)),
    146128            bufferSizeMask
    147129            );
    148130    phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_then);
    149131    phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_then);
    150     phiIter->addIncoming(iBuilder->CreateAdd(phiIter, iBuilder->getSize(1)), reachingBufferEnd_then);
    151     iBuilder->CreateBr(cpyLoopCond);
     132    phiIter->addIncoming(b->CreateAdd(phiIter, b->getSize(1)), reachingBufferEnd_then);
     133    b->CreateBr(cpyLoopCond);
    152134
    153     iBuilder->SetInsertPoint(reachingBufferEnd_else);
     135    b->SetInsertPoint(reachingBufferEnd_else);
    154136    // Copy 4 bytes at a time (regardless of step length).
    155     Value * src32 = iBuilder->CreatePointerCast(
    156             iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset),
    157             iBuilder->getInt32Ty()->getPointerTo());
    158     Value * dst32 = iBuilder->CreatePointerCast(
    159             iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset),
    160             iBuilder->getInt32Ty()->getPointerTo());
     137    Value * src32 = b->CreatePointerCast(
     138            b->CreateGEP(outputBufferBasePtr, phiSrcOffset),
     139            b->getInt32Ty()->getPointerTo());
     140    Value * dst32 = b->CreatePointerCast(
     141            b->CreateGEP(outputBufferBasePtr, phiDstOffset),
     142            b->getInt32Ty()->getPointerTo());
    161143    // Force unaligned load/store of an int32.
    162     iBuilder->CreateAlignedStore(iBuilder->CreateAlignedLoad(src32, 1), dst32, 1);
    163     newSrcOffset = iBuilder->CreateAnd(
    164             iBuilder->CreateAdd(phiSrcOffset, copyStep),
     144    b->CreateAlignedStore(b->CreateAlignedLoad(src32, 1), dst32, 1);
     145    newSrcOffset = b->CreateAnd(
     146            b->CreateAdd(phiSrcOffset, copyStep),
    165147            bufferSizeMask
    166148            );
    167     newDstOffset = iBuilder->CreateAnd(
    168             iBuilder->CreateAdd(phiDstOffset, copyStep),
     149    newDstOffset = b->CreateAnd(
     150            b->CreateAdd(phiDstOffset, copyStep),
    169151            bufferSizeMask
    170152            );
    171153    phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_else);
    172154    phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_else);
    173     phiIter->addIncoming(iBuilder->CreateAdd(phiIter, copyStep), reachingBufferEnd_else);
    174     iBuilder->CreateBr(cpyLoopCond);
     155    phiIter->addIncoming(b->CreateAdd(phiIter, copyStep), reachingBufferEnd_else);
     156    b->CreateBr(cpyLoopCond);
    175157
    176     iBuilder->SetInsertPoint(cpyLoopExit);
    177     outputItems = iBuilder->CreateAdd(outputItems, matchLength);
    178     iBuilder->setProducedItemCount("outputStream", outputItems);
     158    b->SetInsertPoint(cpyLoopExit);
     159    outputItems = b->CreateAdd(outputItems, matchLength);
     160    b->setProducedItemCount("outputStream", outputItems);
    179161
    180     Value * newInputIndex = iBuilder->CreateAdd(phiInputIndex, iBuilder->getSize(1));
     162    Value * newInputIndex = b->CreateAdd(phiInputIndex, b->getSize(1));
    181163    phiInputIndex->addIncoming(newInputIndex, cpyLoopExit);
    182     iBuilder->CreateUnlikelyCondBr(
    183             iBuilder->CreateICmpEQ(newInputIndex, iterations),
     164    b->CreateUnlikelyCondBr(
     165            b->CreateICmpEQ(newInputIndex, iterations),
    184166            loopExit,
    185167            loopBody
    186168            );
    187169
    188     iBuilder->SetInsertPoint(loopExit);
    189 //#ifndef NDEBUG
    190 //    iBuilder->CallPrintInt("Decompressed bytes", iBuilder->getProducedItemCount("outputStream"));
    191 //#endif
     170    b->SetInsertPoint(loopExit);
     171    return numOfStrides;
    192172}
    193173
    194174
    195175LZ4ByteStreamDecoderKernel::LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize)
    196 : BlockOrientedKernel("lz4ByteStreamDecoder",
     176: MultiBlockKernel("lz4ByteStreamDecoder",
    197177    // Inputs
    198178    {Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes"},
    199179     Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes"},
    200      Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", UnknownRate(), LookBehind(65536)}},
     180     Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", FixedRate(), { Deferred(), LookBehind(65536) }}},
    201181    // Outputs
    202182    {Binding{iBuilder->getStreamSetTy(1, 8), "outputStream", UnknownRate()}},
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.h

    r5440 r5755  
    1414namespace kernel {
    1515
    16 class LZ4ByteStreamDecoderKernel : public BlockOrientedKernel {
     16class LZ4ByteStreamDecoderKernel final : public MultiBlockKernel {
    1717public:
    1818    LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize);
    1919protected:
    20     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     20    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;
    2121private:
    2222    size_t mBufferSize;
  • icGREP/icgrep-devel/icgrep/kernels/lz4_index_decoder.cpp

    r5706 r5755  
    1818
    1919#define printRTDebugMsg(MSG) \
    20     if (DEBUG_RT_PRINT) iBuilder->CallPrintMsgToStderr(MSG)
     20    if (DEBUG_RT_PRINT) b->CallPrintMsgToStderr(MSG)
    2121
    2222#define printRTDebugInt(NAME, X) \
    23     if (DEBUG_RT_PRINT) iBuilder->CallPrintIntToStderr(NAME, X)
     23    if (DEBUG_RT_PRINT) b->CallPrintIntToStderr(NAME, X)
    2424
    2525#define printGlobalPos() \
    26     printRTDebugInt("GlobalPos", iBuilder->CreateAdd(blockStartPos, iBuilder->CreateLoad(sOffset)))
     26    printRTDebugInt("GlobalPos", b->CreateAdd(blockStartPos, b->CreateLoad(sOffset)))
    2727
    2828namespace {
    2929
    30 Value * generateBitswap(const std::unique_ptr<KernelBuilder> & iBuilder, Value * v) {
    31     Value * bswapFunc = Intrinsic::getDeclaration(iBuilder->getModule(),
     30Value * generateBitswap(const std::unique_ptr<KernelBuilder> & b, Value * v) {
     31    Value * bswapFunc = Intrinsic::getDeclaration(b->getModule(),
    3232            Intrinsic::bswap, v->getType());
    33     return iBuilder->CreateCall(bswapFunc, {v});
    34 }
    35 
    36 Value * selectMin(const std::unique_ptr<KernelBuilder> & iBuilder, Value * a, Value * b) {
    37     return iBuilder->CreateSelect(iBuilder->CreateICmpULT(a, b), a, b);
    38 }
    39 
    40 Value * createStackVar(const std::unique_ptr<KernelBuilder> & iBuilder, Type * type, StringRef name, Value * initializer = nullptr) {
    41     Value * var = iBuilder->CreateAlloca(type, nullptr, name);
     33    return b->CreateCall(bswapFunc, {v});
     34}
     35
     36Value * createStackVar(const std::unique_ptr<KernelBuilder> & b, Type * type, StringRef name, Value * initializer = nullptr) {
     37    Value * var = b->CreateAlloca(type, nullptr, name);
    4238    if (initializer) {
    43         iBuilder->CreateStore(initializer, var);
     39        b->CreateStore(initializer, var);
    4440    } else {
    45         iBuilder->CreateStore(ConstantInt::get(type, 0), var);
     41        b->CreateStore(ConstantInt::get(type, 0), var);
    4642    }
    4743    return var;
    4844}
    4945
    50 void incStackVar(const std::unique_ptr<KernelBuilder> & iBuilder, Value * svar, Value * increment = nullptr) {
    51     Value * value = iBuilder->CreateLoad(svar);
     46void incStackVar(const std::unique_ptr<KernelBuilder> & b, Value * svar, Value * increment = nullptr) {
     47    Value * value = b->CreateLoad(svar);
    5248    if (increment) {
    53         value = iBuilder->CreateAdd(value, increment);
     49        value = b->CreateAdd(value, increment);
    5450    } else {
    55         value = iBuilder->CreateAdd(value, ConstantInt::get(value->getType(), 1));
     51        value = b->CreateAdd(value, ConstantInt::get(value->getType(), 1));
    5652    }
    57     iBuilder->CreateStore(value, svar);
    58 }
    59 
    60 Value * getOutputPtr(const std::unique_ptr<KernelBuilder> & iBuilder, Value * blockStartPtr, Value * offset) {
    61     return iBuilder->CreateGEP(
    62             iBuilder->CreatePointerCast(blockStartPtr, iBuilder->getInt32Ty()->getPointerTo()),
     53    b->CreateStore(value, svar);
     54}
     55
     56Value * getOutputPtr(const std::unique_ptr<KernelBuilder> & b, Value * blockStartPtr, Value * offset) {
     57    return b->CreateGEP(
     58            b->CreatePointerCast(blockStartPtr, b->getInt32Ty()->getPointerTo()),
    6359            offset
    6460            );
     
    7066 * Get the offset within the current word.
    7167 */
    72 Value * LZ4IndexDecoderKernel::getWordOffset(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
    73     Value * offset = iBuilder->CreateLoad(sOffset);
     68Value * LZ4IndexDecoderKernel::getWordOffset(const std::unique_ptr<kernel::KernelBuilder> & b) {
     69    Value * offset = b->CreateLoad(sOffset);
    7470    IntegerType * type = cast<IntegerType>(offset->getType());
    7571    Constant * mask = ConstantInt::get(type, wordWidth - 1);
    76     return iBuilder->CreateAnd(offset, mask);
     72    return b->CreateAnd(offset, mask);
    7773}
    7874
     
    8076 * Get the offset of the start of the current word.
    8177 */
    82 Value * LZ4IndexDecoderKernel::getWordStartOffset(const std::unique_ptr<KernelBuilder> & iBuilder) {
    83     Value * offset = iBuilder->CreateLoad(sOffset);
     78Value * LZ4IndexDecoderKernel::getWordStartOffset(const std::unique_ptr<KernelBuilder> & b) {
     79    Value * offset = b->CreateLoad(sOffset);
    8480    IntegerType * type = cast<IntegerType>(offset->getType());
    8581    Constant * mask = ConstantExpr::getNeg(ConstantInt::get(type, wordWidth));
    86     return iBuilder->CreateAnd(offset, mask);
     82    return b->CreateAnd(offset, mask);
    8783}
    8884
     
    9187 * If offset is not provided, load the current byte by default.
    9288 */
    93 Value * LZ4IndexDecoderKernel::loadRawByte(const std::unique_ptr<KernelBuilder> & iBuilder, Value * offset) {
    94     Value * blockStartPtr = iBuilder->CreatePointerCast(
    95             iBuilder->getInputStreamBlockPtr("byteStream", iBuilder->getInt32(0)),
    96             iBuilder->getInt8PtrTy()
     89Value * LZ4IndexDecoderKernel::loadRawByte(const std::unique_ptr<KernelBuilder> & b, Value * offset) {
     90    Value * blockStartPtr = b->CreatePointerCast(
     91            b->getInputStreamBlockPtr("byteStream", b->getInt32(0)),
     92            b->getInt8PtrTy()
    9793            );
    9894    if (offset == nullptr) {
    99         offset = iBuilder->CreateLoad(sOffset);
     95        offset = b->CreateLoad(sOffset);
    10096    }
    101     Value * ptr = iBuilder->CreateGEP(blockStartPtr, offset);
    102     return iBuilder->CreateLoad(ptr);
     97    Value * ptr = b->CreateGEP(blockStartPtr, offset);
     98    return b->CreateLoad(ptr);
    10399}
    104100
     
    110106 * cleared  = ....111
    111107 */
    112 void LZ4IndexDecoderKernel::setExtenderUntilOffset(const std::unique_ptr<KernelBuilder> & iBuilder) {
     108void LZ4IndexDecoderKernel::setExtenderUntilOffset(const std::unique_ptr<KernelBuilder> & b) {
    113109    // Little-endian, offset counts from LSB
    114110    // extender = extender ^ ~((1 << offset) -1)
    115     Value * extender = iBuilder->CreateLoad(sExtender);
    116     Value * wordOffset = iBuilder->CreateZExt(
    117             getWordOffset(iBuilder),
    118             iBuilder->getSizeTy()
    119             );
    120     Value * one = iBuilder->getSize(1);
    121     Value * mask = iBuilder->CreateSub(
    122             iBuilder->CreateShl(one, wordOffset),
     111    Value * extender = b->CreateLoad(sExtender);
     112    Value * wordOffset = b->CreateZExt(
     113            getWordOffset(b),
     114            b->getSizeTy()
     115            );
     116    Value * one = b->getSize(1);
     117    Value * mask = b->CreateSub(
     118            b->CreateShl(one, wordOffset),
    123119            one);
    124     extender = iBuilder->CreateOr(extender, mask);
    125     iBuilder->CreateStore(extender, sExtender);
     120    extender = b->CreateOr(extender, mask);
     121    b->CreateStore(extender, sExtender);
    126122}
    127123
     
    131127 * Called when we potentially reach a new word.  Usually followed by setExtenderUntilOffset.
    132128 */
    133 void LZ4IndexDecoderKernel::loadCurrentExtender(const std::unique_ptr<KernelBuilder> & iBuilder) {
    134     Value * offset = iBuilder->CreateLoad(sOffset);
     129void LZ4IndexDecoderKernel::loadCurrentExtender(const std::unique_ptr<KernelBuilder> & b) {
     130    Value * offset = b->CreateLoad(sOffset);
    135131    IntegerType * type = cast<IntegerType>(offset->getType());
    136132    ConstantInt * shift = ConstantInt::get(type, std::log2(wordWidth));
    137     Value * shiftedOffset = iBuilder->CreateLShr(offset, shift);
    138     Value * extender = iBuilder->CreateExtractElement(extenders, shiftedOffset);
    139     iBuilder->CreateStore(extender, sExtender);
    140 }
    141 
    142 
    143 void LZ4IndexDecoderKernel::generateProduceOutput(const std::unique_ptr<KernelBuilder> &iBuilder) {
    144     Value * producedItem = iBuilder->getProducedItemCount("literalIndexes");
     133    Value * shiftedOffset = b->CreateLShr(offset, shift);
     134    Value * extender = b->CreateExtractElement(extenders, shiftedOffset);
     135    b->CreateStore(extender, sExtender);
     136}
     137
     138
     139void LZ4IndexDecoderKernel::generateProduceOutput(const std::unique_ptr<KernelBuilder> &b) {
     140    Value * producedItem = b->getProducedItemCount("literalIndexes");
    145141
    146142//#ifndef NDEBUG
    147 //    iBuilder->CallPrintInt("ProducedItem", producedItem);
     143//    b->CallPrintInt("ProducedItem", producedItem);
    148144//    // LiteralStart is adjusted to be relative to the block start, so that
    149145//    // the output can be compared against that of the reference implementation.
    150 //    Value * literalStart = iBuilder->CreateSub(iBuilder->getScalarField("LiteralStart"), iBuilder->getScalarField("LZ4BlockStart"));
    151 //    iBuilder->CallPrintInt("LiteralStart", literalStart);
    152 //    iBuilder->CallPrintInt("LiteralLength", iBuilder->getScalarField("LiteralLength"));
    153 //    iBuilder->CallPrintInt("MatchOffset", iBuilder->getScalarField("MatchOffset"));
    154 //    iBuilder->CallPrintInt("MatchLength", iBuilder->getScalarField("MatchLength"));
     146//    Value * literalStart = b->CreateSub(b->getScalarField("LiteralStart"), b->getScalarField("LZ4BlockStart"));
     147//    b->CallPrintInt("LiteralStart", literalStart);
     148//    b->CallPrintInt("LiteralLength", b->getScalarField("LiteralLength"));
     149//    b->CallPrintInt("MatchOffset", b->getScalarField("MatchOffset"));
     150//    b->CallPrintInt("MatchLength", b->getScalarField("MatchLength"));
    155151//#endif
    156152    printRTDebugMsg("--------------");
    157153
    158     Value * outputOffset = iBuilder->CreateAnd(
    159             iBuilder->CreateTrunc(producedItem, iBuilder->getInt32Ty()),
    160             iBuilder->getInt32(iBuilder->getBitBlockWidth() - 1)
    161             );  // producedItem % blockWidth (as blockWidth is always a power of 2)
    162     Value * literalStartPtr = getOutputPtr(iBuilder,
    163             iBuilder->getOutputStreamBlockPtr("literalIndexes", iBuilder->getInt32(0)), outputOffset);
    164     Value * literalLengthPtr = getOutputPtr(iBuilder,
    165             iBuilder->getOutputStreamBlockPtr("literalIndexes", iBuilder->getInt32(1)), outputOffset);
    166     Value * matchOffsetPtr = getOutputPtr(iBuilder,
    167             iBuilder->getOutputStreamBlockPtr("matchIndexes", iBuilder->getInt32(0)), outputOffset);
    168     Value * matchLengthPtr = getOutputPtr(iBuilder,
    169             iBuilder->getOutputStreamBlockPtr("matchIndexes", iBuilder->getInt32(1)), outputOffset);
    170     iBuilder->CreateStore(iBuilder->getScalarField("LiteralStart"), literalStartPtr);
    171     iBuilder->CreateStore(iBuilder->getScalarField("LiteralLength"), literalLengthPtr);
    172     iBuilder->CreateStore(iBuilder->getScalarField("MatchOffset"), matchOffsetPtr);
    173     iBuilder->CreateStore(iBuilder->getScalarField("MatchLength"), matchLengthPtr);
    174     iBuilder->setProducedItemCount("literalIndexes", iBuilder->CreateAdd(producedItem, iBuilder->getSize(1)));
     154    Value * outputOffset = b->CreateAnd(b->CreateTrunc(producedItem, b->getInt32Ty()), b->getInt32(b->getBitBlockWidth() - 1));  // producedItem % blockWidth (as blockWidth is always a power of 2)
     155    Value * baseLiteralStartPtr = b->getOutputStreamBlockPtr("literalIndexes", b->getInt32(0));
     156
     157    Value * literalStartPtr = getOutputPtr(b, baseLiteralStartPtr, outputOffset);
     158    Value * literalLengthPtr = getOutputPtr(b,
     159            b->getOutputStreamBlockPtr("literalIndexes", b->getInt32(1)), outputOffset);
     160    Value * matchOffsetPtr = getOutputPtr(b,
     161            b->getOutputStreamBlockPtr("matchIndexes", b->getInt32(0)), outputOffset);
     162    Value * matchLengthPtr = getOutputPtr(b,
     163            b->getOutputStreamBlockPtr("matchIndexes", b->getInt32(1)), outputOffset);
     164
     165    b->CreateStore(b->getScalarField("LiteralStart"), literalStartPtr);
     166    b->CreateStore(b->getScalarField("LiteralLength"), literalLengthPtr);
     167    b->CreateStore(b->getScalarField("MatchOffset"), matchOffsetPtr);
     168    b->CreateStore(b->getScalarField("MatchLength"), matchLengthPtr);
     169    b->setProducedItemCount("literalIndexes", b->CreateAdd(producedItem, b->getSize(1)));
    175170    // matchIndexes has a fixed ratio of 1:1 w.r.t. literalIndexes.
    176171}
    177172
    178173
    179 void LZ4IndexDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    180     BasicBlock * entry_block = iBuilder->GetInsertBlock();
    181     BasicBlock * exit_block = iBuilder->CreateBasicBlock("exit");
     174void LZ4IndexDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     175    BasicBlock * entry_block = b->GetInsertBlock();
     176    BasicBlock * exit_block = b->CreateBasicBlock("exit");
    182177
    183178    // %entry
    184     iBuilder->SetInsertPoint(entry_block);
     179    b->SetInsertPoint(entry_block);
    185180    printRTDebugMsg("entry");
    186181    // Global positions in the byte stream.
    187     Value * blockNo = iBuilder->getScalarField("BlockNo");
    188     blockStartPos = iBuilder->CreateMul(blockNo, iBuilder->getInt32(iBuilder->getBitBlockWidth()), "blockStartPos");
    189     extenders = iBuilder->CreateBitCast(
    190             iBuilder->loadInputStreamBlock("extenders", iBuilder->getInt32(0)),
    191             VectorType::get(iBuilder->getSizeTy(), iBuilder->getBitBlockWidth() / iBuilder->getSizeTy()->getBitWidth()),
     182    Value * blockNo = b->getScalarField("BlockNo");
     183    blockStartPos = b->CreateMul(blockNo, b->getInt32(b->getBitBlockWidth()), "blockStartPos");
     184    extenders = b->CreateBitCast(
     185            b->loadInputStreamBlock("extenders", b->getInt32(0)),
     186            VectorType::get(b->getSizeTy(), b->getBitBlockWidth() / b->getSizeTy()->getBitWidth()),
    192187            "extenders");
    193188    // Create a series of stack variables which will be promoted by mem2reg.
    194     sOffset = createStackVar(iBuilder, iBuilder->getInt32Ty(), "offset");
     189    sOffset = createStackVar(b, b->getInt32Ty(), "offset");
    195190    // tempLength has different meanings in different states.
    196     sTempLength = createStackVar(iBuilder, iBuilder->getInt32Ty(), "tempLength", iBuilder->getScalarField("TempLength"));
    197     sTempCount = createStackVar(iBuilder, iBuilder->getInt32Ty(), "tempCount", iBuilder->getScalarField("TempCount"));
    198     sState = createStackVar(iBuilder, iBuilder->getInt8Ty(), "state", iBuilder->getScalarField("State"));
    199     sExtender = createStackVar(iBuilder, iBuilder->getSizeTy(), "extender",
    200             iBuilder->CreateExtractElement(extenders, iBuilder->getInt32(0)));
    201 
    202     BasicBlock * skippingBytes = iBuilder->CreateBasicBlock("skipping_bytes");
    203     BasicBlock * dispatch = iBuilder->CreateBasicBlock("dispatch");
    204 
    205     iBuilder->CreateCondBr(
    206             iBuilder->CreateICmpUGT(iBuilder->getScalarField("BytesToSkip"), iBuilder->getInt32(0)),
     191    sTempLength = createStackVar(b, b->getInt32Ty(), "tempLength", b->getScalarField("TempLength"));
     192    sTempCount = createStackVar(b, b->getInt32Ty(), "tempCount", b->getScalarField("TempCount"));
     193    sState = createStackVar(b, b->getInt8Ty(), "state", b->getScalarField("State"));
     194    sExtender = createStackVar(b, b->getSizeTy(), "extender",
     195            b->CreateExtractElement(extenders, b->getInt32(0)));
     196
     197    BasicBlock * skippingBytes = b->CreateBasicBlock("skipping_bytes");
     198    BasicBlock * dispatch = b->CreateBasicBlock("dispatch");
     199
     200    b->CreateCondBr(
     201            b->CreateICmpUGT(b->getScalarField("BytesToSkip"), b->getInt32(0)),
    207202            skippingBytes, dispatch
    208203            );
    209204
    210205    // %skipping_bytes
    211     generateSkippingBytes(iBuilder, skippingBytes, exit_block);
     206    generateSkippingBytes(b, skippingBytes, exit_block);
    212207    // Insert point is at the end of skippingBytes.
    213     iBuilder->CreateBr(dispatch);
     208    b->CreateBr(dispatch);
    214209
    215210    // %dispatch
     
    217212
    218213    // %at_block_checksum
    219     BasicBlock * atBlockChecksum = iBuilder->CreateBasicBlock("at_block_checksum");
    220     generateAtBlockChecksum(iBuilder, atBlockChecksum, skippingBytes);
     214    BasicBlock * atBlockChecksum = b->CreateBasicBlock("at_block_checksum");
     215    generateAtBlockChecksum(b, atBlockChecksum, skippingBytes);
    221216 
    222217    // %at_block_size
    223     BasicBlock * atBlockSize = iBuilder->CreateBasicBlock("at_block_size");
    224     generateAtBlockSize(iBuilder, atBlockSize, skippingBytes, exit_block);
     218    BasicBlock * atBlockSize = b->CreateBasicBlock("at_block_size");
     219    generateAtBlockSize(b, atBlockSize, skippingBytes, exit_block);
    225220
    226221    // %at_token
    227     BasicBlock * atToken = iBuilder->CreateBasicBlock("at_token");
    228     generateAtToken(iBuilder, atToken, exit_block);
     222    BasicBlock * atToken = b->CreateBasicBlock("at_token");
     223    generateAtToken(b, atToken, exit_block);
    229224
    230225    // %extending_literal_length
    231     BasicBlock * extendingLiteralLen = iBuilder->CreateBasicBlock("extending_literal_length");
    232     generateExtendingLiteralLen(iBuilder, extendingLiteralLen, exit_block);
     226    BasicBlock * extendingLiteralLen = b->CreateBasicBlock("extending_literal_length");
     227    generateExtendingLiteralLen(b, extendingLiteralLen, exit_block);
    233228
    234229    // %at_literals
    235     BasicBlock * atLiterals = iBuilder->CreateBasicBlock("at_literals");
    236     generateAtLiterals(iBuilder, atLiterals);
    237     iBuilder->CreateBr(skippingBytes);
     230    BasicBlock * atLiterals = b->CreateBasicBlock("at_literals");
     231    generateAtLiterals(b, atLiterals);
     232    b->CreateBr(skippingBytes);
    238233
    239234    // %at_first_offset
     
    241236    // If the whole LZ4 block is done, process the (optional) checksum.
    242237    // Otherwise, go around to process the next sequence.
    243     BasicBlock * atOffset1 = iBuilder->CreateBasicBlock("at_first_offset");
    244     iBuilder->SetInsertPoint(atOffset1);
    245     Value * nowGlobalPos = iBuilder->CreateAdd(blockStartPos, iBuilder->CreateLoad(sOffset));
    246     BasicBlock * blockEnd_else = iBuilder->CreateBasicBlock("block_end_else");
     238    BasicBlock * atOffset1 = b->CreateBasicBlock("at_first_offset");
     239    b->SetInsertPoint(atOffset1);
     240    Value * nowGlobalPos = b->CreateAdd(blockStartPos, b->CreateLoad(sOffset));
     241    BasicBlock * blockEnd_else = b->CreateBasicBlock("block_end_else");
    247242    // Conditional branch inserted at the end of the last block.
    248     iBuilder->CreateUnlikelyCondBr(
    249             iBuilder->CreateICmpEQ(nowGlobalPos, iBuilder->getScalarField("LZ4BlockEnd")),
     243    b->CreateUnlikelyCondBr(
     244            b->CreateICmpEQ(nowGlobalPos, b->getScalarField("LZ4BlockEnd")),
    250245            atBlockChecksum, blockEnd_else
    251246            );
    252     generateAtFirstOffset(iBuilder, blockEnd_else, exit_block);
     247    generateAtFirstOffset(b, blockEnd_else, exit_block);
    253248
    254249    // %at_second_offset
    255     BasicBlock * atOffset2 = iBuilder->CreateBasicBlock("at_second_offset");
    256     generateAtSecondOffset(iBuilder, atOffset2, exit_block);
     250    BasicBlock * atOffset2 = b->CreateBasicBlock("at_second_offset");
     251    generateAtSecondOffset(b, atOffset2, exit_block);
    257252
    258253    // %extending_match_length
    259     BasicBlock * extendingMatchLen = iBuilder->CreateBasicBlock("extending_match_length");
    260     generateExtendingMatchLen(iBuilder, extendingMatchLen, exit_block);
    261     iBuilder->CreateBr(atToken);
     254    BasicBlock * extendingMatchLen = b->CreateBasicBlock("extending_match_length");
     255    generateExtendingMatchLen(b, extendingMatchLen, exit_block);
     256    b->CreateBr(atToken);
    262257
    263258    // Indirect branching.
    264     iBuilder->SetInsertPoint(dispatch);
     259    b->SetInsertPoint(dispatch);
    265260    printRTDebugMsg("dispatch");
    266261    // The order must comply with enum State.
     
    269264             BlockAddress::get(atOffset1), BlockAddress::get(atOffset2), BlockAddress::get(extendingMatchLen), BlockAddress::get(atBlockChecksum)}
    270265            );
    271     Value * target = iBuilder->CreateExtractElement(labels, iBuilder->CreateLoad(sState));
    272     IndirectBrInst * indirectBr = iBuilder->CreateIndirectBr(target);
     266    Value * target = b->CreateExtractElement(labels, b->CreateLoad(sState));
     267    IndirectBrInst * indirectBr = b->CreateIndirectBr(target);
    273268    indirectBr->addDestination(atBlockSize);
    274269    indirectBr->addDestination(atToken);
     
    281276
    282277    // %exit
    283     iBuilder->SetInsertPoint(exit_block);
     278    b->SetInsertPoint(exit_block);
    284279    printRTDebugMsg("exit");
    285     iBuilder->setScalarField("State", iBuilder->CreateLoad(sState));
    286     iBuilder->setScalarField("TempLength", iBuilder->CreateLoad(sTempLength));
    287     iBuilder->setScalarField("TempCount", iBuilder->CreateLoad(sTempCount));
    288     iBuilder->setScalarField("BlockNo", iBuilder->CreateAdd(blockNo, iBuilder->getInt32(1)));
     280    b->setScalarField("State", b->CreateLoad(sState));
     281    b->setScalarField("TempLength", b->CreateLoad(sTempLength));
     282    b->setScalarField("TempCount", b->CreateLoad(sTempCount));
     283    b->setScalarField("BlockNo", b->CreateAdd(blockNo, b->getInt32(1)));
    289284    // When the kernel builder uses indirectbr, doBlock is not a separate function.
    290285    // Hence, we branch to a new basic block and fall through instead of returning.
    291     BasicBlock * end_block = iBuilder->CreateBasicBlock("end_of_block");
    292     iBuilder->CreateBr(end_block);
    293     iBuilder->SetInsertPoint(end_block);
    294 }
    295 
    296 
    297 void LZ4IndexDecoderKernel::generateBoundaryDetection(const std::unique_ptr<KernelBuilder> & iBuilder, State state, BasicBlock * exit_block, bool updateExtenderWord) {
     286    BasicBlock * end_block = b->CreateBasicBlock("end_of_block");
     287    b->CreateBr(end_block);
     288    b->SetInsertPoint(end_block);
     289}
     290
     291
     292void LZ4IndexDecoderKernel::generateBoundaryDetection(const std::unique_ptr<KernelBuilder> & b, State state, BasicBlock * exit_block, bool updateExtenderWord) {
    298293    if (updateExtenderWord) {
    299         BasicBlock * wordBoundary_then = iBuilder->CreateBasicBlock("word_boundary_then-" + StateLabels.at(state));
    300         BasicBlock * blockBoundary_else = iBuilder->CreateBasicBlock("block_boundary_else-" + StateLabels.at(state));
    301         BasicBlock * wordBoundary_cont = iBuilder->CreateBasicBlock("word_boundary_cont-" + StateLabels.at(state));
    302         iBuilder->CreateUnlikelyCondBr(
    303                 iBuilder->CreateICmpEQ(getWordOffset(iBuilder), iBuilder->getInt32(0)),
     294        BasicBlock * wordBoundary_then = b->CreateBasicBlock("word_boundary_then-" + StateLabels.at(state));
     295        BasicBlock * blockBoundary_else = b->CreateBasicBlock("block_boundary_else-" + StateLabels.at(state));
     296        BasicBlock * wordBoundary_cont = b->CreateBasicBlock("word_boundary_cont-" + StateLabels.at(state));
     297        b->CreateUnlikelyCondBr(
     298                b->CreateICmpEQ(getWordOffset(b), b->getInt32(0)),
    304299                wordBoundary_then, wordBoundary_cont
    305300                );
    306301
    307         iBuilder->SetInsertPoint(wordBoundary_then);
    308         iBuilder->CreateUnlikelyCondBr(
    309                 iBuilder->CreateICmpEQ(iBuilder->CreateLoad(sOffset), iBuilder->getInt32(iBuilder->getBitBlockWidth())),
     302        b->SetInsertPoint(wordBoundary_then);
     303        b->CreateUnlikelyCondBr(
     304                b->CreateICmpEQ(b->CreateLoad(sOffset), b->getInt32(b->getBitBlockWidth())),
    310305                exit_block, blockBoundary_else
    311306                );
    312307
    313308        // Reaching word boundary but not block boundary.  Update the extender word as requested.
    314         iBuilder->SetInsertPoint(blockBoundary_else);
    315         loadCurrentExtender(iBuilder);
    316         iBuilder->CreateBr(wordBoundary_cont);
     309        b->SetInsertPoint(blockBoundary_else);
     310        loadCurrentExtender(b);
     311        b->CreateBr(wordBoundary_cont);
    317312
    318313        // Leave the insert point at the end and return.
    319         iBuilder->SetInsertPoint(wordBoundary_cont);
     314        b->SetInsertPoint(wordBoundary_cont);
    320315    } else {
    321         BasicBlock * blockBoundary_cont = iBuilder->CreateBasicBlock("block_boundary_cont-" + StateLabels.at(state));
    322         iBuilder->CreateUnlikelyCondBr(
    323                 iBuilder->CreateICmpEQ(iBuilder->CreateLoad(sOffset), iBuilder->getInt32(iBuilder->getBitBlockWidth())),
     316        BasicBlock * blockBoundary_cont = b->CreateBasicBlock("block_boundary_cont-" + StateLabels.at(state));
     317        b->CreateUnlikelyCondBr(
     318                b->CreateICmpEQ(b->CreateLoad(sOffset), b->getInt32(b->getBitBlockWidth())),
    324319                exit_block, blockBoundary_cont
    325320                );
    326321        // Leave the insert point at the end and return.
    327         iBuilder->SetInsertPoint(blockBoundary_cont);
     322        b->SetInsertPoint(blockBoundary_cont);
    328323    }
    329324}
    330325
    331326
    332 void LZ4IndexDecoderKernel::generateSkippingBytes(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * bb, BasicBlock * exit_block) {
    333     iBuilder->SetInsertPoint(bb);
     327void LZ4IndexDecoderKernel::generateSkippingBytes(const std::unique_ptr<kernel::KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
     328    b->SetInsertPoint(bb);
    334329    printRTDebugMsg("skipping bytes");
    335330
    336     Value * remainingBytesInBlock = iBuilder->CreateSub(
    337             iBuilder->getInt32(iBuilder->getBitBlockWidth()), iBuilder->CreateLoad(sOffset)
    338             );
    339     Value * remainingBytesToSkip = iBuilder->getScalarField("BytesToSkip");
    340     Value * advanceDist = selectMin(iBuilder, remainingBytesInBlock, remainingBytesToSkip);
    341     remainingBytesToSkip = iBuilder->CreateSub(remainingBytesToSkip, advanceDist);
    342     incStackVar(iBuilder, sOffset, advanceDist);
    343     iBuilder->setScalarField("BytesToSkip", remainingBytesToSkip);
    344 
    345     generateBoundaryDetection(iBuilder, State::SKIPPING_BYTES, exit_block);
     331    Value * remainingBytesInBlock = b->CreateSub(
     332            b->getInt32(b->getBitBlockWidth()), b->CreateLoad(sOffset)
     333            );
     334    Value * remainingBytesToSkip = b->getScalarField("BytesToSkip");
     335    Value * advanceDist = b->CreateUMin(remainingBytesInBlock, remainingBytesToSkip);
     336    remainingBytesToSkip = b->CreateSub(remainingBytesToSkip, advanceDist);
     337    incStackVar(b, sOffset, advanceDist);
     338    b->setScalarField("BytesToSkip", remainingBytesToSkip);
     339
     340    generateBoundaryDetection(b, State::SKIPPING_BYTES, exit_block);
    346341    // Falls through.
    347342}
    348343
    349344
    350 void LZ4IndexDecoderKernel::generateAtBlockSize(const std::unique_ptr<KernelBuilder> &iBuilder, BasicBlock * bb, BasicBlock * skippingBytes, BasicBlock * exit_block) {
    351     iBuilder->CreateBr(bb);
    352     iBuilder->SetInsertPoint(bb);
     345void LZ4IndexDecoderKernel::generateAtBlockSize(const std::unique_ptr<KernelBuilder> &b, BasicBlock * bb, BasicBlock * skippingBytes, BasicBlock * exit_block) {
     346    b->CreateBr(bb);
     347    b->SetInsertPoint(bb);
    353348    printRTDebugMsg("scanning block size");
    354349    printGlobalPos();
     
    360355
    361356    // A do-while loop.
    362     BasicBlock * loopBody = iBuilder->CreateBasicBlock("blocksize_loop_body");
    363     BasicBlock * loopExit = iBuilder->CreateBasicBlock("blocksize_loop_exit");
    364     iBuilder->CreateBr(loopBody);
    365 
    366     iBuilder->SetInsertPoint(loopBody);
    367     Value * byte = loadRawByte(iBuilder);
    368     Value * newTempLength = iBuilder->CreateAdd(
    369             iBuilder->CreateShl(iBuilder->CreateLoad(sTempLength), iBuilder->getInt32(8)),
    370             iBuilder->CreateZExt(byte, iBuilder->getInt32Ty())
    371             );
    372     iBuilder->CreateStore(newTempLength, sTempLength);
    373     incStackVar(iBuilder, sTempCount);
    374     incStackVar(iBuilder, sOffset);
     357    BasicBlock * loopBody = b->CreateBasicBlock("blocksize_loop_body");
     358    BasicBlock * loopExit = b->CreateBasicBlock("blocksize_loop_exit");
     359    b->CreateBr(loopBody);
     360
     361    b->SetInsertPoint(loopBody);
     362    Value * byte = loadRawByte(b);
     363    Value * newTempLength = b->CreateAdd(
     364            b->CreateShl(b->CreateLoad(sTempLength), b->getInt32(8)),
     365            b->CreateZExt(byte, b->getInt32Ty())
     366            );
     367    b->CreateStore(newTempLength, sTempLength);
     368    incStackVar(b, sTempCount);
     369    incStackVar(b, sOffset);
    375370    // Stop when we read all four bytes or reach the end of the block.
    376     iBuilder->CreateCondBr(
    377             iBuilder->CreateOr(
    378                 iBuilder->CreateICmpEQ(iBuilder->CreateLoad(sTempCount), iBuilder->getInt32(4)),
    379                 iBuilder->CreateICmpEQ(iBuilder->CreateLoad(sOffset), iBuilder->getInt32(iBuilder->getBitBlockWidth()))
     371    b->CreateCondBr(
     372            b->CreateOr(
     373                b->CreateICmpEQ(b->CreateLoad(sTempCount), b->getInt32(4)),
     374                b->CreateICmpEQ(b->CreateLoad(sOffset), b->getInt32(b->getBitBlockWidth()))
    380375                ),
    381376            loopExit, loopBody
    382377            );
    383378
    384     iBuilder->SetInsertPoint(loopExit);
    385     BasicBlock * blockSizeCompleted_then = iBuilder->CreateBasicBlock("blocksize_completed_then");
    386     BasicBlock * blockSizeCompleted_cont = iBuilder->CreateBasicBlock("blocksize_completed_cont");
    387     iBuilder->CreateLikelyCondBr(
    388             iBuilder->CreateICmpEQ(iBuilder->CreateLoad(sTempCount), iBuilder->getInt32(4)),
     379    b->SetInsertPoint(loopExit);
     380    BasicBlock * blockSizeCompleted_then = b->CreateBasicBlock("blocksize_completed_then");
     381    BasicBlock * blockSizeCompleted_cont = b->CreateBasicBlock("blocksize_completed_cont");
     382    b->CreateLikelyCondBr(
     383            b->CreateICmpEQ(b->CreateLoad(sTempCount), b->getInt32(4)),
    389384            blockSizeCompleted_then, blockSizeCompleted_cont
    390385            );
    391386
    392387    // All four bytes of the block size are read in.
    393     iBuilder->SetInsertPoint(blockSizeCompleted_then);
     388    b->SetInsertPoint(blockSizeCompleted_then);
    394389    // Remember to swap the block size back to little-endian.
    395     Value * blockSize = generateBitswap(iBuilder, iBuilder->CreateLoad(sTempLength));
    396     Value * currentPos = iBuilder->CreateAdd(blockStartPos, iBuilder->CreateLoad(sOffset));
    397     iBuilder->setScalarField("LZ4BlockStart", currentPos);
    398     iBuilder->setScalarField("LZ4BlockEnd", iBuilder->CreateAdd(currentPos, blockSize));
     390    Value * blockSize = generateBitswap(b, b->CreateLoad(sTempLength));
     391    Value * currentPos = b->CreateAdd(blockStartPos, b->CreateLoad(sOffset));
     392    b->setScalarField("LZ4BlockStart", currentPos);
     393    b->setScalarField("LZ4BlockEnd", b->CreateAdd(currentPos, blockSize));
    399394    printRTDebugInt("blockSize", blockSize);
    400395
    401     BasicBlock * uncompressedBlock_then = iBuilder->CreateBasicBlock("uncompressed_block_then");
    402     BasicBlock * uncompressedBlock_else = iBuilder->CreateBasicBlock("uncompressed_block_cont");
    403     iBuilder->CreateUnlikelyCondBr(
    404             iBuilder->CreateTrunc(
    405                 iBuilder->CreateLShr(blockSize, iBuilder->getInt32(31)),
    406                 iBuilder->getInt1Ty()
     396    BasicBlock * uncompressedBlock_then = b->CreateBasicBlock("uncompressed_block_then");
     397    BasicBlock * uncompressedBlock_else = b->CreateBasicBlock("uncompressed_block_cont");
     398    b->CreateUnlikelyCondBr(
     399            b->CreateTrunc(
     400                b->CreateLShr(blockSize, b->getInt32(31)),
     401                b->getInt1Ty()
    407402                ),
    408403            uncompressedBlock_then,
     
    410405            );
    411406
    412     iBuilder->SetInsertPoint(uncompressedBlock_then);
    413     Value * realBlockSize = iBuilder->CreateXor(blockSize, iBuilder->getInt32(1L << 31));
    414     iBuilder->setScalarField("LZ4BlockEnd", iBuilder->CreateAdd(currentPos, realBlockSize));
    415     iBuilder->setScalarField("BytesToSkip", realBlockSize);
    416     iBuilder->setScalarField("LiteralStart", currentPos);
    417     iBuilder->setScalarField("LiteralLength", realBlockSize);
     407    b->SetInsertPoint(uncompressedBlock_then);
     408    Value * realBlockSize = b->CreateXor(blockSize, b->getInt32(1L << 31));
     409    b->setScalarField("LZ4BlockEnd", b->CreateAdd(currentPos, realBlockSize));
     410    b->setScalarField("BytesToSkip", realBlockSize);
     411    b->setScalarField("LiteralStart", currentPos);
     412    b->setScalarField("LiteralLength", realBlockSize);
    418413    // No need to set MatchLength/MatchOffset to 0, nor to produce output,
    419414    // because %atBlockChecksum will do so as the last sequence.
    420     iBuilder->CreateStore(iBuilder->getInt8(State::AT_BLOCK_CHECKSUM), sState);
    421     iBuilder->CreateBr(skippingBytes);
    422 
    423     iBuilder->SetInsertPoint(uncompressedBlock_else);
     415    b->CreateStore(b->getInt8(State::AT_BLOCK_CHECKSUM), sState);
     416    b->CreateBr(skippingBytes);
     417
     418    b->SetInsertPoint(uncompressedBlock_else);
    424419    // Reset these temporary values for later use.
    425     iBuilder->CreateStore(iBuilder->getInt32(0), sTempLength);
    426     iBuilder->CreateStore(iBuilder->getInt32(0), sTempCount);
    427     iBuilder->CreateStore(iBuilder->getInt8(State::AT_TOKEN), sState);
     420    b->CreateStore(b->getInt32(0), sTempLength);
     421    b->CreateStore(b->getInt32(0), sTempCount);
     422    b->CreateStore(b->getInt8(State::AT_TOKEN), sState);
    428423    // A block size of 0 is the end mark of the frame. Exit.
    429     iBuilder->CreateUnlikelyCondBr(
    430             iBuilder->CreateICmpEQ(blockSize, ConstantInt::getNullValue(blockSize->getType())),
     424    b->CreateUnlikelyCondBr(
     425            b->CreateICmpEQ(blockSize, ConstantInt::getNullValue(blockSize->getType())),
    431426            exit_block,
    432427            blockSizeCompleted_cont
     
    434429
    435430    // We could be at the boundary no matter the block size is completed or not.
    436     iBuilder->SetInsertPoint(blockSizeCompleted_cont);
    437     generateBoundaryDetection(iBuilder, State::AT_BLOCK_SIZE, exit_block);
     431    b->SetInsertPoint(blockSizeCompleted_cont);
     432    generateBoundaryDetection(b, State::AT_BLOCK_SIZE, exit_block);
    438433    // Falls through to %at_token.
    439434}
    440435
    441436
    442 void LZ4IndexDecoderKernel::generateAtToken(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * bb, BasicBlock * exit_block) {
    443     iBuilder->CreateBr(bb);
    444     iBuilder->SetInsertPoint(bb);
     437void LZ4IndexDecoderKernel::generateAtToken(const std::unique_ptr<kernel::KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
     438    b->CreateBr(bb);
     439    b->SetInsertPoint(bb);
    445440    printRTDebugMsg("reading token");
    446441
    447     Value * token = loadRawByte(iBuilder);
    448     Value * literalLen = iBuilder->CreateZExt(
    449         iBuilder->CreateLShr(token, iBuilder->getInt8(4)),
    450         iBuilder->getInt32Ty()
     442    Value * token = loadRawByte(b);
     443    Value * literalLen = b->CreateZExt(
     444        b->CreateLShr(token, b->getInt8(4)),
     445        b->getInt32Ty()
    451446        );
    452     Value * matchLen = iBuilder->CreateZExt(
    453         iBuilder->CreateAnd(token, iBuilder->getInt8(0xf)),
    454         iBuilder->getInt32Ty()
     447    Value * matchLen = b->CreateZExt(
     448        b->CreateAnd(token, b->getInt8(0xf)),
     449        b->getInt32Ty()
    455450        );
    456     incStackVar(iBuilder, sOffset);
     451    incStackVar(b, sOffset);
    457452    // Prepare extender word for scanning.
    458     loadCurrentExtender(iBuilder);
    459     setExtenderUntilOffset(iBuilder);
     453    loadCurrentExtender(b);
     454    setExtenderUntilOffset(b);
    460455    // Store the (partial) match length to be extended later.
    461     iBuilder->setScalarField("MatchLength", matchLen);
     456    b->setScalarField("MatchLength", matchLen);
    462457    // Use tempLength to accumulate extended lengths (until at_literals).
    463     iBuilder->CreateStore(literalLen, sTempLength);
    464     iBuilder->CreateStore(iBuilder->getInt8(State::EXTENDING_LITERAL_LENGTH), sState);
    465 
    466     generateBoundaryDetection(iBuilder, State::AT_TOKEN, exit_block);
     458    b->CreateStore(literalLen, sTempLength);
     459    b->CreateStore(b->getInt8(State::EXTENDING_LITERAL_LENGTH), sState);
     460
     461    generateBoundaryDetection(b, State::AT_TOKEN, exit_block);
    467462    // Falls through to %extending_literal_length.
    468463}
    469464
    470465
    471 void LZ4IndexDecoderKernel::generateExtendingLiteralLen(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock * bb, BasicBlock * exit_block) {
    472     iBuilder->CreateBr(bb);
    473     iBuilder->SetInsertPoint(bb);
     466void LZ4IndexDecoderKernel::generateExtendingLiteralLen(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
     467    b->CreateBr(bb);
     468    b->SetInsertPoint(bb);
    474469    printRTDebugMsg("extending literal len");
    475470
    476     Value * wordOffset = getWordOffset(iBuilder);
    477     Value * blockOffset = getWordStartOffset(iBuilder);
    478     Value * literalLen = iBuilder->CreateLoad(sTempLength);
    479     Value * literalExtEnd = iBuilder->CreateTrunc(
    480                 iBuilder->CreateCountForwardZeroes(iBuilder->CreateNot(iBuilder->CreateLoad(sExtender))),
    481                 iBuilder->getInt32Ty());
     471    Value * wordOffset = getWordOffset(b);
     472    Value * blockOffset = getWordStartOffset(b);
     473    Value * literalLen = b->CreateLoad(sTempLength);
     474    Value * literalExtEnd = b->CreateTrunc(
     475                b->CreateCountForwardZeroes(b->CreateNot(b->CreateLoad(sExtender))),
     476                b->getInt32Ty());
    482477    printRTDebugInt("wordOffset", wordOffset);
    483478    printRTDebugInt("literalExtEnd", literalExtEnd);
    484479    // number of extender = literalExtEnd - wordOffset
    485     Value * numExtenders = iBuilder->CreateSub(literalExtEnd, wordOffset);
     480    Value * numExtenders = b->CreateSub(literalExtEnd, wordOffset);
    486481    Value * literalExtReachBoundary =
    487             iBuilder->CreateICmpEQ(literalExtEnd, iBuilder->getInt32(wordWidth));
     482            b->CreateICmpEQ(literalExtEnd, b->getInt32(wordWidth));
    488483    // There are literalExtEnd forward zeroes, we load bytes[literalExtEnd]
    489484    // which is the first non-extender.  If literalExtEnd == 64, we force the
    490485    // load index to be 0 to avoid out-of-bound access, and lastByte will be 0.
    491     Value * loadOffset = iBuilder->CreateSelect(literalExtReachBoundary,
     486    Value * loadOffset = b->CreateSelect(literalExtReachBoundary,
    492487            ConstantInt::getNullValue(literalExtEnd->getType()),
    493488            literalExtEnd);
    494     Value * lastByte = iBuilder->CreateSelect(literalExtReachBoundary,
    495             iBuilder->getInt8(0),
    496             loadRawByte(iBuilder, iBuilder->CreateAdd(blockOffset, loadOffset)));
    497     Value * literalLenExted = iBuilder->CreateICmpUGE(literalLen, iBuilder->getInt32(0xf));
    498     literalLen = iBuilder->CreateSelect(literalLenExted,
    499             iBuilder->CreateAdd(
     489    Value * lastByte = b->CreateSelect(literalExtReachBoundary,
     490            b->getInt8(0),
     491            loadRawByte(b, b->CreateAdd(blockOffset, loadOffset)));
     492    Value * literalLenExted = b->CreateICmpUGE(literalLen, b->getInt32(0xf));
     493    literalLen = b->CreateSelect(literalLenExted,
     494            b->CreateAdd(
    500495                literalLen,
    501                 iBuilder->CreateAdd(
    502                     iBuilder->CreateMul(numExtenders, iBuilder->getInt32(0xff)),
    503                     iBuilder->CreateZExt(lastByte, iBuilder->getInt32Ty())
     496                b->CreateAdd(
     497                    b->CreateMul(numExtenders, b->getInt32(0xff)),
     498                    b->CreateZExt(lastByte, b->getInt32Ty())
    504499                    )
    505500                ),      // literalLen + numExtenders * 255
    506501            literalLen);
    507     wordOffset = iBuilder->CreateSelect(literalLenExted,
     502    wordOffset = b->CreateSelect(literalLenExted,
    508503            literalExtEnd,
    509504            wordOffset);
    510505    // If lastByte is truly the last length byte, we need to advance the cursor by 1.
    511     wordOffset = iBuilder->CreateSelect(
    512             iBuilder->CreateAnd(literalLenExted, iBuilder->CreateNot(literalExtReachBoundary)),
    513             iBuilder->CreateAdd(wordOffset, iBuilder->getInt32(1)),
     506    wordOffset = b->CreateSelect(
     507            b->CreateAnd(literalLenExted, b->CreateNot(literalExtReachBoundary)),
     508            b->CreateAdd(wordOffset, b->getInt32(1)),
    514509            wordOffset
    515510            );
    516     iBuilder->CreateStore(literalLen, sTempLength);
    517     iBuilder->CreateStore(iBuilder->CreateAdd(blockOffset, wordOffset), sOffset);
    518     Value * unfinished = iBuilder->CreateAnd(literalExtReachBoundary, literalLenExted);
    519     Value * newState = iBuilder->CreateSelect(unfinished,
    520             iBuilder->getInt8(State::EXTENDING_LITERAL_LENGTH),
    521             iBuilder->getInt8(State::AT_LITERALS));
    522     iBuilder->CreateStore(newState, sState);
    523 
    524     generateBoundaryDetection(iBuilder, State::EXTENDING_LITERAL_LENGTH, exit_block, true);
    525     BasicBlock * cont_block = iBuilder->CreateBasicBlock("finished_" + StateLabels.at(State::EXTENDING_LITERAL_LENGTH));
     511    b->CreateStore(literalLen, sTempLength);
     512    b->CreateStore(b->CreateAdd(blockOffset, wordOffset), sOffset);
     513    Value * unfinished = b->CreateAnd(literalExtReachBoundary, literalLenExted);
     514    Value * newState = b->CreateSelect(unfinished,
     515            b->getInt8(State::EXTENDING_LITERAL_LENGTH),
     516            b->getInt8(State::AT_LITERALS));
     517    b->CreateStore(newState, sState);
     518
     519    generateBoundaryDetection(b, State::EXTENDING_LITERAL_LENGTH, exit_block, true);
     520    BasicBlock * cont_block = b->CreateBasicBlock("finished_" + StateLabels.at(State::EXTENDING_LITERAL_LENGTH));
    526521    // Insert point is still in wordBoundary block now.
    527522    // See if there are still more extenders.
    528     iBuilder->CreateUnlikelyCondBr(unfinished, bb, cont_block);
    529 
    530     iBuilder->SetInsertPoint(cont_block);
     523    b->CreateUnlikelyCondBr(unfinished, bb, cont_block);
     524
     525    b->SetInsertPoint(cont_block);
    531526    // Falls through to %at_literals.
    532527}
    533528
    534529
    535 void LZ4IndexDecoderKernel::generateAtLiterals(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock * bb) {
    536     iBuilder->CreateBr(bb);
    537     iBuilder->SetInsertPoint(bb);
    538 
    539     iBuilder->setScalarField("LiteralStart", iBuilder->CreateAdd(blockStartPos, iBuilder->CreateLoad(sOffset)));
    540     iBuilder->setScalarField("LiteralLength", iBuilder->CreateLoad(sTempLength));
    541     iBuilder->setScalarField("BytesToSkip", iBuilder->CreateLoad(sTempLength));
    542     iBuilder->CreateStore(iBuilder->getInt8(State::AT_FIRST_OFFSET), sState);
     530void LZ4IndexDecoderKernel::generateAtLiterals(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb) {
     531    b->CreateBr(bb);
     532    b->SetInsertPoint(bb);
     533    b->setScalarField("LiteralStart", b->CreateAdd(blockStartPos, b->CreateLoad(sOffset)));
     534    b->setScalarField("LiteralLength", b->CreateLoad(sTempLength));
     535    b->setScalarField("BytesToSkip", b->CreateLoad(sTempLength));
     536    b->CreateStore(b->getInt8(State::AT_FIRST_OFFSET), sState);
    543537
    544538    // No boundary detection here as we do not advance the cursor.
     
    547541
    548542
    549 void LZ4IndexDecoderKernel::generateAtFirstOffset(const std::unique_ptr<KernelBuilder> &iBuilder, BasicBlock * bb, BasicBlock * exit_block) {
    550     iBuilder->SetInsertPoint(bb);
     543void LZ4IndexDecoderKernel::generateAtFirstOffset(const std::unique_ptr<KernelBuilder> &b, BasicBlock * bb, BasicBlock * exit_block) {
     544    b->SetInsertPoint(bb);
    551545    printRTDebugMsg("reading first offset");
    552546
    553     Value * byte = iBuilder->CreateZExt(loadRawByte(iBuilder), iBuilder->getInt32Ty());
     547    Value * byte = b->CreateZExt(loadRawByte(b), b->getInt32Ty());
    554548    // Use tempLength to store partial offset.
    555     iBuilder->CreateStore(byte, sTempLength);
    556     incStackVar(iBuilder, sOffset);
    557     iBuilder->CreateStore(iBuilder->getInt8(State::AT_SECOND_OFFSET), sState);
    558 
    559     generateBoundaryDetection(iBuilder, State::AT_FIRST_OFFSET, exit_block);
     549    b->CreateStore(byte, sTempLength);
     550    incStackVar(b, sOffset);
     551    b->CreateStore(b->getInt8(State::AT_SECOND_OFFSET), sState);
     552
     553    generateBoundaryDetection(b, State::AT_FIRST_OFFSET, exit_block);
    560554    // Falls through to %at_second_offset.
    561555}
    562556
    563557
    564 void LZ4IndexDecoderKernel::generateAtSecondOffset(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock * bb, BasicBlock * exit_block) {
    565     iBuilder->CreateBr(bb);
    566     iBuilder->SetInsertPoint(bb);
     558void LZ4IndexDecoderKernel::generateAtSecondOffset(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
     559    b->CreateBr(bb);
     560    b->SetInsertPoint(bb);
    567561    printRTDebugMsg("reading second offset");
    568562
    569     Value * byte1 = iBuilder->CreateLoad(sTempLength);
    570     Value * byte2 = iBuilder->CreateZExt(loadRawByte(iBuilder), iBuilder->getInt32Ty());
    571     Value * offset = iBuilder->CreateAdd(
    572             iBuilder->CreateShl(byte2, iBuilder->getInt32(8)),
     563    Value * byte1 = b->CreateLoad(sTempLength);
     564    Value * byte2 = b->CreateZExt(loadRawByte(b), b->getInt32Ty());
     565    Value * offset = b->CreateAdd(
     566            b->CreateShl(byte2, b->getInt32(8)),
    573567            byte1
    574568            );
    575     iBuilder->setScalarField("MatchOffset", offset);
    576     incStackVar(iBuilder, sOffset);
     569    b->setScalarField("MatchOffset", offset);
     570    incStackVar(b, sOffset);
    577571    // Prepare extender word and tempLength for extending.
    578     loadCurrentExtender(iBuilder);
    579     setExtenderUntilOffset(iBuilder);
    580     iBuilder->CreateStore(iBuilder->getScalarField("MatchLength"), sTempLength);
    581     iBuilder->CreateStore(iBuilder->getInt8(State::EXTENDING_MATCH_LENGTH), sState);
    582 
    583     generateBoundaryDetection(iBuilder, State::AT_SECOND_OFFSET, exit_block);
     572    loadCurrentExtender(b);
     573    setExtenderUntilOffset(b);
     574    b->CreateStore(b->getScalarField("MatchLength"), sTempLength);
     575    b->CreateStore(b->getInt8(State::EXTENDING_MATCH_LENGTH), sState);
     576
     577    generateBoundaryDetection(b, State::AT_SECOND_OFFSET, exit_block);
    584578    // Falls through to %extending_match_length.
    585579}
    586580
    587581
    588 void LZ4IndexDecoderKernel::generateExtendingMatchLen(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock * bb, BasicBlock * exit_block) {
    589     iBuilder->CreateBr(bb);
    590     iBuilder->SetInsertPoint(bb);
     582void LZ4IndexDecoderKernel::generateExtendingMatchLen(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * exit_block) {
     583    b->CreateBr(bb);
     584    b->SetInsertPoint(bb);
    591585    printRTDebugMsg("extending match length");
    592586    printGlobalPos();
    593     printRTDebugInt("rawbyte", loadRawByte(iBuilder));
    594     printRTDebugInt("extword", iBuilder->CreateLoad(sExtender));
    595 
    596     Value * wordOffset = getWordOffset(iBuilder);
    597     Value * blockOffset = getWordStartOffset(iBuilder);
    598     Value * matchLen = iBuilder->CreateLoad(sTempLength);
    599     Value * matchExtEnd = iBuilder->CreateTrunc(
    600         iBuilder->CreateCountForwardZeroes(iBuilder->CreateNot(iBuilder->CreateLoad(sExtender))),
    601         iBuilder->getInt32Ty()
     587    printRTDebugInt("rawbyte", loadRawByte(b));
     588    printRTDebugInt("extword", b->CreateLoad(sExtender));
     589
     590    Value * wordOffset = getWordOffset(b);
     591    Value * blockOffset = getWordStartOffset(b);
     592    Value * matchLen = b->CreateLoad(sTempLength);
     593    Value * matchExtEnd = b->CreateTrunc(
     594        b->CreateCountForwardZeroes(b->CreateNot(b->CreateLoad(sExtender))),
     595        b->getInt32Ty()
    602596        );
    603597    printRTDebugInt("wordoffset", wordOffset);
    604598    printRTDebugInt("matchExtEnd", matchExtEnd);
    605599    // number of extender = matchExtEnd - wordOffset
    606     Value * numExtenders = iBuilder->CreateSub(matchExtEnd, wordOffset);
     600    Value * numExtenders = b->CreateSub(matchExtEnd, wordOffset);
    607601    Value * matchExtReachBoundary =
    608             iBuilder->CreateICmpEQ(matchExtEnd, iBuilder->getInt32(wordWidth));
     602            b->CreateICmpEQ(matchExtEnd, b->getInt32(wordWidth));
    609603    // There are matchExtEnd forward zeroes, we load bytes[matchExtEnd]
    610604    // which is the first non-extender.  If matchExtEnd == 64, we force the
    611605    // load index to be 0 to avoid out-of-bound access, and lastByte will be 0.
    612     Value * loadOffset = iBuilder->CreateSelect(matchExtReachBoundary,
     606    Value * loadOffset = b->CreateSelect(matchExtReachBoundary,
    613607            ConstantInt::getNullValue(matchExtEnd->getType()),
    614608            matchExtEnd);
    615     Value * lastByte = iBuilder->CreateSelect(matchExtReachBoundary,
    616             iBuilder->getInt8(0),
    617             loadRawByte(iBuilder, iBuilder->CreateAdd(blockOffset, loadOffset)));
    618     Value * matchLenExted = iBuilder->CreateICmpUGE(matchLen, iBuilder->getInt32(0xf));
    619     matchLen = iBuilder->CreateSelect(matchLenExted,
    620             iBuilder->CreateAdd(
     609    Value * lastByte = b->CreateSelect(matchExtReachBoundary,
     610            b->getInt8(0),
     611            loadRawByte(b, b->CreateAdd(blockOffset, loadOffset)));
     612    Value * matchLenExted = b->CreateICmpUGE(matchLen, b->getInt32(0xf));
     613    matchLen = b->CreateSelect(matchLenExted,
     614            b->CreateAdd(
    621615                matchLen,
    622                 iBuilder->CreateAdd(
    623                     iBuilder->CreateMul(numExtenders, iBuilder->getInt32(0xff)),
    624                     iBuilder->CreateZExt(lastByte, iBuilder->getInt32Ty())
     616                b->CreateAdd(
     617                    b->CreateMul(numExtenders, b->getInt32(0xff)),
     618                    b->CreateZExt(lastByte, b->getInt32Ty())
    625619                    )
    626620                ),      // matchLen + numExtenders * 255
    627621            matchLen);
    628     wordOffset = iBuilder->CreateSelect(matchLenExted,
     622    wordOffset = b->CreateSelect(matchLenExted,
    629623            matchExtEnd,
    630624            wordOffset);
    631625    // If lastByte is truly the last length byte, we need to advance the cursor by 1.
    632     wordOffset = iBuilder->CreateSelect(
    633             iBuilder->CreateAnd(matchLenExted, iBuilder->CreateNot(matchExtReachBoundary)),
    634             iBuilder->CreateAdd(wordOffset, iBuilder->getInt32(1)),
     626    wordOffset = b->CreateSelect(
     627            b->CreateAnd(matchLenExted, b->CreateNot(matchExtReachBoundary)),
     628            b->CreateAdd(wordOffset, b->getInt32(1)),
    635629            wordOffset
    636630            );
    637     iBuilder->CreateStore(matchLen, sTempLength);
    638     iBuilder->CreateStore(iBuilder->CreateAdd(blockOffset, wordOffset), sOffset);
    639 
    640     Value * unfinished = iBuilder->CreateAnd(matchExtReachBoundary, matchLenExted);
    641     BasicBlock * output_then = iBuilder->CreateBasicBlock("output_then");
    642     BasicBlock * output_cont = iBuilder->CreateBasicBlock("output_cont");
    643     iBuilder->CreateLikelyCondBr(
    644             iBuilder->CreateNot(unfinished),
     631    b->CreateStore(matchLen, sTempLength);
     632    b->CreateStore(b->CreateAdd(blockOffset, wordOffset), sOffset);
     633
     634    Value * unfinished = b->CreateAnd(matchExtReachBoundary, matchLenExted);
     635    BasicBlock * output_then = b->CreateBasicBlock("output_then");
     636    BasicBlock * output_cont = b->CreateBasicBlock("output_cont");
     637    b->CreateLikelyCondBr(
     638            b->CreateNot(unfinished),
    645639            output_then, output_cont
    646640            );
    647     iBuilder->SetInsertPoint(output_then);
    648     iBuilder->CreateStore(iBuilder->getInt8(State::AT_TOKEN), sState);
    649     matchLen = iBuilder->CreateAdd(matchLen, iBuilder->getInt32(4));    // Add the constant at the end.
    650     iBuilder->setScalarField("MatchLength", matchLen);
    651     generateProduceOutput(iBuilder);
    652     iBuilder->CreateBr(output_cont);
    653 
    654     iBuilder->SetInsertPoint(output_cont);
    655     generateBoundaryDetection(iBuilder, State::EXTENDING_MATCH_LENGTH, exit_block, true);
    656     BasicBlock * cont_block = iBuilder->CreateBasicBlock("finished_" + StateLabels.at(State::EXTENDING_MATCH_LENGTH));
     641    b->SetInsertPoint(output_then);
     642    b->CreateStore(b->getInt8(State::AT_TOKEN), sState);
     643    matchLen = b->CreateAdd(matchLen, b->getInt32(4));    // Add the constant at the end.
     644    b->setScalarField("MatchLength", matchLen);
     645    generateProduceOutput(b);
     646    b->CreateBr(output_cont);
     647
     648    b->SetInsertPoint(output_cont);
     649    generateBoundaryDetection(b, State::EXTENDING_MATCH_LENGTH, exit_block, true);
     650    BasicBlock * cont_block = b->CreateBasicBlock("finished_" + StateLabels.at(State::EXTENDING_MATCH_LENGTH));
    657651    // Insert point is still in wordBoundary block now.
    658652    // See if there are still more extenders.
    659     iBuilder->CreateUnlikelyCondBr(unfinished, bb, cont_block);
    660 
    661     iBuilder->SetInsertPoint(cont_block);
    662 }
    663 
    664 
    665 void LZ4IndexDecoderKernel::generateAtBlockChecksum(const std::unique_ptr<KernelBuilder> & iBuilder, BasicBlock * bb, BasicBlock * skippingBytes) {
     653    b->CreateUnlikelyCondBr(unfinished, bb, cont_block);
     654
     655    b->SetInsertPoint(cont_block);
     656}
     657
     658
     659void LZ4IndexDecoderKernel::generateAtBlockChecksum(const std::unique_ptr<KernelBuilder> & b, BasicBlock * bb, BasicBlock * skippingBytes) {
    666660    // No branch here as we have made a conditional branch outside.
    667     iBuilder->SetInsertPoint(bb);
     661    b->SetInsertPoint(bb);
    668662    printRTDebugMsg("processing block checksum");
    669663
    670664    // Produce the partial output (fill matchIndexes with 0).
    671     iBuilder->setScalarField("MatchOffset", iBuilder->getInt32(0));
    672     iBuilder->setScalarField("MatchLength", iBuilder->getInt32(0));
    673     generateProduceOutput(iBuilder);
    674 
    675     BasicBlock * hasChecksum_then = iBuilder->CreateBasicBlock("has_checksum_then");
    676     BasicBlock * hasChecksum_cont = iBuilder->CreateBasicBlock("has_checksum_cont");
    677 
    678     iBuilder->CreateStore(iBuilder->getInt8(State::AT_BLOCK_SIZE), sState);
    679     iBuilder->CreateCondBr(iBuilder->getScalarField("hasBlockChecksum"), hasChecksum_then, hasChecksum_cont);
    680 
    681     iBuilder->SetInsertPoint(hasChecksum_then);
    682     iBuilder->setScalarField("BytesToSkip", iBuilder->getInt32(4));
    683     iBuilder->CreateBr(skippingBytes);
     665    b->setScalarField("MatchOffset", b->getInt32(0));
     666    b->setScalarField("MatchLength", b->getInt32(0));
     667    generateProduceOutput(b);
     668
     669    BasicBlock * hasChecksum_then = b->CreateBasicBlock("has_checksum_then");
     670    BasicBlock * hasChecksum_cont = b->CreateBasicBlock("has_checksum_cont");
     671
     672    b->CreateStore(b->getInt8(State::AT_BLOCK_SIZE), sState);
     673    b->CreateCondBr(b->getScalarField("hasBlockChecksum"), hasChecksum_then, hasChecksum_cont);
     674
     675    b->SetInsertPoint(hasChecksum_then);
     676    b->setScalarField("BytesToSkip", b->getInt32(4));
     677    b->CreateBr(skippingBytes);
    684678    // Boundary detection will be done in skipping_bytes.
    685679
    686     iBuilder->SetInsertPoint(hasChecksum_cont);
     680    b->SetInsertPoint(hasChecksum_cont);
    687681    // No checksum, offset not advanced.  Falls through to the next block (block_size).
    688682}
    689683
    690 LZ4IndexDecoderKernel::LZ4IndexDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
     684LZ4IndexDecoderKernel::LZ4IndexDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    691685: BlockOrientedKernel("lz4IndexDecoder",
    692686    // Inputs
    693     {Binding{iBuilder->getStreamSetTy(1, 8), "byteStream"},
    694      Binding{iBuilder->getStreamSetTy(1, 1), "extenders"}},
     687    {Binding{b->getStreamSetTy(1, 8), "byteStream"},
     688     Binding{b->getStreamSetTy(1, 1), "extenders"}},
    695689    // Outputs: literal start, literal length, match offset, match length
    696     {Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes", UnknownRate()},
    697      Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes", RateEqualTo("literalIndexes")}},
     690    {Binding{b->getStreamSetTy(2, 32), "literalIndexes", UnknownRate()},
     691     Binding{b->getStreamSetTy(2, 32), "matchIndexes", RateEqualTo("literalIndexes")}},
    698692    // Arguments
    699     {Binding{iBuilder->getInt1Ty(), "hasBlockChecksum"}},
     693    {Binding{b->getInt1Ty(), "hasBlockChecksum"}},
    700694    {},
    701695    // Internal states:
    702     {Binding{iBuilder->getInt32Ty(), "BlockNo"},
    703      Binding{iBuilder->getInt8Ty(), "State"},
    704      Binding{iBuilder->getInt32Ty(), "LZ4BlockStart"},
    705      Binding{iBuilder->getInt32Ty(), "LZ4BlockEnd"},
    706      Binding{iBuilder->getInt32Ty(), "BytesToSkip"},
    707      Binding{iBuilder->getInt32Ty(), "TempLength"},
    708      Binding{iBuilder->getInt32Ty(), "TempCount"},
    709      Binding{iBuilder->getInt32Ty(), "LiteralStart"},
    710      Binding{iBuilder->getInt32Ty(), "LiteralLength"},
    711      Binding{iBuilder->getInt32Ty(), "MatchOffset"},
    712      Binding{iBuilder->getInt32Ty(), "MatchLength"}})
    713 , wordWidth{iBuilder->getSizeTy()->getBitWidth()} {
     696    {Binding{b->getInt32Ty(), "BlockNo"},
     697     Binding{b->getInt8Ty(), "State"},
     698     Binding{b->getInt32Ty(), "LZ4BlockStart"},
     699     Binding{b->getInt32Ty(), "LZ4BlockEnd"},
     700     Binding{b->getInt32Ty(), "BytesToSkip"},
     701     Binding{b->getInt32Ty(), "TempLength"},
     702     Binding{b->getInt32Ty(), "TempCount"},
     703     Binding{b->getInt32Ty(), "LiteralStart"},
     704     Binding{b->getInt32Ty(), "LiteralLength"},
     705     Binding{b->getInt32Ty(), "MatchOffset"},
     706     Binding{b->getInt32Ty(), "MatchLength"}})
     707, wordWidth{b->getSizeTy()->getBitWidth()} {
    714708    setNoTerminateAttribute(true);
    715709}
  • icGREP/icgrep-devel/icgrep/kernels/lz4_index_decoder.h

    r5440 r5755  
    2222namespace kernel {
    2323
    24 class LZ4IndexDecoderKernel : public BlockOrientedKernel {
     24class LZ4IndexDecoderKernel final : public BlockOrientedKernel {
    2525public:
    2626    LZ4IndexDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    2727protected:
    28     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     28    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)override;
    2929private:
    3030
  • icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp

    r5706 r5755  
    3737}
    3838               
    39 void P2SKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     39void P2SKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    4040    Value * p_bitblock[8];
    4141    for (unsigned i = 0; i < 8; i++) {
    42         p_bitblock[i] = iBuilder->loadInputStreamBlock("basisBits", iBuilder->getInt32(i));
     42        p_bitblock[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i));
    4343    }
    4444    Value * s_bytepack[8];
    45     p2s(iBuilder, p_bitblock, s_bytepack);
     45    p2s(b, p_bitblock, s_bytepack);
    4646    for (unsigned j = 0; j < 8; ++j) {
    47         iBuilder->storeOutputStreamPack("byteStream", iBuilder->getInt32(0), iBuilder->getInt32(j), s_bytepack[j]);
     47        b->storeOutputStreamPack("byteStream", b->getInt32(0), b->getInt32(j), s_bytepack[j]);
    4848    }
    4949}
    5050
    51 void P2SKernelWithCompressedOutput::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    52     IntegerType * i32 = iBuilder->getInt32Ty();
    53     PointerType * bitBlockPtrTy = PointerType::get(iBuilder->getBitBlockType(), 0);
     51void P2SKernelWithCompressedOutput::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     52    IntegerType * i32 = b->getInt32Ty();
     53    PointerType * bitBlockPtrTy = PointerType::get(b->getBitBlockType(), 0);
    5454
    5555    Value * basisBits[8];
    5656    for (unsigned i = 0; i < 8; i++) {
    57         basisBits[i] = iBuilder->loadInputStreamBlock("basisBits", iBuilder->getInt32(i));
     57        basisBits[i] = b->loadInputStreamBlock("basisBits", b->getInt32(i));
    5858    }
    5959    Value * bytePack[8];
    60     p2s(iBuilder, basisBits, bytePack);
     60    p2s(b, basisBits, bytePack);
    6161
    62     unsigned units_per_register = iBuilder->getBitBlockWidth()/8;
    63     Value * delCountBlock_ptr = iBuilder->getInputStreamBlockPtr("deletionCounts", iBuilder->getInt32(0));
    64     Value * unit_counts = iBuilder->fwCast(units_per_register, iBuilder->CreateBlockAlignedLoad(delCountBlock_ptr));
     62    unsigned units_per_register = b->getBitBlockWidth()/8;
     63    Value * delCountBlock_ptr = b->getInputStreamBlockPtr("deletionCounts", b->getInt32(0));
     64    Value * unit_counts = b->fwCast(units_per_register, b->CreateBlockAlignedLoad(delCountBlock_ptr));
    6565
    66     Value * output_ptr = iBuilder->getOutputStreamBlockPtr("byteStream", iBuilder->getInt32(0));
    67     output_ptr = iBuilder->CreatePointerCast(output_ptr, iBuilder->getInt8PtrTy());
    68     Value * offset = iBuilder->getInt32(0);
     66    Value * output_ptr = b->getOutputStreamBlockPtr("byteStream", b->getInt32(0));
     67    output_ptr = b->CreatePointerCast(output_ptr, b->getInt8PtrTy());
     68    Value * offset = b->getInt32(0);
    6969    for (unsigned j = 0; j < 8; ++j) {
    70         iBuilder->CreateStore(bytePack[j], iBuilder->CreateBitCast(iBuilder->CreateGEP(output_ptr, offset), bitBlockPtrTy));
    71         offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(j)), i32);
     70        b->CreateStore(bytePack[j], b->CreateBitCast(b->CreateGEP(output_ptr, offset), bitBlockPtrTy));
     71        offset = b->CreateZExt(b->CreateExtractElement(unit_counts, b->getInt32(j)), i32);
    7272    }
    7373
    74     Value * unitsGenerated = iBuilder->getProducedItemCount("byteStream"); // units generated to buffer
    75     unitsGenerated = iBuilder->CreateAdd(unitsGenerated, iBuilder->CreateZExt(offset, iBuilder->getSizeTy()));
    76     iBuilder->setProducedItemCount("byteStream", unitsGenerated);
     74    Value * unitsGenerated = b->getProducedItemCount("byteStream"); // units generated to buffer
     75    unitsGenerated = b->CreateAdd(unitsGenerated, b->CreateZExt(offset, b->getSizeTy()));
     76    b->setProducedItemCount("byteStream", unitsGenerated);
    7777}
    7878
    79 void P2S16Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     79void P2S16Kernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    8080    Value * hi_input[8];
    8181    for (unsigned j = 0; j < 8; ++j) {
    82         hi_input[j] = iBuilder->loadInputStreamBlock("basisBits", iBuilder->getInt32(j));
     82        hi_input[j] = b->loadInputStreamBlock("basisBits", b->getInt32(j));
    8383    }
    8484    Value * hi_bytes[8];
    85     p2s(iBuilder, hi_input, hi_bytes);   
     85    p2s(b, hi_input, hi_bytes);
    8686    Value * lo_input[8];
    8787    for (unsigned j = 0; j < 8; ++j) {
    88         lo_input[j] = iBuilder->loadInputStreamBlock("basisBits", iBuilder->getInt32(j + 8));
     88        lo_input[j] = b->loadInputStreamBlock("basisBits", b->getInt32(j + 8));
    8989    }
    9090    Value * lo_bytes[8];
    91     p2s(iBuilder, lo_input, lo_bytes);   
     91    p2s(b, lo_input, lo_bytes);
    9292    for (unsigned j = 0; j < 8; ++j) {
    93         Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
    94         Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
    95         iBuilder->storeOutputStreamPack("i16Stream", iBuilder->getInt32(0), iBuilder->getInt32(2 * j), merge0);
    96         iBuilder->storeOutputStreamPack("i16Stream", iBuilder->getInt32(0), iBuilder->getInt32(2 * j + 1), merge1);
     93        Value * merge0 = b->bitCast(b->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
     94        Value * merge1 = b->bitCast(b->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
     95        b->storeOutputStreamPack("i16Stream", b->getInt32(0), b->getInt32(2 * j), merge0);
     96        b->storeOutputStreamPack("i16Stream", b->getInt32(0), b->getInt32(2 * j + 1), merge1);
    9797    }
    9898}
    9999       
    100 void P2S16KernelWithCompressedOutput::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    101     IntegerType * i32Ty = iBuilder->getInt32Ty();
    102     PointerType * int16PtrTy = iBuilder->getInt16Ty()->getPointerTo();
    103     PointerType * bitBlockPtrTy = iBuilder->getBitBlockType()->getPointerTo();
    104     ConstantInt * blockMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
     100void P2S16KernelWithCompressedOutput::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     101    IntegerType * i32Ty = b->getInt32Ty();
     102    PointerType * int16PtrTy = b->getInt16Ty()->getPointerTo();
     103    PointerType * bitBlockPtrTy = b->getBitBlockType()->getPointerTo();
     104    ConstantInt * blockMask = b->getSize(b->getBitBlockWidth() - 1);
    105105
    106106    Value * hi_input[8];
    107107    for (unsigned j = 0; j < 8; ++j) {
    108         hi_input[j] = iBuilder->loadInputStreamBlock("basisBits", iBuilder->getInt32(j));
     108        hi_input[j] = b->loadInputStreamBlock("basisBits", b->getInt32(j));
    109109    }
    110110    Value * hi_bytes[8];
    111     p2s(iBuilder, hi_input, hi_bytes);
     111    p2s(b, hi_input, hi_bytes);
    112112
    113113    Value * lo_input[8];
    114114    for (unsigned j = 0; j < 8; ++j) {
    115         lo_input[j] = iBuilder->loadInputStreamBlock("basisBits", iBuilder->getInt32(j + 8));
     115        lo_input[j] = b->loadInputStreamBlock("basisBits", b->getInt32(j + 8));
    116116    }
    117117    Value * lo_bytes[8];
    118     p2s(iBuilder, lo_input, lo_bytes);
     118    p2s(b, lo_input, lo_bytes);
    119119
    120     Value * delCount = iBuilder->loadInputStreamBlock("deletionCounts", iBuilder->getInt32(0));
    121     Value * unitCounts = iBuilder->fwCast(iBuilder->getBitBlockWidth() / 16, delCount);
    122     Value * outputPtr = iBuilder->getOutputStreamBlockPtr("i16Stream", iBuilder->getInt32(0));
    123     outputPtr = iBuilder->CreatePointerCast(outputPtr, int16PtrTy);
    124     Value * i16UnitsGenerated = iBuilder->getProducedItemCount("i16Stream"); // units generated to buffer
    125     outputPtr = iBuilder->CreateGEP(outputPtr, iBuilder->CreateAnd(i16UnitsGenerated, blockMask));
     120    Value * const delCount = b->loadInputStreamBlock("deletionCounts", b->getInt32(0));
     121    Value * const unitCounts = b->fwCast(b->getBitBlockWidth() / 16, delCount);
     122    Value * outputPtr = b->getOutputStreamBlockPtr("i16Stream", b->getInt32(0));
     123    outputPtr = b->CreatePointerCast(outputPtr, int16PtrTy);
     124    Value * const i16UnitsGenerated = b->getProducedItemCount("i16Stream"); // units generated to buffer
     125    outputPtr = b->CreateGEP(outputPtr, b->CreateAnd(i16UnitsGenerated, blockMask));
    126126
    127     Value * offset = ConstantInt::get(i32Ty, 0);
     127    Value * offset = b->getInt32(0);
    128128
    129129    for (unsigned j = 0; j < 8; ++j) {
    130         Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
    131         iBuilder->CreateAlignedStore(merge0, iBuilder->CreateBitCast(iBuilder->CreateGEP(outputPtr, offset), bitBlockPtrTy), 1);
    132         offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unitCounts, iBuilder->getInt32(2 * j)), i32Ty);
     130        Value * const merge0 = b->bitCast(b->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
     131        b->CreateAlignedStore(merge0, b->CreateBitCast(b->CreateGEP(outputPtr, offset), bitBlockPtrTy), 1);
     132        Value * const nextOffset1 = b->CreateZExt(b->CreateExtractElement(unitCounts, b->getInt32(2 * j)), i32Ty);
     133        b->CreateAssert(b->CreateICmpULE(offset, nextOffset1), "deletion offset is not monotonically non-decreasing");
    133134
    134         Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
    135         iBuilder->CreateAlignedStore(merge1, iBuilder->CreateBitCast(iBuilder->CreateGEP(outputPtr, offset), bitBlockPtrTy), 1);
    136         offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unitCounts, iBuilder->getInt32(2 * j + 1)), i32Ty);
     135        Value * const merge1 = b->bitCast(b->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
     136        b->CreateAlignedStore(merge1, b->CreateBitCast(b->CreateGEP(outputPtr, nextOffset1), bitBlockPtrTy), 1);
     137        Value * const nextOffset2 = b->CreateZExt(b->CreateExtractElement(unitCounts, b->getInt32(2 * j + 1)), i32Ty);
     138        b->CreateAssert(b->CreateICmpULE(nextOffset1, nextOffset2), "deletion offset is not monotonically non-decreasing");
     139
     140        offset = nextOffset2;
    137141    }
    138142
    139     Value * i16UnitsFinal = iBuilder->CreateAdd(i16UnitsGenerated, iBuilder->CreateZExt(offset, iBuilder->getSizeTy()));
    140     iBuilder->setProducedItemCount("i16Stream", i16UnitsFinal);
     143    Value * const i16UnitsFinal = b->CreateAdd(i16UnitsGenerated, b->CreateZExt(offset, b->getSizeTy()));
     144    b->setProducedItemCount("i16Stream", i16UnitsFinal);
    141145}
    142146
    143 P2SKernel::P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
     147P2SKernel::P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    144148: BlockOrientedKernel("p2s",
    145               {Binding{iBuilder->getStreamSetTy(8, 1), "basisBits"}},
    146               {Binding{iBuilder->getStreamSetTy(1, 8), "byteStream"}},
     149              {Binding{b->getStreamSetTy(8, 1), "basisBits"}},
     150              {Binding{b->getStreamSetTy(1, 8), "byteStream"}},
    147151              {}, {}, {}) {
    148152}
    149153
    150 P2SKernelWithCompressedOutput::P2SKernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
     154P2SKernelWithCompressedOutput::P2SKernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & b)
    151155: BlockOrientedKernel("p2s_compress",
    152               {Binding{iBuilder->getStreamSetTy(8, 1), "basisBits"}, Binding{iBuilder->getStreamSetTy(1, 1), "deletionCounts"}},
    153               {Binding{iBuilder->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}},
     156              {Binding{b->getStreamSetTy(8, 1), "basisBits"}, Binding{b->getStreamSetTy(1, 1), "deletionCounts"}},
     157              {Binding{b->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}},
    154158              {}, {}, {}) {
    155159}
    156160
    157 P2S16Kernel::P2S16Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
     161P2S16Kernel::P2S16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    158162: BlockOrientedKernel("p2s_16",
    159               {Binding{iBuilder->getStreamSetTy(16, 1), "basisBits"}},
    160               {Binding{iBuilder->getStreamSetTy(1, 16), "i16Stream"}},
     163              {Binding{b->getStreamSetTy(16, 1), "basisBits"}},
     164              {Binding{b->getStreamSetTy(1, 16), "i16Stream"}},
    161165              {}, {}, {}) {
    162166}
  • icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.h

    r5464 r5755  
    1414class P2SKernel final : public BlockOrientedKernel {
    1515public:
    16     P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     16    P2SKernel(const std::unique_ptr<kernel::KernelBuilder> & b);
    1717    bool isCachable() const override { return true; }
    1818    bool hasSignature() const override { return false; }
    1919private:
    20     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     20    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    2121};
    2222
    2323class P2SKernelWithCompressedOutput final : public BlockOrientedKernel {
    2424public:
    25     P2SKernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     25    P2SKernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & b);
    2626    bool isCachable() const override { return true; }
    2727    bool hasSignature() const override { return false; }
    2828private:
    29     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     29    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    3030};
    3131
    3232class P2S16Kernel final : public BlockOrientedKernel {
    3333public:
    34     P2S16Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     34    P2S16Kernel(const std::unique_ptr<kernel::KernelBuilder> & b);
    3535    bool isCachable() const override { return true; }
    3636    bool hasSignature() const override { return false; }
    3737private:
    38     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     38    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    3939};
    4040   
    4141class P2S16KernelWithCompressedOutput final : public BlockOrientedKernel {
    4242public:
    43     P2S16KernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     43    P2S16KernelWithCompressedOutput(const std::unique_ptr<kernel::KernelBuilder> & b);
    4444    bool isCachable() const override { return true; }
    4545    bool hasSignature() const override { return false; }
    4646private:
    47     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     47    void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    4848};
    4949   
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5718 r5755  
    2525}
    2626
    27 void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
     27Value * PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
    2828    BasicBlock * entry = kb->GetInsertBlock();
    2929    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
     
    134134    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
    135135    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));   
    136     kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);   
     136    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);
     137
     138    return numOfStrides;
    137139}
    138140
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.h

    r5706 r5755  
    7474    const unsigned mSwizzleFactor;
    7575    const unsigned mPDEPWidth;
    76     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
     76    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
    7777    std::vector<llvm::Value *> get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * PDEP_ms_blk,
    7878                                              const unsigned mask_width);
  • icGREP/icgrep-devel/icgrep/kernels/processing_rate.h

    r5706 r5755  
    44#include <string>
    55#include <assert.h>
     6#include <boost/rational.hpp>
    67
    78namespace kernel {
     
    2728struct ProcessingRate  {
    2829
     30    friend struct Binding;
     31
    2932    enum class KindId {
    30         Fixed, Bounded, Unknown, DirectlyRelative, PopCountRelative
     33        Fixed, Bounded, Unknown, Relative, PopCount
    3134    };
     35
     36    using RateValue = boost::rational<unsigned>;
    3237
    3338    KindId getKind() const { return mKind; }
    3439
    35     unsigned getRate() const {
    36         assert (isFixed());
    37         assert (mN > 0 && mN == mM);
    38         return mN;
     40    RateValue getRate() const {
     41        assert (isFixed() || isRelative());
     42        return mLowerBound;
    3943    }
    4044
    41     unsigned getLowerBound() const {
     45    RateValue getLowerBound() const {
    4246        assert (isFixed() || isBounded() || isUnknown());
    43         return mN;
     47        return mLowerBound;
    4448    }
    4549
    46     unsigned getUpperBound() const {
     50    RateValue getUpperBound() const {
    4751        assert (isFixed() || isBounded());
    48         assert (isFixed() ? mM == mN : mM > mN);
    49         return mM;
     52        assert (isFixed() ? mUpperBound == mLowerBound : mUpperBound > mLowerBound);
     53        return mUpperBound;
    5054    }
    5155
    5256    const std::string & getReference() const {
    53         assert (isExactlyRelative());
     57        assert (isRelative());
    5458        return mReference;
    55     }
    56 
    57     const unsigned getNumerator() const {
    58         assert (isExactlyRelative());
    59         assert (mM > 0);
    60         return mM;
    61     }
    62 
    63     const unsigned getDenominator() const {
    64         assert (isExactlyRelative());
    65         assert (mN > 0);
    66         return mN;
    6759    }
    6860
     
    7567    }
    7668
    77     bool isExactlyRelative() const {
    78         return mKind == KindId::DirectlyRelative;
     69    bool isRelative() const {
     70        return mKind == KindId::Relative;
     71    }
     72
     73    bool isPopCount() const {
     74        return mKind == KindId::PopCount;
    7975    }
    8076
     
    8480
    8581    bool isDerived() const {
    86         return isExactlyRelative(); // isFixed() ||
     82        return isRelative(); // isFixed() ||
    8783    }
    8884
    8985    bool operator == (const ProcessingRate & other) const {
    90         return mKind == other.mKind && mN == other.mN && mM == other.mM && mReference == other.mReference;
     86        return mKind == other.mKind && mLowerBound == other.mLowerBound && mUpperBound == other.mUpperBound && mReference == other.mReference;
    9187    }
    9288
     
    9591    }
    9692
    97     ProcessingRate & operator = (const ProcessingRate & other) {
    98         mKind = other.mKind;
    99         mN = other.mN;
    100         mM = other.mM;
    101         mReference = other.mReference;
    102         return *this;
    103     }
    104 
    10593    friend ProcessingRate FixedRate(const unsigned);
    10694    friend ProcessingRate BoundedRate(const unsigned, const unsigned);
    10795    friend ProcessingRate UnknownRate(const unsigned);
    10896    friend ProcessingRate RateEqualTo(std::string);
     97    friend ProcessingRate PopcountOf(std::string, const ProcessingRate::RateValue);
    10998
    110 protected:
     99    ProcessingRate(ProcessingRate &&) = default;
     100    ProcessingRate(const ProcessingRate &) = default;
     101    ProcessingRate & operator = (const ProcessingRate & other) = default;
    111102
    112     ProcessingRate(const KindId k, const unsigned n, const unsigned m, const std::string && ref = "") : mKind(k), mN(n), mM(m), mReference(ref) {}
     103protected:   
     104    ProcessingRate(const KindId k, const unsigned n, const unsigned m, const std::string && ref = "") : mKind(k), mLowerBound(n), mUpperBound(m), mReference(ref) {}
     105    ProcessingRate(const KindId k, const RateValue n, const RateValue m, const std::string && ref = "") : mKind(k), mLowerBound(n), mUpperBound(m), mReference(ref) {}
    113106private:
    114107    KindId mKind;
    115     unsigned mN;
    116     unsigned mM;
     108    RateValue mLowerBound;
     109    RateValue mUpperBound;
    117110    std::string mReference;
    118111};
     
    126119        return FixedRate(lower);
    127120    } else {
    128         return ProcessingRate(ProcessingRate::KindId::Bounded, lower, upper);
     121        return ProcessingRate(ProcessingRate::KindId::Bounded, {lower}, {upper});
    129122    }
    130123}
    131124
     125/**
     126 * @brief UnknownRate
     127 *
     128 * The produced item count per stride should never be dependent on an unknown rate input stream.
     129 */
    132130inline ProcessingRate UnknownRate(const unsigned lower = 0) {
    133131    return ProcessingRate(ProcessingRate::KindId::Unknown, lower, 0);
     
    135133
    136134inline ProcessingRate RateEqualTo(std::string ref) {
    137     return ProcessingRate(ProcessingRate::KindId::DirectlyRelative, 1, 1, std::move(ref));
     135    return ProcessingRate(ProcessingRate::KindId::Relative, 1, 0, std::move(ref));
    138136}
     137
     138inline ProcessingRate PopcountOf(std::string ref, const ProcessingRate::RateValue ratio = ProcessingRate::RateValue{1}) {
     139    return ProcessingRate(ProcessingRate::KindId::PopCount, ratio, ProcessingRate::RateValue{0}, std::move(ref));
     140}
     141
     142ProcessingRate::RateValue lcm(const ProcessingRate::RateValue & x, const ProcessingRate::RateValue & y);
     143
     144ProcessingRate::RateValue gcd(const ProcessingRate::RateValue & x, const ProcessingRate::RateValue & y);
    139145
    140146}
  • icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

    r5706 r5755  
    3939// of bytes to the actual output stream.
    4040
    41 void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) {
     41Value * expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    4242
    4343    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
     
    130130   
    131131    iBuilder->SetInsertPoint(expand3_4_exit);
     132
     133    return numOfStrides;
    132134}
    133135
  • icGREP/icgrep-devel/icgrep/kernels/radix64.h

    r5706 r5755  
    2525    bool hasSignature() const override { return false; }
    2626private:
    27     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
     27    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    2828};
    2929
     
    3636    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    3737    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * remainingBytes) override;
    38     llvm::Value* processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* packData) const;
     38    llvm::Value * processPackData(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value* packData) const;
    3939};
    4040
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp

    r5738 r5755  
    205205        : BlockOrientedKernel(aligned ? "s2p" : "s2p_unaligned",
    206206#endif
    207     {Binding{b->getStreamSetTy(1, 8), "byteStream"}}, {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
     207    {Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()}},
     208    {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
    208209  mAligned(aligned) {
    209210    setNoTerminateAttribute(true);
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp

    r5706 r5755  
    2121namespace kernel {
    2222
    23 void ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
     23Value * ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    2424
    2525    Module * const m = iBuilder->getModule();
     
    201201
    202202    iBuilder->SetInsertPoint(scanReturn);
    203    
     203    return numOfStrides;
    204204}
    205205
    206206ScanMatchKernel::ScanMatchKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    207207: MultiBlockKernel("scanMatch",
    208     {Binding{b->getStreamSetTy(1, 1), "matchResult", FixedRate(), Principle()}, Binding{b->getStreamSetTy(1, 1), "lineBreak"}, Binding{b->getStreamSetTy(1, 8), "InputStream", UnknownRate()}},
     208    {Binding{b->getStreamSetTy(1, 1), "matchResult", FixedRate(), Principal()}, Binding{b->getStreamSetTy(1, 1), "lineBreak"}, Binding{b->getStreamSetTy(1, 8), "InputStream", FixedRate(), Deferred() }},
    209209    {},
    210210    {Binding{b->getIntAddrTy(), "accumulator_address"}},
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.h

    r5706 r5755  
    2020    bool hasSignature() const override { return false; }
    2121private:
    22     void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     22    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    2323};
    2424
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.cpp

    r5706 r5755  
    406406
    407407void MemorySourceKernel::generateInitializeMethod(const std::unique_ptr<KernelBuilder> & kb) {
    408     kb->setBaseAddress("sourceBuffer", kb->CreatePointerCast(kb->getScalarField("fileSource"), kb->getVoidPtrTy()));
    409     kb->setBufferedSize("sourceBuffer", kb->getScalarField("fileSize"));
    410     kb->setCapacity("sourceBuffer", kb->getScalarField("fileSize"));
     408    Value * const fileSource = kb->getScalarField("fileSource");
     409    kb->setBaseAddress("sourceBuffer", fileSource);
     410    Value * const fileSize = kb->getScalarField("fileSize");
     411    kb->setBufferedSize("sourceBuffer", fileSize);
     412    kb->setCapacity("sourceBuffer", fileSize);
    411413}
    412414
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r5706 r5755  
    1515namespace kernel {
    1616
    17 void StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const /* numOfStrides */) {
     17Value * StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) {
    1818    Value * codeUnitBuffer = iBuilder->getInputStreamBlockPtr("codeUnitBuffer", iBuilder->getInt32(0));
    1919    codeUnitBuffer = iBuilder->CreatePointerCast(codeUnitBuffer, iBuilder->getInt8PtrTy());
     
    2525    }
    2626    iBuilder->CreateWriteCall(iBuilder->getInt32(1), codeUnitBuffer, bytesToDo);
     27    return numOfStrides;
    2728}
    2829
     
    6263}
    6364
    64 void FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const /* numOfStrides */) {
     65Value * FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) {
    6566    BasicBlock * const closeFile = iBuilder->CreateBasicBlock("closeFile");
    6667    BasicBlock * const fileOutExit = iBuilder->CreateBasicBlock("fileOutExit");
    6768
    6869    Value * const fileDes = iBuilder->getScalarField("fileDes");
    69     Value * const codeUnitBuffer = iBuilder->CreatePointerCast(getStreamSetInputBufferPtr(0), iBuilder->getInt8PtrTy());
     70    Value * codeUnitBuffer = iBuilder->getInputStreamBlockPtr("codeUnitBuffer", iBuilder->getInt32(0));
     71    codeUnitBuffer = iBuilder->CreatePointerCast(codeUnitBuffer, iBuilder->getInt8PtrTy());
    7072    Value * bytesToDo = mAvailableItemCount[0];
    7173    if (LLVM_UNLIKELY(mCodeUnitWidth > 8)) {
     
    8688   
    8789    iBuilder->SetInsertPoint(fileOutExit);
     90    return numOfStrides;
    8891}
    8992
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.h

    r5706 r5755  
    1616    StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth);
    1717private:
    18     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     18    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;