Ignore:
Timestamp:
Oct 25, 2017, 4:57:58 PM (19 months ago)
Author:
nmedfort
Message:

First stage of MultiBlockKernel? and pipeline restructuring

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
2 added
26 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5540 r5706  
    6464    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
    6565    assert((mPEXTWidth == 64 || mPEXTWidth == 32) && "PEXT width must be 32 or 64");
    66    
    67     mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
     66
     67    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
     68    mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
    6869    addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
    6970    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    7071        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
    71             "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
     72            "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
    7273        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    7374    }
     
    564565        assert(mSwizzleFactor > 1 && "fieldWidth must be less than the block width");
    565566        mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle0"});
    566         mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", MaxRatio(1)});
     567        mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
    567568        addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
    568569        for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    569570            mStreamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "inputSwizzle" + std::to_string(i)});
    570             mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), FixedRatio(1, 1, "outputSwizzle0")});
     571            mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
    571572            addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    572573        }
     
    602603    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    603604        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
    604         iBuilder->CallPrintInt("newItemCount", newItemCount);
     605    //iBuilder->CallPrintInt("newItemCount", newItemCount);
    605606        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
    606607        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     
    610611        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    611612            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
    612             iBuilder->CallPrintRegister("newItems", newItems);
     613        //iBuilder->CallPrintRegister("newItems", newItems);
    613614            // Combine as many of the new items as possible into the pending group.
    614615            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5705 r5706  
    137137: PabloKernel(kb, "RequiredStreams_UTF8",               
    138138              {Binding{kb->getStreamSetTy(8), "basis"}},
    139               {Binding{kb->getStreamSetTy(4), "required"}},
     139              {Binding{kb->getStreamSetTy(4), "required", FixedRate(), Add1()}},
    140140              {},
    141141              {}) {
     
    171171: PabloKernel(kb, "RequiredStreams_UTF16",               
    172172              {Binding{kb->getStreamSetTy(16), "basis"}},
    173               {Binding{kb->getStreamSetTy(4), "required"}},
     173              {Binding{kb->getStreamSetTy(4), "required", FixedRate(), Add1()}},
    174174              {},
    175175              {}) {
     
    184184               Binding{iBuilder->getStreamSetTy(1, 1), "linebreak"},
    185185               Binding{iBuilder->getStreamSetTy(4, 1), "required"}},
    186 
    187               {Binding{iBuilder->getStreamSetTy(1, 1), "matches", Add1()}}) {
     186              {Binding{iBuilder->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}}) {
    188187
    189188}
     
    227226
    228227InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
    229 : BlockOrientedKernel("Invert", {Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}}, {Binding{builder->getStreamSetTy(1, 1), "nonMatches"}}, {}, {}, {}) {
     228: BlockOrientedKernel("Invert",
     229    // Inputs
     230    {Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
     231    // Outputs
     232    {Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
     233    // Input/Output Scalars and internal state
     234    {}, {}, {}) {
    230235    setNoTerminateAttribute(true);   
    231236}
     
    246251              {Binding{iBuilder->getSizeTy(), "countResult"}}) {
    247252}
    248 
  • icGREP/icgrep-devel/icgrep/kernels/interface.cpp

    r5646 r5706  
    2020using namespace llvm;
    2121
    22 ProcessingRate FixedRatio(unsigned strmItems, unsigned referenceItems, std::string && referenceStreamSet) {
    23     return ProcessingRate(ProcessingRate::ProcessingRateKind::FixedRatio, strmItems, referenceItems, std::move(referenceStreamSet));
    24 }
    25 
    26 ProcessingRate MaxRatio(unsigned strmItems, unsigned referenceItems, std::string && referenceStreamSet) {
    27     return ProcessingRate(ProcessingRate::ProcessingRateKind::MaxRatio, strmItems, referenceItems, std::move(referenceStreamSet));
    28 }
    29 
    30 ProcessingRate RoundUpToMultiple(unsigned itemMultiple, std::string && referenceStreamSet) {
    31     return ProcessingRate(ProcessingRate::ProcessingRateKind::RoundUp, itemMultiple, itemMultiple, std::move(referenceStreamSet));
    32 }
    33 
    34 ProcessingRate Add1(std::string && referenceStreamSet) {
    35     return ProcessingRate(ProcessingRate::ProcessingRateKind::Add1, 0, 1, std::move(referenceStreamSet));
    36 }
    37 
    38 ProcessingRate UnknownRate() {
    39     return ProcessingRate(ProcessingRate::ProcessingRateKind::Unknown, 0, 1, "");
    40 }
    41 
    42 unsigned ProcessingRate::calculateRatio(unsigned referenceItems, bool doFinal) const {
    43     if (mKind == ProcessingRate::ProcessingRateKind::FixedRatio || mKind == ProcessingRate::ProcessingRateKind::MaxRatio) {
    44         if (mRatioNumerator == mRatioDenominator) {
    45             return referenceItems;
    46         }
    47         unsigned strmItems = referenceItems * mRatioNumerator;
    48         return (strmItems + mRatioDenominator - 1) / mRatioDenominator;
    49     }
    50     if (mKind == ProcessingRate::ProcessingRateKind::RoundUp) {
    51         return ((referenceItems + mRatioDenominator - 1) / mRatioDenominator) * mRatioDenominator;
    52     }
    53     if (mKind == ProcessingRate::ProcessingRateKind::Add1) {
    54         return doFinal ? referenceItems + 1 : referenceItems;
    55     }
    56     report_fatal_error("Processing rate calculation attempted for variable or unknown rate.");
    57 }
    58 
    59 Value * ProcessingRate::CreateRatioCalculation(IDISA::IDISA_Builder * const b, Value * referenceItems, Value * doFinal) const {
    60     if (mKind == ProcessingRate::ProcessingRateKind::FixedRatio || mKind == ProcessingRate::ProcessingRateKind::MaxRatio) {
    61         if (mRatioNumerator == mRatioDenominator) {
    62             return referenceItems;
    63         }
    64         Type * const T = referenceItems->getType();
    65         Constant * const numerator = ConstantInt::get(T, mRatioNumerator);
    66         Constant * const denominator = ConstantInt::get(T, mRatioDenominator);
    67         Constant * const denominatorLess1 = ConstantInt::get(T, mRatioDenominator - 1);
    68         Value * strmItems = b->CreateMul(referenceItems, numerator);
    69         return b->CreateUDiv(b->CreateAdd(denominatorLess1, strmItems), denominator);
    70     }
    71     if (mKind == ProcessingRate::ProcessingRateKind::RoundUp) {
    72         Type * const T = referenceItems->getType();
    73         Constant * const denominator = ConstantInt::get(T, mRatioDenominator);
    74         Constant * const denominatorLess1 = ConstantInt::get(T, mRatioDenominator - 1);
    75         return b->CreateMul(b->CreateUDiv(b->CreateAdd(referenceItems, denominatorLess1), denominator), denominator);
    76     }
    77     if (mKind == ProcessingRate::ProcessingRateKind::Add1) {
    78         if (doFinal) {
    79             Type * const T = referenceItems->getType();
    80             referenceItems = b->CreateAdd(referenceItems, b->CreateZExt(doFinal, T));
    81         }
    82         return referenceItems;
    83     }
    84     report_fatal_error("Processing rate calculation attempted for variable or unknown rate.");
    85 }
    86 
    87 unsigned ProcessingRate::calculateMaxReferenceItems(const unsigned outputItems, const bool doFinal) const {
    88     if (mKind == ProcessingRate::ProcessingRateKind::FixedRatio || mKind == ProcessingRate::ProcessingRateKind::MaxRatio) {
    89         if (mRatioNumerator == mRatioDenominator) {
    90             return outputItems;
    91         }
    92         return (outputItems / mRatioNumerator) * mRatioDenominator;
    93     }
    94     if (mKind == ProcessingRate::ProcessingRateKind::RoundUp) {
    95         return (outputItems / mRatioDenominator) * mRatioDenominator;
    96     }
    97     if (mKind == ProcessingRate::ProcessingRateKind::Add1) {
    98         return outputItems - (doFinal ? 1 : 0);
    99     }
    100     report_fatal_error("Inverse processing rate calculation attempted for unknown rate.");
    101 }
    102 
    103 Value * ProcessingRate::CreateMaxReferenceItemsCalculation(IDISA::IDISA_Builder * const b, Value * outputItems, Value * doFinal) const {
    104     if (mKind == ProcessingRate::ProcessingRateKind::FixedRatio || mKind == ProcessingRate::ProcessingRateKind::MaxRatio) {
    105         if (mRatioNumerator == mRatioDenominator) {
    106             return outputItems;
    107         }
    108         Type * const T = outputItems->getType();
    109         Constant * const numerator = ConstantInt::get(T, mRatioNumerator);
    110         Constant * const denominator = ConstantInt::get(T, mRatioDenominator);
    111         return b->CreateMul(b->CreateUDiv(outputItems, numerator), denominator);
    112     }
    113     if (mKind == ProcessingRate::ProcessingRateKind::RoundUp) {
    114         Type * const T = outputItems->getType();
    115         Constant * const denominator = ConstantInt::get(T, mRatioDenominator);
    116         return b->CreateMul(b->CreateUDiv(outputItems, denominator), denominator);
    117     }
    118     if (mKind == ProcessingRate::ProcessingRateKind::Add1) {
    119         Type * const T = outputItems->getType();
    120         if (doFinal) {
    121             return b->CreateSub(outputItems, b->CreateZExt(doFinal, T));
    122         }
    123         return b->CreateSub(outputItems, ConstantInt::get(T, 1));
    124     }
    125     report_fatal_error("Inverse processing rate calculation attempted for unknown rate.");
    126 }
     22namespace kernel {
    12723
    12824void KernelInterface::addKernelDeclarations(const std::unique_ptr<kernel::KernelBuilder> & idb) {
     
    14137    std::vector<Type *> initParameters = {selfType};
    14238    for (auto binding : mScalarInputs) {
    143         initParameters.push_back(binding.type);
     39        initParameters.push_back(binding.getType());
    14440    }
    14541    initParameters.insert(initParameters.end(), mStreamSetOutputs.size(), consumerTy);
     
    15147    auto args = initFunc->arg_begin();
    15248    args->setName("self");
    153     for (auto binding : mScalarInputs) {
    154         (++args)->setName(binding.name);
     49    for (const Binding & binding : mScalarInputs) {
     50        (++args)->setName(binding.getName());
    15551    }
    156     for (auto binding : mStreamSetOutputs) {
    157         (++args)->setName(binding.name + "ConsumerLocks");
     52    for (const Binding & binding : mStreamSetOutputs) {
     53        (++args)->setName(binding.getName() + "ConsumerLocks");
    15854    }
    15955
    16056    // Create the doSegment function prototype.
    16157    std::vector<Type *> params = {selfType, idb->getInt1Ty()};
    162     params.insert(params.end(), mStreamSetInputs.size(), sizeTy);
     58
     59    const auto count = mStreamSetInputs.size();
     60    params.insert(params.end(), count, sizeTy);
    16361
    16462    FunctionType * const doSegmentType = FunctionType::get(voidTy, params, false);
     
    17068    args->setName("self");
    17169    (++args)->setName("doFinal");
     70//    if (mHasPrincipleItemCount) {
     71//        (++args)->setName("principleAvailableItemCount");
     72//    }
    17273    for (const Binding & input : mStreamSetInputs) {
    173         (++args)->setName(input.name + "AvailableItems");
     74        //const ProcessingRate & r = input.getRate();
     75        //if (!r.isDerived()) {
     76            (++args)->setName(input.getName() + "AvailableItems");
     77        //}
    17478    }
    17579
     
    18286        Type * outputType[n];
    18387        for (unsigned i = 0; i < n; ++i) {
    184             outputType[i] = mScalarOutputs[i].type;
     88            outputType[i] = mScalarOutputs[i].getType();
    18589        }
    18690        if (n == 1) {
     
    201105}
    202106
     107void  KernelInterface::setInstance(Value * const instance) {
     108    assert ("kernel instance cannot be null!" && instance);
     109    assert ("kernel instance must point to a valid kernel state type!" && (instance->getType()->getPointerElementType() == mKernelStateType));
     110    mKernelInstance = instance;
     111}
     112
    203113Function * KernelInterface::getInitFunction(Module * const module) const {
    204114    const auto name = getName() + INIT_SUFFIX;
     
    210120}
    211121
    212 Function * KernelInterface::getDoSegmentFunction(llvm::Module * const module) const {
     122Function * KernelInterface::getDoSegmentFunction(Module * const module) const {
    213123    const auto name = getName() + DO_SEGMENT_SUFFIX;
    214124    Function * f = module->getFunction(name);
     
    227137    return f;
    228138}
     139
     140CallInst * KernelInterface::makeDoSegmentCall(kernel::KernelBuilder & idb, const std::vector<llvm::Value *> & args) const {
     141    Function * const doSegment = getDoSegmentFunction(idb.getModule());
     142    assert (doSegment->getArgumentList().size() <= args.size());
     143    return idb.CreateCall(doSegment, args);
     144}
     145
     146void Binding::addAttribute(Attribute attribute) {
     147    for (Attribute & attr : attributes) {
     148        if (attr.getKind() == attribute.getKind()) {
     149            return;
     150        }
     151    }
     152    attributes.emplace_back(attribute);
     153}
     154
     155void KernelInterface::normalizeStreamProcessingRates() {
     156
     157}
     158
     159}
  • icGREP/icgrep-devel/icgrep/kernels/interface.h

    r5646 r5706  
    77#define KERNEL_INTERFACE_H
    88
    9 #include <llvm/IR/Constants.h>
     9#include <kernels/processing_rate.h>
     10#include <kernels/attributes.h>
     11#include <memory>
    1012#include <string>
    1113#include <vector>
     
    1416namespace kernel { class Kernel; }
    1517namespace kernel { class KernelBuilder; }
    16 
    17 // Processing rate attributes are required for all stream set bindings for a kernel.
    18 // These attributes describe the number of items that are processed or produced as
    19 // a ratio in comparison to a reference stream set, normally the principal input stream set
    20 // by default (or the principal output stream set if there is no input).
    21 //
    22 // The default ratio is FixedRatio(1) which means that there is one item processed or
    23 // produced for every item of the reference stream.
    24 // FixedRatio(m, n) means that for every group of n items of the refrence stream,
    25 // there are m items in the output stream (rounding up).
    26 //
    27 // Kernels which produce a variable number of items use MaxRatio(n), for a maximum
    28 // of n items produced or consumed per principal input or output item.  MaxRatio(m, n)
    29 // means there are at most m items for every n items of the reference stream.
    30 //
    31 // RoundUpToMultiple(n) means that number of items produced is the same as the
    32 // number of reference items, rounded up to an exact multiple of n.
    33 //
    34 
    35 struct ProcessingRate  {
    36     friend class kernel::Kernel;
    37     enum class ProcessingRateKind : uint8_t { FixedRatio, RoundUp, Add1, MaxRatio, Unknown };
    38     ProcessingRateKind getKind() const {return mKind;}
    39     bool isFixedRatio() const {return mKind == ProcessingRateKind::FixedRatio;}
    40     bool isMaxRatio() const {return mKind == ProcessingRateKind::MaxRatio;}
    41     bool isExact() const {return (mKind == ProcessingRateKind::FixedRatio)||(mKind == ProcessingRateKind::RoundUp)||(mKind == ProcessingRateKind::Add1) ;}
    42     bool isUnknownRate() const { return mKind == ProcessingRateKind::Unknown; }
    43     unsigned calculateRatio(unsigned referenceItems, bool doFinal = false) const;
    44     // Calculate the max number of reference items that can be processed without exceeding/exhausting outputItems
    45     unsigned calculateMaxReferenceItems(unsigned outputItems, bool doFinal = false) const;
    46     llvm::Value * CreateRatioCalculation(IDISA::IDISA_Builder * const b, llvm::Value * referenceItems, llvm::Value * doFinal = nullptr) const;
    47     llvm::Value * CreateMaxReferenceItemsCalculation(IDISA::IDISA_Builder * const b, llvm::Value * outputItems, llvm::Value * doFinal = nullptr) const;
    48     friend ProcessingRate FixedRatio(unsigned strmItems, unsigned referenceItems, std::string && referenceStreamSet);
    49     friend ProcessingRate MaxRatio(unsigned strmItems, unsigned referenceItems, std::string && referenceStreamSet);
    50     friend ProcessingRate RoundUpToMultiple(unsigned itemMultiple, std::string && referenceStreamSet);
    51     friend ProcessingRate Add1(std::string && referenceStreamSet);
    52     friend ProcessingRate UnknownRate();
    53     uint16_t getRatioNumerator() const { return mRatioNumerator;}
    54     uint16_t getRatioDenominator() const { return mRatioDenominator;}
    55     const std::string & referenceStreamSet() const { return mReferenceStreamSet;}
    56 protected:
    57     ProcessingRate(ProcessingRateKind k, unsigned numerator, unsigned denominator, std::string && referenceStreamSet)
    58     : mKind(k), mRatioNumerator(numerator), mRatioDenominator(denominator), mReferenceStreamSet(referenceStreamSet) {}
    59     void setReferenceStreamSet(const std::string & s) {mReferenceStreamSet = s;}
     18namespace llvm { class CallInst; }
     19namespace llvm { class Function; }
     20namespace llvm { class Value; }
     21namespace llvm { class Module; }
     22namespace llvm { class StructType; }
     23namespace llvm { class Type; }
     24
     25namespace kernel {
     26
     27struct Binding {
     28
     29    friend class KernelInterface;
     30
     31    Binding(llvm::Type * type, const std::string & name, ProcessingRate r = FixedRate(1))
     32    : type(type), name(name), rate(r), attributes() { }
     33
     34
     35    Binding(llvm::Type * type, const std::string & name, ProcessingRate r, Attribute && attribute)
     36    : type(type), name(name), rate(r), attributes({std::move(attribute)}) { }
     37
     38
     39    Binding(llvm::Type * type, const std::string & name, ProcessingRate r, std::initializer_list<Attribute> attributes)
     40    : type(type), name(name), rate(r), attributes(attributes) { }
     41
     42    llvm::Type * getType() const {
     43        return type;
     44    }
     45
     46    const std::string & getName() const {
     47        return name;
     48    }
     49
     50    const ProcessingRate & getRate() const {
     51        return rate;
     52    }
     53
     54    const Attribute & getAttribute(const unsigned i) const {
     55        return attributes[i];
     56    }
     57
     58    const std::vector<Attribute> & getAttributes() const {
     59        return attributes;
     60    }
     61
     62    void addAttribute(Attribute attribute);
     63
     64    bool hasAttributes() const {
     65        return !attributes.empty();
     66    }
     67
    6068private:
    61     const ProcessingRateKind mKind;
    62     const uint16_t mRatioNumerator;
    63     const uint16_t mRatioDenominator;
    64     std::string mReferenceStreamSet;
    65 };
    66 
    67 ProcessingRate FixedRatio(unsigned strmItems, unsigned referenceItems = 1, std::string && referenceStreamSet = "");
    68 ProcessingRate MaxRatio(unsigned strmItems, unsigned referenceItems = 1, std::string && referenceStreamSet = "");
    69 ProcessingRate RoundUpToMultiple(unsigned itemMultiple, std::string &&referenceStreamSet = "");
    70 ProcessingRate Add1(std::string && referenceStreamSet = "");
    71 ProcessingRate UnknownRate();
    72 
    73 struct Binding {
    74     Binding(llvm::Type * type, const std::string & name, ProcessingRate r = FixedRatio(1))
    75     : type(type), name(name), rate(r) { }
    76     llvm::Type * const        type;
    77     const std::string         name;
    78     ProcessingRate      rate;
     69    llvm::Type * const          type;
     70    const std::string           name;
     71    ProcessingRate              rate;
     72    std::vector<Attribute>      attributes;
    7973};
    8074
     
    157151    }
    158152
    159     void setInstance(llvm::Value * const instance) {
    160         assert ("kernel instance cannot be null!" && instance);
    161         assert ("kernel instance must point to a valid kernel state type!" && (instance->getType()->getPointerElementType() == mKernelStateType));
    162         mKernelInstance = instance;
     153    void setInstance(llvm::Value * const instance);
     154
     155    bool hasPrincipleItemCount() const {
     156        return mHasPrincipleItemCount;
    163157    }
    164158
    165159    unsigned getLookAhead(const unsigned i) const {
    166         assert (i < mStreamSetInputLookahead.size());
    167         return mStreamSetInputLookahead[i];
     160        return 0;
    168161    }
    169162
    170163    void setLookAhead(const unsigned i, const unsigned lookAheadPositions) {
    171         assert (i < mStreamSetInputLookahead.size());
    172         mStreamSetInputLookahead[i] = lookAheadPositions;
     164
    173165    }
    174166
     
    180172
    181173    llvm::Function * getTerminateFunction(llvm::Module * const module) const;
     174
     175    llvm::CallInst * makeDoSegmentCall(KernelBuilder & idb, const std::vector<llvm::Value *> & args) const;
    182176
    183177    KernelInterface(const std::string && kernelName,
     
    190184    , mModule(nullptr)
    191185    , mKernelStateType(nullptr)
     186    , mHasPrincipleItemCount(false)
    192187    , mKernelName(kernelName)
    193188    , mStreamSetInputs(stream_inputs)
    194     , mStreamSetInputLookahead(mStreamSetInputs.size(), 0)
    195189    , mStreamSetOutputs(stream_outputs)
    196190    , mScalarInputs(scalar_inputs)
    197191    , mScalarOutputs(scalar_outputs)
    198192    , mInternalScalars(internal_scalars) {
    199 
     193        normalizeStreamProcessingRates();
    200194    }
    201195   
     196private:
     197
     198    void normalizeStreamProcessingRates();
     199
    202200protected:
    203201
     
    205203    llvm::Module *                          mModule;
    206204    llvm::StructType *                      mKernelStateType;
     205    bool                                    mHasPrincipleItemCount;
    207206    const std::string                       mKernelName;
    208207    std::vector<llvm::Value *>              mInitialArguments;
    209208    std::vector<Binding>                    mStreamSetInputs;
    210     std::vector<unsigned>                   mStreamSetInputLookahead;
    211209    std::vector<Binding>                    mStreamSetOutputs;
    212210    std::vector<Binding>                    mScalarInputs;
    213211    std::vector<Binding>                    mScalarOutputs;
    214212    std::vector<Binding>                    mInternalScalars;
     213
    215214};
    216215
     216}
     217
    217218#endif
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5688 r5706  
    1818#include <sstream>
    1919#include <kernels/kernel_builder.h>
     20#include <boost/math/common_factor_rt.hpp>
    2021#include <llvm/Support/Debug.h>
    2122
    2223using namespace llvm;
    2324using namespace parabix;
     25using namespace boost::math;
    2426
    2527namespace kernel {
     
    3739const std::string Kernel::CYCLECOUNT_SCALAR = "CPUcycles";
    3840
     41/** ------------------------------------------------------------------------------------------------------------- *
     42 * @brief addScalar
     43 ** ------------------------------------------------------------------------------------------------------------- */
    3944unsigned Kernel::addScalar(Type * const type, const std::string & name) {
    4045    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     
    5055}
    5156
     57
     58/** ------------------------------------------------------------------------------------------------------------- *
     59 * @brief addUnnamedScalar
     60 ** ------------------------------------------------------------------------------------------------------------- */
    5261unsigned Kernel::addUnnamedScalar(Type * const type) {
    5362    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     
    5968}
    6069
     70
     71/** ------------------------------------------------------------------------------------------------------------- *
     72 * @brief prepareStreamSetNameMap
     73 ** ------------------------------------------------------------------------------------------------------------- */
    6174void Kernel::prepareStreamSetNameMap() {
    6275    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    63         mStreamMap.emplace(mStreamSetInputs[i].name, std::make_pair(Port::Input, i));
     76        mStreamMap.emplace(mStreamSetInputs[i].getName(), std::make_pair(Port::Input, i));
    6477    }
    6578    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    66         mStreamMap.emplace(mStreamSetOutputs[i].name, std::make_pair(Port::Output, i));
    67     }
    68 }
    69 
     79        mStreamMap.emplace(mStreamSetOutputs[i].getName(), std::make_pair(Port::Output, i));
     80    }
     81}
     82
     83
     84/** ------------------------------------------------------------------------------------------------------------- *
     85 * @brief bindPorts
     86 ** ------------------------------------------------------------------------------------------------------------- */
    7087void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
    7188    assert (mModule == nullptr);
     
    111128}
    112129
     130
     131/** ------------------------------------------------------------------------------------------------------------- *
     132 * @brief getCacheName
     133 ** ------------------------------------------------------------------------------------------------------------- */
    113134std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & idb) const {
    114135    std::stringstream cacheName;
     
    123144}
    124145
     146
     147/** ------------------------------------------------------------------------------------------------------------- *
     148 * @brief setModule
     149 ** ------------------------------------------------------------------------------------------------------------- */
    125150Module * Kernel::setModule(Module * const module) {
    126151    assert (mModule == nullptr || mModule == module);
     
    130155}
    131156
     157
     158/** ------------------------------------------------------------------------------------------------------------- *
     159 * @brief makeModule
     160 ** ------------------------------------------------------------------------------------------------------------- */
    132161Module * Kernel::makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    133162    return setModule(new Module(getCacheName(idb), idb->getContext()));
    134163}
    135164
     165
     166/** ------------------------------------------------------------------------------------------------------------- *
     167 * @brief prepareKernel
     168 ** ------------------------------------------------------------------------------------------------------------- */
    136169void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
    137170    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     
    139172        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
    140173    }
    141     const auto blockSize = idb->getBitBlockWidth();
     174    addBaseKernelProperties(idb);
     175    addInternalKernelProperties(idb);
     176    // NOTE: StructType::create always creates a new type even if an identical one exists.
     177    if (LLVM_UNLIKELY(mModule == nullptr)) {
     178        setModule(new Module(getCacheName(idb), idb->getContext()));
     179    }
     180    mKernelStateType = mModule->getTypeByName(getName());
     181    if (LLVM_LIKELY(mKernelStateType == nullptr)) {
     182        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
     183        assert (mKernelStateType);
     184    }   
     185}
     186
     187
     188/** ------------------------------------------------------------------------------------------------------------- *
     189 * @brief prepareCachedKernel
     190 ** ------------------------------------------------------------------------------------------------------------- */
     191void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
     192    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     193    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     194        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
     195    }
     196    assert (getModule());
     197    addBaseKernelProperties(idb);
     198    mKernelStateType = getModule()->getTypeByName(getName());
     199    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
     200        report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object");
     201    }   
     202}
     203
     204/** ------------------------------------------------------------------------------------------------------------- *
     205 * @brief getItemsPerStride
     206 ** ------------------------------------------------------------------------------------------------------------- */
     207std::pair<unsigned, unsigned> Kernel::getStreamRate(const Port p, const unsigned i) const {
     208    const ProcessingRate & rate = (p == Port::Input) ? mStreamSetInputs[i].getRate() : mStreamSetOutputs[i].getRate();
     209    unsigned min = 0, max = 0;
     210    if (rate.isFixed()) {
     211        min = max = rate.getRate();
     212    } else if (rate.isBounded()) {
     213        min = rate.getLowerBound();
     214        max = rate.getUpperBound();
     215    } else if (rate.isUnknown()) {
     216        min = rate.getLowerBound();
     217        max = 0;
     218    } else if (rate.isExactlyRelative()) {
     219        for (unsigned j = 0; j < mStreamSetInputs.size(); ++j) {
     220            if (mStreamSetInputs[j].getName() == rate.getReference()) {
     221                std::tie(min, max) = getStreamRate(Port::Input, j);
     222                min = (min * rate.getNumerator()) / rate.getDenominator();
     223                assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
     224                max = (max * rate.getNumerator()) / rate.getDenominator();
     225                return std::make_pair(min, max);
     226            }
     227        }
     228        for (unsigned j = 0; j < mStreamSetOutputs.size(); ++j) {
     229            if (mStreamSetOutputs[j].getName() == rate.getReference()) {
     230                assert (p == Port::Output);
     231                std::tie(min, max) = getStreamRate(Port::Output, j);
     232                min = (min * rate.getNumerator()) / rate.getDenominator();
     233                assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
     234                max = (max * rate.getNumerator()) / rate.getDenominator();
     235                return std::make_pair(min, max);
     236            }
     237        }
     238        llvm_unreachable("Reference rate must be associated with an input or output!");
     239    }
     240    return std::make_pair(min, max);
     241}
     242
     243/** ------------------------------------------------------------------------------------------------------------- *
     244 * @brief addBaseKernelProperties
     245 ** ------------------------------------------------------------------------------------------------------------- */
     246void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
     247   
     248    const unsigned inputSetCount = mStreamSetInputs.size();
     249    const unsigned outputSetCount = mStreamSetOutputs.size();
     250   
     251    assert (inputSetCount == mStreamSetInputBuffers.size());
     252    assert (outputSetCount == mStreamSetOutputBuffers.size());
     253
    142254    if (mStride == 0) {
    143255        // Set the default kernel stride.
    144         mStride = blockSize;
    145     }
     256        mStride = idb->getBitBlockWidth();
     257    }
     258
    146259    IntegerType * const sizeTy = idb->getSizeTy();
    147260
    148     assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
    149 //    assert (mStreamSetInputs.size() == mStreamSetInputLookahead.size());
    150 
    151     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    152 //        const auto requiredBlocks = codegen::SegmentSize + ((mStreamSetInputLookahead[i] + blockSize - 1) / blockSize);
    153 //        if ((mStreamSetInputBuffers[i]->getBufferBlocks() != 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
    154 //            report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
    155 //        }
    156         mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
    157         if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
    158             addScalar(sizeTy, mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
    159         }
    160     }
    161 
    162     assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
    163 
    164     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    165         mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
    166         if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
    167             addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
    168         }
     261    for (unsigned i = 0; i < inputSetCount; i++) {
     262        const Binding & b = mStreamSetInputs[i];
     263        //const ProcessingRate & rate = b.getRate();
     264        //if (rate.isBounded() || rate.isUnknown()) {
     265            addScalar(sizeTy, b.getName() + PROCESSED_ITEM_COUNT_SUFFIX);
     266        //}
     267    }
     268
     269    for (unsigned i = 0; i < outputSetCount; i++) {
     270        const Binding & b = mStreamSetOutputs[i];
     271        //const ProcessingRate & rate = b.getRate();
     272        //if (rate.isBounded() || rate.isUnknown()) {
     273            addScalar(sizeTy, b.getName() + PRODUCED_ITEM_COUNT_SUFFIX);
     274        //}
     275    }
     276
     277    for (unsigned i = 0; i < inputSetCount; i++) {
     278        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].getName() + BUFFER_PTR_SUFFIX);
     279    }
     280    for (unsigned i = 0; i < outputSetCount; i++) {
     281        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].getName() + BUFFER_PTR_SUFFIX);
    169282    }
    170283    for (const auto & binding : mScalarInputs) {
    171         addScalar(binding.type, binding.name);
     284        addScalar(binding.getType(), binding.getName());
    172285    }
    173286    for (const auto & binding : mScalarOutputs) {
    174         addScalar(binding.type, binding.name);
     287        addScalar(binding.getType(), binding.getName());
    175288    }
    176289    if (mStreamMap.empty()) {
     
    178291    }
    179292    for (const auto & binding : mInternalScalars) {
    180         addScalar(binding.type, binding.name);
    181     }
    182 
     293        addScalar(binding.getType(), binding.getName());
     294    }
    183295    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
    184296    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    185         addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
    186     }
    187 
     297        addScalar(consumerSetTy, mStreamSetOutputs[i].getName() + CONSUMER_SUFFIX);
     298    }
    188299    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    189300    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
    190 
    191301    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    192         addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
    193     }
    194 
     302        addScalar(sizeTy, mStreamSetOutputs[i].getName() + CONSUMED_ITEM_COUNT_SUFFIX);
     303    }
    195304    // We compile in a 64-bit CPU cycle counter into every kernel.   It will remain unused
    196305    // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
    197306    // will be able to add instrumentation to cached modules without recompilation.
    198307    addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
    199     addInternalKernelProperties(idb);
    200     // NOTE: StructType::create always creates a new type even if an identical one exists.
    201     if (LLVM_UNLIKELY(mModule == nullptr)) {
    202         setModule(new Module(getCacheName(idb), idb->getContext()));
    203     }
    204     mKernelStateType = mModule->getTypeByName(getName());
    205     if (LLVM_LIKELY(mKernelStateType == nullptr)) {
    206         mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
    207         assert (mKernelStateType);
    208     }
    209     processingRateAnalysis();
    210 }
    211 
    212 void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
    213 
    214     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
    215     if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
    216         report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
    217     }
    218     assert (getModule());
    219     const auto blockSize = idb->getBitBlockWidth();
    220     if (mStride == 0) {
    221         // Set the default kernel stride.
    222         mStride = blockSize;
    223     }
    224     IntegerType * const sizeTy = idb->getSizeTy();
    225 
    226     assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
    227 //    assert (mStreamSetInputs.size() == mStreamSetInputLookahead.size());
    228 
    229     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    230 //        const auto requiredBlocks = codegen::SegmentSize + ((mStreamSetInputLookahead[i] + blockSize - 1) / blockSize);
    231 //        if ((mStreamSetInputBuffers[i]->getBufferBlocks() != 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
    232 //            report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
    233 //        }
    234         mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
    235         if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
    236             addScalar(sizeTy, mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
    237         }
    238     }
    239 
    240     assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
    241 
    242     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    243         mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
    244         if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
    245             addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
    246         }
    247     }
    248     for (const auto & binding : mScalarInputs) {
    249         addScalar(binding.type, binding.name);
    250     }
    251     for (const auto & binding : mScalarOutputs) {
    252         addScalar(binding.type, binding.name);
    253     }
    254     if (mStreamMap.empty()) {
    255         prepareStreamSetNameMap();
    256     }
    257     for (const auto & binding : mInternalScalars) {
    258         addScalar(binding.type, binding.name);
    259     }
    260     Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
    261     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    262         addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
    263     }
    264     addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    265     addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
    266     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    267         addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
    268     }
    269     // We compile in a 64-bit CPU cycle counter into every kernel.   It will remain unused
    270     // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
    271     // will be able to add instrumentation to cached modules without recompilation.
    272     addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
    273 
    274     mKernelStateType = getModule()->getTypeByName(getName());
    275     if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
    276         report_fatal_error("Kernel " + getName() + " definition could not be found in the cache object");
    277     }
    278     processingRateAnalysis();
    279 }
    280    
    281 void Kernel::processingRateAnalysis() {
    282    
    283     const unsigned inputSetCount = mStreamSetInputs.size();
    284     const unsigned outputSetCount = mStreamSetOutputs.size();
    285     const unsigned totalSetCount = inputSetCount + outputSetCount;
    286    
    287     mItemsPerStride.resize(totalSetCount);
    288     mIsDerived.resize(totalSetCount);
    289 
    290     mItemsPerStride[0] = mStride;
    291     mIsDerived[0] = true;
    292    
    293     for (unsigned i = 0; i < inputSetCount; i++) {
    294         // Default reference stream set is the principal input stream set.
    295         auto & rate = mStreamSetInputs[i].rate;
    296         if (rate.referenceStreamSet() == "") {
    297             rate.setReferenceStreamSet(mStreamSetInputs[0].name);
    298         }
    299         Port port; unsigned ssIdx;
    300         std::tie(port, ssIdx) = getStreamPort(rate.referenceStreamSet());
    301         if ((port == Port::Output) || (ssIdx > i) || ((ssIdx == i) && (i > 0))) {
    302             report_fatal_error(getName() + ": input set " + mStreamSetInputs[i].name + ": forward or circular rate dependency");
    303         }
    304         if ((rate.isExact() || rate.isMaxRatio()) && mIsDerived[ssIdx]) {
    305             if ((mItemsPerStride[ssIdx] % rate.getRatioDenominator()) != 0) {
    306                 report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " processing rate denominator does not exactly divide items per stride.");
    307             }
    308             mItemsPerStride[i] = rate.calculateRatio(mItemsPerStride[ssIdx]);
    309             mIsDerived[i] = rate.isExact();
    310         }
    311         else {
    312             mIsDerived[i] = false;
    313             mItemsPerStride[i] = 0;  // For unknown input rate, no items will be copied to temp buffers.
    314         }
    315     }
    316    
    317     for (unsigned i = inputSetCount; i < totalSetCount; i++) {
    318         auto & rate = mStreamSetOutputs[i-inputSetCount].rate;
    319         // Default reference stream set is the principal input stream set for the principal output stream set.
    320         // Default reference stream set is the principal output stream set for other output stream sets.
    321         if (rate.referenceStreamSet() == "") {
    322             if ((mStreamSetInputs.size() > 0) && (i == inputSetCount)) {
    323                 rate.setReferenceStreamSet(mStreamSetInputs[0].name);
    324             }
    325             else {
    326                 rate.setReferenceStreamSet(mStreamSetOutputs[0].name);
    327             }
    328         }
    329         Port port; unsigned ssIdx;
    330         std::tie(port, ssIdx) = getStreamPort(rate.referenceStreamSet());
    331         if (port == Port::Output) ssIdx += inputSetCount;
    332         if ((ssIdx > i) || ((ssIdx == i) && (i > 0))) {
    333             report_fatal_error(getName() + ": output set " + mStreamSetOutputs[i].name + ": forward or circular rate dependency");
    334         }
    335         if ((rate.isExact() || rate.isMaxRatio()) && mIsDerived[ssIdx]) {
    336             if ((mItemsPerStride[ssIdx] % rate.getRatioDenominator()) != 0) {
    337                 report_fatal_error(getName() + ": " + mStreamSetOutputs[i-inputSetCount].name + " processing rate denominator does not exactly divide items per stride.");
    338             }
    339             mItemsPerStride[i] = rate.calculateRatio(mItemsPerStride[ssIdx]);
    340             mIsDerived[i] = rate.isExact();
    341         }
    342         else {
    343             mIsDerived[i] = false;
    344             mItemsPerStride[i] = 0;  // For unknown output rate, no items will be copied to temp buffers.
    345         }
    346     }
    347 }
    348 
    349    
    350 
    351 // Default kernel signature: generate the IR and emit as byte code.
     308
     309}
     310
     311
     312/** ------------------------------------------------------------------------------------------------------------- *
     313 * @brief makeSignature
     314 *
     315 * Default kernel signature: generate the IR and emit as byte code.
     316 ** ------------------------------------------------------------------------------------------------------------- */
    352317std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    353318    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
     
    363328}
    364329
     330
     331/** ------------------------------------------------------------------------------------------------------------- *
     332 * @brief generateKernel
     333 ** ------------------------------------------------------------------------------------------------------------- */
    365334void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    366335    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
     
    383352}
    384353
     354
     355/** ------------------------------------------------------------------------------------------------------------- *
     356 * @brief callGenerateInitializeMethod
     357 ** ------------------------------------------------------------------------------------------------------------- */
    385358inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    386359    mCurrentMethod = getInitFunction(idb->getModule());
     
    390363    idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
    391364    for (const auto & binding : mScalarInputs) {
    392         idb->setScalarField(binding.name, &*(args++));
     365        idb->setScalarField(binding.getName(), &*(args++));
    393366    }
    394367    for (const auto & binding : mStreamSetOutputs) {
    395         idb->setConsumerLock(binding.name, &*(args++));
     368        idb->setConsumerLock(binding.getName(), &*(args++));
    396369    }
    397370    generateInitializeMethod(idb);
     
    399372}
    400373
     374/** ------------------------------------------------------------------------------------------------------------- *
     375 * @brief callGenerateDoSegmentMethod
     376 ** ------------------------------------------------------------------------------------------------------------- */
    401377inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    402378    mCurrentMethod = getDoSegmentFunction(idb->getModule());
     
    405381    setInstance(&*(args++));
    406382    mIsFinal = &*(args++);
     383    mAvailablePrincipleItemCount = nullptr;
     384//    if (mHasPrincipleItemCount) {
     385//        mAvailablePrincipleItemCount = &*(args++);
     386//    }
    407387    const auto n = mStreamSetInputs.size();
    408388    mAvailableItemCount.resize(n, nullptr);
    409     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     389    for (unsigned i = 0; i < n; i++) {
     390//        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     391//        Value * itemCount = nullptr;
     392//        if (rate.isFixed()) {
     393//            itemCount = mAvailablePrincipleItemCount;
     394//            if (rate.getRate() != 1) {
     395//                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getRate()));
     396//            }
     397//        } else if (rate.isBounded() || rate.isUnknown()) {
     398//            itemCount = &*(args++);
     399//        } else if (rate.isRelative()) {
     400//            for (unsigned j = 0; j < i; ++j) {
     401//                if (mStreamSetInputs[j].getName() == rate.getReference()) {
     402//                    itemCount = mAvailableItemCount[j];
     403//                    break;
     404//                }
     405//            }
     406//            if (LLVM_UNLIKELY(itemCount == nullptr)) {
     407//                report_fatal_error(mStreamSetInputs[i].getName() + " is declared before " + rate.getReference());
     408//            }
     409//            if (rate.getNumerator() != 1) {
     410//                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
     411//            }
     412//            if (rate.getDenominator() != 1) {
     413//                itemCount = idb->CreateUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
     414//            }
     415//        }
     416//        assert (itemCount);
     417//        mAvailableItemCount[i] = itemCount;
     418
     419        assert (args != mCurrentMethod->arg_end());
    410420        mAvailableItemCount[i] = &*(args++);
    411421    }
    412     generateDoSegmentMethod(idb); // must be overridden by the KernelBuilder subtype
     422    assert (args == mCurrentMethod->arg_end());
     423
     424    generateKernelMethod(idb); // must be overridden by the Kernel subtype
    413425    mIsFinal = nullptr;
    414426    mAvailableItemCount.clear();
     
    416428}
    417429
     430
     431/** ------------------------------------------------------------------------------------------------------------- *
     432 * @brief callGenerateFinalizeMethod
     433 ** ------------------------------------------------------------------------------------------------------------- */
    418434inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
    419435    mCurrentMethod = getTerminateFunction(idb->getModule());
     
    421437    auto args = mCurrentMethod->arg_begin();
    422438    setInstance(&*(args++));
    423     generateFinalizeMethod(idb); // may be overridden by the KernelBuilder subtype
     439    generateFinalizeMethod(idb); // may be overridden by the Kernel subtype
    424440    const auto n = mScalarOutputs.size();
    425441    if (n == 0) {
     
    428444        Value * outputs[n];
    429445        for (unsigned i = 0; i < n; ++i) {
    430             outputs[i] = idb->getScalarField(mScalarOutputs[i].name);
     446            outputs[i] = idb->getScalarField(mScalarOutputs[i].getName());
    431447        }
    432448        if (n == 1) {
     
    438454}
    439455
     456
     457/** ------------------------------------------------------------------------------------------------------------- *
     458 * @brief getScalarIndex
     459 ** ------------------------------------------------------------------------------------------------------------- */
    440460unsigned Kernel::getScalarIndex(const std::string & name) const {
    441461    const auto f = mKernelMap.find(name);
    442462    if (LLVM_UNLIKELY(f == mKernelMap.end())) {
     463        assert (false);
    443464        report_fatal_error(getName() + " does not contain scalar: " + name);
    444465    }
     
    446467}
    447468
     469
     470/** ------------------------------------------------------------------------------------------------------------- *
     471 * @brief createInstance
     472 ** ------------------------------------------------------------------------------------------------------------- */
    448473Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) {
    449474    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     
    455480}
    456481
     482
     483/** ------------------------------------------------------------------------------------------------------------- *
     484 * @brief initializeInstance
     485 ** ------------------------------------------------------------------------------------------------------------- */
    457486void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
    458487    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     
    518547}
    519548
     549/** ------------------------------------------------------------------------------------------------------------- *
     550 * @brief finalizeInstance
     551 ** ------------------------------------------------------------------------------------------------------------- */
     552void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
     553    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     554    mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
     555}
     556
     557/** ------------------------------------------------------------------------------------------------------------- *
     558 * @brief getStreamPort
     559 ** ------------------------------------------------------------------------------------------------------------- */
     560Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
     561    const auto f = mStreamMap.find(name);
     562    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
     563        report_fatal_error(getName() + " does not contain stream set " + name);
     564    }
     565    return f->second;
     566}
     567
     568/** ------------------------------------------------------------------------------------------------------------- *
     569 * @brief generateKernelMethod
     570 ** ------------------------------------------------------------------------------------------------------------- */
     571void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
     572
     573    Constant * const log2BlockWidth = b->getSize(std::log2(b->getBitBlockWidth()));
     574
     575    const auto inputSetCount = mStreamSetInputs.size();
     576    mStreamSetInputBufferPtr.resize(inputSetCount);
     577    for (unsigned i = 0; i < inputSetCount; ++i) {
     578        const auto & name = mStreamSetInputs[i].getName();
     579        Value * ic = b->getProcessedItemCount(name);
     580        Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
     581        mStreamSetInputBufferPtr[i] = b->getInputStreamPtr(name, blockIndex);
     582    }
     583
     584    const auto outputSetCount = mStreamSetOutputs.size();
     585    mStreamSetOutputBufferPtr.resize(outputSetCount);
     586    for (unsigned i = 0; i < outputSetCount; ++i) {
     587        const auto & name = mStreamSetOutputs[i].getName();
     588        Value * ic = b->getProducedItemCount(name);
     589        Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
     590        mStreamSetOutputBufferPtr[i] = b->getOutputStreamPtr(name, blockIndex);
     591    }
     592
     593    generateDoSegmentMethod(b);
     594
     595}
     596
     597/** ------------------------------------------------------------------------------------------------------------- *
     598 * @brief generateKernelMethod
     599 ** ------------------------------------------------------------------------------------------------------------- */
     600void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) {
     601
     602    const auto inputSetCount = mStreamSetInputs.size();
     603    const auto outputSetCount = mStreamSetOutputs.size();
     604    const auto totalSetCount = inputSetCount + outputSetCount;
     605
     606    // Scan through and see if any of our input streams is marked as the principle
     607
     608    bool hasPrinciple = false;
     609    unsigned principleInput = 0;
     610
     611    for (unsigned i = 0; i < inputSetCount; i++) {
     612        for (const auto attr : mStreamSetInputs[i].getAttributes()) {
     613            if (attr.isPrinciple()) {
     614                hasPrinciple = true;
     615                principleInput = i;
     616                break;
     617            }
     618        }
     619    }
     620
     621    // Now we iteratively process these blocks using the doMultiBlock method.
     622    // In each iteration, we check how many linearly accessible / writable
     623    // items can be processed with our current input / output buffers. If we
     624    // cannot support an full stride, we check whether (a) there is enough
     625    // input data to process but it is not linearly accessible, in which case
     626    // we move the data into temporary buffers or (b) there is not enough data
     627    // to process, in which case we abort unless IsFinal was set.
     628
     629    // Now proceed with creation of the doSegment method.
     630    BasicBlock * const doSegmentLoop = kb->CreateBasicBlock("DoSegmentLoop");
     631    kb->CreateBr(doSegmentLoop);
     632
     633    /// DO SEGMENT LOOP
     634
     635    kb->SetInsertPoint(doSegmentLoop);
     636
     637    // For each input buffer, determine the processedItemCount, the block pointer for the
     638    // buffer block containing the next item, and the number of linearly available items.
     639
     640    Value * processedItemCount[inputSetCount];
     641    Value * baseInputBuffer[inputSetCount];
     642    Value * unprocessed[inputSetCount];
     643    Value * linearlyAvailable[inputSetCount];
     644    Value * readableStrides[inputSetCount];
     645
     646    Constant * const log2BlockWidth = kb->getSize(std::log2(kb->getBitBlockWidth()));
     647
     648    Value * numOfStrides = nullptr;
     649
     650    for (unsigned i = 0; i < inputSetCount; i++) {
     651        const auto name = mStreamSetInputs[i].getName();
     652        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     653
     654        processedItemCount[i] = kb->getProcessedItemCount(name);
     655
     656        assert (processedItemCount[i]->getType() == mAvailableItemCount[i]->getType());
     657
     658        Value * const blockIndex = kb->CreateLShr(processedItemCount[i], log2BlockWidth);
     659        baseInputBuffer[i] = kb->getInputStreamPtr(name, blockIndex);
     660
     661        if (codegen::EnableAsserts) {
     662            kb->CreateAssert(kb->CreateICmpUGE(mAvailableItemCount[i], processedItemCount[i]),
     663                             "Processed item count cannot exceed the available item count");
     664        }
     665
     666        unprocessed[i] = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
     667
     668        //kb->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed[i]);
     669
     670        // INVESTIGATE: If the input rate of this stream is constant and known a priori, we could
     671        // avoid checking whether it is linearly accessible. Should we have an attribute for this?
     672
     673        linearlyAvailable[i] = kb->getLinearlyAccessibleItems(name, processedItemCount[i], unprocessed[i]);
     674
     675        //kb->CallPrintInt(getName() + "_" + name + "_linearlyAvailable", linearlyAvailable[i]);
     676
     677        readableStrides[i] = nullptr;
     678
     679        if (rate.isFixed() || rate.isBounded()) {
     680            Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
     681            readableStrides[i] = kb->CreateUDiv(linearlyAvailable[i], maxStrideSize);
     682            if (numOfStrides) {
     683                numOfStrides = kb->CreateUMin(numOfStrides, readableStrides[i]);
     684            } else {
     685                numOfStrides = readableStrides[i];
     686            }
     687        }
     688    }
     689
     690    //kb->CallPrintInt(getName() + "_numOfStrides", numOfStrides);
     691
     692    // Now determine the linearly writeable blocks, based on available blocks reduced
     693    // by limitations of output buffer space.
     694
     695    Value * producedItemCount[outputSetCount];
     696    Value * baseOutputBuffer[outputSetCount];
     697    Value * writableStrides[outputSetCount];
     698    Value * linearlyWritable[outputSetCount];
     699
     700    for (unsigned i = 0; i < outputSetCount; i++) {
     701        const auto & name = mStreamSetOutputs[i].getName();
     702        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     703        producedItemCount[i] = kb->getProducedItemCount(name);
     704
     705        //kb->CallPrintInt(getName() + "_" + name + "_producedItemCount", producedItemCount[i]);
     706
     707        Value * const blockIndex = kb->CreateLShr(producedItemCount[i], log2BlockWidth);
     708        baseOutputBuffer[i] = kb->getOutputStreamPtr(name, blockIndex);
     709        linearlyWritable[i] = nullptr;
     710        writableStrides[i] = nullptr;
     711        if (rate.isFixed() || rate.isBounded()) {
     712            linearlyWritable[i] = kb->getLinearlyWritableItems(name, producedItemCount[i]);
     713
     714            //kb->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]);
     715
     716            Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
     717            writableStrides[i] = kb->CreateUDiv(linearlyWritable[i], maxStrideSize);
     718            if (numOfStrides) {
     719                numOfStrides = kb->CreateUMin(numOfStrides, writableStrides[i]);
     720            } else {
     721                numOfStrides = writableStrides[i];
     722            }
     723        }
     724    }
     725
     726    //kb->CallPrintInt(getName() + "_numOfStrides'", numOfStrides);
     727
     728    for (unsigned i = 0; i < inputSetCount; i++) {
     729        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     730        if (rate.isFixed()) {
     731            mAvailableItemCount[i] = kb->CreateMul(numOfStrides, kb->getSize(rate.getRate() * mStride));
     732        } else {
     733            mAvailableItemCount[i] = linearlyAvailable[i];
     734        }
     735
     736        //kb->CallPrintInt(getName() + "_" + mStreamSetInputs[i].getName() + "_avail", mAvailableItemCount[i]);
     737    }
     738
     739    // Define and allocate the temporary buffer area.
     740    Type * tempBuffers[totalSetCount];
     741    for (unsigned i = 0; i < inputSetCount; ++i) {
     742        Type * bufType = baseInputBuffer[i]->getType()->getPointerElementType();
     743        assert (baseInputBuffer[i]->getType()->getPointerAddressSpace() == 0);
     744        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     745        unsigned count = 0;
     746        if (rate.isFixed()) {
     747            count = rate.getRate();
     748        } else if (rate.isBounded()) {
     749            count = rate.getUpperBound() + 2;
     750        }
     751        tempBuffers[i] = ArrayType::get(bufType, count);
     752    }
     753    for (unsigned i = 0; i < outputSetCount; i++) {
     754        Type * const bufType = baseOutputBuffer[i]->getType()->getPointerElementType();
     755        assert (baseOutputBuffer[i]->getType()->getPointerAddressSpace() == 0);
     756        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     757        unsigned count = 0;
     758        if (rate.isFixed()) {
     759            count = rate.getRate();
     760        } else if (rate.isBounded()) {
     761            count = rate.getUpperBound() + 2;
     762        }
     763        tempBuffers[i + inputSetCount] = ArrayType::get(bufType, count);
     764    }
     765
     766    Type * const tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount));
     767
     768    Value * const tempBufferArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
     769
     770    BasicBlock * const temporaryBufferCheck = kb->CreateBasicBlock("temporaryBufferCheck");
     771    BasicBlock * const doMultiBlock = kb->CreateBasicBlock("doMultiBlock");
     772    BasicBlock * const copyToTemporaryBuffers = kb->CreateBasicBlock("copyToTemporaryBuffers");
     773    BasicBlock * const segmentDone = kb->CreateBasicBlock("segmentDone");
     774
     775    Value * const hasFullStride = numOfStrides ? kb->CreateICmpNE(numOfStrides, kb->getSize(0)) : kb->getTrue();
     776    kb->CreateCondBr(hasFullStride, doMultiBlock, temporaryBufferCheck);
     777
     778    // We use temporary buffers in 3 different cases that preclude full stride processing.
     779
     780    //  (a) One or more input buffers does not have a sufficient number of input items linearly available.
     781    //  (b) One or more output buffers does not have sufficient linearly available buffer space.
     782    //  (c) We have processed all the full strides of input and only the final block remains.
     783
     784    kb->SetInsertPoint(temporaryBufferCheck);
     785
     786    // Even if we copy the input data into a linear arrays, is there enough data to perform this stride?
     787    // If not, proceed only if this is our final block.
     788    Value * hasFullFragmentedStride = nullptr;
     789    for (unsigned i = 0; i < inputSetCount; i++) {
     790        const ProcessingRate & r = mStreamSetInputs[i].getRate();
     791        if (r.isBounded() || (r.isUnknown() && r.getLowerBound() > 0)) {
     792            const auto l = r.isBounded() ? r.getUpperBound() : r.getLowerBound();
     793            Constant * const strideSize = kb->getSize(l * mStride);
     794            Value * enoughAvail = kb->CreateICmpUGE(unprocessed[i], strideSize);
     795            if (hasFullFragmentedStride) {
     796                hasFullFragmentedStride = kb->CreateAnd(hasFullFragmentedStride, enoughAvail);
     797            } else {
     798                hasFullFragmentedStride = enoughAvail;
     799            }
     800        }
     801    }
     802
     803    Value * hasFragmentedOrFinalStride = nullptr;
     804    if (hasFullFragmentedStride) {
     805        hasFragmentedOrFinalStride = kb->CreateOr(hasFullFragmentedStride, mIsFinal);
     806        // Although this might be the final segment, we may have a full fragmented stride to process prior
     807        // to the actual final stride.
     808        mIsFinal = kb->CreateAnd(mIsFinal, kb->CreateNot(hasFullFragmentedStride));
     809    } else {
     810        hasFragmentedOrFinalStride = mIsFinal;
     811    }
     812    kb->CreateCondBr(hasFragmentedOrFinalStride, copyToTemporaryBuffers, segmentDone);
     813
     814    /// COPY TO TEMPORARY BUFFERS
     815    kb->SetInsertPoint(copyToTemporaryBuffers);
     816
     817    kb->CreateAlignedStore(Constant::getNullValue(tempParameterStructType), tempBufferArea, kb->getCacheAlignment());
     818
     819    // For each input and output buffer, copy over necessary data starting from the last block boundary.
     820
     821    Value * temporaryInputBuffer[inputSetCount];
     822    Value * temporaryAvailable[inputSetCount];
     823
     824    for (unsigned i = 0; i < inputSetCount; i++) {
     825        temporaryInputBuffer[i] = baseInputBuffer[i];
     826        if (readableStrides[i]) {
     827            const auto name = mStreamSetInputs[i].getName();
     828            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     829            assert (rate.getUpperBound() > 0);
     830            Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
     831            temporaryAvailable[i] = kb->CreateUMin(unprocessed[i], maxStrideSize);
     832
     833            BasicBlock * entry = kb->GetInsertBlock();
     834            BasicBlock * copy = kb->CreateBasicBlock(name + "Copy");
     835            BasicBlock * resume = kb->CreateBasicBlock(name + "ResumeCopy");
     836            Value * const test = kb->CreateOr(kb->CreateICmpNE(readableStrides[i], kb->getSize(0)), mIsFinal);
     837            kb->CreateCondBr(test, resume, copy);
     838
     839            kb->SetInsertPoint(copy);
     840            Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(i), kb->getInt32(0)});
     841            assert (tempBufferPtr->getType() == baseInputBuffer[i]->getType());
     842            Value * const neededItems = linearlyAvailable[i];
     843            Value * const bytesCopied = kb->copy(name, tempBufferPtr, baseInputBuffer[i], neededItems);
     844            Value * const nextInputPtr = kb->getRawInputPointer(name, kb->getSize(0));
     845            Value * const remaining = kb->CreateSub(temporaryAvailable[i], neededItems);
     846            Value * nextBufPtr = kb->CreatePointerCast(tempBufferPtr, kb->getInt8PtrTy());
     847            nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
     848            kb->copy(name, nextBufPtr, nextInputPtr, remaining);
     849
     850            kb->CreateBr(resume);
     851
     852            kb->SetInsertPoint(resume);
     853            PHINode * bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
     854            bufferPtr->addIncoming(baseInputBuffer[i], entry);
     855            bufferPtr->addIncoming(tempBufferPtr, copy);
     856            temporaryInputBuffer[i] = bufferPtr;
     857        }
     858    }
     859
     860    Value * temporaryOutputBuffer[outputSetCount];
     861    for (unsigned i = 0; i < outputSetCount; i++) {
     862        temporaryOutputBuffer[i] = baseOutputBuffer[i];
     863        if (writableStrides[i]) {
     864            const auto name = mStreamSetOutputs[i].getName();
     865
     866            BasicBlock * const entry = kb->GetInsertBlock();
     867            BasicBlock * const copy = kb->CreateBasicBlock(name + "Copy");
     868            BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopy");
     869
     870            Value * const test = kb->CreateOr(kb->CreateICmpNE(writableStrides[i], kb->getSize(0)), mIsFinal);
     871            kb->CreateCondBr(test, resume, copy);
     872
     873            kb->SetInsertPoint(copy);
     874            Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i), kb->getInt32(0)});
     875            assert (tempBufferPtr->getType() == baseOutputBuffer[i]->getType());
     876            Value * const itemsToCopy = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
     877            kb->copy(name, tempBufferPtr, baseOutputBuffer[i], itemsToCopy);
     878            kb->CreateBr(resume);
     879
     880            kb->SetInsertPoint(resume);
     881            PHINode * bufferPtr = kb->CreatePHI(tempBufferPtr->getType(), 2);
     882            bufferPtr->addIncoming(baseOutputBuffer[i], entry);
     883            bufferPtr->addIncoming(tempBufferPtr, copy);
     884            temporaryOutputBuffer[i] = bufferPtr;
     885        }
     886    }
     887
     888    kb->CreateBr(doMultiBlock);
     889    BasicBlock * const usingTemporaryBuffers = kb->GetInsertBlock();
     890    doMultiBlock->moveAfter(usingTemporaryBuffers);
     891
     892    /// DO MULTI BLOCK
     893
     894    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
     895    //  Now prepare the doMultiBlock call.
     896    kb->SetInsertPoint(doMultiBlock);
     897
     898    PHINode * const isFinal = kb->CreatePHI(mIsFinal->getType(), 2);
     899    isFinal->addIncoming(kb->getFalse(), doSegmentLoop);
     900    isFinal->addIncoming(mIsFinal, usingTemporaryBuffers);
     901    mIsFinal = isFinal;
     902
     903    mStreamSetInputBufferPtr.resize(inputSetCount);
     904    for (unsigned i = 0; i < inputSetCount; ++i) {
     905        assert (baseInputBuffer[i] && temporaryInputBuffer[i]);
     906        if (baseInputBuffer[i] != temporaryInputBuffer[i]) {
     907            PHINode * const avail = kb->CreatePHI(kb->getSizeTy(), 2);
     908            avail->addIncoming(mAvailableItemCount[i], doSegmentLoop);
     909            avail->addIncoming(temporaryAvailable[i], usingTemporaryBuffers);
     910            mAvailableItemCount[i] = avail;
     911            PHINode * const bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
     912            bufferPtr->addIncoming(baseInputBuffer[i], doSegmentLoop);
     913            assert (baseInputBuffer[i]->getType() == temporaryInputBuffer[i]->getType());
     914            bufferPtr->addIncoming(temporaryInputBuffer[i], usingTemporaryBuffers);
     915            temporaryInputBuffer[i] = bufferPtr;
     916        }
     917        mStreamSetInputBufferPtr[i] = temporaryInputBuffer[i];
     918    }
     919
     920    mStreamSetOutputBufferPtr.resize(outputSetCount);
     921    for (unsigned i = 0; i < outputSetCount; ++i) {
     922        assert (baseOutputBuffer[i] && temporaryOutputBuffer[i]);
     923        if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
     924            PHINode * const bufferPtr = kb->CreatePHI(baseOutputBuffer[i]->getType(), 2);
     925            bufferPtr->addIncoming(baseOutputBuffer[i], doSegmentLoop);
     926            assert (baseOutputBuffer[i]->getType() == temporaryOutputBuffer[i]->getType());
     927            bufferPtr->addIncoming(temporaryOutputBuffer[i], usingTemporaryBuffers);
     928            temporaryOutputBuffer[i] = bufferPtr;
     929        }
     930        mStreamSetOutputBufferPtr[i] = temporaryOutputBuffer[i];
     931    }
     932
     933    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
     934    // provide the required multi-block kernel logic.
     935    generateMultiBlockLogic(kb, numOfStrides);
     936
     937    // If we have no fixed rate inputs, we won't know when we're done parsing until we test
     938    // whether any input data was processed.
     939    bool mayMakeNoProgress = true;
     940
     941    // Update the processed item count of any Fixed input or output stream. While doing so, also
     942    // calculate the LCM of their rates. The LCM is used to calculate the final item counts.
     943
     944    unsigned rateLCM = 1;
     945
     946    for (unsigned i = 0; i < inputSetCount; ++i) {
     947        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     948        if (rate.isFixed()) {
     949            mayMakeNoProgress = false;
     950            rateLCM = lcm(rateLCM, rate.getRate());
     951            Value * const processed = mAvailableItemCount[i]; // kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
     952            Value * const ic = kb->CreateAdd(processedItemCount[i], processed);
     953            kb->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
     954        }
     955    }
     956
     957    for (unsigned i = 0; i < outputSetCount; ++i) {
     958        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     959        if (rate.isFixed()) {
     960            rateLCM = lcm(rateLCM, rate.getRate());
     961            Value * const produced = kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
     962            Value * const ic = kb->CreateAdd(producedItemCount[i], produced);
     963            kb->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
     964        }
     965    }
     966
     967    BasicBlock * const finalStrideCheck = kb->CreateBasicBlock("finalStrideCheck");
     968    BasicBlock * const finalStrideAdjustment = kb->CreateBasicBlock("finalStrideAdjustment");
     969    BasicBlock * const standardCopyBack = kb->CreateBasicBlock("standardCopyBack");
     970    BasicBlock * const temporaryBufferCopyBack = kb->CreateBasicBlock("temporaryBufferCopyBack");
     971
     972    kb->CreateLikelyCondBr(hasFullStride, standardCopyBack, finalStrideCheck);
     973
     974
     975    /// FINAL STRIDE CHECK
     976    kb->SetInsertPoint(finalStrideCheck);
     977    kb->CreateUnlikelyCondBr(mIsFinal, finalStrideAdjustment, temporaryBufferCopyBack);
     978
     979    /// FINAL STRIDE ADJUSTMENT
     980    kb->SetInsertPoint(finalStrideAdjustment);
     981
     982    // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
     983    // the ITEM COUNT % FIXED RATE = 0 for all Fixed Input and Output streams. We correct that here
     984    // to calculate them based on the actual input item counts.
     985
     986    // NOTE: This appears overly complex to avoid an integer overflow without reducing the maximum
     987    // integer size. For each Fixed output stream, this calculates:
     988
     989    //       CEILING(MIN(Total Available Item Count / Fixed Input Rate) * Fixed Output Rate)
     990
     991    Value * basePreviouslyProcessedItemCount = nullptr;
     992    Value * scaledInverseOfStrideItemCount = nullptr;
     993
     994    for (unsigned i = 0; i < inputSetCount; ++i) {
     995        const ProcessingRate & r = mStreamSetInputs[i].getRate();
     996        if (r.isFixed()) {
     997            assert (rateLCM % r.getRate() == 0);
     998            Value * const a = kb->CreateMul(mAvailableItemCount[i], kb->getSize(rateLCM / r.getRate())); // unprocessed
     999            Value * const p = kb->CreateUDiv(processedItemCount[i], kb->getSize(r.getRate()));
     1000            if (scaledInverseOfStrideItemCount) {
     1001                scaledInverseOfStrideItemCount = kb->CreateUMin(scaledInverseOfStrideItemCount, a);
     1002                basePreviouslyProcessedItemCount = kb->CreateUMin(basePreviouslyProcessedItemCount, p);
     1003            } else {
     1004                scaledInverseOfStrideItemCount = a;
     1005                basePreviouslyProcessedItemCount = p;
     1006            }
     1007        }
     1008//        const auto name = mStreamSetInputs[i].getName();
     1009//        Value * const processed = kb->CreateAdd(processedItemCount[i], unprocessed[i]);
     1010//        kb->setProcessedItemCount(name, processed);
     1011    }
     1012
     1013    for (unsigned i = 0; i < outputSetCount; ++i) {
     1014        const auto name = mStreamSetOutputs[i].getName();
     1015        const ProcessingRate & r = mStreamSetOutputs[i].getRate();
     1016        Value * produced = nullptr;
     1017        if (r.isFixed()) {
     1018            assert (rateLCM % r.getRate() == 0);
     1019            assert (basePreviouslyProcessedItemCount && scaledInverseOfStrideItemCount);
     1020            Value * const p = kb->CreateMul(basePreviouslyProcessedItemCount, kb->getSize(r.getRate()));
     1021            Value * const ic = kb->CreateUDivCeil(scaledInverseOfStrideItemCount, kb->getSize(rateLCM / r.getRate()));
     1022            produced = kb->CreateAdd(p, ic);
     1023        } else { // check if we have an attribute; if so, get the current produced count and adjust it
     1024            bool noAttributes = true;
     1025            for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1026                if (attr.isAdd() || attr.isRoundUpTo()) {
     1027                    noAttributes = false;
     1028                    break;
     1029                }
     1030            }
     1031            if (noAttributes) {
     1032                continue;
     1033            }
     1034            produced = kb->getProducedItemCount(name);
     1035        }
     1036        for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1037            if (attr.isAdd()) {
     1038                produced = kb->CreateAdd(produced, kb->getSize(attr.getAmount()));
     1039            } else if (attr.isRoundUpTo()) {
     1040                produced = kb->CreateRoundUp(produced, kb->getSize(attr.getAmount()));
     1041            }
     1042        }
     1043        kb->setProducedItemCount(name, produced);
     1044    }
     1045
     1046    kb->CreateBr(temporaryBufferCopyBack);
     1047
     1048    /// TEMPORARY BUFFER COPY BACK
     1049    kb->SetInsertPoint(temporaryBufferCopyBack);
     1050
     1051    // Copy back data to the actual output buffers.
     1052    for (unsigned i = 0; i < outputSetCount; i++) {
     1053
     1054        if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
     1055
     1056            const auto name = mStreamSetOutputs[i].getName();
     1057
     1058            BasicBlock * const copy = kb->CreateBasicBlock(name + "CopyBack");
     1059            BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopyBack");
     1060            Value * const usedTemporary = kb->CreateICmpNE(temporaryOutputBuffer[i], baseOutputBuffer[i]);
     1061
     1062            // If we used a temporary buffer ...
     1063            kb->CreateCondBr(usedTemporary, copy, resume);
     1064
     1065            kb->SetInsertPoint(copy);
     1066            Value * bytesCopied = kb->copy(name, baseOutputBuffer[i], temporaryOutputBuffer[i], linearlyWritable[i]);
     1067            Value * nextOutputPtr = kb->getRawOutputPointer(name, kb->getSize(0));
     1068            Value * producedCount = kb->getProducedItemCount(name);
     1069
     1070            Value * remaining = kb->CreateSub(producedCount, linearlyWritable[i]);
     1071            Value * nextBufPtr = kb->CreatePointerCast(temporaryOutputBuffer[i], kb->getInt8PtrTy());
     1072            nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
     1073
     1074            kb->copy(name, nextOutputPtr, nextBufPtr, remaining);
     1075            kb->CreateBr(resume);
     1076
     1077            kb->SetInsertPoint(resume);
     1078        }
     1079    }
     1080
     1081    //  We've dealt with the partial block processing and copied information back into the
     1082    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
     1083    BasicBlock * setTermination = nullptr;
     1084    if (hasNoTerminateAttribute()) {
     1085        kb->CreateCondBr(mIsFinal, segmentDone, standardCopyBack);
     1086    } else {
     1087        setTermination = kb->CreateBasicBlock("setTermination");
     1088        kb->CreateCondBr(mIsFinal, setTermination, standardCopyBack);
     1089    }
     1090
     1091    /// STANDARD COPY BACK
     1092    kb->SetInsertPoint(standardCopyBack);
     1093
     1094    // Do copybacks if necessary.
     1095    for (unsigned i = 0; i < outputSetCount; i++) {
     1096        if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
     1097            const auto name = mStreamSetOutputs[i].getName();
     1098            Value * newProduced = kb->getProducedItemCount(name);
     1099            kb->CreateCopyBack(name, producedItemCount[i], newProduced);
     1100        }
     1101    }
     1102
     1103    // If it is possible to make no progress, verify we processed some of the input. If we haven't,
     1104    // we're finished this segment.
     1105    if (mayMakeNoProgress) {
     1106        Value * madeProgress = nullptr;
     1107        for (unsigned i = 0; i < inputSetCount; ++i) {
     1108            Value * const processed = kb->getProcessedItemCount(mStreamSetInputs[i].getName());
     1109            Value * const progress = kb->CreateICmpNE(processed, processedItemCount[i]);
     1110            if (madeProgress) {
     1111                madeProgress = kb->CreateOr(madeProgress, progress);
     1112            } else {
     1113                madeProgress = progress;
     1114            }
     1115        }
     1116        assert (madeProgress);
     1117        kb->CreateCondBr(madeProgress, doSegmentLoop, segmentDone);
     1118    } else {
     1119        kb->CreateBr(doSegmentLoop);
     1120    }
     1121
     1122    if (hasNoTerminateAttribute()) {
     1123        segmentDone->moveAfter(kb->GetInsertBlock());
     1124    } else {
     1125        /// SET TERMINATION
     1126        setTermination->moveAfter(kb->GetInsertBlock());
     1127        kb->SetInsertPoint(setTermination);
     1128        kb->setTerminationSignal();
     1129        kb->CreateBr(segmentDone);
     1130        segmentDone->moveAfter(setTermination);
     1131    }
     1132
     1133    kb->SetInsertPoint(segmentDone);
     1134
     1135}
     1136
     1137//bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
     1138//    if (rate.isBounded() || rate.isUnknown()) {
     1139//        return true;
     1140//    } else if (rate.isDirectlyRelative()) {
     1141//        Port port; unsigned i;
     1142//        std::tie(port, i) = getStreamPort(rate.getReference());
     1143//        const auto & binding = (port == Port::Input) ? mStreamSetInputs[i] : mStreamSetOutputs[i];
     1144//        return requiresCopyBack(binding.getRate());
     1145//    }
     1146//    return false;
     1147//}
     1148
    5201149//  The default doSegment method dispatches to the doBlock routine for
    5211150//  each block of the given number of blocksToDo, and then updates counts.
    5221151
    523 void BlockOrientedKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb) {
     1152void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) {
     1153
    5241154    BasicBlock * const entryBlock = idb->GetInsertBlock();
    5251155    BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
     
    5341164    }
    5351165
    536     ConstantInt * stride = idb->getSize(idb->getStride());
    537     Value * availablePos = mAvailableItemCount[0];
    538     Value * processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
    539     Value * itemsAvail = idb->CreateSub(availablePos, processed);
    540     Value * stridesToDo = idb->CreateUDiv(itemsAvail, stride);
     1166    Constant * const log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
     1167
     1168    const auto inputSetCount = mStreamSetInputs.size();
     1169    Value * baseProcessedIndex[inputSetCount];
     1170    for (unsigned i = 0; i < inputSetCount; ++i) {
     1171        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     1172        if (rate.isFixed()) {
     1173            baseProcessedIndex[i] = nullptr;
     1174        } else {
     1175            Value * ic = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
     1176            ic = idb->CreateLShr(ic, log2BlockSize);
     1177            baseProcessedIndex[i] = ic;
     1178        }
     1179    }
     1180
     1181    const auto outputSetCount = mStreamSetOutputs.size();
     1182    Value * baseProducedIndex[outputSetCount];
     1183    for (unsigned i = 0; i < outputSetCount; ++i) {
     1184        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     1185        if (rate.isFixed()) {
     1186            baseProducedIndex[i] = nullptr;
     1187        } else {
     1188            Value * ic = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
     1189            ic = idb->CreateLShr(ic, log2BlockSize);
     1190            baseProducedIndex[i] = ic;
     1191        }
     1192    }
     1193
     1194    Value * const numOfBlocksToProcess = idb->CreateMul(numOfStrides, idb->getSize(mStride / idb->getBitBlockWidth()));
    5411195
    5421196    idb->CreateBr(strideLoopCond);
    5431197
     1198    /// BLOCK COND
     1199
    5441200    idb->SetInsertPoint(strideLoopCond);
    5451201
    5461202    PHINode * branchTarget = nullptr;
    547     if (idb->supportsIndirectBr()) {
     1203    if (baseTarget) {
    5481204        branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
    5491205        branchTarget->addIncoming(baseTarget, entryBlock);
    5501206    }
    5511207
    552     PHINode * const stridesRemaining = idb->CreatePHI(idb->getSizeTy(), 2, "stridesRemaining");
    553     stridesRemaining->addIncoming(stridesToDo, entryBlock);
    554     // NOTE: stridesRemaining may go to a negative number in the final block if the generateFinalBlockMethod(...)
    555     // calls CreateDoBlockMethodCall(). Do *not* replace the comparator with an unsigned one!
    556     Value * notDone = idb->CreateICmpSGT(stridesRemaining, idb->getSize(0));
     1208    PHINode * const blockIndex = idb->CreatePHI(idb->getSizeTy(), 2, "index");
     1209    blockIndex->addIncoming(idb->getSize(0), entryBlock);
     1210
     1211    for (unsigned i = 0; i < inputSetCount; ++i) {
     1212        Value * offset = blockIndex;
     1213        if (baseProcessedIndex[i]) {
     1214            offset = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
     1215            offset = idb->CreateLShr(offset, log2BlockSize);
     1216            offset = idb->CreateSub(offset, baseProcessedIndex[i]);
     1217        }
     1218        mStreamSetInputBufferPtr[i] = idb->CreateGEP(mStreamSetInputBufferPtr[i], offset);
     1219    }
     1220
     1221    for (unsigned i = 0; i < outputSetCount; ++i) {
     1222        Value * offset = blockIndex;
     1223        if (baseProducedIndex[i]) {
     1224            offset = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
     1225            offset = idb->CreateLShr(offset, log2BlockSize);
     1226            offset = idb->CreateSub(offset, baseProducedIndex[i]);
     1227        }
     1228        mStreamSetOutputBufferPtr[i] = idb->CreateGEP(mStreamSetOutputBufferPtr[i], offset);
     1229    }
     1230
     1231    Value * const notDone = idb->CreateICmpULT(blockIndex, numOfBlocksToProcess);
    5571232    idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
     1233
     1234    /// BLOCK BODY
    5581235
    5591236    idb->SetInsertPoint(mStrideLoopBody);
     
    5681245    writeDoBlockMethod(idb);
    5691246
    570     /// UPDATE PROCESSED COUNTS
    571 
    572     processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
    573     Value * itemsDone = idb->CreateAdd(processed, stride);
    574     idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
    575 
    576     stridesRemaining->addIncoming(idb->CreateSub(stridesRemaining, idb->getSize(1)), idb->GetInsertBlock());
    577 
    578     BasicBlock * bodyEnd = idb->GetInsertBlock();
    579     if (idb->supportsIndirectBr()) {
     1247    BasicBlock * const bodyEnd = idb->GetInsertBlock();
     1248    blockIndex->addIncoming(idb->CreateAdd(blockIndex, idb->getSize(1)), bodyEnd);
     1249    if (branchTarget) {
    5801250        branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
    5811251    }
     
    5841254    stridesDone->moveAfter(bodyEnd);
    5851255
     1256    /// STRIDE DONE
     1257
    5861258    idb->SetInsertPoint(stridesDone);
    5871259
    5881260    // Now conditionally perform the final block processing depending on the doFinal parameter.
    589     if (idb->supportsIndirectBr()) {
     1261    if (branchTarget) {
    5901262        mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
    5911263        mStrideLoopBranch->addDestination(doFinalBlock);
     
    5991271    idb->SetInsertPoint(doFinalBlock);
    6001272
    601     Value * remainingItems = idb->CreateSub(mAvailableItemCount[0], idb->getProcessedItemCount(mStreamSetInputs[0].name));
     1273    Value * remainingItems = nullptr;
     1274    for (unsigned i = 0; i < inputSetCount; ++i) {
     1275        const ProcessingRate & r = mStreamSetInputs[i].getRate();
     1276        if (r.isFixed()) {
     1277            Value * ic = idb->CreateUDiv(mAvailableItemCount[i], idb->getSize(r.getRate()));
     1278            if (remainingItems) {
     1279                remainingItems = idb->CreateUMax(remainingItems, ic);
     1280            } else {
     1281                remainingItems = ic;
     1282            }
     1283        }
     1284    }
    6021285
    6031286    writeFinalBlockMethod(idb, remainingItems);
    6041287
    605     itemsDone = mAvailableItemCount[0];
    606     idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
    607     idb->setTerminationSignal();
    6081288    idb->CreateBr(segmentDone);
    6091289
     
    6131293
    6141294    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
    615     if (idb->supportsIndirectBr()) {
     1295    if (branchTarget) {
    6161296        MDBuilder mdb(idb->getContext());
    6171297        const auto destinations = mStrideLoopBranch->getNumDestinations();
     
    6331313    std::vector<Value *> availableItemCount(0);
    6341314
    635     /// Check if the do block method is called and create the function if necessary   
     1315    /// Check if the do block method is called and create the function if necessary
    6361316    if (!idb->supportsIndirectBr()) {
    6371317
     
    6601340    }
    6611341
    662     std::vector<Value *> priorProduced;
    663     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    664         if (mStreamSetOutputBuffers[i]->supportsCopyBack())  {
    665             priorProduced.push_back(idb->getProducedItemCount(mStreamSetOutputs[i].name));
    666         }
    667     }
    668 
    6691342    generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
    670 
    671     unsigned priorIdx = 0;
    672     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    673         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    674             Value * newProduced = idb->getProducedItemCount(mStreamSetOutputs[i].name);
    675             Value * handle = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
    676             mStreamSetOutputBuffers[i]->genCopyBackLogic(idb.get(), handle, priorProduced[priorIdx], newProduced, mStreamSetOutputs[i].name);
    677             priorIdx++;
    678         }
    679     }
    6801343
    6811344    if (!idb->supportsIndirectBr()) {
     
    7281391
    7291392    generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
    730 
    731     RecursivelyDeleteTriviallyDeadInstructions(remainingItems); // if remainingItems was not used, this will eliminate it.
    7321393
    7331394    if (!idb->supportsIndirectBr()) {
     
    7741435}
    7751436
    776 void MultiBlockKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
    777 
    778     // Stream set and buffer analysis.  When near the end of buffers
    779     // or for final block processing, data for each streamset may need
    780     // to be copied into temporary buffers to ensure linear access.
    781     // Data is always copied as a number of whole blocks, dependent
    782     // on the stream set processing rate.
    783    
    784     const unsigned bitBlockWidth = kb->getBitBlockWidth();
    785     const unsigned inputSetCount = mStreamSetInputs.size();
    786     const unsigned outputSetCount = mStreamSetOutputs.size();
    787     const unsigned totalSetCount = inputSetCount + outputSetCount;
    788    
    789     int maxBlocksToCopy[totalSetCount];
    790     for (unsigned i = 0; i < totalSetCount; i++) {
    791         if (mIsDerived[i]) {
    792             if (mItemsPerStride[i] % bitBlockWidth == 0) {
    793                 maxBlocksToCopy[i] = mItemsPerStride[i] / bitBlockWidth;
    794             }
    795             else {
    796                 // May not be block aligned, can overlap partial blocks at both ends.
    797                 maxBlocksToCopy[i] = mItemsPerStride[i]/bitBlockWidth + 2;
    798             }
    799         }
    800         else {
    801             // For variable input stream sets, we make a single stride of items
    802             // available, if possible, but this stride could be nonaligned.
    803             maxBlocksToCopy[i] = mStride / bitBlockWidth + 2;
    804         }
    805     }
    806     auto ip = kb->saveIP();
    807     Function * const cp = mCurrentMethod;
    808     const auto saveInstance = getInstance();
    809 
    810     // First prepare the multi-block method that will be used.
    811 
    812     std::vector<Type *> multiBlockParmTypes;
    813     multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
    814     multiBlockParmTypes.push_back(kb->getSizeTy());
    815     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    816         if (!mIsDerived[i]) multiBlockParmTypes.push_back(kb->getSizeTy());
    817     }
    818     for (auto buffer : mStreamSetInputBuffers) {
    819         multiBlockParmTypes.push_back(buffer->getStreamSetBlockType()->getPointerTo());
    820     }
    821     for (auto buffer : mStreamSetOutputBuffers) {
    822         multiBlockParmTypes.push_back(buffer->getStreamSetBlockType()->getPointerTo());
    823     }
    824 
    825     FunctionType * const type = FunctionType::get(kb->getVoidTy(), multiBlockParmTypes, false);
    826     Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, kb->getModule());
    827     multiBlockFunction->setCallingConv(CallingConv::C);
    828     multiBlockFunction->setDoesNotThrow();
    829     mCurrentMethod = multiBlockFunction;
    830     kb->SetInsertPoint(BasicBlock::Create(kb->getContext(), "multiBlockEntry", multiBlockFunction, 0));
    831 
    832     auto args = multiBlockFunction->arg_begin();
    833     args->setName("self");
    834     setInstance(&*args);
    835     (++args)->setName("itemsToDo");
    836     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    837         if (!mIsDerived[i]) (++args)->setName(mStreamSetInputs[i].name + "_availItems");
    838     }
    839     for (auto binding : mStreamSetInputs) {
    840         (++args)->setName(binding.name + "BufPtr");
    841     }
    842     for (auto binding : mStreamSetOutputs) {
    843         (++args)->setName(binding.name + "BufPtr");
    844     }
    845 
    846     // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
    847     // provide the required multi-block kernel logic.
    848     generateMultiBlockLogic(kb);
    849 
    850     kb->CreateRetVoid();
    851 
    852     kb->restoreIP(ip);
    853     mCurrentMethod = cp;
    854     setInstance(saveInstance);
    855 
    856     // Now proceed with creation of the doSegment method.
    857 
    858     BasicBlock * const entry = kb->GetInsertBlock();
    859     BasicBlock * const doSegmentOuterLoop = kb->CreateBasicBlock(getName() + "_doSegmentOuterLoop");
    860     BasicBlock * const doMultiBlockCall = kb->CreateBasicBlock(getName() + "_doMultiBlockCall");
    861     BasicBlock * const tempBlockCheck = kb->CreateBasicBlock(getName() + "_tempBlockCheck");
    862     BasicBlock * const doTempBufferBlock = kb->CreateBasicBlock(getName() + "_doTempBufferBlock");
    863     BasicBlock * const segmentDone = kb->CreateBasicBlock(getName() + "_segmentDone");
    864 
    865     Value * blockBaseMask = kb->CreateNot(kb->getSize(kb->getBitBlockWidth() - 1));
    866     ConstantInt * blockSize = kb->getSize(kb->getBitBlockWidth());
    867     ConstantInt * strideSize = kb->getSize(mStride);
    868    
    869     Value * availablePos = mAvailableItemCount[0];
    870     Value * itemsAvail = availablePos;
    871 
    872     //  Make sure that corresponding data is available depending on processing rate
    873     //  for all derived input stream sets.
    874     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    875         Value * a = mAvailableItemCount[i];
    876         auto & rate = mStreamSetInputs[i].rate;
    877         if (mIsDerived[i]) {
    878             Value * maxItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), a);
    879             itemsAvail = kb->CreateSelect(kb->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
    880         }
    881     }
    882 
    883     Value * processed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    884     Value * itemsToDo = kb->CreateSub(itemsAvail, processed);
    885     Value * fullStridesToDo = kb->CreateUDiv(itemsToDo, strideSize);
    886 
    887     //  Now we iteratively process these blocks using the doMultiBlock method.
    888     //  In each iteration, we process the maximum number of linearly accessible
    889     //  blocks on the principal input, reduced to ensure that the corresponding
    890     //  data is linearly available at the specified processing rates for the other inputs,
    891     //  and that each of the output buffers has sufficient linearly available space
    892     //  (using overflow areas, if necessary) for the maximum output that can be
    893     //  produced.
    894 
    895     kb->CreateBr(doSegmentOuterLoop);
    896     kb->SetInsertPoint(doSegmentOuterLoop);
    897     PHINode * const stridesRemaining = kb->CreatePHI(kb->getSizeTy(), 2, "stridesRemaining");
    898     stridesRemaining->addIncoming(fullStridesToDo, entry);
    899 
    900     // For each input buffer, determine the processedItemCount, the block pointer for the
    901     // buffer block containing the next item, and the number of linearly available items.
    902 
    903     Value * processedItemCount[inputSetCount];
    904     Value * inputBlockPtr[inputSetCount];
    905     Value * linearlyAvailItems[inputSetCount];
    906 
    907     Value * linearlyAvailStrides = stridesRemaining;
    908     for (unsigned i = 0; i < inputSetCount; i++) {
    909         processedItemCount[i] = kb->getProcessedItemCount(mStreamSetInputs[i].name);
    910         inputBlockPtr[i] = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
    911         Value * avail = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    912         linearlyAvailItems[i] = kb->getLinearlyAccessibleItems(mStreamSetInputs[i].name, processedItemCount[i], avail);
    913         auto & rate = mStreamSetInputs[i].rate;
    914         if (rate.isUnknownRate()) continue;  // No calculation possible for unknown rates.
    915         Value * maxReferenceItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), linearlyAvailItems[i]);
    916         Value * maxStrides = kb->CreateUDiv(maxReferenceItems, strideSize);
    917         linearlyAvailStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyAvailStrides), maxStrides, linearlyAvailStrides);
    918     }
    919 
    920     Value * producedItemCount[outputSetCount];
    921     Value * outputBlockPtr[outputSetCount];
    922     //  Now determine the linearly writeable blocks, based on available blocks reduced
    923     //  by limitations of output buffer space.
    924     Value * linearlyWritableStrides = linearlyAvailStrides;
    925     for (unsigned i = 0; i < outputSetCount; i++) {
    926         producedItemCount[i] = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    927         outputBlockPtr[i] = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
    928        
    929         auto & rate = mStreamSetOutputs[i].rate;
    930         if (rate.isUnknownRate()) continue;  // No calculation possible for unknown rates.
    931         Value * writableItems = kb->getLinearlyWritableItems(mStreamSetOutputs[i].name, producedItemCount[i]);
    932         Value * maxReferenceItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), writableItems);
    933         Value * maxStrides = kb->CreateUDiv(maxReferenceItems, strideSize);
    934         linearlyWritableStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyWritableStrides), maxStrides, linearlyWritableStrides);
    935     }
    936     Value * const haveFullStrides = kb->CreateICmpUGT(linearlyWritableStrides, kb->getSize(0));
    937     kb->CreateCondBr(haveFullStrides, doMultiBlockCall, tempBlockCheck);
    938 
    939     //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
    940     //  Now prepare the doMultiBlock call.
    941     kb->SetInsertPoint(doMultiBlockCall);
    942 
    943     Value * principalItemsToDo = kb->CreateMul(linearlyWritableStrides, strideSize);
    944 
    945     std::vector<Value *> doMultiBlockArgs;
    946     doMultiBlockArgs.push_back(getInstance());
    947     doMultiBlockArgs.push_back(principalItemsToDo);
    948     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    949         if (!mIsDerived[i]) {
    950             doMultiBlockArgs.push_back(linearlyAvailItems[i]);
    951         }
    952     }
    953     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    954         Value * bufPtr = kb->CreatePointerCast(inputBlockPtr[i], mStreamSetInputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    955         doMultiBlockArgs.push_back(bufPtr);
    956     }
    957     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    958         Value * bufPtr = kb->CreatePointerCast(outputBlockPtr[i], mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    959         doMultiBlockArgs.push_back(bufPtr);
    960     }
    961 
    962     kb->CreateCall(multiBlockFunction, doMultiBlockArgs);
    963 
    964     // Do copybacks if necessary.
    965     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    966         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    967             Value * newProduced = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    968             Value * handle = mStreamSetOutputBuffers[i]->getStreamSetHandle();
    969             mStreamSetOutputBuffers[i]->genCopyBackLogic(kb.get(), handle, producedItemCount[i], newProduced, mStreamSetOutputs[i].name);
    970         }
    971     }
    972 
    973     if (mIsDerived[0]) {
    974         Value * reducedStridesToDo = kb->CreateSub(stridesRemaining, linearlyWritableStrides);
    975         stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    976         Value * nowProcessed = kb->CreateAdd(processedItemCount[0], principalItemsToDo);
    977         kb->setProcessedItemCount(mStreamSetInputs[0].name, nowProcessed);
    978         kb->CreateBr(doSegmentOuterLoop);
    979     }
    980     else {
    981         // Processed item count updated by the kernel itself.
    982         Value * nowProcessed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    983         Value * remainingItemsToDo = kb->CreateSub(itemsAvail, nowProcessed);
    984         Value * reducedStridesToDo = kb->CreateUDiv(remainingItemsToDo, nowProcessed);
    985         stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    986         // If we didn't make progress, we have gone as far as we can in this segment.
    987         kb->CreateCondBr(kb->CreateICmpUGT(nowProcessed, processedItemCount[0]), doSegmentOuterLoop, segmentDone);
    988     }
    989 
    990     //
    991     // We use temporary buffers in 3 different cases that preclude full block processing.
    992     // (a) One or more input buffers does not have a sufficient number of input items linearly available.
    993     // (b) One or more output buffers does not have sufficient linearly available buffer space.
    994     // (c) We have processed all the full blocks of input and only the excessItems remain.
    995     // In each case we set up temporary buffers for input and output and then
    996     // call the Multiblock routine.
    997     //
    998 
    999     kb->SetInsertPoint(tempBlockCheck);
    1000     Value * const haveStrides = kb->CreateICmpUGT(stridesRemaining, kb->getSize(0));
    1001     kb->CreateCondBr(kb->CreateOr(mIsFinal, haveStrides), doTempBufferBlock, segmentDone);
    1002 
    1003     kb->SetInsertPoint(doTempBufferBlock);
    1004     Value * excessItems = kb->CreateSub(itemsAvail, kb->getProcessedItemCount(mStreamSetInputs[0].name));
    1005     Value * tempBlockItems = kb->CreateSelect(haveStrides, strideSize, excessItems);
    1006     Value * doFinal = kb->CreateNot(haveStrides);
    1007 
    1008     // Begin constructing the doMultiBlock args.
    1009     std::vector<Value *> tempArgs;
    1010     tempArgs.push_back(getInstance());
    1011     tempArgs.push_back(tempBlockItems);
    1012     // For non-derived inputs, add the available items.
    1013     for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
    1014         if (!mIsDerived[i]) {
    1015             Value * avail = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    1016             tempArgs.push_back(kb->CreateSelect(kb->CreateICmpULT(avail, strideSize), avail, strideSize));
    1017         }
    1018     }
    1019     //
    1020     // Define and allocate the temporary buffer area.
    1021     //
    1022     Type * tempBuffers[totalSetCount];
    1023     for (unsigned i = 0; i < inputSetCount; ++i) {
    1024         Type * bufType = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    1025         tempBuffers[i] = ArrayType::get(bufType, maxBlocksToCopy[i]);
    1026     }
    1027     for (unsigned i = 0; i < outputSetCount; i++) {
    1028         Type * bufType = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
    1029         tempBuffers[i + inputSetCount] = ArrayType::get(bufType, maxBlocksToCopy[i + inputSetCount]);
    1030     }
    1031     Type * tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount), "tempBuf");
    1032     // Prepare the temporary buffer area.
    1033     Value * tempParameterArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
    1034     kb->CreateMemZero(tempParameterArea, ConstantExpr::getSizeOf(tempParameterStructType));
    1035     // For each input and output buffer, copy over necessary data starting from the last block boundary.
    1036     Value * itemCountNeeded[inputSetCount];
    1037     itemCountNeeded[0] = tempBlockItems;
    1038     Value * finalItemCountNeeded[inputSetCount];
    1039 
    1040     for (unsigned i = 0; i < inputSetCount; i++) {
    1041         Type * bufPtrType = mStreamSetInputBuffers[i]->getStreamSetBlockType()->getPointerTo();
    1042         if (mItemsPerStride[i] != 0) {
    1043             Value * tempBufPtr = kb->CreateGEP(tempParameterArea, {kb->getInt32(0), kb->getInt32(i)});
    1044             tempBufPtr = kb->CreatePointerCast(tempBufPtr, bufPtrType);
    1045             ConstantInt * strideItems = kb->getSize(mItemsPerStride[i]);
    1046             Value * strideBasePos = kb->CreateSub(processedItemCount[i], kb->CreateURem(processedItemCount[i], strideItems));
    1047             Value * blockBasePos = strideBasePos;
    1048             if (mItemsPerStride[i] & (bitBlockWidth - 1)) {
    1049                 blockBasePos = kb->CreateAnd(strideBasePos, blockBaseMask);
    1050             }
    1051 
    1052             // The number of items to copy is determined by the processing rate requirements.
    1053             if (i >= 1) {
    1054                 auto & rate = mStreamSetInputs[i].rate;
    1055                 std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
    1056                 Port port; unsigned ssIdx;
    1057                 std::tie(port, ssIdx) = getStreamPort(refSet);
    1058                 itemCountNeeded[i] = rate.CreateRatioCalculation(kb.get(), itemCountNeeded[ssIdx], doFinal);
    1059             }
    1060             finalItemCountNeeded[i] = kb->CreateAdd(itemCountNeeded[i], processedItemCount[i]);
    1061 
    1062             Value * inputPtr = kb->CreatePointerCast(kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), blockBasePos), bufPtrType);
    1063            
    1064             if (maxBlocksToCopy[i] == 1) {
    1065                 // copy one block
    1066                 mStreamSetInputBuffers[i]->createBlockCopy(kb.get(), tempBufPtr, inputPtr, kb->getSize(1));
    1067             }
    1068             else {
    1069                 Value * neededItems = kb->CreateSub(finalItemCountNeeded[i], blockBasePos);
    1070                 Value * copyItems1 = kb->getLinearlyAccessibleItems(mStreamSetInputs[i].name, blockBasePos, neededItems);
    1071                 Value * allAvail = kb->CreateICmpEQ(neededItems, copyItems1);
    1072                 Value * copyBlocks1 = kb->CreateUDivCeil(copyItems1, blockSize);
    1073                 mStreamSetInputBuffers[i]->createBlockCopy(kb.get(), tempBufPtr, inputPtr, copyBlocks1);
    1074                 BasicBlock * copyRemaining = kb->CreateBasicBlock("copyRemaining");
    1075                 BasicBlock * copyDone = kb->CreateBasicBlock("copyDone");
    1076                 kb->CreateCondBr(allAvail, copyDone, copyRemaining);
    1077                 kb->SetInsertPoint(copyRemaining);
    1078                 Value * copyItems2 = kb->CreateSub(neededItems, copyItems1);
    1079                 Value * copyBlocks2 = kb->CreateUDivCeil(copyItems2, blockSize);
    1080                 //Value * nextBasePos = kb->CreateAdd(blockBasePos, copyItems1);
    1081                 Value * nextBasePos = kb->CreateAdd(blockBasePos, kb->CreateMul(copyBlocks2, blockSize));
    1082                 Value * nextInputPtr = kb->CreatePointerCast(kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), nextBasePos), bufPtrType);
    1083                 Value * nextBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(copyItems1, blockSize));
    1084                 //mStreamSetInputBuffers[i]->createBlockAlignedCopy(kb.get(), nextBufPtr, nextInputPtr, copyItems2);
    1085                 mStreamSetInputBuffers[i]->createBlockCopy(kb.get(), nextBufPtr, nextInputPtr, copyBlocks2);
    1086                 kb->CreateBr(copyDone);
    1087                 kb->SetInsertPoint(copyDone);
    1088             }
    1089             tempArgs.push_back(tempBufPtr);
    1090         } else {
    1091             Value * bufPtr = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
    1092             bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetInputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    1093             tempArgs.push_back(bufPtr);           
    1094         }
    1095     }
    1096     Value * outputBasePos[outputSetCount];
    1097     for (unsigned i = 0; i < outputSetCount; i++) {
    1098         Value * tempBufPtr = kb->CreateGEP(tempParameterArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i)});
    1099         Type * bufPtrType = mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo();
    1100         tempBufPtr = kb->CreatePointerCast(tempBufPtr, bufPtrType);
    1101         producedItemCount[i] = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    1102         outputBasePos[i] = kb->CreateAnd(producedItemCount[i], blockBaseMask);
    1103         //mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), tempBufPtr, outputBlockPtr[i], kb->CreateSub(producedItemCount[i], outputBasePos[i]));
    1104         Value * copyBlocks = kb->CreateUDivCeil(kb->CreateSub(producedItemCount[i], outputBasePos[i]), blockSize);
    1105         mStreamSetOutputBuffers[i]->createBlockCopy(kb.get(), tempBufPtr, outputBlockPtr[i], copyBlocks);
    1106         tempArgs.push_back(tempBufPtr);
    1107     }
    1108 
    1109     kb->CreateCall(multiBlockFunction, tempArgs);
    1110    
    1111     //  The items have been processed and output generated to the temporary areas.
    1112     //  Update the processed item count (and hence all the counts derived automatically
    1113     //  therefrom).
    1114     if (mIsDerived[0]) {
    1115         kb->setProcessedItemCount(mStreamSetInputs[0].name, finalItemCountNeeded[0]);
    1116     }
    1117    
    1118     // Copy back data to the actual output buffers.
    1119     for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
    1120         Value * tempBufPtr = kb->CreateGEP(tempParameterArea,  {kb->getInt32(0), kb->getInt32(mStreamSetInputs.size() + i)});
    1121         tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo());
    1122         Value * finalOutputItems = kb->getProducedItemCount(mStreamSetOutputs[i].name);
    1123         Value * copyItems = kb->CreateSub(finalOutputItems, outputBasePos[i]);
    1124         // Round up to exact multiple of block size.
    1125         //copyItems = kb->CreateAnd(kb->CreateAdd(copyItems, kb->getSize(bitBlockWidth - 1)), blockBaseMask);
    1126         Value * writableFromBase = kb->getLinearlyWritableItems(mStreamSetOutputs[i].name, outputBasePos[i]); // must be a whole number of blocks.
    1127         Value * allWritable = kb->CreateICmpULE(copyItems, writableFromBase);
    1128         Value * copyItems1 = kb->CreateSelect(allWritable, copyItems, writableFromBase);
    1129         //mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), outputBlockPtr[i], tempBufPtr, copyItems1);
    1130         Value * copyBlocks1 = kb->CreateUDivCeil(copyItems1, blockSize);
    1131         mStreamSetOutputBuffers[i]->createBlockCopy(kb.get(), outputBlockPtr[i], tempBufPtr, copyBlocks1);
    1132         BasicBlock * copyBackRemaining = kb->CreateBasicBlock("copyBackRemaining");
    1133         BasicBlock * copyBackDone = kb->CreateBasicBlock("copyBackDone");
    1134         kb->CreateCondBr(allWritable, copyBackDone, copyBackRemaining);
    1135         kb->SetInsertPoint(copyBackRemaining);
    1136         Value * copyItems2 = kb->CreateSub(copyItems, copyItems1);
    1137         Value * nextBasePos = kb->CreateAdd(outputBasePos[i], copyItems1);
    1138         Type * bufPtrType = mStreamSetOutputBuffers[i]->getStreamSetBlockType()->getPointerTo();
    1139         Value * nextOutputPtr = kb->CreatePointerCast(kb->getRawOutputPointer(mStreamSetOutputs[i].name, kb->getInt32(0), nextBasePos), bufPtrType);
    1140         tempBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(copyItems1, blockSize));
    1141         //mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), nextOutputPtr, tempBufPtr, copyItems2);
    1142         Value * copyBlocks2 = kb->CreateUDivCeil(copyItems2, blockSize);
    1143         mStreamSetOutputBuffers[i]->createBlockCopy(kb.get(), nextOutputPtr, tempBufPtr, copyBlocks2);
    1144         kb->CreateBr(copyBackDone);
    1145         kb->SetInsertPoint(copyBackDone);
    1146     }
    1147 
    1148     //  We've dealt with the partial block processing and copied information back into the
    1149     //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1150     //
    1151     BasicBlock * setTermination = kb->CreateBasicBlock("mBsetTermination");
    1152     if (mIsDerived[0]) {
    1153         stridesRemaining->addIncoming(kb->CreateSub(stridesRemaining, kb->CreateZExt(haveStrides, kb->getSizeTy())), kb->GetInsertBlock());
    1154         kb->CreateCondBr(haveStrides, doSegmentOuterLoop, setTermination);
    1155     }
    1156     else {
    1157         Value * nowProcessed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    1158         Value * remainingItemsToDo = kb->CreateSub(itemsAvail, nowProcessed);
    1159         Value * reducedStridesToDo = kb->CreateUDiv(remainingItemsToDo, nowProcessed);
    1160         stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    1161         Value * haveStrides = kb->CreateICmpUGT(reducedStridesToDo, kb->getSize(0));
    1162         kb->CreateCondBr(haveStrides, doSegmentOuterLoop, setTermination);
    1163     }   
    1164     kb->SetInsertPoint(setTermination);
    1165     kb->setTerminationSignal();
    1166     kb->CreateBr(segmentDone);
    1167     kb->SetInsertPoint(segmentDone);
    1168 }
    1169 
    1170 void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
    1171     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
    1172     mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
    1173 }
    1174 
    1175 Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
    1176     const auto f = mStreamMap.find(name);
    1177     if (LLVM_UNLIKELY(f == mStreamMap.end())) {
    1178         report_fatal_error(getName() + " does not contain stream set " + name);
    1179     }
    1180     return f->second;
    1181 }
    1182 
    11831437static inline std::string annotateKernelNameWithDebugFlags(std::string && name) {
    11841438    if (codegen::EnableAsserts) {
     
    12011455                  , std::move(internal_scalars))
    12021456, mCurrentMethod(nullptr)
     1457, mAvailablePrincipleItemCount(nullptr)
    12031458, mNoTerminateAttribute(false)
    12041459, mIsGenerated(false)
     1460, mStride(0)
    12051461, mIsFinal(nullptr)
    1206 , mOutputScalarResult(nullptr)
    1207 , mStride(0) {
     1462, mOutputScalarResult(nullptr) {
    12081463
    12091464}
     
    12201475                                         std::vector<Binding> && scalar_outputs,
    12211476                                         std::vector<Binding> && internal_scalars)
    1222 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
     1477: MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
    12231478, mDoBlockMethod(nullptr)
    12241479, mStrideLoopBody(nullptr)
     
    12281483}
    12291484
    1230 // CONSTRUCTOR
     1485// MULTI-BLOCK KERNEL CONSTRUCTOR
    12311486MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
    12321487                                   std::vector<Binding> && stream_inputs,
     
    12361491                                   std::vector<Binding> && internal_scalars)
    12371492: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1493
    12381494}
    12391495
     
    12461502                                             std::vector<Binding> && internal_scalars)
    12471503: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    1248    
    1249 }
    1250  
    1251    
    1252 void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & kb,
    1253                                  std::vector<Value *> inputAvailable,
    1254                                  Value * doFinal) {
    1255     auto kernel = kb->getKernel();
    1256     const unsigned inputSetCount = inputAvailable.size();
    1257     if (inputSetCount == 0) return;  //  Cannot calculate buffer items expected from input.
    1258     auto & outputs = kernel->getStreamSetOutputBuffers();
    1259     const unsigned outputSetCount = outputs.size();
    1260 
    1261     Constant * blockSize = kb->getSize(kb->getBitBlockWidth());
    1262     Value * newlyAvailInputItems[inputSetCount];
    1263     Value * requiredOutputBufferSpace[outputSetCount];
    1264     for (unsigned i = 0; i < inputSetCount; i++) {
    1265         Value * processed = kb->getProcessedItemCount(kernel->getStreamInput(i).name);
    1266         newlyAvailInputItems[i] = kb->CreateSub(inputAvailable[i], processed);
    1267     }
    1268     //kb->GetInsertBlock()->dump();
    1269     for (unsigned i = 0; i < outputSetCount; i++) {
    1270         const auto & rate = kernel->getStreamOutput(i).rate;
    1271         if (rate.isUnknownRate()) continue;  // No calculations possible.
    1272         Kernel::Port port; unsigned ssIdx;
    1273         std::tie(port, ssIdx) = kernel->getStreamPort(rate.referenceStreamSet());
    1274         Value * base = nullptr;
    1275         if (port == Kernel::Port::Output) {
    1276             base = requiredOutputBufferSpace[ssIdx]; assert (base);
    1277         } else {
    1278             base = newlyAvailInputItems[ssIdx]; assert (base);
    1279         }
    1280         requiredOutputBufferSpace[i] = rate.CreateRatioCalculation(kb.get(), base, doFinal);
    1281         if (auto db = dyn_cast<DynamicBuffer>(outputs[i])) {
    1282             Value * handle = db->getStreamSetHandle();
    1283             // This buffer can be expanded.
    1284             Value * producedBlock = kb->CreateUDivCeil(kb->getProducedItemCount(kernel->getStreamOutput(i).name), blockSize);
    1285             Value * consumedBlock = kb->CreateUDiv(kb->getConsumedItemCount(kernel->getStreamOutput(i).name), blockSize);
    1286             Value * blocksInUse = kb->CreateSub(producedBlock, consumedBlock);
    1287             Value * blocksRequired = kb->CreateAdd(blocksInUse, kb->CreateUDivCeil(requiredOutputBufferSpace[i], blockSize));
    1288             Value * spaceRequired = kb->CreateMul(blocksRequired, blockSize);
    1289             Value * expansionNeeded = kb->CreateICmpUGT(spaceRequired, db->getBufferedSize(kb.get(), handle));
    1290             BasicBlock * doExpand = kb->CreateBasicBlock("doExpand");
    1291             BasicBlock * bufferReady = kb->CreateBasicBlock("bufferReady");
    1292             kb->CreateCondBr(expansionNeeded, doExpand, bufferReady);
    1293             kb->SetInsertPoint(doExpand);
    1294             db->doubleCapacity(kb.get(), handle);
    1295             // Ensure that capacity is sufficient by successive doubling, if necessary.
    1296             expansionNeeded = kb->CreateICmpUGT(spaceRequired, db->getBufferedSize(kb.get(), handle));
    1297             kb->CreateCondBr(expansionNeeded, doExpand, bufferReady);
    1298             kb->SetInsertPoint(bufferReady);
    1299         }
    1300     }
    1301 
    1302 }
    1303 
    1304 }
     1504
     1505}
     1506
     1507
     1508}
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5630 r5706  
    99#include "interface.h"
    1010#include <boost/container/flat_map.hpp>
    11 #include <llvm/IR/Constants.h>
    12 
     11
     12namespace llvm { class BasicBlock; }
    1313namespace llvm { class Function; }
    1414namespace llvm { class IntegerType; }
     
    2828public:
    2929    enum class Port { Input, Output };
    30 protected:
     30
     31    using StreamPort = std::pair<Port, unsigned>;
     32
     33protected:
     34
    3135    using KernelMap = boost::container::flat_map<std::string, unsigned>;
    32     using StreamPort = std::pair<Port, unsigned>;
    3336    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
    3437    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
     
    4548    static const std::string BUFFER_PTR_SUFFIX;
    4649    static const std::string CONSUMER_SUFFIX;
    47 public:
    4850    static const std::string CYCLECOUNT_SCALAR;
    4951
     
    7476    // guaranteeing uniqueness.  In this case, hasSignature() should return false.
    7577    //
     78
     79    //
     80    // Kernel builder subtypes define their logic of kernel construction
     81    // in terms of 3 virtual methods for
     82    // (a) preparing the Kernel state data structure
     83    // (c) defining the logic of the finalBlock function.
     84    //
     85    // Note: the kernel state data structure must only be finalized after
     86    // all scalar fields have been added.   If there are no fields to
     87    // be added, the default method for preparing kernel state may be used.
     88
    7689       
    7790    bool isCachable() const override { return false; }
     
    131144    //
    132145   
    133     unsigned getKernelStride() const { return mStride;}
    134    
    135     void setKernelStride(unsigned stride) {mStride = stride;}
     146    unsigned getKernelStride() const { return mStride; }
    136147   
    137148    virtual ~Kernel() = 0;
     
    145156protected:
    146157
     158    void setKernelStride(unsigned stride) { mStride = stride; }
     159
    147160    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
     161
     162    void getDoSegmentFunctionArguments(const std::vector<llvm::Value *> & availItems) const;
    148163
    149164    // Constructor
     
    155170                  std::vector<Binding> && internal_scalars);
    156171
    157     //
    158     // Kernel builder subtypes define their logic of kernel construction
    159     // in terms of 3 virtual methods for
    160     // (a) preparing the Kernel state data structure
    161     // (c) defining the logic of the finalBlock function.
    162     //
    163     // Note: the kernel state data structure must only be finalized after
    164     // all scalar fields have been added.   If there are no fields to
    165     // be added, the default method for preparing kernel state may be used.
    166 
    167172    void setNoTerminateAttribute(const bool noTerminate = true) {
    168173        mNoTerminateAttribute = noTerminate;
    169174    }
    170175
     176    llvm::Value * getPrincipleItemCount() const {
     177        return mAvailablePrincipleItemCount;
     178    }
     179
    171180    unsigned getScalarIndex(const std::string & name) const;
    172181
    173182    void prepareStreamSetNameMap();
    174183   
    175     void processingRateAnalysis();
    176 
    177184    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
    178185
    179186    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { }
    180187   
    181     virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & iBuilder) = 0;
     188    virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> & iBuilder) = 0;
    182189
    183190    virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) { }
     
    189196    unsigned addUnnamedScalar(llvm::Type * type);
    190197
    191     llvm::Value * getIsFinal() const {
    192         return mIsFinal;
    193     }
    194 
    195198    void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & idb);
    196199
     
    198201
    199202    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
     203
     204
     205    std::pair<unsigned, unsigned> getStreamRate(const Port p, const unsigned i) const;
    200206
    201207    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
     
    229235    }
    230236
     237    llvm::Value * getStreamSetInputBufferPtr(const unsigned i) const {
     238        return mStreamSetInputBufferPtr[i];
     239    }
     240
     241    llvm::Value * getStreamSetOutputBufferPtr(const unsigned i) const {
     242        return mStreamSetOutputBufferPtr[i];
     243    }
     244
    231245private:
     246
     247    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
    232248
    233249    llvm::Value * getAvailableItemCount(const unsigned i) const {
     
    238254
    239255    llvm::Function *                    mCurrentMethod;
     256    llvm::Value *                       mAvailablePrincipleItemCount;
    240257    bool                                mNoTerminateAttribute;
    241258    bool                                mIsGenerated;
    242 
     259    unsigned                            mStride;
    243260    llvm::Value *                       mIsFinal;
     261    llvm::Value *                       mOutputScalarResult;
     262
     263
    244264    std::vector<llvm::Value *>          mAvailableItemCount;
    245     llvm::Value *                       mOutputScalarResult;
    246265
    247266    std::vector<llvm::Type *>           mKernelFields;
     
    249268    StreamMap                           mStreamMap;
    250269    StreamSetBuffers                    mStreamSetInputBuffers;
     270    std::vector<llvm::Value *>          mStreamSetInputBufferPtr;
    251271    StreamSetBuffers                    mStreamSetOutputBuffers;
    252     unsigned                            mStride;
    253     std::vector<unsigned>               mItemsPerStride;
    254     std::vector<unsigned>               mIsDerived;
     272    std::vector<llvm::Value *>          mStreamSetOutputBufferPtr;
    255273
    256274};
     
    265283                          std::vector<Binding> && scalar_outputs,
    266284                          std::vector<Binding> && internal_scalars);
    267 
    268 };
    269 
    270 class BlockOrientedKernel : public Kernel {
    271 protected:
    272 
    273     void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb);
    274 
    275     // Each kernel builder subtype must provide its own logic for generating
    276     // doBlock calls.
    277     virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) = 0;
    278 
    279     // Each kernel builder subtypre must also specify the logic for processing the
    280     // final block of stream data, if there is any special processing required
    281     // beyond simply calling the doBlock function.   In the case that the final block
    282     // processing may be trivially implemented by dispatching to the doBlock method
    283     // without additional preparation, the default generateFinalBlockMethod need
    284     // not be overridden.
    285 
    286     virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
    287 
    288     void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb) final;
    289 
    290     BlockOrientedKernel(std::string && kernelName,
    291                         std::vector<Binding> && stream_inputs,
    292                         std::vector<Binding> && stream_outputs,
    293                         std::vector<Binding> && scalar_parameters,
    294                         std::vector<Binding> && scalar_outputs,
    295                         std::vector<Binding> && internal_scalars);
    296 
    297 private:
    298 
    299     void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb);
    300 
    301     void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
    302 
    303 private:
    304 
    305     llvm::Function *        mDoBlockMethod;
    306     llvm::BasicBlock *      mStrideLoopBody;
    307     llvm::IndirectBrInst *  mStrideLoopBranch;
    308     llvm::PHINode *         mStrideLoopTarget;
     285protected:
     286
     287    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
     288
     289    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) = 0;
     290
    309291};
    310292
     
    417399    // exit the RetVoid instruction will be added to complete the method.
    418400    //
    419     virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb) = 0;
    420    
     401    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) = 0;
     402
    421403private:
     404
    422405    // Given a kernel subtype with an appropriate interface, the generateDoSegment
    423406    // method of the multi-block kernel builder makes all the necessary arrangements
    424407    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
    425     void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) final;
     408    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) final;
     409
     410    bool requiresCopyBack(const ProcessingRate & rate) const;
    426411
    427412};
    428413
    429 void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & kb,
    430                                  std::vector<llvm::Value *> inputAvailable,
    431                                  llvm::Value * doFinal);
    432    
    433    
     414
     415class BlockOrientedKernel : public MultiBlockKernel {
     416protected:
     417
     418    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb);
     419
     420    // Each kernel builder subtype must provide its own logic for generating
     421    // doBlock calls.
     422    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) = 0;
     423
     424    // Each kernel builder subtypre must also specify the logic for processing the
     425    // final block of stream data, if there is any special processing required
     426    // beyond simply calling the doBlock function.   In the case that the final block
     427    // processing may be trivially implemented by dispatching to the doBlock method
     428    // without additional preparation, the default generateFinalBlockMethod need
     429    // not be overridden.
     430
     431    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
     432
     433    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) final;
     434
     435    BlockOrientedKernel(std::string && kernelName,
     436                        std::vector<Binding> && stream_inputs,
     437                        std::vector<Binding> && stream_outputs,
     438                        std::vector<Binding> && scalar_parameters,
     439                        std::vector<Binding> && scalar_outputs,
     440                        std::vector<Binding> && internal_scalars);
     441
     442private:
     443
     444    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb);
     445
     446    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
     447
     448private:
     449
     450    llvm::Function *        mDoBlockMethod;
     451    llvm::BasicBlock *      mStrideLoopBody;
     452    llvm::IndirectBrInst *  mStrideLoopBranch;
     453    llvm::PHINode *         mStrideLoopTarget;
     454};
    434455
    435456}
     457
    436458#endif
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5650 r5706  
    33#include <kernels/kernel.h>
    44#include <kernels/streamset.h>
     5#include <llvm/Support/raw_ostream.h>
    56
    67using namespace llvm;
     
    1112namespace kernel {
    1213
     14using Port = Kernel::Port;
     15
    1316Value * KernelBuilder::getScalarFieldPtr(llvm::Value * instance, Value * const index) {
    1417    assert (instance);
    15     CreateAssert(instance, "instance cannot be null!");
     18    CreateAssert(instance, "getScalarFieldPtr: instance cannot be null!");
    1619    return CreateGEP(instance, {getInt32(0), index});
    1720}
     
    3740}
    3841
    39 Value * KernelBuilder::getStreamSetBufferPtr(const std::string & name) {
     42Value * KernelBuilder::getStreamHandle(const std::string & name) {
    4043    Value * const ptr = getScalarField(name + Kernel::BUFFER_PTR_SUFFIX);
    4144    CreateAssert(ptr, name + " cannot be null!");
     
    5154}
    5255
    53 Value * KernelBuilder::getProducedItemCount(const std::string & name, Value * doFinal) {
    54     Kernel::Port port; unsigned index;
    55     std::tie(port, index) = mKernel->getStreamPort(name);
    56     assert (port == Kernel::Port::Output);
    57     const auto & rate = mKernel->getStreamOutput(index).rate;
    58     const auto & refSet = rate.referenceStreamSet();
    59     if ((refSet != name) && rate.isExact()) {
    60         Value * principalCount;
    61         std::tie(port, index) = mKernel->getStreamPort(refSet);
    62         if (port == Kernel::Port::Input) {
    63             principalCount = getProcessedItemCount(refSet);
     56Value * KernelBuilder::getCycleCountPtr() {
     57    return getScalarFieldPtr(Kernel::CYCLECOUNT_SCALAR);
     58}
     59
     60inline const Binding & getBinding(const Kernel * k, const std::string & name) {
     61    Port port; unsigned index;
     62    std::tie(port, index) = k->getStreamPort(name);
     63    if (port == Port::Input) {
     64        return k->getStreamInput(index);
     65    } else {
     66        return k->getStreamOutput(index);
     67    }
     68}
     69
     70Value * KernelBuilder::getInternalItemCount(const std::string & name, const std::string & suffix) {
     71    const ProcessingRate & rate = getBinding(mKernel, name).getRate();
     72    Value * itemCount = nullptr;
     73    if (rate.isExactlyRelative()) {
     74        Port port; unsigned index;
     75        std::tie(port, index) = mKernel->getStreamPort(rate.getReference());
     76        if (port == Port::Input) {
     77            itemCount = getProcessedItemCount(rate.getReference());
    6478        } else {
    65             principalCount = getProducedItemCount(refSet);
    66         }
    67         return rate.CreateRatioCalculation(this, principalCount, doFinal);
    68     }
    69     return getScalarField(name + Kernel::PRODUCED_ITEM_COUNT_SUFFIX);
    70 }
    71 
    72 Value * KernelBuilder::getProcessedItemCount(const std::string & name) {
    73     Kernel::Port port; unsigned index;
    74     std::tie(port, index) = mKernel->getStreamPort(name);
    75     assert (port == Kernel::Port::Input);
    76     const auto & rate = mKernel->getStreamInput(index).rate;
    77     const auto & refSet = rate.referenceStreamSet();
    78     if ((refSet != name) && rate.isExact()) {
    79         Value * const principalCount = getProcessedItemCount(refSet);
    80         return rate.CreateRatioCalculation(this, principalCount);
    81     }
    82     return getScalarField(name + Kernel::PROCESSED_ITEM_COUNT_SUFFIX);
    83 }
     79            itemCount = getProducedItemCount(rate.getReference());
     80        }
     81        if (rate.getNumerator() != 1) {
     82            itemCount = CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
     83        }
     84        if (rate.getDenominator() != 1) {
     85            itemCount = CreateExactUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
     86        }
     87    } else {
     88        itemCount = getScalarField(name + suffix);
     89    }
     90    return itemCount;
     91}
     92
     93void KernelBuilder::setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value) {
     94    const ProcessingRate & rate = getBinding(mKernel, name).getRate();
     95    if (LLVM_UNLIKELY(rate.isDerived())) {
     96        report_fatal_error("Cannot set item count: " + name + " is a Derived rate");
     97    }
     98    if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
     99        CallPrintIntToStderr(mKernel->getName() + ": " + name + suffix, value);
     100    }
     101    setScalarField(name + suffix, value);
     102}
     103
    84104
    85105Value * KernelBuilder::getAvailableItemCount(const std::string & name) {
    86106    const auto & inputs = mKernel->getStreamInputs();
    87107    for (unsigned i = 0; i < inputs.size(); ++i) {
    88         if (inputs[i].name == name) {
     108        if (inputs[i].getName() == name) {
    89109            return mKernel->getAvailableItemCount(i);
    90110        }
     
    93113}
    94114
    95 Value * KernelBuilder::getConsumedItemCount(const std::string & name) {
    96     return getScalarField(name + Kernel::CONSUMED_ITEM_COUNT_SUFFIX);
    97 }
    98 
    99 void KernelBuilder::setProducedItemCount(const std::string & name, Value * value) {
    100     setScalarField(name + Kernel::PRODUCED_ITEM_COUNT_SUFFIX, value);
     115Value * KernelBuilder::getTerminationSignal() {
     116    if (mKernel->hasNoTerminateAttribute()) {
     117        return getFalse();
     118    }
     119    return getScalarField(Kernel::TERMINATION_SIGNAL);
     120}
     121
     122void KernelBuilder::setTerminationSignal(llvm::Value * const value) {
     123    assert (!mKernel->hasNoTerminateAttribute());
     124    assert (value->getType() == getInt1Ty());
    101125    if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
    102         CallPrintIntToStderr(mKernel->getName() + ": " + name + "_producedItemCount", value);
    103     }
    104 }
    105 
    106 void KernelBuilder::setProcessedItemCount(const std::string & name, Value * value) {
    107     setScalarField(name + Kernel::PROCESSED_ITEM_COUNT_SUFFIX, value);
    108     if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
    109         CallPrintIntToStderr(mKernel->getName() + ": " + name + "_processedItemCount", value);
    110     }
    111 }
    112 
    113 void KernelBuilder::setConsumedItemCount(const std::string & name, Value * value) {
    114     setScalarField(name + Kernel::CONSUMED_ITEM_COUNT_SUFFIX, value);
    115     if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
    116         CallPrintIntToStderr(mKernel->getName() + ": " + name + "_consumedItemCount", value);
    117     }
    118 }
    119 
    120 Value * KernelBuilder::getTerminationSignal() {
    121     return getScalarField(Kernel::TERMINATION_SIGNAL);
    122 }
    123 
    124 void KernelBuilder::setTerminationSignal() {
    125     setScalarField(Kernel::TERMINATION_SIGNAL, getTrue());
    126     if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
    127         CallPrintIntToStderr(mKernel->getName() + ": setTerminationSignal", getTrue());
    128     }
     126        CallPrintIntToStderr(mKernel->getName() + ": setTerminationSignal", value);
     127    }
     128    setScalarField(Kernel::TERMINATION_SIGNAL, value);
    129129}
    130130
    131131Value * KernelBuilder::getLinearlyAccessibleItems(const std::string & name, Value * fromPosition, Value * avail, bool reverse) {
    132     Kernel::Port port; unsigned index;
    133     std::tie(port, index) = mKernel->getStreamPort(name);
    134     const StreamSetBuffer * buf = nullptr;
    135     if (port == Kernel::Port::Input) {
    136         const auto lookAhead = mKernel->getLookAhead(index);
    137         if (LLVM_UNLIKELY(lookAhead != 0)) {
    138             fromPosition = CreateAdd(ConstantInt::get(fromPosition->getType(), lookAhead), fromPosition);
    139         }
    140         buf = mKernel->getInputStreamSetBuffer(name);
    141     } else {
    142         buf = mKernel->getOutputStreamSetBuffer(name);
    143     }
    144     assert (buf);
    145     return buf->getLinearlyAccessibleItems(this, getStreamSetBufferPtr(name), fromPosition, avail, reverse);
     132    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     133    return buf->getLinearlyAccessibleItems(this, getStreamHandle(name), fromPosition, avail, reverse);
    146134}
    147135
    148136Value * KernelBuilder::getLinearlyWritableItems(const std::string & name, Value * fromPosition, bool reverse) {
    149137    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    150     return buf->getLinearlyWritableItems(this, getStreamSetBufferPtr(name), fromPosition, reverse);
     138    return buf->getLinearlyWritableItems(this, getStreamHandle(name), fromPosition, reverse);
     139}
     140
     141Value * KernelBuilder::copy(const std::string & name, Value * target, Value * source, Value * itemsToCopy, const unsigned alignment) {
     142    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
     143    return buf->copy(this, getStreamHandle(name), target, source, itemsToCopy, alignment);
     144}
     145
     146void KernelBuilder::CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to) {
     147    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
     148    return buf->genCopyBackLogic(this, getStreamHandle(name), from, to, name);
    151149}
    152150
     
    168166}
    169167
     168Value * KernelBuilder::getInputStreamPtr(const std::string & name, Value * const blockIndex) {
     169//    Value * const blockIndex = computeBlockIndex(getProcessedItemCount(name));
     170    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     171    return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
     172}
     173
    170174Value * KernelBuilder::getInputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    171     Value * const blockIndex = computeBlockIndex(getProcessedItemCount(name));
    172     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    173     return buf->getStreamBlockPtr(this, getStreamSetBufferPtr(name), streamIndex, blockIndex, true);
     175    const Kernel::StreamPort p = mKernel->getStreamPort(name);
     176    if (LLVM_UNLIKELY(p.first == Port::Output)) {
     177        report_fatal_error(name + " is not an input stream set");
     178    }
     179    Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
     180    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     181    return buf->getStreamBlockPtr(this, getStreamHandle(name), addr, streamIndex, true);
    174182}
    175183
     
    179187
    180188Value * KernelBuilder::getInputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    181     Value * const blockIndex = computeBlockIndex(getProcessedItemCount(name));
    182     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    183     return buf->getStreamPackPtr(this, getStreamSetBufferPtr(name), streamIndex, blockIndex, packIndex, true);
     189    const Kernel::StreamPort p = mKernel->getStreamPort(name);
     190    if (LLVM_UNLIKELY(p.first == Port::Output)) {
     191        report_fatal_error(name + " is not an input stream set");
     192    }
     193    Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
     194    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     195    return buf->getStreamPackPtr(this, getStreamHandle(name), addr, streamIndex, packIndex, true);
    184196}
    185197
     
    190202Value * KernelBuilder::getInputStreamSetCount(const std::string & name) {
    191203    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    192     return buf->getStreamSetCount(this, getStreamSetBufferPtr(name));
     204    return buf->getStreamSetCount(this, getStreamHandle(name));
    193205}
    194206
    195207Value * KernelBuilder::getAdjustedInputStreamBlockPtr(Value * blockAdjustment, const std::string & name, Value * streamIndex) {
    196     Value * const blockIndex = CreateAdd(computeBlockIndex(getProcessedItemCount(name)), blockAdjustment);
    197     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    198     return buf->getStreamBlockPtr(this, getStreamSetBufferPtr(name), streamIndex, blockIndex, true);
     208    const Kernel::StreamPort p = mKernel->getStreamPort(name);
     209    if (LLVM_UNLIKELY(p.first == Port::Output)) {
     210        report_fatal_error(name + " is not an input stream set");
     211    }
     212    Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
     213    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     214    return buf->getStreamBlockPtr(this, getStreamHandle(name), CreateGEP(addr, blockAdjustment), streamIndex, true);
     215}
     216
     217Value * KernelBuilder::getOutputStreamPtr(const std::string & name, Value * const blockIndex) {
     218//    Value * const blockIndex = computeBlockIndex(getProducedItemCount(name));
     219    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     220    return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
    199221}
    200222
    201223Value * KernelBuilder::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    202     Value * const blockIndex = computeBlockIndex(getProducedItemCount(name));
    203     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    204     return buf->getStreamBlockPtr(this, getStreamSetBufferPtr(name), streamIndex, blockIndex, false);
     224    const Kernel::StreamPort p = mKernel->getStreamPort(name);
     225    if (LLVM_UNLIKELY(p.first == Port::Input)) {
     226        report_fatal_error(name + " is not an output stream set");
     227    }
     228    Value * addr = mKernel->getStreamSetOutputBufferPtr(p.second);
     229    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     230    return buf->getStreamBlockPtr(this, getStreamHandle(name), addr, streamIndex, true);
    205231}
    206232
     
    210236
    211237Value * KernelBuilder::getOutputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    212     Value * const blockIndex = computeBlockIndex(getProducedItemCount(name));
    213     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    214     return buf->getStreamPackPtr(this, getStreamSetBufferPtr(name), streamIndex, blockIndex, packIndex, false);
     238    const Kernel::StreamPort p = mKernel->getStreamPort(name);
     239    if (LLVM_UNLIKELY(p.first == Port::Input)) {
     240        report_fatal_error(name + " is not an output stream set");
     241    }
     242    Value * addr = mKernel->getStreamSetOutputBufferPtr(p.second);
     243    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     244    return buf->getStreamPackPtr(this, getStreamHandle(name), addr, streamIndex, packIndex, false);
    215245}
    216246
     
    221251Value * KernelBuilder::getOutputStreamSetCount(const std::string & name) {
    222252    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    223     return buf->getStreamSetCount(this, getStreamSetBufferPtr(name));
    224 }
    225 
    226 Value * KernelBuilder::getRawInputPointer(const std::string & name, Value * streamIndex, Value * absolutePosition) {
    227     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    228     return buf->getRawItemPointer(this, getStreamSetBufferPtr(name), streamIndex, absolutePosition);
    229 }
    230 
    231 Value * KernelBuilder::getRawOutputPointer(const std::string & name, Value * streamIndex, Value * absolutePosition) {
    232     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    233     return buf->getRawItemPointer(this, getStreamSetBufferPtr(name), streamIndex, absolutePosition);
     253    return buf->getStreamSetCount(this, getStreamHandle(name));
     254}
     255
     256Value * KernelBuilder::getRawInputPointer(const std::string & name, Value * absolutePosition) {
     257    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     258    return buf->getRawItemPointer(this, getStreamHandle(name), absolutePosition);
     259}
     260
     261Value * KernelBuilder::getRawOutputPointer(const std::string & name, Value * absolutePosition) {
     262    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     263    return buf->getRawItemPointer(this, getStreamHandle(name), absolutePosition);
    234264}
    235265
    236266Value * KernelBuilder::getBaseAddress(const std::string & name) {
    237     return mKernel->getAnyStreamSetBuffer(name)->getBaseAddress(this, getStreamSetBufferPtr(name));
     267    return mKernel->getAnyStreamSetBuffer(name)->getBaseAddress(this, getStreamHandle(name));
    238268}
    239269
    240270void KernelBuilder::setBaseAddress(const std::string & name, Value * const addr) {
    241     return mKernel->getAnyStreamSetBuffer(name)->setBaseAddress(this, getStreamSetBufferPtr(name), addr);
     271    return mKernel->getAnyStreamSetBuffer(name)->setBaseAddress(this, getStreamHandle(name), addr);
    242272}
    243273
    244274Value * KernelBuilder::getBufferedSize(const std::string & name) {
    245     return mKernel->getAnyStreamSetBuffer(name)->getBufferedSize(this, getStreamSetBufferPtr(name));
     275    return mKernel->getAnyStreamSetBuffer(name)->getBufferedSize(this, getStreamHandle(name));
    246276}
    247277
    248278void KernelBuilder::setBufferedSize(const std::string & name, Value * size) {
    249     mKernel->getAnyStreamSetBuffer(name)->setBufferedSize(this, getStreamSetBufferPtr(name), size);
     279    mKernel->getAnyStreamSetBuffer(name)->setBufferedSize(this, getStreamHandle(name), size);
    250280}
    251281
    252282
    253283Value * KernelBuilder::getCapacity(const std::string & name) {
    254     return mKernel->getAnyStreamSetBuffer(name)->getCapacity(this, getStreamSetBufferPtr(name));
     284    return mKernel->getAnyStreamSetBuffer(name)->getCapacity(this, getStreamHandle(name));
    255285}
    256286
    257287void KernelBuilder::setCapacity(const std::string & name, Value * c) {
    258     mKernel->getAnyStreamSetBuffer(name)->setCapacity(this, getStreamSetBufferPtr(name), c);
     288    mKernel->getAnyStreamSetBuffer(name)->setCapacity(this, getStreamHandle(name), c);
    259289}
    260290
    261291   
    262292CallInst * KernelBuilder::createDoSegmentCall(const std::vector<Value *> & args) {
    263     Function * const doSegment = mKernel->getDoSegmentFunction(getModule());
    264     assert (doSegment->getArgumentList().size() == args.size());
    265     return CreateCall(doSegment, args);
     293//    Function * const doSegment = mKernel->getDoSegmentFunction(getModule());
     294//    assert (doSegment->getArgumentList().size() == args.size());
     295//    return CreateCall(doSegment, args);
     296    return mKernel->makeDoSegmentCall(*this, args);
    266297}
    267298
     
    278309        for (unsigned i = 0; i < n; ++i) {
    279310            const Binding & b = outputs[i];
    280             if (b.name == accumName) {
     311            if (b.getName() == accumName) {
    281312                if (n == 1) {
    282313                    return results;
     
    307338        BasicBlock * wait[n];
    308339        for (unsigned i = 0; i < n; ++i) {
    309             load[i] = BasicBlock::Create(getContext(), consumers[i].name + "Load", parent);
    310             wait[i] = BasicBlock::Create(getContext(), consumers[i].name + "Wait", parent);
     340            load[i] = BasicBlock::Create(getContext(), consumers[i].getName() + "Load", parent);
     341            wait[i] = BasicBlock::Create(getContext(), consumers[i].getName() + "Wait", parent);
    311342        }
    312343        load[n] = BasicBlock::Create(getContext(), "Resume", parent);
     
    315346
    316347            SetInsertPoint(load[i]);
    317             Value * const outputConsumers = getConsumerLock(consumers[i].name);
     348            Value * const outputConsumers = getConsumerLock(consumers[i].getName());
    318349
    319350            Value * const consumerCount = CreateLoad(CreateGEP(outputConsumers, {zero, zero}));
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5650 r5706  
    22#define KERNEL_BUILDER_H
    33
    4 #include <kernels/interface.h>
    54#include <IR_Gen/idisa_builder.h>
     5#include <kernels/kernel.h>
    66
    77namespace kernel {
     
    1010
    1111class KernelBuilder : public virtual IDISA::IDISA_Builder {
     12    friend class Kernel;
    1213public:
    1314
     
    3536    void releaseLogicalSegmentNo(llvm::Value * nextSegNo);
    3637
    37     llvm::Value * getProducedItemCount(const std::string & name, llvm::Value * doFinal = nullptr);
     38    llvm::Value * getProducedItemCount(const std::string & name) {
     39        return getInternalItemCount(name, Kernel::PRODUCED_ITEM_COUNT_SUFFIX);
     40    }
    3841
    39     void setProducedItemCount(const std::string & name, llvm::Value * value);
     42    void setProducedItemCount(const std::string & name, llvm::Value * value) {
     43        setInternalItemCount(name, Kernel::PRODUCED_ITEM_COUNT_SUFFIX, value);
     44    }
    4045
    41     llvm::Value * getProcessedItemCount(const std::string & name);
     46    llvm::Value * getProcessedItemCount(const std::string & name) {
     47        return getInternalItemCount(name, Kernel::PROCESSED_ITEM_COUNT_SUFFIX);
     48    }
    4249
    43     void setProcessedItemCount(const std::string & name, llvm::Value * value);
     50    void setProcessedItemCount(const std::string & name, llvm::Value * value) {
     51        setInternalItemCount(name, Kernel::PROCESSED_ITEM_COUNT_SUFFIX, value);
     52    }
    4453
    45     llvm::Value * getConsumedItemCount(const std::string & name);
     54    llvm::Value * getConsumedItemCount(const std::string & name) {
     55        return getInternalItemCount(name, Kernel::CONSUMED_ITEM_COUNT_SUFFIX);
     56    }
    4657
    47     void setConsumedItemCount(const std::string & name, llvm::Value * value);
     58    void setConsumedItemCount(const std::string & name, llvm::Value * value) {
     59        setInternalItemCount(name, Kernel::CONSUMED_ITEM_COUNT_SUFFIX, value);
     60    }
    4861
    4962    llvm::Value * getTerminationSignal();
    5063
    51     void setTerminationSignal();
     64    void setTerminationSignal() { setTerminationSignal(getTrue()); }
     65
     66    void setTerminationSignal(llvm::Value * const value);
     67
     68    llvm::Value * getCycleCountPtr();
    5269
    5370    // Run-time access of Kernel State and parameters of methods for
    5471    // use in implementing kernels.
     72
     73    llvm::Value * getInputStreamPtr(const std::string & name, llvm::Value * const blockIndex);
    5574
    5675    llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
     
    6382
    6483    llvm::Value * getInputStreamSetCount(const std::string & name);
     84
     85    llvm::Value * getOutputStreamPtr(const std::string & name, llvm::Value * const blockIndex);
    6586
    6687    llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
     
    7697    llvm::Value * getAdjustedInputStreamBlockPtr(llvm::Value * blockAdjustment, const std::string & name, llvm::Value * streamIndex);
    7798
    78     llvm::Value * getRawInputPointer(const std::string & name, llvm::Value * streamIndex, llvm::Value * absolutePosition);
     99    llvm::Value * getRawInputPointer(const std::string & name, llvm::Value * absolutePosition);
    79100
    80     llvm::Value * getRawOutputPointer(const std::string & name, llvm::Value * streamIndex, llvm::Value * absolutePosition);
     101    llvm::Value * getRawOutputPointer(const std::string & name, llvm::Value * absolutePosition);
    81102
    82103    llvm::Value * getBaseAddress(const std::string & name);
     104
     105    void CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to);
    83106
    84107    void setBaseAddress(const std::string & name, llvm::Value * addr);
     
    98121    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    99122   
     123    llvm::Value * copy(const std::string & name, llvm::Value * target, llvm::Value * source, llvm::Value * itemsToCopy, const unsigned alignment = 0);
     124
    100125    llvm::BasicBlock * CreateConsumerWait();
    101126
    102     llvm::Value * getStreamSetBufferPtr(const std::string & name);
     127    llvm::Value * getStreamHandle(const std::string & name);
    103128
    104129    llvm::CallInst * createDoSegmentCall(const std::vector<llvm::Value *> & args);
     
    130155    llvm::Value * getScalarFieldPtr(llvm::Value * instance, const std::string & fieldName);
    131156
     157    llvm::Value * getInternalItemCount(const std::string & name, const std::string & suffix);
     158
     159    void setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value);
     160
    132161private:
    133162
  • icGREP/icgrep-devel/icgrep/kernels/linebreak_kernel.cpp

    r5436 r5706  
    2121
    2222LineBreakKernelBuilder::LineBreakKernelBuilder(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned basisBitsCount)
    23 : PabloKernel(b, "lb", {Binding{b->getStreamSetTy(basisBitsCount), "basis"}}, {Binding{b->getStreamSetTy(1), "linebreak", Add1()}}) {
     23: PabloKernel(b, "lb", {Binding{b->getStreamSetTy(basisBitsCount), "basis"}}, {Binding{b->getStreamSetTy(1), "linebreak", FixedRate(), Add1()}}) {
    2424
    2525}
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5645 r5706  
    3232            iBuilder->getSize(iBuilder->getBitBlockWidth()),
    3333            iBuilder->CreateSub(iBuilder->getAvailableItemCount("literalIndexes"), iBuilder->getProcessedItemCount("literalIndexes")));
    34     Value * inputBufferBasePtr = iBuilder->getRawInputPointer("inputStream", iBuilder->getSize(0), iBuilder->getSize(0));
    35     Value * outputBufferBasePtr = iBuilder->getRawOutputPointer("outputStream", iBuilder->getSize(0), iBuilder->getSize(0));
     34    Value * inputBufferBasePtr = iBuilder->getRawInputPointer("inputStream", iBuilder->getSize(0));
     35    Value * outputBufferBasePtr = iBuilder->getRawOutputPointer("outputStream", iBuilder->getSize(0));
    3636    iBuilder->CreateBr(loopBody);
    3737
     
    5555    Value * matchLength = iBuilder->CreateZExt(iBuilder->CreateLoad(matchLengthPtr), iBuilder->getSizeTy());
    5656
    57 #if 0
    58     Value * processedItem = iBuilder->CreateAdd(iBuilder->getProcessedItemCount("literalIndexes"), phiInputIndex);
    59     iBuilder->CallPrintInt("ProccessedItem", processedItem);
    60     iBuilder->CallPrintInt("LiteralStart", literalStart);
    61     iBuilder->CallPrintInt("LiteralLength", literalLength);
    62     iBuilder->CallPrintInt("MatchOffset", matchOffset);
    63     iBuilder->CallPrintInt("MatchLength", matchLength);
    64 #endif
     57//    iBuilder->CallPrintInt(" ----- literalStart", literalStart);
     58//    iBuilder->CallPrintInt(" ----- literalLength", literalLength);
     59//    iBuilder->CallPrintInt(" ----- matchOffset", matchOffset);
     60//    iBuilder->CallPrintInt(" ----- matchLength", matchLength);
     61
     62//#if 0
     63//    Value * processedItem = iBuilder->CreateAdd(iBuilder->getProcessedItemCount("literalIndexes"), phiInputIndex);
     64//    iBuilder->CallPrintInt("ProccessedItem", processedItem);
     65//    iBuilder->CallPrintInt("LiteralStart", literalStart);
     66//    iBuilder->CallPrintInt("LiteralLength", literalLength);
     67//    iBuilder->CallPrintInt("MatchOffset", matchOffset);
     68//    iBuilder->CallPrintInt("MatchLength", matchLength);
     69//#endif
    6570
    6671    // =================================================
     
    114119
    115120    iBuilder->SetInsertPoint(cpyLoopBody);
    116 #ifndef NDEBUG
    117     iBuilder->CallPrintIntToStderr("srcOffset", phiSrcOffset);
    118     iBuilder->CallPrintIntToStderr("dstOffset", phiDstOffset);
    119 #endif
     121//#ifndef NDEBUG
     122//    iBuilder->CallPrintIntToStderr("srcOffset", phiSrcOffset);
     123//    iBuilder->CallPrintIntToStderr("dstOffset", phiDstOffset);
     124//#endif
    120125    BasicBlock * reachingBufferEnd_then = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_then");
    121126    BasicBlock * reachingBufferEnd_else = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_else");
     
    182187
    183188    iBuilder->SetInsertPoint(loopExit);
    184 #ifndef NDEBUG
    185     iBuilder->CallPrintInt("Decompressed bytes", iBuilder->getProducedItemCount("outputStream"));
    186 #endif
     189//#ifndef NDEBUG
     190//    iBuilder->CallPrintInt("Decompressed bytes", iBuilder->getProducedItemCount("outputStream"));
     191//#endif
    187192}
    188193
     
    193198    {Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes"},
    194199     Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes"},
    195      Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", UnknownRate()}},
     200     Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", UnknownRate(), LookBehind(65536)}},
    196201    // Outputs
    197202    {Binding{iBuilder->getStreamSetTy(1, 8), "outputStream", UnknownRate()}},
  • icGREP/icgrep-devel/icgrep/kernels/lz4_index_decoder.cpp

    r5493 r5706  
    1212
    1313#ifndef NDEBUG
    14 #define DEBUG_RT_PRINT 1
     14#define DEBUG_RT_PRINT 0
    1515#else
    1616#define DEBUG_RT_PRINT 0
     
    1818
    1919#define printRTDebugMsg(MSG) \
    20     do { if (DEBUG_RT_PRINT) iBuilder->CallPrintMsgToStderr(MSG); } while (0)
     20    if (DEBUG_RT_PRINT) iBuilder->CallPrintMsgToStderr(MSG)
    2121
    2222#define printRTDebugInt(NAME, X) \
    23     do { if (DEBUG_RT_PRINT) iBuilder->CallPrintIntToStderr(NAME, X); } while (0)
     23    if (DEBUG_RT_PRINT) iBuilder->CallPrintIntToStderr(NAME, X)
    2424
    2525#define printGlobalPos() \
     
    144144    Value * producedItem = iBuilder->getProducedItemCount("literalIndexes");
    145145
    146 #ifndef NDEBUG
    147     iBuilder->CallPrintInt("ProducedItem", producedItem);
    148     // LiteralStart is adjusted to be relative to the block start, so that
    149     // the output can be compared against that of the reference implementation.
    150     Value * literalStart = iBuilder->CreateSub(iBuilder->getScalarField("LiteralStart"), iBuilder->getScalarField("LZ4BlockStart"));
    151     iBuilder->CallPrintInt("LiteralStart", literalStart);
    152     iBuilder->CallPrintInt("LiteralLength", iBuilder->getScalarField("LiteralLength"));
    153     iBuilder->CallPrintInt("MatchOffset", iBuilder->getScalarField("MatchOffset"));
    154     iBuilder->CallPrintInt("MatchLength", iBuilder->getScalarField("MatchLength"));
    155 #endif
     146//#ifndef NDEBUG
     147//    iBuilder->CallPrintInt("ProducedItem", producedItem);
     148//    // LiteralStart is adjusted to be relative to the block start, so that
     149//    // the output can be compared against that of the reference implementation.
     150//    Value * literalStart = iBuilder->CreateSub(iBuilder->getScalarField("LiteralStart"), iBuilder->getScalarField("LZ4BlockStart"));
     151//    iBuilder->CallPrintInt("LiteralStart", literalStart);
     152//    iBuilder->CallPrintInt("LiteralLength", iBuilder->getScalarField("LiteralLength"));
     153//    iBuilder->CallPrintInt("MatchOffset", iBuilder->getScalarField("MatchOffset"));
     154//    iBuilder->CallPrintInt("MatchLength", iBuilder->getScalarField("MatchLength"));
     155//#endif
    156156    printRTDebugMsg("--------------");
    157157
     
    695695    // Outputs: literal start, literal length, match offset, match length
    696696    {Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes", UnknownRate()},
    697      Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes", FixedRatio(1, 1, "literalIndexes")}},
     697     Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes", RateEqualTo("literalIndexes")}},
    698698    // Arguments
    699699    {Binding{iBuilder->getInt1Ty(), "hasBlockChecksum"}},
  • icGREP/icgrep-devel/icgrep/kernels/p2s_kernel.cpp

    r5440 r5706  
    11#include "p2s_kernel.h"
    2 //#include "llvm/IR/Constant.h"      // for Constant
    3 //#include "llvm/IR/Constants.h"     // for ConstantInt
    4 //#include "llvm/IR/DerivedTypes.h"  // for PointerType, VectorType
    5 //#include "llvm/IR/Function.h"      // for Function, Function::arg_iterator
    6 //#include <llvm/IR/Module.h>
    72#include <kernels/streamset.h>
    83#include <kernels/kernel_builder.h>
     
    107102    PointerType * int16PtrTy = iBuilder->getInt16Ty()->getPointerTo();
    108103    PointerType * bitBlockPtrTy = iBuilder->getBitBlockType()->getPointerTo();
    109     ConstantInt * stride = iBuilder->getSize(iBuilder->getStride());
     104    ConstantInt * blockMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    110105
    111106    Value * hi_input[8];
     
    123118    p2s(iBuilder, lo_input, lo_bytes);
    124119
    125     Value * delCountBlock_ptr = iBuilder->getInputStreamBlockPtr("deletionCounts", iBuilder->getInt32(0));
    126     Value * unit_counts = iBuilder->fwCast(iBuilder->getBitBlockWidth() / 16, iBuilder->CreateBlockAlignedLoad(delCountBlock_ptr));
    127 
    128 
    129     Value * u16_output_ptr = iBuilder->getOutputStreamBlockPtr("i16Stream", iBuilder->getInt32(0));
    130     u16_output_ptr = iBuilder->CreatePointerCast(u16_output_ptr, int16PtrTy);
     120    Value * delCount = iBuilder->loadInputStreamBlock("deletionCounts", iBuilder->getInt32(0));
     121    Value * unitCounts = iBuilder->fwCast(iBuilder->getBitBlockWidth() / 16, delCount);
     122    Value * outputPtr = iBuilder->getOutputStreamBlockPtr("i16Stream", iBuilder->getInt32(0));
     123    outputPtr = iBuilder->CreatePointerCast(outputPtr, int16PtrTy);
    131124    Value * i16UnitsGenerated = iBuilder->getProducedItemCount("i16Stream"); // units generated to buffer
    132     u16_output_ptr = iBuilder->CreateGEP(u16_output_ptr, iBuilder->CreateURem(i16UnitsGenerated, stride));
     125    outputPtr = iBuilder->CreateGEP(outputPtr, iBuilder->CreateAnd(i16UnitsGenerated, blockMask));
    133126
    134127    Value * offset = ConstantInt::get(i32Ty, 0);
     
    136129    for (unsigned j = 0; j < 8; ++j) {
    137130        Value * merge0 = iBuilder->bitCast(iBuilder->esimd_mergel(8, hi_bytes[j], lo_bytes[j]));
    138         iBuilder->CreateAlignedStore(merge0, iBuilder->CreateBitCast(iBuilder->CreateGEP(u16_output_ptr, offset), bitBlockPtrTy), 1);
    139         offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(2 * j)), i32Ty);
     131        iBuilder->CreateAlignedStore(merge0, iBuilder->CreateBitCast(iBuilder->CreateGEP(outputPtr, offset), bitBlockPtrTy), 1);
     132        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unitCounts, iBuilder->getInt32(2 * j)), i32Ty);
    140133
    141134        Value * merge1 = iBuilder->bitCast(iBuilder->esimd_mergeh(8, hi_bytes[j], lo_bytes[j]));
    142         iBuilder->CreateAlignedStore(merge1, iBuilder->CreateBitCast(iBuilder->CreateGEP(u16_output_ptr, offset), bitBlockPtrTy), 1);
    143         offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unit_counts, iBuilder->getInt32(2 * j + 1)), i32Ty);
    144     }   
     135        iBuilder->CreateAlignedStore(merge1, iBuilder->CreateBitCast(iBuilder->CreateGEP(outputPtr, offset), bitBlockPtrTy), 1);
     136        offset = iBuilder->CreateZExt(iBuilder->CreateExtractElement(unitCounts, iBuilder->getInt32(2 * j + 1)), i32Ty);
     137    }
     138
    145139    Value * i16UnitsFinal = iBuilder->CreateAdd(i16UnitsGenerated, iBuilder->CreateZExt(offset, iBuilder->getSizeTy()));
    146140    iBuilder->setProducedItemCount("i16Stream", i16UnitsFinal);
     
    157151: BlockOrientedKernel("p2s_compress",
    158152              {Binding{iBuilder->getStreamSetTy(8, 1), "basisBits"}, Binding{iBuilder->getStreamSetTy(1, 1), "deletionCounts"}},
    159                       {Binding{iBuilder->getStreamSetTy(1, 8), "byteStream", MaxRatio(1)}},
     153              {Binding{iBuilder->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}},
    160154              {}, {}, {}) {
    161155}
     
    172166: BlockOrientedKernel("p2s_16_compress",
    173167              {Binding{b->getStreamSetTy(16, 1), "basisBits"}, Binding{b->getStreamSetTy(1, 1), "deletionCounts"}},
    174               {Binding{b->getStreamSetTy(1, 16), "i16Stream", MaxRatio(1)}},
     168              {Binding{b->getStreamSetTy(1, 16), "i16Stream", BoundedRate(0, 1)}},
    175169              {},
    176170              {},
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5639 r5706  
    1414PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
    1515: MultiBlockKernel("PDEPdel",
    16                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", MaxRatio(1)}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
     16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)},
     17                   Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
    1718                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
    1819                  {}, {}, {})
     
    2425}
    2526
    26 void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {   
     27void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
    2728    BasicBlock * entry = kb->GetInsertBlock();
    2829    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
     
    3132    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
    3233
    33     Function::arg_iterator args = mCurrentMethod->arg_begin();
    34     args++; //self
    35     Value * itemsToDo = &*(args++); // Since PDEP marker stream is a bit stream, this is the number of PDEP marker bits to process
    36     // Get pointer to start of the StreamSetBlock containing unprocessed input items.
    37     Value * sourceItemsAvail =  &*(args++);
    38     Value * PDEPStrmPtr = &*(args++);
    39     Value * inputSwizzlesPtr = &*(args++);
     34    Value * itemsToDo = mAvailableItemCount[0];
     35    Value * sourceItemsAvail = mAvailableItemCount[1];
     36
     37    Value * PDEPStrmPtr = iBuilder->getInputStreamBlockPtr("PDEPmarkerStream", iBuilder->getInt32(0)); // mStreamBufferPtr[0];
     38    Value * inputSwizzlesPtr = iBuilder->getInputStreamBlockPtr("sourceStreamSet", iBuilder->getInt32(0)); // mStreamBufferPtr[1];
    4039    // Get pointer to start of the output StreamSetBlock we're currently writing to
    41     Value * outputStreamPtr = &*(args);
     40    Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputStreamSet", iBuilder->getInt32(0)); // mStreamBufferPtr[2];
    4241
    4342    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.h

    r5627 r5706  
    7474    const unsigned mSwizzleFactor;
    7575    const unsigned mPDEPWidth;
    76     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) override;
     76    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
    7777    std::vector<llvm::Value *> get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * PDEP_ms_blk,
    7878                                              const unsigned mask_width);
  • icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

    r5599 r5706  
    3939// of bytes to the actual output stream.
    4040
    41 void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) {
     41void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) {
    4242
    4343    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
     
    7171    const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
    7272
    73     Function::arg_iterator args = mCurrentMethod->arg_begin();
    74    
    75     /* self = */ args++;
    76     Value * itemsToDo = &*(args++);
    77     Value * sourceStream = &*(args++);
    78     Value * expandedStream = &*(args);
     73    Value * itemsToDo = mAvailableItemCount[0];
     74
     75    Value * sourceStream = iBuilder->getInputStreamBlockPtr("sourceStream", iBuilder->getInt32(0));
     76    Value * expandedStream = iBuilder->getOutputStreamBlockPtr("expand34Stream", iBuilder->getInt32(0));
    7977
    8078    // The main loop processes 3 packs of data at a time.
     
    132130   
    133131    iBuilder->SetInsertPoint(expand3_4_exit);
    134     }
     132}
    135133
    136134
     
    294292expand3_4Kernel::expand3_4Kernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder)
    295293: MultiBlockKernel("expand3_4",
    296             {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream"}},
    297             {Binding{iBuilder->getStreamSetTy(1, 8), "expand34Stream", FixedRatio(4,3)}},
     294            {Binding{iBuilder->getStreamSetTy(1, 8), "sourceStream", FixedRate(3)}},
     295            {Binding{iBuilder->getStreamSetTy(1, 8), "expand34Stream", FixedRate(4)}},
    298296            {}, {}, {}) {
    299     setKernelStride(3 * iBuilder->getBitBlockWidth()/8);
     297
    300298}
    301299
     
    310308: BlockOrientedKernel("base64",
    311309            {Binding{iBuilder->getStreamSetTy(1, 8), "radix64stream"}},
    312             {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream", RoundUpToMultiple(4)}},
     310            {Binding{iBuilder->getStreamSetTy(1, 8), "base64stream", FixedRate(1), RoundUpTo(4)}},
    313311            {}, {}, {}) {
    314312}
  • icGREP/icgrep-devel/icgrep/kernels/radix64.h

    r5507 r5706  
    2525    bool hasSignature() const override { return false; }
    2626private:
    27     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder) override;
     27    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    2828};
    2929
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp

    r5695 r5706  
    2121namespace kernel {
    2222
    23 void ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder) {
     23void ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    2424
    2525    Module * const m = iBuilder->getModule();
     
    4242    const unsigned fieldCount = iBuilder->getBitBlockWidth() / sizeTy->getBitWidth();
    4343    VectorType * const scanwordVectorType =  VectorType::get(sizeTy, fieldCount);
    44     Constant * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
    45 
    46     Function::arg_iterator args = mCurrentMethod->arg_begin();
    47     /* self = */ args++;
    48     Value * itemsToDo = &*(args++);
    49     /* inputStreamAvail = */ args++;
    50     Value * match_result = &*(args++);
    51     Value * line_break = &*(args++);
    52     /* input_stream = */ args++;
    53 
    54     Value * blocksToDo = iBuilder->CreateUDivCeil(itemsToDo, blockSize);
     44
     45    Value * match_result = iBuilder->getInputStreamBlockPtr("matchResult", iBuilder->getInt32(0));
     46    Value * line_break = iBuilder->getInputStreamBlockPtr("lineBreak", iBuilder->getInt32(0));
     47
     48    Value * blocksToDo = iBuilder->CreateAdd(numOfStrides, iBuilder->CreateZExt(mIsFinal, numOfStrides->getType()));
     49    blocksToDo = iBuilder->CreateMul(blocksToDo, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth()));
    5550   
    5651    Value * match_result_ptr = iBuilder->CreateBitCast(match_result, scanwordVectorType->getPointerTo());
     
    140135            phiRecordStart->addIncoming(matchRecordStart, loop_final_block);
    141136            Value * matchRecordEnd = iBuilder->CreateAdd(phiScanwordPos, iBuilder->CreateCountForwardZeroes(phiMatchWord));
    142             Value * const inputStream = iBuilder->getRawInputPointer("InputStream", iBuilder->getInt32(0), iBuilder->getInt32(0));
     137            Value * const inputStream = iBuilder->getRawInputPointer("InputStream", iBuilder->getInt32(0));
    143138            Function * dispatcher = m->getFunction("accumulate_match_wrapper"); assert (dispatcher);
    144139            Value * start_ptr = iBuilder->CreateGEP(inputStream, matchRecordStart);
     
    195190
    196191    iBuilder->SetInsertPoint(blocksExit);
    197     iBuilder->CreateCondBr(iBuilder->CreateICmpULT(itemsToDo, blockSize), callFinalizeScan, scanReturn);
     192    iBuilder->CreateCondBr(mIsFinal, callFinalizeScan, scanReturn);
     193
    198194    iBuilder->SetInsertPoint(callFinalizeScan);
    199195    Value * bufSize = iBuilder->getBufferedSize("InputStream");
    200196    Function * finalizer = m->getFunction("finalize_match_wrapper"); assert (finalizer);
    201     Value * const buffer_base = iBuilder->getRawInputPointer("InputStream", iBuilder->getInt32(0), iBuilder->getInt32(0));
     197    Value * const buffer_base = iBuilder->getRawInputPointer("InputStream", iBuilder->getInt32(0));
    202198    Value * buffer_end_address = iBuilder->CreateGEP(buffer_base, bufSize);
    203199    iBuilder->CreateCall(finalizer, {accumulator, buffer_end_address});
    204200    iBuilder->CreateBr(scanReturn);
     201
    205202    iBuilder->SetInsertPoint(scanReturn);
    206203   
     
    209206ScanMatchKernel::ScanMatchKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    210207: MultiBlockKernel("scanMatch",
    211     {Binding{b->getStreamSetTy(1, 1), "matchResult"}, Binding{b->getStreamSetTy(1, 1), "lineBreak"}, Binding{b->getStreamSetTy(1, 8), "InputStream", UnknownRate()}},
     208    {Binding{b->getStreamSetTy(1, 1), "matchResult", FixedRate(), Principle()}, Binding{b->getStreamSetTy(1, 1), "lineBreak"}, Binding{b->getStreamSetTy(1, 8), "InputStream", UnknownRate()}},
    212209    {},
    213210    {Binding{b->getIntAddrTy(), "accumulator_address"}},
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.h

    r5697 r5706  
    2020    bool hasSignature() const override { return false; }
    2121private:
    22         void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     22    void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    2323};
    2424
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.cpp

    r5694 r5706  
    2828/// MMAP SOURCE KERNEL
    2929
    30 void MMapSourceKernel::linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> & kb) {
    31     mFileSizeFunction = kb->LinkFunction("file_size", &file_size);
    32 }
    33 
    34 void MMapSourceKernel::generateInitializeMethod(const std::unique_ptr<KernelBuilder> & kb) {
     30Function * MMapSourceKernel::linkFileSizeMethod(const std::unique_ptr<kernel::KernelBuilder> & kb) {
     31    return kb->LinkFunction("file_size", &file_size);
     32}
     33
     34void MMapSourceKernel::generateInitializeMethod(Function * const fileSizeMethod, const unsigned codeUnitWidth, const std::unique_ptr<KernelBuilder> & kb) {
    3535    BasicBlock * const emptyFile = kb->CreateBasicBlock("EmptyFile");
    3636    BasicBlock * const nonEmptyFile = kb->CreateBasicBlock("NonEmptyFile");
    3737    BasicBlock * const exit = kb->CreateBasicBlock("Exit");
    3838    IntegerType * const sizeTy = kb->getSizeTy();
    39     assert (kb->getKernel() == this);
    4039    Value * const fd = kb->getScalarField("fileDescriptor");
    41     assert (mFileSizeFunction);
    42     Value * fileSize = kb->CreateCall(mFileSizeFunction, fd);
     40    assert (fileSizeMethod);
     41    Value * fileSize = kb->CreateCall(fileSizeMethod, fd);
    4342    fileSize = kb->CreateZExtOrTrunc(fileSize, sizeTy);
    44     if (mCodeUnitWidth > 8) {
    45         fileSize = kb->CreateUDiv(fileSize, kb->getSize(mCodeUnitWidth / 8));
     43    if (codeUnitWidth > 8) {
     44        fileSize = kb->CreateUDiv(fileSize, kb->getSize(codeUnitWidth / 8));
    4645    }
    4746    Value * const isEmpty = kb->CreateICmpEQ(fileSize, ConstantInt::getNullValue(fileSize->getType()));
     
    7473}
    7574
    76 void MMapSourceKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
     75void MMapSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentBlocks, const std::unique_ptr<KernelBuilder> & kb) {
    7776
    7877    BasicBlock * dropPages = kb->CreateBasicBlock("dropPages");
     
    9089
    9190    // multiply the consumed count by the code unit size then mask off any partial pages
    92     if (mCodeUnitWidth > 8) {
    93         consumed = kb->CreateMul(consumed, ConstantInt::get(consumedTy, mCodeUnitWidth / 8));
     91    if (codeUnitWidth > 8) {
     92        consumed = kb->CreateMul(consumed, ConstantInt::get(consumedTy, codeUnitWidth / 8));
    9493    }
    9594    const auto pageSize = getpagesize();
     
    124123    // determine whether or not we've exhausted the file buffer
    125124    kb->SetInsertPoint(processSegment);
    126     ConstantInt * segmentItems = kb->getSize(mSegmentBlocks * kb->getBitBlockWidth());
     125    ConstantInt * segmentItems = kb->getSize(segmentBlocks * kb->getBitBlockWidth());
    127126    Value * const fileSize = kb->getScalarField("fileSize");
    128127    Value * const produced = kb->CreateAdd(kb->getProducedItemCount("sourceBuffer"), segmentItems);
    129128    Value * const lessThanFullSegment = kb->CreateICmpULT(fileSize, produced);
    130129    kb->CreateUnlikelyCondBr(lessThanFullSegment, setTermination, mmapSourceExit);
     130
    131131    kb->SetInsertPoint(setTermination);
    132 
    133132    kb->setTerminationSignal();
    134133    kb->CreateBr(mmapSourceExit);
     
    140139    itemsRead->addIncoming(fileSize, setTermination);
    141140    kb->setProducedItemCount("sourceBuffer", itemsRead);
    142 }
    143 
    144 void MMapSourceKernel::generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & kb) {
     141
     142}
     143
     144void MMapSourceKernel::unmapSourceBuffer(const std::unique_ptr<KernelBuilder> & kb) {
    145145    kb->CreateMUnmap(kb->getBaseAddress("sourceBuffer"), kb->getBufferedSize("sourceBuffer"));
    146146}
     
    160160/// READ SOURCE KERNEL
    161161
    162 void ReadSourceKernel::generateInitializeMethod(const std::unique_ptr<KernelBuilder> & kb) {
    163     const size_t initialBufferSize = 8 * getpagesize() * mCodeUnitWidth;
    164     ConstantInt * const bufferBytes = kb->getSize(initialBufferSize * mCodeUnitWidth/8);
    165     PointerType * const codeUnitPtrTy = IntegerType::get(kb->getContext(), mCodeUnitWidth)->getPointerTo();
     162void ReadSourceKernel::generateInitializeMethod(const unsigned codeUnitWidth, const std::unique_ptr<KernelBuilder> & kb) {
     163    const size_t initialBufferSize = 8 * getpagesize() * codeUnitWidth;
     164    ConstantInt * const bufferBytes = kb->getSize(initialBufferSize * codeUnitWidth / 8);
     165    PointerType * const codeUnitPtrTy = IntegerType::get(kb->getContext(), codeUnitWidth)->getPointerTo();
    166166    Value * const buffer = kb->CreatePointerCast(kb->CreateCacheAlignedMalloc(bufferBytes), codeUnitPtrTy);
    167167    kb->setScalarField("buffer", buffer);
     
    172172}
    173173
    174 void ReadSourceKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
    175 
    176     ConstantInt * const readSize = kb->getSize(getpagesize() * 8/mCodeUnitWidth);
    177     PointerType * const codeUnitPtrTy = IntegerType::get(kb->getContext(), mCodeUnitWidth)->getPointerTo();
     174void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentBlocks, const std::unique_ptr<KernelBuilder> & kb) {
     175
     176    ConstantInt * const readSize = kb->getSize(getpagesize() * 8/codeUnitWidth);
     177    PointerType * const codeUnitPtrTy = IntegerType::get(kb->getContext(), codeUnitWidth)->getPointerTo();
    178178    PointerType * const i8PtrTy = IntegerType::get(kb->getContext(), 8)->getPointerTo();
    179     ConstantInt * const codeUnitBytes = kb->getSize(mCodeUnitWidth/8);
     179    ConstantInt * const codeUnitBytes = kb->getSize(codeUnitWidth/8);
    180180    BasicBlock * const entryBlock = kb->GetInsertBlock();
    181181    BasicBlock * const exhaustedBuffer = kb->CreateBasicBlock("ExhaustedBuffer");
     
    184184    BasicBlock * const stdInExit = kb->CreateBasicBlock("StdInExit");
    185185
    186     assert(kb->getKernel() == this);
    187 
    188186    // Check whether we need to read another page of data
    189     ConstantInt * const segmentSize = kb->getSize(mSegmentBlocks * kb->getBitBlockWidth());
     187    ConstantInt * const segmentSize = kb->getSize(segmentBlocks * kb->getBitBlockWidth());
    190188    Value * bufferedSize = kb->getBufferedSize("sourceBuffer");
    191189    Value * const produced = kb->getProducedItemCount("sourceBuffer");
    192190    Value * unreadSize = kb->CreateSub(bufferedSize, produced);
     191
    193192    kb->CreateUnlikelyCondBr(kb->CreateICmpULT(unreadSize, segmentSize), exhaustedBuffer, stdInExit);
    194193
     
    207206    // If so, we can append to our existing buffer without impacting any subsequent kernel.
    208207
    209     Value * inputStream = kb->getRawOutputPointer("sourceBuffer", kb->getInt32(0), kb->getInt32(0));
     208    Value * inputStream = kb->getRawOutputPointer("sourceBuffer", kb->getInt32(0));
    210209    Value * const originalPtr = kb->CreateGEP(inputStream, produced);
    211210
     
    216215    Value * B = kb->CreateGEP(buffer, capacity);
    217216    Value * const canAppend = kb->CreateICmpULT(L, B);
     217
    218218    kb->CreateLikelyCondBr(canAppend, readData, waitOnConsumers);
    219219
     
    225225    // that our "unproduced" data must be block aligned.
    226226    const size_t blockAlignment = kb->getBitBlockWidth() / 8;
    227     Constant * const alignmentMask = kb->getSize(-(blockAlignment * 8 / mCodeUnitWidth));
     227    Constant * const alignmentMask = kb->getSize(-(blockAlignment * 8 / codeUnitWidth));
    228228    Value * const consumed = kb->CreateAnd(kb->getConsumedItemCount("sourceBuffer"), alignmentMask);
    229229    Value * const remaining = kb->CreateSub(bufferedSize, consumed);
     
    233233    Value * source = unconsumedPtr;
    234234    Value * toCopy = remaining;
    235     if (mCodeUnitWidth != 8) {
     235    if (codeUnitWidth != 8) {
    236236        source = kb->CreatePointerCast(unconsumedPtr, i8PtrTy);
    237237        toCopy = kb->CreateMul(remaining, codeUnitBytes);
     
    247247    kb->SetInsertPoint(copyBack);
    248248    // If so, just copy the data ...
    249     if (mCodeUnitWidth != 8) {
     249    if (codeUnitWidth != 8) {
    250250        target = kb->CreatePointerCast(buffer, i8PtrTy);
    251251    }
     
    256256    kb->SetInsertPoint(expandAndCopyBack);
    257257    Value * const expandedCapacity = kb->CreateShl(capacity, 1);
    258     Value * const expandedBytes = mCodeUnitWidth == 8 ? expandedCapacity : kb->CreateMul(expandedCapacity, codeUnitBytes);
     258    Value * const expandedBytes = codeUnitWidth == 8 ? expandedCapacity : kb->CreateMul(expandedCapacity, codeUnitBytes);
    259259    Value * const expandedBuffer = kb->CreatePointerCast(kb->CreateCacheAlignedMalloc(expandedBytes), codeUnitPtrTy);
    260     target = mCodeUnitWidth == 8 ? expandedBuffer : kb->CreatePointerCast(expandedBuffer, i8PtrTy);
     260    target = codeUnitWidth == 8 ? expandedBuffer : kb->CreatePointerCast(expandedBuffer, i8PtrTy);
    261261    kb->CreateMemCpy(target, source, toCopy, 1);
    262262    kb->CreateFree(buffer);
     
    284284    addr->addIncoming(originalPtr, exhaustedBuffer);
    285285    addr->addIncoming(modifiedPtr, calculateLogicalAddress);
    286     assert(kb->getKernel() == this);
    287286    Value * const fd = kb->getScalarField("fileDescriptor");
    288     Value * toRead = readSize;
    289     if (mCodeUnitWidth != 8) {
    290         toRead = kb->CreateMul(toRead, codeUnitBytes);
    291     }
    292     Value * bytesRead = kb->CreateReadCall(fd, addr, toRead);
    293     Value * itemsRead = bytesRead;
    294     if (mCodeUnitWidth != 8) {
    295         itemsRead = kb->CreateUDiv(bytesRead, codeUnitBytes);
     287
     288    Value * itemsRead = kb->CreateReadCall(fd, addr, readSize);
     289    if (codeUnitWidth != 8) {
     290        itemsRead = kb->CreateUDiv(itemsRead, codeUnitBytes);
    296291    }
    297292    unreadSize = kb->CreateAdd(unreadSize, itemsRead);
     
    306301    Value * bytesToZero = kb->CreateSub(segmentSize, unreadSize);
    307302    Value * unreadPtr = kb->CreateGEP(addr, unreadSize);
    308     bytesToZero = mCodeUnitWidth == 8 ? bytesToZero : kb->CreateMul(bytesToZero, codeUnitBytes);
    309     if (mCodeUnitWidth != 8) {
     303    bytesToZero = codeUnitWidth == 8 ? bytesToZero : kb->CreateMul(bytesToZero, codeUnitBytes);
     304    if (codeUnitWidth != 8) {
    310305        bytesToZero = kb->CreateMul(bytesToZero, codeUnitBytes);
    311306        unreadPtr = kb->CreatePointerCast(unreadPtr, i8PtrTy);
    312307    }
    313308    kb->CreateMemZero(unreadPtr, bytesToZero);
    314     kb->setTerminationSignal();
     309    kb->setCapacity("sourceBuffer", bufferedSize);
     310    kb->setTerminationSignal(kb->CreateICmpEQ(unreadSize, Constant::getNullValue(itemsRead->getType())));
    315311    kb->CreateBr(stdInExit);
    316312
     
    326322}
    327323
    328 void ReadSourceKernel::generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & kb) {
     324void ReadSourceKernel::freeBuffer(const std::unique_ptr<KernelBuilder> & kb) {
    329325    kb->CreateFree(kb->getScalarField("buffer"));
    330326}
     
    342338}
    343339
    344 // Hybrid MMap/Read source kernel
    345    
     340/// Hybrid MMap/Read source kernel
     341
     342void FDSourceKernel::linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> & kb) {
     343    mFileSizeFunction = MMapSourceKernel::linkFileSizeMethod(kb);
     344}
     345
     346void FDSourceKernel::generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & kb) {
     347    BasicBlock * finalizeRead = kb->CreateBasicBlock("finalizeRead");
     348    BasicBlock * finalizeMMap = kb->CreateBasicBlock("finalizeMMap");
     349    BasicBlock * finalizeDone = kb->CreateBasicBlock("finalizeDone");
     350    // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic, otherwise use mmap logic.
     351    kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), finalizeRead, finalizeMMap);
     352    kb->SetInsertPoint(finalizeRead);
     353    ReadSourceKernel::freeBuffer(kb);
     354    kb->CreateBr(finalizeDone);
     355    kb->SetInsertPoint(finalizeMMap);
     356    MMapSourceKernel::unmapSourceBuffer(kb);
     357    kb->CreateBr(finalizeDone);
     358    kb->SetInsertPoint(finalizeDone);
     359}
     360
     361void FDSourceKernel::generateInitializeMethod(const std::unique_ptr<KernelBuilder> & kb) {
     362    BasicBlock * initializeRead = kb->CreateBasicBlock("initializeRead");
     363    BasicBlock * initializeMMap = kb->CreateBasicBlock("initializeMMap");
     364    BasicBlock * initializeDone = kb->CreateBasicBlock("initializeDone");
     365    // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic, otherwise use MMap logic.
     366    kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), initializeRead, initializeMMap);
     367    kb->SetInsertPoint(initializeRead);
     368    ReadSourceKernel::generateInitializeMethod(mCodeUnitWidth, kb);
     369    kb->CreateBr(initializeDone);
     370    kb->SetInsertPoint(initializeMMap);
     371    MMapSourceKernel::generateInitializeMethod(mFileSizeFunction, mCodeUnitWidth, kb);
     372    kb->CreateBr(initializeDone);
     373    kb->SetInsertPoint(initializeDone);
     374}
     375
     376void FDSourceKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
     377    BasicBlock * DoSegmentRead = kb->CreateBasicBlock("DoSegmentRead");
     378    BasicBlock * DoSegmentMMap = kb->CreateBasicBlock("DoSegmentMMap");
     379    BasicBlock * DoSegmentDone = kb->CreateBasicBlock("DoSegmentDone");
     380    // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic, otherwise use MMap logic.
     381    kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), DoSegmentRead, DoSegmentMMap);
     382    kb->SetInsertPoint(DoSegmentRead);
     383    ReadSourceKernel::generateDoSegmentMethod(mCodeUnitWidth, mSegmentBlocks, kb);
     384    kb->CreateBr(DoSegmentDone);
     385    kb->SetInsertPoint(DoSegmentMMap);
     386    MMapSourceKernel::generateDoSegmentMethod(mCodeUnitWidth, mSegmentBlocks, kb);
     387    kb->CreateBr(DoSegmentDone);
     388    kb->SetInsertPoint(DoSegmentDone);
     389}
     390
    346391FDSourceKernel::FDSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned blocksPerSegment, unsigned codeUnitWidth)
    347392: SegmentOrientedKernel("FD_source" + std::to_string(blocksPerSegment) + "@" + std::to_string(codeUnitWidth)
     
    355400, mCodeUnitWidth(codeUnitWidth)
    356401, mFileSizeFunction(nullptr) {
    357    
    358 }
    359 
    360 void FDSourceKernel::generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & kb) {
    361     BasicBlock * finalizeRead = kb->CreateBasicBlock("finalizeRead");
    362     BasicBlock * finalizeMMap = kb->CreateBasicBlock("finalizeMMap");
    363     BasicBlock * finalizeDone = kb->CreateBasicBlock("finalizeDone");
    364     // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic, otherwise use mmap logic.
    365     kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), finalizeRead, finalizeMMap);
    366     kb->SetInsertPoint(finalizeRead);
    367     reinterpret_cast<ReadSourceKernel *>(this)->ReadSourceKernel::generateFinalizeMethod(kb);
    368     kb->CreateBr(finalizeDone);
    369     kb->SetInsertPoint(finalizeMMap);
    370     reinterpret_cast<MMapSourceKernel *>(this)->MMapSourceKernel::generateFinalizeMethod(kb);
    371     kb->CreateBr(finalizeDone);
    372     kb->SetInsertPoint(finalizeDone);
    373 }
    374 
    375 void FDSourceKernel::generateInitializeMethod(const std::unique_ptr<KernelBuilder> & kb) {
    376     BasicBlock * initializeRead = kb->CreateBasicBlock("initializeRead");
    377     BasicBlock * initializeMMap = kb->CreateBasicBlock("initializeMMap");
    378     BasicBlock * initializeDone = kb->CreateBasicBlock("initializeDone");
    379     // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic, otherwise use MMap logic.
    380     kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), initializeRead, initializeMMap);
    381     kb->SetInsertPoint(initializeRead);
    382     reinterpret_cast<ReadSourceKernel *>(this)->ReadSourceKernel::generateInitializeMethod(kb);
    383     kb->CreateBr(initializeDone);
    384     kb->SetInsertPoint(initializeMMap);
    385     reinterpret_cast<MMapSourceKernel *>(this)->MMapSourceKernel::generateInitializeMethod(kb);
    386     kb->CreateBr(initializeDone);
    387     kb->SetInsertPoint(initializeDone);
    388 }
    389 
    390 void FDSourceKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
    391     BasicBlock * DoSegmentRead = kb->CreateBasicBlock("DoSegmentRead");
    392     BasicBlock * DoSegmentMMap = kb->CreateBasicBlock("DoSegmentMMap");
    393     BasicBlock * DoSegmentDone = kb->CreateBasicBlock("DoSegmentDone");
    394     // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic, otherwise use MMap logic.
    395     kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), DoSegmentRead, DoSegmentMMap);
    396     kb->SetInsertPoint(DoSegmentRead);
    397     reinterpret_cast<ReadSourceKernel *>(this)->ReadSourceKernel::generateDoSegmentMethod(kb);
    398     kb->CreateBr(DoSegmentDone);
    399     kb->SetInsertPoint(DoSegmentMMap);
    400     reinterpret_cast<MMapSourceKernel *>(this)->MMapSourceKernel::generateDoSegmentMethod(kb);
    401     kb->CreateBr(DoSegmentDone);
    402     kb->SetInsertPoint(DoSegmentDone);
    403 }
    404 
    405 
    406 void FDSourceKernel::linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> & kb) {
    407     mFileSizeFunction = kb->LinkFunction("file_size", &file_size);
    408 }
    409    
    410    
     402
     403}
     404
    411405/// MEMORY SOURCE KERNEL
    412406
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.h

    r5693 r5706  
    1212
    1313class MMapSourceKernel final : public SegmentOrientedKernel {
     14    friend class FDSourceKernel;
    1415public:
    1516    MMapSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned blocksPerSegment = 1, unsigned codeUnitWidth = 8);
    1617    bool isCachable() const override { return true; }
    1718    bool hasSignature() const override { return false; }
    18     void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    19     void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    20     void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    21     void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     19    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     20        mFileSizeFunction = linkFileSizeMethod(iBuilder);
     21    }
     22    void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     23        generateInitializeMethod(mFileSizeFunction, mCodeUnitWidth, iBuilder);
     24    }
     25    void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     26        generateDoSegmentMethod(mCodeUnitWidth, mSegmentBlocks, iBuilder);
     27    }
     28    void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     29        unmapSourceBuffer(iBuilder);
     30    }
     31protected:
     32    static llvm::Function * linkFileSizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     33    static void generateInitializeMethod(llvm::Function * fileSize, const unsigned codeUnitWidth, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     34    static void generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentBlocks, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     35    static void unmapSourceBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    2236protected:
    2337    const unsigned          mSegmentBlocks;
     
    2741
    2842class ReadSourceKernel final : public SegmentOrientedKernel {
     43    friend class FDSourceKernel;
    2944public:
    3045    ReadSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned blocksPerSegment = 1, unsigned codeUnitWidth = 8);
    3146    bool isCachable() const override { return true; }
    3247    bool hasSignature() const override { return false; }
    33     void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    34     void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    35     void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     48    void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     49        generateInitializeMethod(mCodeUnitWidth, iBuilder);
     50    }
     51    void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     52        generateDoSegmentMethod(mCodeUnitWidth, mSegmentBlocks, iBuilder);
     53    }
     54    void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     55        freeBuffer(iBuilder);
     56    }
     57protected:
     58    static void generateInitializeMethod(const unsigned codeUnitWidth, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     59    static void generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentBlocks, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     60    static void freeBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    3661private:
    3762    unsigned mSegmentBlocks;
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r5606 r5706  
    1515namespace kernel {
    1616
    17 // Rather than using doBlock logic to write one block at a time, this custom
    18 // doSegment method attempts to write the entire segment with a single write call.
    19 // However, if the segment spans two memory areas (e.g., because of wraparound),
    20 // then two write calls are made.
    21 void StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) {
    22     PointerType * i8PtrTy = iBuilder->getInt8PtrTy();
    23     Constant * itemBytes = iBuilder->getSize(mCodeUnitWidth / 8);
    24    
    25     Function::arg_iterator args = mCurrentMethod->arg_begin();
    26     /* self = */ args++;
    27     Value * itemsToDo = &*(args++);
    28     Value * codeUnitBuffer = &*(args++);
    29 
    30     Value * bytesToDo = mCodeUnitWidth == 8 ? itemsToDo : iBuilder->CreateMul(itemsToDo, itemBytes);
    31     Value * bytePtr = iBuilder->CreatePointerCast(codeUnitBuffer, i8PtrTy);
    32     iBuilder->CreateWriteCall(iBuilder->getInt32(1), bytePtr, bytesToDo);
     17void StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const /* numOfStrides */) {
     18    Value * codeUnitBuffer = iBuilder->getInputStreamBlockPtr("codeUnitBuffer", iBuilder->getInt32(0));
     19    codeUnitBuffer = iBuilder->CreatePointerCast(codeUnitBuffer, iBuilder->getInt8PtrTy());
     20    Value * bytesToDo = mAvailableItemCount[0];
     21    if (LLVM_UNLIKELY(mCodeUnitWidth > 8)) {
     22        bytesToDo = iBuilder->CreateMul(bytesToDo, iBuilder->getSize(mCodeUnitWidth / 8));
     23    } else if (LLVM_UNLIKELY(mCodeUnitWidth < 8)) {
     24        bytesToDo = iBuilder->CreateUDiv(bytesToDo, iBuilder->getSize(8 / mCodeUnitWidth));
     25    }
     26    iBuilder->CreateWriteCall(iBuilder->getInt32(1), codeUnitBuffer, bytesToDo);
    3327}
    3428
     
    3731, mCodeUnitWidth(codeUnitWidth) {
    3832    setNoTerminateAttribute(true);
     33    // setKernelStride(getpagesize());
    3934}
    4035
     
    6762}
    6863
    69 void FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) {
    70     BasicBlock * closeFile = iBuilder->CreateBasicBlock("closeFile");
    71     BasicBlock * fileOutExit = iBuilder->CreateBasicBlock("fileOutExit");
    72    
    73     PointerType * i8PtrTy = iBuilder->getInt8PtrTy();
    74     Constant * itemBytes = iBuilder->getSize(mCodeUnitWidth / 8);
    75     Value * fileDes = iBuilder->getScalarField("fileDes");
     64void FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const /* numOfStrides */) {
     65    BasicBlock * const closeFile = iBuilder->CreateBasicBlock("closeFile");
     66    BasicBlock * const fileOutExit = iBuilder->CreateBasicBlock("fileOutExit");
    7667
    77     Function::arg_iterator args = mCurrentMethod->arg_begin();
    78     /* self = */ args++;
    79     Value * itemsToDo = &*(args++);
    80     Value * codeUnitBuffer = &*(args++);
    81    
    82     Value * bytesToDo = mCodeUnitWidth == 8 ? itemsToDo : iBuilder->CreateMul(itemsToDo, itemBytes);
    83     Value * bytePtr = iBuilder->CreatePointerCast(codeUnitBuffer, i8PtrTy);
    84    
    85     iBuilder->CreateWriteCall(fileDes, bytePtr, bytesToDo);
    86     iBuilder->CreateCondBr(iBuilder->CreateICmpULT(itemsToDo, iBuilder->getSize(getKernelStride())), closeFile, fileOutExit);
    87    
    88     iBuilder->SetInsertPoint(closeFile);
     68    Value * const fileDes = iBuilder->getScalarField("fileDes");
     69    Value * const codeUnitBuffer = iBuilder->CreatePointerCast(getStreamSetInputBufferPtr(0), iBuilder->getInt8PtrTy());
     70    Value * bytesToDo = mAvailableItemCount[0];
     71    if (LLVM_UNLIKELY(mCodeUnitWidth > 8)) {
     72        bytesToDo = iBuilder->CreateMul(bytesToDo, iBuilder->getSize(mCodeUnitWidth / 8));
     73    } else if (LLVM_UNLIKELY(mCodeUnitWidth < 8)) {
     74        bytesToDo = iBuilder->CreateUDiv(bytesToDo, iBuilder->getSize(8 / mCodeUnitWidth));
     75    }   
     76    iBuilder->CreateWriteCall(fileDes, codeUnitBuffer, bytesToDo);
     77    iBuilder->CreateUnlikelyCondBr(mIsFinal, closeFile, fileOutExit);
     78
     79    iBuilder->SetInsertPoint(closeFile);   
    8980    iBuilder->CreateCloseCall(fileDes);
    9081    Value * newFileNamePtr = iBuilder->getScalarField("fileName");
    9182    Value * tmpFileNamePtr = iBuilder->getScalarField("tmpFileName");
    9283    iBuilder->CreateRenameCall(tmpFileNamePtr, newFileNamePtr);
    93     iBuilder->CreateFree(tmpFileNamePtr);
    94    
     84    iBuilder->CreateFree(tmpFileNamePtr);   
    9585    iBuilder->CreateBr(fileOutExit);
    9686   
     
    9989
    10090FileSink::FileSink(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth)
    101 : MultiBlockKernel("filesink", {Binding{iBuilder->getStreamSetTy(1, codeUnitWidth), "codeUnitBuffer"}}, {},
    102                 {Binding{iBuilder->getInt8PtrTy(), "fileName"}}, {}, {Binding{iBuilder->getInt8PtrTy(), "tmpFileName"}, Binding{iBuilder->getInt32Ty(), "fileDes"}})
     91: MultiBlockKernel("filesink" + std::to_string(codeUnitWidth),
     92{Binding{iBuilder->getStreamSetTy(1, codeUnitWidth), "codeUnitBuffer"}},
     93{},
     94{Binding{iBuilder->getInt8PtrTy(), "fileName"}}, {}, {Binding{iBuilder->getInt8PtrTy(), "tmpFileName"}, Binding{iBuilder->getInt32Ty(), "fileDes"}})
    10395, mCodeUnitWidth(codeUnitWidth) {
     96    // setKernelStride(getpagesize());
    10497}
    10598
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.h

    r5449 r5706  
    1616    StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth);
    1717private:
    18     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) override;
     18    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    1919private:
    2020    const unsigned mCodeUnitWidth;
     
    2727protected:
    2828    void generateInitializeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    29     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) override;
     29    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    3030private:
    3131    const unsigned mCodeUnitWidth;
  • icGREP/icgrep-devel/icgrep/kernels/streamset.cpp

    r5650 r5706  
    2828
    2929void StreamSetBuffer::allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
     30    assert (mBufferBlocks > 0);
    3031    if (LLVM_LIKELY(mStreamSetBufferPtr == nullptr)) {
    3132        Type * const ty = getType();
     
    4950}
    5051
    51 Value * StreamSetBuffer::getStreamBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * streamIndex, Value * blockIndex, const bool /* readOnly */) const {
     52Value * StreamSetBuffer::getStreamBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * addr, Value * streamIndex, const bool /* readOnly */) const {
    5253    if (codegen::EnableAsserts) {
    5354        Value * const count = getStreamSetCount(iBuilder, self);
     
    5657        iBuilder->CreateAssert(cond, "StreamSetBuffer: out-of-bounds stream access");
    5758    }
    58     return iBuilder->CreateGEP(getStreamSetBlockPtr(iBuilder, self, blockIndex), {iBuilder->getInt32(0), streamIndex});
    59 }
    60 
    61 Value * StreamSetBuffer::getStreamPackPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * streamIndex, Value * blockIndex, Value * packIndex, const bool /* readOnly */) const {
     59    return iBuilder->CreateGEP(addr, {iBuilder->getInt32(0), streamIndex});
     60}
     61
     62Value * StreamSetBuffer::getStreamPackPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * addr, Value * streamIndex, Value * packIndex, const bool /* readOnly */) const {
    6263    if (codegen::EnableAsserts) {
    6364        Value * const count = getStreamSetCount(iBuilder, self);
     
    6667        iBuilder->CreateAssert(cond, "StreamSetBuffer: out-of-bounds stream access");
    6768    }
    68     return iBuilder->CreateGEP(getStreamSetBlockPtr(iBuilder, self, blockIndex), {iBuilder->getInt32(0), streamIndex, packIndex});
    69 }
    70 
    71 void StreamSetBuffer::setBaseAddress(IDISA::IDISA_Builder * const iBuilder, Value * /* self */, Value * /* addr */) const {
     69    return iBuilder->CreateGEP(addr, {iBuilder->getInt32(0), streamIndex, packIndex});
     70}
     71
     72void StreamSetBuffer::setBaseAddress(IDISA::IDISA_Builder * const /* iBuilder */, Value * /* self */, Value * /* addr */) const {
    7273    report_fatal_error("setBaseAddress is not supported by this buffer type");
    7374}
    7475
    75 Value * StreamSetBuffer::getBufferedSize(IDISA::IDISA_Builder * const iBuilder, Value * /* self */) const {
     76Value * StreamSetBuffer::getBufferedSize(IDISA::IDISA_Builder * const /* iBuilder */, Value * /* self */) const {
    7677    report_fatal_error("getBufferedSize is not supported by this buffer type");
    7778}
    7879
    79 void StreamSetBuffer::setBufferedSize(IDISA::IDISA_Builder * const iBuilder, Value * /* self */, llvm::Value * /* size */) const {
     80void StreamSetBuffer::setBufferedSize(IDISA::IDISA_Builder * const /* iBuilder */, Value * /* self */, llvm::Value * /* size */) const {
    8081    report_fatal_error("setBufferedSize is not supported by this buffer type");
    8182}
    8283
    83 Value * StreamSetBuffer::getCapacity(IDISA::IDISA_Builder * const iBuilder, Value * /* self */) const {
    84     report_fatal_error("getCapacity is not supported by this buffer type");
    85 }
    86 
    87 void StreamSetBuffer::setCapacity(IDISA::IDISA_Builder * const iBuilder, Value * /* self */, llvm::Value * /* c */) const {
     84Value * StreamSetBuffer::getCapacity(IDISA::IDISA_Builder * const iBuilder, Value * self) const {
     85    return getBufferedSize(iBuilder, self);
     86}
     87
     88void StreamSetBuffer::setCapacity(IDISA::IDISA_Builder * const /* iBuilder */, Value * /* self */, llvm::Value * /* c */) const {
    8889    report_fatal_error("setCapacity is not supported by this buffer type");
    8990}
     
    126127 * The type of the pointer is i8* for fields of 8 bits or less, otherwise iN* for N-bit fields.
    127128 */
    128 Value * StreamSetBuffer::getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * streamIndex, Value * absolutePosition) const {
    129     Value * ptr = iBuilder->CreateGEP(getBaseAddress(iBuilder, self), {iBuilder->getInt32(0), streamIndex});
     129Value * StreamSetBuffer::getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * absolutePosition) const {
     130    Value * ptr = getBaseAddress(iBuilder, self);
    130131    Value * relativePosition = absolutePosition;
    131132    const auto bw = mBaseType->getArrayElementType()->getScalarSizeInBits();
     
    141142
    142143Value * StreamSetBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const b, Value * self, Value * fromPosition, Value * availItems, bool reverse) const {
    143     Constant * bufSize = b->getSize(mBufferBlocks * b->getStride());
     144    Constant * bufSize = ConstantInt::get(fromPosition->getType(), mBufferBlocks * b->getStride());
    144145    Value * itemsFromBase = b->CreateURem(fromPosition, bufSize);
    145146    if (reverse) {
    146147        Value * bufAvail = b->CreateSelect(b->CreateICmpEQ(itemsFromBase, b->getSize(0)), bufSize, itemsFromBase);
    147148        return b->CreateSelect(b->CreateICmpULT(bufAvail, availItems), bufAvail, availItems);
    148     }
    149     else {
     149    } else {
    150150        Value * linearSpace = b->CreateSub(bufSize, itemsFromBase, "linearSpace");
    151151        return b->CreateSelect(b->CreateICmpULT(availItems, linearSpace), availItems, linearSpace);
     
    154154
    155155Value * StreamSetBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromPosition, bool reverse) const {
    156     Constant * bufSize = iBuilder->getSize(mBufferBlocks * iBuilder->getStride());
     156    Constant * bufSize = ConstantInt::get(fromPosition->getType(), mBufferBlocks * iBuilder->getStride());
    157157    Value * bufRem = iBuilder->CreateURem(fromPosition, bufSize);
    158158    if (reverse) {
    159159        return iBuilder->CreateSelect(iBuilder->CreateICmpEQ(bufRem, iBuilder->getSize(0)), bufSize, bufRem);
    160160    }
    161     else return iBuilder->CreateSub(bufSize, bufRem, "linearSpace");
     161    return iBuilder->CreateSub(bufSize, bufRem, "linearSpace");
    162162}
    163163
     
    179179}
    180180
     181Value * StreamSetBuffer::copy(IDISA::IDISA_Builder * const b, Value * self, Value * const target, Value * const source, Value * itemsToCopy, const unsigned alignment) const {
     182    Type * ty = getBaseType();
     183    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     184        ty = ty->getArrayElementType();
     185    }
     186    if (LLVM_LIKELY(isa<VectorType>(ty))) {
     187        ty = ty->getVectorElementType();
     188    }
     189    const auto itemWidth = ty->getScalarSizeInBits();
     190    assert (itemWidth > 0);
     191    Value * const m = b->CreateMul(getStreamSetCount(b, self), b->getSize(itemWidth / 8));
     192    Value * const bytesToCopy = b->CreateMul(itemsToCopy, m);
     193
     194    // TODO: lz4d s2p reads misaligned data into the source stream. The stream binding should indicate alignment.
     195    // alignment ? alignment : b->getBitBlockWidth() / 8
     196    b->CreateMemCpy(target, source, bytesToCopy, 1);
     197    return bytesToCopy;
     198}
     199
    181200void StreamSetBuffer::createBlockAlignedCopy(IDISA::IDISA_Builder * const iBuilder, Value * targetBlockPtr, Value * sourceBlockPtr, Value * itemsToCopy) const {
    182     Type * const int8PtrTy = iBuilder->getInt8PtrTy();
    183201    const unsigned alignment = iBuilder->getBitBlockWidth() / 8;
    184     Constant * const blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
     202    Constant * const blockSize = ConstantInt::get(itemsToCopy->getType(), iBuilder->getBitBlockWidth());
    185203    size_t numStreams = 1;
    186204    if (isa<ArrayType>(mBaseType)) {
     
    191209        Value * copyBits = iBuilder->CreateMul(itemsToCopy, iBuilder->getSize(fieldWidth));
    192210        Value * copyBytes = iBuilder->CreateLShr(iBuilder->CreateAdd(copyBits, iBuilder->getSize(7)), iBuilder->getSize(3));
    193         iBuilder->CreateMemMove(iBuilder->CreateBitCast(targetBlockPtr, int8PtrTy), iBuilder->CreateBitCast(sourceBlockPtr, int8PtrTy), copyBytes, alignment);
     211        iBuilder->CreateMemMove(targetBlockPtr, sourceBlockPtr, copyBytes, alignment);
    194212    } else {
    195213        Value * blocksToCopy = iBuilder->CreateUDiv(itemsToCopy, blockSize);
     
    198216        Value * partialBlockSourcePtr = iBuilder->CreateGEP(sourceBlockPtr, blocksToCopy);
    199217        Value * blockCopyBytes = iBuilder->CreateMul(blocksToCopy, iBuilder->getSize(iBuilder->getBitBlockWidth() * numStreams * fieldWidth/8));
    200         iBuilder->CreateMemMove(iBuilder->CreateBitCast(targetBlockPtr, int8PtrTy), iBuilder->CreateBitCast(sourceBlockPtr, int8PtrTy), blockCopyBytes, alignment);
     218        iBuilder->CreateMemMove(targetBlockPtr, sourceBlockPtr, blockCopyBytes, alignment);
    201219        Value * partialCopyBitsPerStream = iBuilder->CreateMul(partialItems, iBuilder->getSize(fieldWidth));
    202220        Value * partialCopyBytesPerStream = iBuilder->CreateLShr(iBuilder->CreateAdd(partialCopyBitsPerStream, iBuilder->getSize(7)), iBuilder->getSize(3));
    203         for (unsigned strm = 0; strm < numStreams; strm++) {
    204             Value * strmTargetPtr = iBuilder->CreateGEP(partialBlockTargetPtr, {iBuilder->getInt32(0), iBuilder->getInt32(strm)});
    205             Value * strmSourcePtr = iBuilder->CreateGEP(partialBlockSourcePtr, {iBuilder->getInt32(0), iBuilder->getInt32(strm)});
    206             strmTargetPtr = iBuilder->CreateBitCast(strmTargetPtr, int8PtrTy);
    207             strmSourcePtr = iBuilder->CreateBitCast(strmSourcePtr, int8PtrTy);
     221        for (unsigned i = 0; i < numStreams; i++) {
     222            Value * strmTargetPtr = iBuilder->CreateGEP(partialBlockTargetPtr, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
     223            Value * strmSourcePtr = iBuilder->CreateGEP(partialBlockSourcePtr, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
    208224            iBuilder->CreateMemMove(strmTargetPtr, strmSourcePtr, partialCopyBytesPerStream, alignment);
    209225        }
     
    211227}
    212228
    213 void StreamSetBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProduced, Value * newProduced, const std::string Name) {
     229void StreamSetBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProduced, Value * newProduced, const std::string Name) const {
    214230    report_fatal_error("Copy back not supported for this buffer type:" + Name);
    215231}
     
    244260void SourceBuffer::setBaseAddress(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * addr) const {
    245261    Value * const ptr = iBuilder->CreateGEP(self, {iBuilder->getInt32(0), iBuilder->getInt32(int(SourceBuffer::Field::BaseAddress))});
    246 
    247262    iBuilder->CreateStore(iBuilder->CreatePointerCast(addr, ptr->getType()->getPointerElementType()), ptr);
    248263}
     
    256271}
    257272
    258 Value * SourceBuffer::getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * blockIndex) const {
    259     return iBuilder->CreateGEP(getBaseAddress(iBuilder, self), blockIndex);
     273Value * SourceBuffer::getBlockAddress(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * blockIndex) const {
     274    return iBuilder->CreateGEP(getBaseAddress(iBuilder, self), blockIndex );
    260275}
    261276
    262277Value * SourceBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromPosition, Value * availItems, bool reverse) const {
    263278    if (reverse) report_fatal_error("SourceBuffer cannot be accessed in reverse");
    264     Value * maxAvail = iBuilder->CreateSub(getCapacity(iBuilder, self), fromPosition);
     279    Value * maxAvail = iBuilder->CreateSub(getBufferedSize(iBuilder, self), fromPosition);
    265280    return iBuilder->CreateSelect(iBuilder->CreateICmpULT(availItems, maxAvail), availItems, maxAvail);
    266281}
     
    293308}
    294309
    295 Value * ExternalBuffer::getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * blockIndex) const {
     310Value * ExternalBuffer::getBlockAddress(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * blockIndex) const {
    296311    return iBuilder->CreateGEP(getBaseAddress(iBuilder, self), blockIndex);
    297312}
    298313
    299 // All available items can be accessed.
    300 Value * ExternalBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const, Value *, Value *, Value * availItems, bool) const {
    301     return availItems;
     314Value * ExternalBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const, Value *, Value *, Value * availItems, const bool reverse) const {
     315    // All available items can be accessed.
     316    return reverse ? ConstantInt::getAllOnesValue(availItems->getType()) : availItems;
     317}
     318
     319Value * ExternalBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const, Value *, Value * fromPosition, const bool reverse) const {
     320    // Trust that the buffer is large enough to write any amount
     321    return reverse ? fromPosition : ConstantInt::getAllOnesValue(fromPosition->getType());
    302322}
    303323
    304324// Circular Buffer
    305 Value * CircularBuffer::getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * const self, Value * const blockIndex) const {
     325Value * CircularBuffer::getBlockAddress(IDISA::IDISA_Builder * const iBuilder, Value * const self, Value * const blockIndex) const {
    306326    return iBuilder->CreateGEP(getBaseAddress(iBuilder, self), modByBufferBlocks(iBuilder, blockIndex));
    307327}
    308328
    309 Value * CircularBuffer::getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * streamIndex, Value * absolutePosition) const {
    310     Value * ptr = iBuilder->CreateGEP(getBaseAddress(iBuilder, self), {iBuilder->getInt32(0), streamIndex});
     329Value * CircularBuffer::getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * absolutePosition) const {
     330    Value * ptr = getBaseAddress(iBuilder, self);
    311331    Value * relativePosition = iBuilder->CreateURem(absolutePosition, ConstantInt::get(absolutePosition->getType(), mBufferBlocks * iBuilder->getBitBlockWidth()));
    312332    const auto bw = mBaseType->getArrayElementType()->getScalarSizeInBits();
     
    335355}
    336356
    337 void CircularCopybackBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProduced, Value * newProduced, const std::string Name) {
    338     Constant * bufSize = b->getSize(mBufferBlocks * b->getBitBlockWidth());
     357void CircularCopybackBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProduced, Value * newProduced, const std::string Name) const {
     358    assert (priorProduced->getType() == newProduced->getType());
     359    Constant * bufSize = ConstantInt::get(priorProduced->getType(), mBufferBlocks * b->getBitBlockWidth());
    339360    Value * priorBufPos = b->CreateURem(priorProduced, bufSize);
    340361    Value * newBufPos = b->CreateURem(newProduced, bufSize);
    341     BasicBlock * copyBack = b->CreateBasicBlock(Name + "_copyBack");
    342     BasicBlock * done = b->CreateBasicBlock(Name + "_copyBackDone");
     362    BasicBlock * copyBack = b->CreateBasicBlock(Name + "_circularCopyBack");
     363    BasicBlock * done = b->CreateBasicBlock(Name + "_circularCopyBackDone");
    343364    Value * wraparound = b->CreateICmpUGT(priorBufPos, newBufPos);
    344365    b->CreateCondBr(wraparound, copyBack, done);
     366
    345367    b->SetInsertPoint(copyBack);
    346     Value * overFlowAreaPtr = b->CreateGEP(handle, b->getSize(mBufferBlocks));
     368    Value * overFlowAreaPtr = b->CreateGEP(handle, b->getInt32(mBufferBlocks));
    347369    createBlockAlignedCopy(b, handle, overFlowAreaPtr, newBufPos);
    348370    b->CreateBr(done);
     371
    349372    b->SetInsertPoint(done);
    350373}
     
    365388    IntegerType * const intAddrTy = iBuilder->getIntPtrTy(DL);
    366389
    367     Constant * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
     390    Constant * blockSize = ConstantInt::get(itemsToCopy->getType(), iBuilder->getBitBlockWidth());
    368391    Function * f = iBuilder->GetInsertBlock()->getParent();
    369392    BasicBlock * wholeBlockCopy = BasicBlock::Create(iBuilder->getContext(), "wholeBlockCopy", f, 0);
     
    398421}
    399422
    400 Value * SwizzledCopybackBuffer::getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * blockIndex) const {
     423Value * SwizzledCopybackBuffer::getBlockAddress(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * blockIndex) const {
    401424    return iBuilder->CreateGEP(getBaseAddress(iBuilder, self), modByBufferBlocks(iBuilder, blockIndex));
    402425}
     
    408431}
    409432
    410 void SwizzledCopybackBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProduced, Value * newProduced, const std::string Name) {
    411     Constant * bufSize = b->getSize(mBufferBlocks * b->getBitBlockWidth());
     433void SwizzledCopybackBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProduced, Value * newProduced, const std::string Name) const {
     434    assert (priorProduced->getType() == newProduced->getType());
     435    Constant * bufSize = ConstantInt::get(priorProduced->getType(), mBufferBlocks * b->getBitBlockWidth());
    412436    Value * priorBufPos = b->CreateURem(priorProduced, bufSize);
    413437    Value * newBufPos = b->CreateURem(newProduced, bufSize);
    414     BasicBlock * copyBack = b->CreateBasicBlock(Name + "_copyBack");
    415     BasicBlock * done = b->CreateBasicBlock(Name + "_copyBackDone");
     438    BasicBlock * copyBack = b->CreateBasicBlock(Name + "_swizzledCopyBack");
     439    BasicBlock * done = b->CreateBasicBlock(Name + "_swizzledCopyBackDone");
    416440    Value * wraparound = b->CreateICmpUGT(priorBufPos, newBufPos);
    417441    b->CreateCondBr(wraparound, copyBack, done);
     
    547571
    548572Value * ExpandableBuffer::getStreamBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * streamIndex, Value * blockIndex, const bool readOnly) const {
    549     Value * ptr, * offset;
    550     std::tie(ptr, offset) = getInternalStreamBuffer(iBuilder, self, streamIndex, blockIndex, readOnly);
    551     return iBuilder->CreateGEP(ptr, offset);
     573    report_fatal_error("temporarily not supported");
     574//    Value * ptr, * offset;
     575//    std::tie(ptr, offset) = getInternalStreamBuffer(iBuilder, self, streamIndex, blockIndex, readOnly);
     576//    return iBuilder->CreateGEP(ptr, offset);
    552577}
    553578
    554579Value * ExpandableBuffer::getStreamPackPtr(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * streamIndex, Value * blockIndex, Value * packIndex, const bool readOnly) const {
    555     Value * ptr, * offset;
    556     std::tie(ptr, offset) = getInternalStreamBuffer(iBuilder, self, streamIndex, blockIndex, readOnly);
    557     return iBuilder->CreateGEP(ptr, {offset, packIndex});
     580    report_fatal_error("temporarily not supported");
     581//    Value * ptr, * offset;
     582//    std::tie(ptr, offset) = getInternalStreamBuffer(iBuilder, self, streamIndex, blockIndex, readOnly);
     583//    return iBuilder->CreateGEP(ptr, {offset, packIndex});
    558584}
    559585
     
    573599}
    574600
    575 Value * ExpandableBuffer::getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, Value *, Value *) const {
    576     report_fatal_error("Expandable buffers: getStreamSetBlockPtr is not supported.");
     601Value * ExpandableBuffer::getBlockAddress(IDISA::IDISA_Builder * const iBuilder, Value *, Value *) const {
     602    report_fatal_error("Expandable buffers: getBlockAddress is not supported.");
    577603}
    578604
     
    581607}
    582608
    583 SourceBuffer::SourceBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, unsigned MemoryAddressSpace, unsigned StructAddressSpace)
    584 : StreamSetBuffer(BufferKind::SourceBuffer, type, StructType::get(resolveStreamSetType(b, type)->getPointerTo(MemoryAddressSpace), b->getSizeTy(), b->getSizeTy(), nullptr), 0, StructAddressSpace) {
    585     mUniqueID = "B";
    586     if (MemoryAddressSpace != 0 || StructAddressSpace != 0) {
    587         mUniqueID += "@" + std::to_string(MemoryAddressSpace) + ":" + std::to_string(StructAddressSpace);
    588     }
    589 }
    590 
    591 ExternalBuffer::ExternalBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, llvm::Value * addr, unsigned AddressSpace)
    592 : StreamSetBuffer(BufferKind::ExternalBuffer, type, resolveStreamSetType(b, type), 0, AddressSpace) {
    593     mUniqueID = "E";
    594     if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
    595     mStreamSetBufferPtr = b->CreatePointerBitCastOrAddrSpaceCast(addr, getPointerType());
    596 }
    597 
    598 CircularBuffer::CircularBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, unsigned AddressSpace)
    599 : StreamSetBuffer(BufferKind::CircularBuffer, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace) {
    600     mUniqueID = "C" + std::to_string(bufferBlocks);
    601     if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
    602 }
    603 
    604 CircularBuffer::CircularBuffer(const BufferKind k, const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, unsigned AddressSpace)
    605 : StreamSetBuffer(k, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace) {
    606 
    607 }
    608 
    609 CircularCopybackBuffer::CircularCopybackBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned AddressSpace)
    610 : CircularBuffer(BufferKind::CircularCopybackBuffer, b, type, bufferBlocks, AddressSpace)
    611 , mOverflowBlocks(overflowBlocks) {
    612     if (bufferBlocks < 2 * overflowBlocks) {
    613         report_fatal_error("CircularCopybackBuffer: bufferBlocks < 2 * overflowBlocks");
    614     }
    615     mUniqueID = "CC" + std::to_string(bufferBlocks);
    616     if (mOverflowBlocks != 1) mUniqueID += "_" + std::to_string(mOverflowBlocks);
    617     if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
    618 }
    619 
    620 ExpandableBuffer::ExpandableBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, unsigned AddressSpace)
    621 : StreamSetBuffer(BufferKind::ExpandableBuffer, type, resolveExpandableStreamSetType(b, type), bufferBlocks, AddressSpace)
    622 , mInitialCapacity(type->getArrayNumElements()) {
    623     mUniqueID = "XP" + std::to_string(bufferBlocks);
    624     if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
    625 }
    626 
    627 SwizzledCopybackBuffer::SwizzledCopybackBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned fieldwidth, unsigned AddressSpace)
    628 : StreamSetBuffer(BufferKind::SwizzledCopybackBuffer, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace), mOverflowBlocks(overflowBlocks), mFieldWidth(fieldwidth) {
    629     mUniqueID = "SW" + std::to_string(fieldwidth) + ":" + std::to_string(bufferBlocks);
    630     if (bufferBlocks < 2 * overflowBlocks) {
    631         report_fatal_error("SwizzledCopybackBuffer: bufferBlocks < 2 * overflowBlocks");
    632     }
    633     if (mOverflowBlocks != 1) {
    634         mUniqueID += "_" + std::to_string(mOverflowBlocks);
    635     }
    636     if (AddressSpace > 0) {
    637         mUniqueID += "@" + std::to_string(AddressSpace);
    638     }
    639 }
    640609
    641610Value * DynamicBuffer::getBaseAddress(IDISA::IDISA_Builder * const b, Value * const handle) const {
     
    647616}
    648617
    649 Value * DynamicBuffer::getStreamSetBlockPtr(IDISA::IDISA_Builder * const b, Value * handle, Value * blockIndex) const {
     618Value * DynamicBuffer::getBlockAddress(IDISA::IDISA_Builder * const b, Value * handle, Value * blockIndex) const {
    650619    Value * const wkgBlocks = b->CreateLoad(b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(DynamicBuffer::Field::WorkingBlocks))}));
     620    assert (blockIndex->getType() == wkgBlocks->getType());
    651621    return b->CreateGEP(getBaseAddress(b, handle), b->CreateURem(blockIndex, wkgBlocks));
    652622}
    653623
    654 Value * DynamicBuffer::getRawItemPointer(IDISA::IDISA_Builder * const b, Value * handle, Value * streamIndex, Value * absolutePosition) const {
    655     Value * absBlock = b->CreateUDiv(absolutePosition, b->getSize(b->getBitBlockWidth()));
    656     Value * blockPos = b->CreateURem(absolutePosition, b->getSize(b->getBitBlockWidth()));
    657     Value * blockPtr = b->CreateGEP(getStreamSetBlockPtr(b, handle, absBlock), {b->getInt32(0), streamIndex});
     624Value * DynamicBuffer::getRawItemPointer(IDISA::IDISA_Builder * const b, Value * handle, Value * absolutePosition) const {
     625    Constant * blockSize = ConstantInt::get(absolutePosition->getType(), b->getBitBlockWidth());
     626    Value * absBlock = b->CreateUDiv(absolutePosition, blockSize);
     627    Value * blockPos = b->CreateURem(absolutePosition, blockSize);
     628    Value * blockPtr = getBlockAddress(b, handle, absBlock);
    658629    const auto bw = mBaseType->getArrayElementType()->getScalarSizeInBits();
    659630    if (bw < 8) {
     
    669640
    670641Value * DynamicBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const b, Value * handle, Value * fromPosition, Value * availItems, bool reverse) const {
    671     Constant * blockSize = b->getSize(b->getBitBlockWidth());
    672642    Value * const bufBlocks = b->CreateLoad(b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(Field::WorkingBlocks))}));
     643    Constant * blockSize = ConstantInt::get(bufBlocks->getType(), b->getBitBlockWidth());
    673644    Value * bufSize = b->CreateMul(bufBlocks, blockSize);
     645    assert (bufSize->getType() == fromPosition->getType());
    674646    Value * itemsFromBase = b->CreateURem(fromPosition, bufSize);
    675647    if (reverse) {
    676648        Value * bufAvail = b->CreateSelect(b->CreateICmpEQ(itemsFromBase, b->getSize(0)), bufSize, itemsFromBase);
    677649        return b->CreateSelect(b->CreateICmpULT(bufAvail, availItems), bufAvail, availItems);
    678     }
    679     else {
     650    } else {
    680651        Value * linearSpace = b->CreateSub(bufSize, itemsFromBase, "linearSpace");
    681652        return b->CreateSelect(b->CreateICmpULT(availItems, linearSpace), availItems, linearSpace);
     
    683654}
    684655
    685 Value * DynamicBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const b, Value * handle, Value * fromPosition, bool reverse) const {
    686     Constant * blockSize = b->getSize(b->getBitBlockWidth());
     656Value * DynamicBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const b, Value * handle, Value * fromPosition, bool reverse) const {   
    687657    Value * bufBlocks = b->CreateLoad(b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(Field::WorkingBlocks))}));
     658    Constant * blockSize = ConstantInt::get(bufBlocks->getType(), b->getBitBlockWidth());
    688659    Value * bufSize = b->CreateMul(bufBlocks, blockSize);
     660    assert (bufSize->getType() == fromPosition->getType());
    689661    Value * bufRem = b->CreateURem(fromPosition, bufSize);
    690662    if (reverse) {
    691663        return b->CreateSelect(b->CreateICmpEQ(bufRem, b->getSize(0)), bufSize, bufRem);
    692664    }
    693     bufSize = b->CreateMul(b->CreateAdd(bufBlocks, b->getSize(mOverflowBlocks)), blockSize);
     665    Constant * overflow = ConstantInt::get(bufBlocks->getType(), mOverflowBlocks);
     666    bufSize = b->CreateMul(b->CreateAdd(bufBlocks, overflow), blockSize);
    694667    return b->CreateSub(bufSize, bufRem, "linearWritable");
    695668}
     
    700673}
    701674
    702 void DynamicBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProducedCount, Value * newProducedCount, const std::string Name) {
     675void DynamicBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * handle, Value * priorProducedCount, Value * newProducedCount, const std::string Name) const {
     676    assert (priorProducedCount->getType() == newProducedCount->getType());   
    703677    Value * workingBlocks = b->CreateLoad(b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(DynamicBuffer::Field::WorkingBlocks))}));
    704     Value * bufSize = b->CreateMul(workingBlocks, b->getSize(b->getBitBlockWidth()));
     678    assert (workingBlocks->getType() == newProducedCount->getType());
     679    Value * bufSize = b->CreateMul(workingBlocks, ConstantInt::get(workingBlocks->getType(), b->getBitBlockWidth()));
    705680    Value * priorBufPos = b->CreateURem(priorProducedCount, bufSize);
    706681    Value * newBufPos = b->CreateURem(newProducedCount, bufSize);
    707     BasicBlock * copyBack = b->CreateBasicBlock(Name + "_copyBack");
    708     BasicBlock * done = b->CreateBasicBlock(Name + "_copyBackDone");
     682    BasicBlock * copyBack = b->CreateBasicBlock(Name + "_dynamicCopyBack");
     683    BasicBlock * done = b->CreateBasicBlock(Name + "_dynamicCopyBackDone");
     684
    709685    Value * wraparound = b->CreateICmpUGT(priorBufPos, newBufPos);
    710686    b->CreateCondBr(wraparound, copyBack, done);
     687
    711688    b->SetInsertPoint(copyBack);
    712689    Value * bufBasePtrField = b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(DynamicBuffer::Field::BaseAddress))});
     
    715692    createBlockAlignedCopy(b, bufBasePtr, overFlowAreaPtr, newBufPos);
    716693    b->CreateBr(done);
     694
    717695    b->SetInsertPoint(done);
    718696}
     
    782760    Value * workingBlocksField = b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(DynamicBuffer::Field::WorkingBlocks))});
    783761    Value * capacityField = b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(DynamicBuffer::Field::AllocatedCapacity))});
    784    
     762
    785763    Value * oldBufPtr = b->CreateLoad(bufBasePtrField);
    786764    Value * currentWorkingBlocks = b->CreateLoad(workingBlocksField);
     
    833811}
    834812
     813SourceBuffer::SourceBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, unsigned MemoryAddressSpace, unsigned StructAddressSpace)
     814: StreamSetBuffer(BufferKind::SourceBuffer, type, StructType::get(resolveStreamSetType(b, type)->getPointerTo(MemoryAddressSpace), b->getSizeTy(), b->getSizeTy(), nullptr), 0, StructAddressSpace) {
     815    mUniqueID = "B";
     816    if (MemoryAddressSpace != 0 || StructAddressSpace != 0) {
     817        mUniqueID += "@" + std::to_string(MemoryAddressSpace) + ":" + std::to_string(StructAddressSpace);
     818    }
     819}
     820
     821ExternalBuffer::ExternalBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, llvm::Value * addr, unsigned AddressSpace)
     822: StreamSetBuffer(BufferKind::ExternalBuffer, type, resolveStreamSetType(b, type), 0, AddressSpace) {
     823    mUniqueID = "E";
     824    if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
     825    mStreamSetBufferPtr = b->CreatePointerBitCastOrAddrSpaceCast(addr, getPointerType());
     826}
     827
     828CircularBuffer::CircularBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, unsigned AddressSpace)
     829: StreamSetBuffer(BufferKind::CircularBuffer, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace) {
     830    mUniqueID = "C" + std::to_string(bufferBlocks);
     831    if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
     832}
     833
     834CircularBuffer::CircularBuffer(const BufferKind k, const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, unsigned AddressSpace)
     835: StreamSetBuffer(k, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace) {
     836
     837}
     838
     839CircularCopybackBuffer::CircularCopybackBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned AddressSpace)
     840: CircularBuffer(BufferKind::CircularCopybackBuffer, b, type, bufferBlocks, AddressSpace)
     841, mOverflowBlocks(overflowBlocks) {
     842    if (bufferBlocks < 2 * overflowBlocks) {
     843        report_fatal_error("CircularCopybackBuffer: bufferBlocks < 2 * overflowBlocks");
     844    }
     845    mUniqueID = "CC" + std::to_string(bufferBlocks);
     846    if (mOverflowBlocks != 1) mUniqueID += "_" + std::to_string(mOverflowBlocks);
     847    if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
     848}
     849
     850ExpandableBuffer::ExpandableBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, unsigned AddressSpace)
     851: StreamSetBuffer(BufferKind::ExpandableBuffer, type, resolveExpandableStreamSetType(b, type), bufferBlocks, AddressSpace)
     852, mInitialCapacity(type->getArrayNumElements()) {
     853    mUniqueID = "XP" + std::to_string(bufferBlocks);
     854    if (AddressSpace > 0) mUniqueID += "@" + std::to_string(AddressSpace);
     855}
     856
     857SwizzledCopybackBuffer::SwizzledCopybackBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned fieldwidth, unsigned AddressSpace)
     858: StreamSetBuffer(BufferKind::SwizzledCopybackBuffer, type, resolveStreamSetType(b, type), bufferBlocks, AddressSpace), mOverflowBlocks(overflowBlocks), mFieldWidth(fieldwidth) {
     859    mUniqueID = "SW" + std::to_string(fieldwidth) + ":" + std::to_string(bufferBlocks);
     860    if (bufferBlocks < 2 * overflowBlocks) {
     861        report_fatal_error("SwizzledCopybackBuffer: bufferBlocks < 2 * overflowBlocks");
     862    }
     863    if (mOverflowBlocks != 1) {
     864        mUniqueID += "_" + std::to_string(mOverflowBlocks);
     865    }
     866    if (AddressSpace > 0) {
     867        mUniqueID += "@" + std::to_string(AddressSpace);
     868    }
     869}
     870
    835871inline StructType * getDynamicBufferStructType(const std::unique_ptr<kernel::KernelBuilder> & b, Type * baseType, const unsigned addrSpace) {
    836872    IntegerType * sizeTy = b->getSizeTy();
     
    869905, mBaseType(baseType)
    870906, mProducer(nullptr) {
    871 
     907    assert(k == BufferKind::SourceBuffer || k == BufferKind::ExternalBuffer || BufferBlocks);
    872908}
    873909
  • icGREP/icgrep-devel/icgrep/kernels/streamset.h

    r5650 r5706  
    6464    virtual void releaseBuffer(const std::unique_ptr<kernel::KernelBuilder> & kb) const;
    6565
    66     virtual llvm::Value * getStreamBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * streamIndex, llvm::Value * blockIndex, const bool readOnly) const;
    67 
    68     virtual llvm::Value * getStreamPackPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * streamIndex, llvm::Value * blockIndex, llvm::Value * packIndex, const bool readOnly) const;
     66    virtual llvm::Value * getStreamBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * addr, llvm::Value * streamIndex, const bool readOnly) const;
     67
     68    virtual llvm::Value * getStreamPackPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * addr, llvm::Value * streamIndex, llvm::Value * packIndex, const bool readOnly) const;
    6969   
    7070    virtual llvm::Value * getStreamSetCount(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self) const;
    7171
    72     virtual llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * streamIndex, llvm::Value * absolutePosition) const;
     72    virtual llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * absolutePosition) const;
    7373
    7474    virtual void setBaseAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * addr, llvm::Value *) const;
     
    8585    virtual llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPos, llvm::Value * avail, bool reverse = false) const;
    8686   
     87    virtual llvm::Value * copy(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * target, llvm::Value * source, llvm::Value * itemsToCopy, const unsigned alignment = 0) const;
     88
    8789    void createBlockCopy(IDISA::IDISA_Builder * const iBuilder, llvm::Value * targetBlockPtr, llvm::Value * sourceBlockPtr, llvm::Value * blocksToCopy) const;
    8890
     
    9496        return false;  // Overridden to return true by buffer types that support copyback.
    9597    }
    96     virtual void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string);
     98    virtual void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const;
    9799   
    98100    virtual ~StreamSetBuffer() = 0;
     
    111113
    112114    // Get the buffer pointer for a given block of the stream set.
    113     virtual llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockNo) const = 0;
     115    virtual llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockNo) const = 0;
    114116
    115117    bool isCapacityGuaranteed(const llvm::Value * const index, const size_t capacity) const;
     
    159161    llvm::Value * getCapacity(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self) const override;
    160162   
     163    llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockNo) const override;
     164
    161165    llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition, llvm::Value * avail, bool reverse = false) const override;
    162166
     
    174178
    175179    llvm::Value * getBaseAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self) const override;
    176 
    177     llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockNo) const override;
    178180
    179181};
     
    189191    llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition, llvm::Value * avail, bool reverse = false) const override;
    190192   
     193    llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition, bool reverse = false) const override;
     194
    191195    void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    192196
     
    194198
    195199protected:
    196     llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockNo) const override;
     200    llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockNo) const override;
    197201};
    198202
     
    205209    CircularBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, llvm::Type * type, size_t bufferBlocks, unsigned AddressSpace = 0);
    206210
    207     llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * streamIndex, llvm::Value * absolutePosition) const final;
     211    llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * absolutePosition) const final;
    208212
    209213protected:
     
    211215    CircularBuffer(const BufferKind kind, const std::unique_ptr<kernel::KernelBuilder> & b, llvm::Type * type, size_t bufferBlocks, unsigned AddressSpace = 0);
    212216
    213     llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockIndex) const final;
     217    llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockIndex) const final;
    214218};
    215219   
     
    236240    }
    237241   
    238     void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) override;
     242    void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const override;
    239243   
    240244   
     
    260264    }
    261265   
    262     void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) override;
    263 
    264 protected:
    265     llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockIndex) const override;
     266    void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const override;
     267
     268protected:
     269    llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * blockIndex) const override;
    266270private:
    267271    size_t mOverflowBlocks;
     
    295299    llvm::Value * getBaseAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self) const override;
    296300
    297     llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * blockIndex, llvm::Value *) const override;
     301    llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const iBuilder, llvm::Value * blockIndex, llvm::Value *) const override;
    298302
    299303private:
     
    322326    void releaseBuffer(const std::unique_ptr<kernel::KernelBuilder> & kb) const override;
    323327
    324     llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * streamIndex, llvm::Value * absolutePosition) const override;
     328    llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * absolutePosition) const override;
    325329   
    326330    llvm::Value * getBufferedSize(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self) const override;
     
    332336    }
    333337       
    334     void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) override;
     338    void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const override;
    335339
    336340protected:
    337341    llvm::Value * getBaseAddress(IDISA::IDISA_Builder * const b, llvm::Value * handle) const override;
    338342   
    339     llvm::Value * getStreamSetBlockPtr(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * blockIndex) const override;
     343    llvm::Value * getBlockAddress(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * blockIndex) const override;
    340344
    341345   
  • icGREP/icgrep-devel/icgrep/kernels/until_n.cpp

    r5699 r5706  
    1818const unsigned packSize = 64;
    1919   
    20 void UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {
     20void UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
    2121/* 
    2222   Strategy:  first form an index consisting of one bit per packsize input positions,
     
    6060    Type * iPackPtrTy = iPackTy->getPointerTo();
    6161   
    62     Function::arg_iterator args = mCurrentMethod->arg_begin();
    63     /* self = */ args++;
    64     Value * itemsToDo = &*(args++);
    65     Value * sourceBitstream = &*(args++);
    66     Value * uptoN_bitstream = &*(args);
    67    
     62//    Function::arg_iterator args = mCurrentMethod->arg_begin();
     63//    /* self = */ args++;
     64//    Value * itemsToDo = &*(args++);
     65//    Value * sourceBitstream = &*(args++);
     66//    Value * uptoN_bitstream = &*(args);
     67   
     68    Value * itemsToDo = mAvailableItemCount[0];
     69    Value * sourceBitstream = kb->getInputStreamBlockPtr("bits", kb->getInt32(0)); // mStreamBufferPtr[0];
     70    Value * uptoN_bitstream = kb->getInputStreamBlockPtr("uptoN", kb->getInt32(0)); // mStreamBufferPtr[1];
     71
    6872    // Compute the ceiling of the number of blocks to do.  If we have a final
    6973    // partial block, it is treated as a full block initially.   
     
    189193UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & kb)
    190194: MultiBlockKernel("UntilN", {Binding{kb->getStreamSetTy(1, 1), "bits"}},
    191                              {Binding{kb->getStreamSetTy(1, 1), "uptoN", MaxRatio(1)}},
     195                             {Binding{kb->getStreamSetTy(1, 1), "uptoN", BoundedRate(0, 1)}},
    192196                             {Binding{kb->getSizeTy(), "N"}}, {},
    193197                             {Binding{kb->getSizeTy(), "seenSoFar"}}) {
  • icGREP/icgrep-devel/icgrep/kernels/until_n.h

    r5450 r5706  
    1515    UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    1616private:
    17     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) override;
     17    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    1818
    1919};
Note: See TracChangeset for help on using the changeset viewer.