Changeset 6249


Ignore:
Timestamp:
Dec 21, 2018, 1:06:11 PM (5 weeks ago)
Author:
nmedfort
Message:

Migrated processed/produced item counts out of kernels and into pipeline + corrected misbehaving kernels.

Location:
icGREP/icgrep-devel/icgrep
Files:
19 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.cpp

    r6244 r6249  
    694694    inst->setOrdering(AtomicOrdering::Acquire);
    695695    return inst;
    696 
    697696}
    698697
  • icGREP/icgrep-devel/icgrep/editd/editd.cpp

    r6228 r6249  
    161161}
    162162
    163 typedef void (*preprocessFunctionType)(char * output_data, size_t output_size, const uint32_t fd);
     163#warning make a "CBuffer" class to abstract away the complexity of making these function typedefs.
     164
     165typedef void (*preprocessFunctionType)(char * output_data, size_t output_produced, size_t output_size, const uint32_t fd);
    164166
    165167static char * chStream;
     
    240242    const auto n = round_up_to(size, 8 * ALIGNMENT);
    241243    chStream = alloc.allocate((4 * n) / 8);
    242     preprocess(chStream, n, fd);
     244    preprocess(chStream, 0, n, fd);
    243245    close(fd);
    244246    return chStream;
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r6241 r6249  
    432432    auto P = mGrepDriver.makePipeline(
    433433                // inputs
    434                 {Binding{idb->getInt8Ty(), "useMMap"},
     434                {Binding{idb->getSizeTy(), "useMMap"},
    435435                Binding{idb->getInt32Ty(), "fileDescriptor"},
    436436                Binding{idb->getIntAddrTy(), "callbackObject"},
     
    494494    auto E = mGrepDriver.makePipeline(
    495495                // inputs
    496                 {Binding{idb->getInt8Ty(), "useMMap"},
     496                {Binding{idb->getSizeTy(), "useMMap"},
    497497                Binding{idb->getInt32Ty(), "fileDescriptor"},
    498498                Binding{idb->getIntAddrTy(), "callbackObject"},
  • icGREP/icgrep-devel/icgrep/kernels/block_kernel.cpp

    r6187 r6249  
    157157                offset = getPopCountRateItemCount(b, rate, mStrideBlockIndex);
    158158            }
    159             Value * const initial = b->getNonDeferredProcessedItemCount(input);
     159            Value * const initial = b->getProcessedItemCount(input.getName());
    160160            Value * const processed = b->CreateAdd(initial, offset);
    161             b->setNonDeferredProcessedItemCount(input, processed);
     161            b->setProcessedItemCount(input.getName(), processed);
    162162        }
    163163    }
     
    174174                offset = getPopCountRateItemCount(b, rate, mStrideBlockIndex);
    175175            }
    176             Value * const initial = b->getNonDeferredProducedItemCount(output);
     176            Value * const initial = b->getProducedItemCount(output.getName());
    177177            Value * const produced = b->CreateAdd(initial, offset);
    178             b->setNonDeferredProducedItemCount(output, produced);
     178            b->setProducedItemCount(output.getName(), produced);
    179179        }
    180180    }
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r6244 r6249  
    7070
    7171/** ------------------------------------------------------------------------------------------------------------- *
     72 * @brief reset
     73 ** ------------------------------------------------------------------------------------------------------------- */
     74template <typename Vec>
     75inline void reset(Vec & vec, const unsigned n) {
     76    vec.resize(n);
     77    std::fill_n(vec.begin(), n, nullptr);
     78}
     79
     80/** ------------------------------------------------------------------------------------------------------------- *
    7281 * @brief addBaseKernelProperties
    7382 *
     
    104113    IntegerType * const sizeTy = b->getSizeTy();
    105114
    106 //    addInternalScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    107 
    108     // TODO: if we had a way of easily calculating the number of processed/produced items of the
    109     // final stride of a non-deferred fixed rate stream, we could avoid storing the item counts.
    110     for (unsigned i = 0; i < numOfInputStreams; ++i) {
    111         const Binding & input = mInputStreamSets[i];
    112         addInternalScalar(sizeTy, input.getName() + PROCESSED_ITEM_COUNT_SUFFIX);
    113         if (LLVM_UNLIKELY(input.isDeferred())) {
    114             addInternalScalar(sizeTy, input.getName() + NON_DEFERRED_ITEM_COUNT_SUFFIX);
    115         }
    116     }
    117 
    118115    // If an output is a managed buffer, we need to store both the buffer and a set of consumers.
    119116    for (unsigned i = 0; i < numOfOutputStreams; ++i) {
    120117        const Binding & output = mOutputStreamSets[i];
    121         addInternalScalar(sizeTy, output.getName() + PRODUCED_ITEM_COUNT_SUFFIX);
    122         if (LLVM_UNLIKELY(output.isDeferred())) {
    123             addInternalScalar(sizeTy, output.getName() + NON_DEFERRED_ITEM_COUNT_SUFFIX);
    124         }
    125118        if (LLVM_UNLIKELY(isLocalBuffer(output))) {
    126119            Type * const handleTy = mStreamSetOutputBuffers[i]->getHandleType(b);
     
    243236
    244237/** ------------------------------------------------------------------------------------------------------------- *
     238 * @brief isParamAddressable
     239 ** ------------------------------------------------------------------------------------------------------------- */
     240inline bool isParamAddressable(const Binding & binding) {
     241    if (binding.isDeferred()) {
     242        return true;
     243    }
     244    const ProcessingRate & rate = binding.getRate();
     245    return (rate.isBounded() || rate.isUnknown());
     246}
     247
     248/** ------------------------------------------------------------------------------------------------------------- *
     249 * @brief isParamConstant
     250 ** ------------------------------------------------------------------------------------------------------------- */
     251inline bool isParamConstant(const Binding & binding) {
     252    assert (!binding.isDeferred());
     253    const ProcessingRate & rate = binding.getRate();
     254    return rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount();
     255}
     256
     257/** ------------------------------------------------------------------------------------------------------------- *
     258 * @brief hasParam
     259 ** ------------------------------------------------------------------------------------------------------------- */
     260inline bool hasParam(const Binding & binding) {
     261    return !binding.getRate().isRelative();
     262}
     263
     264/** ------------------------------------------------------------------------------------------------------------- *
    245265 * @brief addDoSegmentDeclaration
    246266 ** ------------------------------------------------------------------------------------------------------------- */
     
    256276    for (unsigned i = 0; i < mInputStreamSets.size(); ++i) {
    257277        Type * const bufferType = mStreamSetInputBuffers[i]->getType();
    258         params.push_back(bufferType->getPointerTo()); // logical "base" input address
    259         params.push_back(sizeTy);  // accessible input items (after non-deferred processed item count)
     278        // logical base input address
     279        params.push_back(bufferType->getPointerTo());
     280        // processed input items
    260281        const Binding & input = mInputStreamSets[i];
    261         unsigned numOfPopCountArrays = 0;
     282        if (isParamAddressable(input)) {
     283            params.push_back(sizePtrTy); // updatable
     284        }  else if (isParamConstant(input)) {
     285            params.push_back(sizeTy);  // constant
     286        }
     287        // accessible input items (after non-deferred processed item count)
     288        params.push_back(sizeTy);
    262289        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresPopCountArray))) {
    263             ++numOfPopCountArrays;
     290            params.push_back(sizePtrTy);
    264291        }
    265292        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresNegatedPopCountArray))) {
    266             ++numOfPopCountArrays;
    267         }
    268         if (numOfPopCountArrays) {
    269             params.insert(params.end(), numOfPopCountArrays, sizePtrTy); // popCountRef array (length is numOfStrides)
    270         }
    271     }
     293            params.push_back(sizePtrTy);
     294        }
     295    }
     296
     297    const auto canTerminate = canSetTerminateSignal();
     298
    272299    for (unsigned i = 0; i < mOutputStreamSets.size(); ++i) {
    273300        const Binding & output = mOutputStreamSets[i];
     301        // logical base output address
    274302        if (LLVM_LIKELY(!isLocalBuffer(output))) {
    275303            Type * const bufferType = mStreamSetOutputBuffers[i]->getType();
    276             params.push_back(bufferType->getPointerTo()); // logical "base" output address
    277             params.push_back(sizeTy); // writable output items (after non-deferred produced item count)
    278         }
    279     }
    280 
    281     Type * const retTy = canSetTerminateSignal() ? b->getInt1Ty() : b->getVoidTy();
     304            params.push_back(bufferType->getPointerTo());
     305        }
     306        // produced output items
     307        if (canTerminate || isParamAddressable(output)) {
     308            params.push_back(sizePtrTy); // updatable
     309        } else if (isParamConstant(output)) {
     310            params.push_back(sizeTy); // constant
     311        }
     312        // writable output items (after non-deferred produced item count)
     313        if (LLVM_LIKELY(!isLocalBuffer(output))) {
     314            params.push_back(sizeTy);
     315        }
     316    }
     317
     318
     319    Type * const retTy = canTerminate ? b->getInt1Ty() : b->getVoidTy();
    282320    FunctionType * const doSegmentType = FunctionType::get(retTy, params, false);
    283321    Function * const doSegment = Function::Create(doSegmentType, GlobalValue::ExternalLinkage, getName() + DO_SEGMENT_SUFFIX, b->getModule());
     
    290328        const Binding & input = mInputStreamSets[i];
    291329        (++args)->setName(input.getName());
     330        if (LLVM_LIKELY(hasParam(input))) {
     331            (++args)->setName(input.getName() + "_processed");
     332        }
    292333        (++args)->setName(input.getName() + "_accessible");
    293334        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresPopCountArray))) {
     
    302343        if (LLVM_LIKELY(!isLocalBuffer(output))) {
    303344            (++args)->setName(output.getName());
     345        }
     346        if (LLVM_LIKELY(hasParam(output))) {
     347            (++args)->setName(output.getName() + "_produced");
     348        }
     349        if (LLVM_LIKELY(!isLocalBuffer(output))) {
    304350            (++args)->setName(output.getName() + "_writable");
    305351        }
     
    307353    assert (std::next(args) == doSegment->arg_end());
    308354}
     355
     356
    309357
    310358/** ------------------------------------------------------------------------------------------------------------- *
     
    332380    // and call system here but that is not an ideal way of handling this.
    333381
    334     // TODO: use a graph to depict relations between binding? It would be better to first move to a model
    335     // where inputs and outputs are contained in a single parameter vector.
    336 
    337382    const auto numOfInputs = getNumOfStreamInputs();
    338     mAccessibleInputItems.resize(numOfInputs, nullptr);
    339     mAvailableInputItems.resize(numOfInputs, nullptr);
    340     mPopCountRateArray.resize(numOfInputs, nullptr);
    341     mNegatedPopCountRateArray.resize(numOfInputs, nullptr);
     383    reset(mProcessedInputItems, numOfInputs);
     384    reset(mAccessibleInputItems, numOfInputs);
     385    reset(mAvailableInputItems, numOfInputs);
     386    reset(mPopCountRateArray, numOfInputs);
     387    reset(mNegatedPopCountRateArray, numOfInputs);
     388    std::vector<Value *> updatableProcessedInputItems;
     389    reset(updatableProcessedInputItems, numOfInputs);
     390
     391    IntegerType * const sizeTy = b->getSizeTy();
     392
    342393    for (unsigned i = 0; i < numOfInputs; i++) {
     394        /// ----------------------------------------------------
     395        /// logical buffer base address
     396        /// ----------------------------------------------------
    343397        const Binding & input = mInputStreamSets[i];
    344398        assert (args != mCurrentMethod->arg_end());
     
    348402        buffer->setHandle(b, localHandle);
    349403        buffer->setBaseAddress(b.get(), addr);
     404        /// ----------------------------------------------------
     405        /// processed item count
     406        /// ----------------------------------------------------
     407
     408        // NOTE: we create a redundant alloca to store the input param so that
     409        // Mem2Reg can convert it into a PHINode if the item count is updated in
     410        // a loop; otherwise, it will be discarded in favor of the param itself.
     411
     412        Value * processed = nullptr;
     413        if (isParamAddressable(input)) {
     414            assert (args != mCurrentMethod->arg_end());
     415            updatableProcessedInputItems[i] = &*(args++);
     416            processed = b->CreateLoad(updatableProcessedInputItems[i]);
     417        } else if (LLVM_LIKELY(isParamConstant(input))) {
     418            assert (args != mCurrentMethod->arg_end());
     419            processed = &*(args++);
     420        } else { // isRelative
     421            const ProcessingRate & rate = input.getRate();
     422            Port port; unsigned index;
     423            std::tie(port, index) = getStreamPort(rate.getReference());
     424            assert (port == Port::Input && index < i);
     425            assert (mProcessedInputItems[index]);
     426            Value * const ref = b->CreateLoad(mProcessedInputItems[index]);
     427            processed = b->CreateMul2(ref, rate.getRate());
     428        }
     429        AllocaInst * const processedItems = b->CreateAlloca(sizeTy);
     430        b->CreateStore(processed, processedItems);
     431        mProcessedInputItems[i] = processedItems;
     432        /// ----------------------------------------------------
     433        /// accessible item count
     434        /// ----------------------------------------------------
    350435        assert (args != mCurrentMethod->arg_end());
    351436        Value * const accessible = &*(args++);
    352437        mAccessibleInputItems[i] = accessible;
    353         Value * const processed = b->getNonDeferredProcessedItemCount(input);
    354438        Value * capacity = b->CreateAdd(processed, accessible);
    355439        mAvailableInputItems[i] = capacity;
     
    358442        }
    359443        buffer->setCapacity(b.get(), capacity);
     444
    360445        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresPopCountArray))) {
    361446            assert (args != mCurrentMethod->arg_end());
    362447            mPopCountRateArray[i] = &*(args++);
    363448        }
     449
    364450        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresNegatedPopCountArray))) {
    365451            assert (args != mCurrentMethod->arg_end());
     
    370456    // set all of the output buffers
    371457    const auto numOfOutputs = getNumOfStreamOutputs();
    372     mWritableOutputItems.resize(numOfOutputs, nullptr);
     458    reset(mProducedOutputItems, numOfOutputs);
     459    reset(mWritableOutputItems, numOfOutputs);
     460    std::vector<Value *> updatableProducedOutputItems;
     461    reset(updatableProducedOutputItems, numOfOutputs);
     462
     463    const auto canTerminate = canSetTerminateSignal();
     464
    373465    for (unsigned i = 0; i < numOfOutputs; i++) {
    374         // If an output is a managed buffer, the address is stored within the state instead
    375         // of being passed in through the function call.
     466        /// ----------------------------------------------------
     467        /// logical buffer base address
     468        /// ----------------------------------------------------
     469
    376470        auto & buffer = mStreamSetOutputBuffers[i];
    377471        const Binding & output = mOutputStreamSets[i];
    378472        if (LLVM_UNLIKELY(isLocalBuffer(output))) {
     473            // If an output is a managed buffer, the address is stored within the state instead
     474            // of being passed in through the function call.
    379475            Value * const handle = b->getScalarFieldPtr(output.getName() + BUFFER_HANDLE_SUFFIX);
    380476            buffer->setHandle(b, handle);
     
    385481            buffer->setHandle(b, localHandle);
    386482            buffer->setBaseAddress(b.get(), logicalBaseAddress);
     483        }
     484        /// ----------------------------------------------------
     485        /// produced item count
     486        /// ----------------------------------------------------
     487        Value * produced = nullptr;
     488        if (LLVM_LIKELY(canTerminate || isParamAddressable(output))) {
    387489            assert (args != mCurrentMethod->arg_end());
     490            updatableProducedOutputItems[i] = &*(args++);
     491            produced = b->CreateLoad(updatableProducedOutputItems[i]);
     492        } else if (LLVM_LIKELY(isParamConstant(output))) {
     493            assert (args != mCurrentMethod->arg_end());
     494            produced = &*(args++);
     495        } else { // isRelative
     496
     497            // For now, if something is produced at a relative rate to another stream in a kernel that
     498            // may terminate, its final item count is inherited from its reference stream and cannot
     499            // be set independently. Should they be independent at early termination?
     500
     501            const ProcessingRate & rate = output.getRate();
     502            Port port; unsigned index;
     503            std::tie(port, index) = getStreamPort(rate.getReference());
     504            assert (port == Port::Input || (port == Port::Output && index < i));
     505            const auto & items = (port == Port::Input) ? mProcessedInputItems : mProducedOutputItems;
     506            Value * const ref = b->CreateLoad(items[index]);
     507            produced = b->CreateMul2(ref, rate.getRate());
     508        }
     509        AllocaInst * const producedItems = b->CreateAlloca(sizeTy);
     510        b->CreateStore(produced, producedItems);
     511        mProducedOutputItems[i] = producedItems;
     512        /// ----------------------------------------------------
     513        /// writable item count
     514        /// ----------------------------------------------------
     515        if (LLVM_UNLIKELY(!isLocalBuffer(output))) {
    388516            Value * const writable = &*(args++);
    389517            mWritableOutputItems[i] = writable;
    390             Value * const produced = b->getNonDeferredProducedItemCount(output);
    391518            Value * const capacity = b->CreateAdd(produced, writable);
    392519            buffer->setCapacity(b.get(), capacity);
    393520        }
     521
    394522    }
    395523    assert (args == mCurrentMethod->arg_end());
    396524
    397     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    398         Value * const terminated = b->getTerminationSignal();
    399         b->CreateAssert(b->CreateNot(terminated), getName() + " was called after termination");
    400     }
    401 
    402525    // initialize the termination signal if this kernel can set it
    403     if (canSetTerminateSignal()) {
     526    if (canTerminate) {
    404527        mTerminationSignalPtr = b->CreateAlloca(b->getInt1Ty(), nullptr, "terminationSignal");
    405528        b->CreateStore(b->getFalse(), mTerminationSignalPtr);
    406529    }
    407530
    408     // Calculate and/or load the accessible and writable item counts. If they are unneeded,
    409     // LLVM ought to recognize them as dead code and remove them.
    410     generateKernelMethod(b); // must be overridden by the Kernel subtype
     531    generateKernelMethod(b);
     532
    411533    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableMProtect))) {
    412534        b->CreateMProtect(mHandle, CBuilder::Protect::READ);
    413535    }
    414536
     537    for (unsigned i = 0; i < numOfInputs; i++) {
     538        if (updatableProcessedInputItems[i]) {
     539            Value * const items = b->CreateLoad(mProcessedInputItems[i]);
     540            b->CreateStore(items, updatableProcessedInputItems[i]);
     541        }
     542    }
     543
     544    for (unsigned i = 0; i < numOfOutputs; i++) {
     545        if (updatableProducedOutputItems[i]) {
     546            Value * const items = b->CreateLoad(mProducedOutputItems[i]);
     547            b->CreateStore(items, updatableProducedOutputItems[i]);
     548        }
     549    }
     550
    415551    // return the termination signal (if one exists)
    416     if (canSetTerminateSignal()) {
     552    if (canTerminate) {
    417553        b->CreateRet(b->CreateLoad(mTerminationSignalPtr));
    418554        mTerminationSignalPtr = nullptr;
     
    427563    mIsFinal = nullptr;
    428564    mNumOfStrides = nullptr;
    429     mAccessibleInputItems.clear();
    430     mPopCountRateArray.clear();
    431     mNegatedPopCountRateArray.clear();
    432565}
    433566
     
    598731    if (LLVM_LIKELY(mKernelStateType == nullptr)) {
    599732        std::vector<llvm::Type *> fields;
    600         fields.reserve(mInputScalars.size() + mOutputScalars.size() + mInternalScalars.size());
     733        fields.reserve(mInputScalars.size() + mOutputScalars.size() + mInternalScalars.size() + 1);
    601734        for (const Binding & scalar : mInputScalars) {
    602735            assert (scalar.getType());
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r6233 r6249  
    3434class BaseDriver;
    3535
    36 const static std::string LOGICAL_SEGMENT_NO_SCALAR = "segmentNo";
    37 const static std::string PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
    38 const static std::string PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
    3936const static std::string CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
    40 const static std::string NON_DEFERRED_ITEM_COUNT_SUFFIX = "_nonDeferredItemCount";
    4137const static std::string BUFFER_HANDLE_SUFFIX = "_buffer";
    4238const static std::string CYCLECOUNT_SCALAR = "CPUcycles";
     
    406402    void setStride(unsigned stride) { mStride = stride; }
    407403
    408     llvm::Value * getAccessibleInputItems(const llvm::StringRef name) const {
     404    LLVM_READNONE llvm::Value * getAccessibleInputItems(const llvm::StringRef name) const {
    409405        Port port; unsigned index;
    410406        std::tie(port, index) = getStreamPort(name);
     
    413409    }
    414410
    415     llvm::Value * getAccessibleInputItems(const unsigned index) const {
     411    LLVM_READNONE llvm::Value * getAccessibleInputItems(const unsigned index) const {
    416412        assert (index < mAccessibleInputItems.size());
    417413        return mAccessibleInputItems[index];
    418414    }
    419415
    420     llvm::Value * getAvailableInputItems(const llvm::StringRef name) const {
     416    LLVM_READNONE llvm::Value * getAvailableInputItems(const llvm::StringRef name) const {
    421417        Port port; unsigned index;
    422418        std::tie(port, index) = getStreamPort(name);
     
    425421    }
    426422
    427     llvm::Value * getAvailableInputItems(const unsigned index) const {
     423    LLVM_READNONE llvm::Value * getAvailableInputItems(const unsigned index) const {
    428424        assert (index < mAvailableInputItems.size());
    429425        return mAvailableInputItems[index];
     
    436432    }
    437433
    438     llvm::Value * getTerminationSignalPtr() const {
     434    LLVM_READNONE llvm::Value * getTerminationSignalPtr() const {
    439435        return mTerminationSignalPtr;
    440436    }
    441437
    442     llvm::Value * isFinal() const {
     438    LLVM_READNONE llvm::Value * getProcessedInputItemsPtr(const llvm::StringRef name) const {
     439        Port port; unsigned index;
     440        std::tie(port, index) = getStreamPort(name);
     441        assert (port == Port::Input);
     442        return mProcessedInputItems[index];
     443    }
     444
     445    LLVM_READNONE llvm::Value * getProducedOutputItemsPtr(const llvm::StringRef name) const {
     446        Port port; unsigned index;
     447        std::tie(port, index) = getStreamPort(name);
     448        assert (port == Port::Output);
     449        return mProducedOutputItems[index];
     450    }
     451
     452    LLVM_READNONE llvm::Value * isFinal() const {
    443453        return mIsFinal;
    444454    }
     
    466476    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & b);
    467477
    468     void deriveItemCounts(const std::unique_ptr<KernelBuilder> & b);
    469 
    470     llvm::Value * deriveItemCount(const std::unique_ptr<KernelBuilder> & b, const Binding & binding, llvm::Value * const strideIndex);
    471 
    472478    llvm::Function * getInitFunction(llvm::Module * const module) const;
    473479
     
    497503    llvm::Value *                   mNumOfStrides;
    498504
     505    std::vector<llvm::Value *>      mProcessedInputItems;
    499506    std::vector<llvm::Value *>      mAccessibleInputItems;
    500507    std::vector<llvm::Value *>      mAvailableInputItems;
    501508    std::vector<llvm::Value *>      mPopCountRateArray;
    502509    std::vector<llvm::Value *>      mNegatedPopCountRateArray;
     510    std::vector<llvm::Value *>      mProducedOutputItems;
    503511    std::vector<llvm::Value *>      mWritableOutputItems;
    504512
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r6237 r6249  
    3030}
    3131
    32 #warning TODO: make get scalar field able to get I/O scalars
    33 
    3432inline Value * KernelBuilder::getScalarFieldPtr(Value * const handle, const std::string & fieldName) {
    3533    ConstantInt * const index = getInt32(mKernel->getScalarIndex(fieldName));
     
    4139}
    4240
    43 Value * KernelBuilder::getScalarFieldPtr(const std::string & fieldName) {
     41Value * KernelBuilder::getScalarFieldPtr(const StringRef fieldName) {
    4442    return getScalarFieldPtr(mKernel->getHandle(), fieldName);
    4543}
    4644
    47 Value * KernelBuilder::getScalarField(const std::string & fieldName) {
    48     Value * const ptr = getScalarFieldPtr(fieldName);
    49     return CreateLoad(ptr, fieldName);
    50 }
    51 
    52 void KernelBuilder::setScalarField(const std::string & fieldName, Value * const value) {
    53     Value * const ptr = getScalarFieldPtr(fieldName);
    54     CreateStore(value, ptr);
     45Value * KernelBuilder::getScalarField(const StringRef fieldName) {
     46    return CreateLoad(getScalarFieldPtr(fieldName));
     47}
     48
     49void KernelBuilder::setScalarField(const StringRef fieldName, Value * const value) {
     50    CreateStore(value, getScalarFieldPtr(fieldName));
    5551}
    5652
    5753Value * KernelBuilder::getCycleCountPtr() {
    5854    return getScalarFieldPtr(CYCLECOUNT_SCALAR);
     55}
     56
     57/** ------------------------------------------------------------------------------------------------------------- *
     58 * @brief getProcessedItemCount
     59 ** ------------------------------------------------------------------------------------------------------------- */
     60Value * KernelBuilder::getProcessedItemCount(const StringRef name) {
     61    return CreateLoad(mKernel->getProcessedInputItemsPtr(name));
     62}
     63
     64/** ------------------------------------------------------------------------------------------------------------- *
     65 * @brief setProcessedItemCount
     66 ** ------------------------------------------------------------------------------------------------------------- */
     67void KernelBuilder::setProcessedItemCount(const StringRef name, Value * value) {
     68    CreateStore(value, mKernel->getProcessedInputItemsPtr(name));
     69}
     70
     71/** ------------------------------------------------------------------------------------------------------------- *
     72 * @brief getProducedItemCount
     73 ** ------------------------------------------------------------------------------------------------------------- */
     74Value * KernelBuilder::getProducedItemCount(const StringRef name) {
     75    return CreateLoad(mKernel->getProducedOutputItemsPtr(name));
     76}
     77
     78/** ------------------------------------------------------------------------------------------------------------- *
     79 * @brief setProducedItemCount
     80 ** ------------------------------------------------------------------------------------------------------------- */
     81void KernelBuilder::setProducedItemCount(const StringRef name, Value * value) {
     82    CreateStore(value, mKernel->getProducedOutputItemsPtr(name));
    5983}
    6084
     
    99123
    100124/** ------------------------------------------------------------------------------------------------------------- *
    101  * @brief getAvailableItemCount
    102  ** ------------------------------------------------------------------------------------------------------------- */
    103 Value * KernelBuilder::getAvailableItemCount(const std::string & name) {
    104     return mKernel->getAvailableInputItems(name);
    105 }
    106 
    107 /** ------------------------------------------------------------------------------------------------------------- *
    108  * @brief getAccessibleItemCount
    109  ** ------------------------------------------------------------------------------------------------------------- */
    110 Value * KernelBuilder::getAccessibleItemCount(const std::string & name) {
    111     return mKernel->getAccessibleInputItems(name);
    112 }
    113 
    114 /** ------------------------------------------------------------------------------------------------------------- *
    115125 * @brief getTerminationSignal
    116126 ** ------------------------------------------------------------------------------------------------------------- */
     
    135145    Value * const ptr = mKernel->getTerminationSignalPtr();
    136146    if (LLVM_UNLIKELY(ptr == nullptr)) {
    137         llvm::report_fatal_error(mKernel->getName() + " does not have CanTerminateEarly or MustExplicitlyTerminate set.");
     147        report_fatal_error(mKernel->getName() + " does not have CanTerminateEarly or MustExplicitlyTerminate set.");
    138148    }
    139149    CreateStore(value, ptr);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r6237 r6249  
    1818    llvm::Value * getScalarFieldPtr(llvm::Value * index);
    1919
    20     llvm::Value * getScalarFieldPtr(const std::string & fieldName);
     20    llvm::Value * getScalarFieldPtr(const llvm::StringRef fieldName);
    2121
    22     llvm::Value * getScalarField(const std::string & fieldName);
     22    llvm::Value * getScalarField(const llvm::StringRef fieldName);
    2323
    2424    // Set the value of a scalar field for the current instance.
    25     void setScalarField(const std::string & fieldName, llvm::Value * value);
     25    void setScalarField(const llvm::StringRef fieldName, llvm::Value * value);
    2626
    27     llvm::Value * getAvailableItemCount(const std::string & name);
    28 
    29     llvm::Value * getAccessibleItemCount(const std::string & name);
    30 
    31     llvm::Value * getProcessedItemCount(const std::string & name) {
    32         return getNamedItemCount(name, PROCESSED_ITEM_COUNT_SUFFIX);
     27    llvm::Value * getAvailableItemCount(const llvm::StringRef name) {
     28        return mKernel->getAvailableInputItems(name);
    3329    }
    3430
    35     void setProcessedItemCount(const std::string & name, llvm::Value * value) {
    36         setNamedItemCount(name, PROCESSED_ITEM_COUNT_SUFFIX, value);
     31    llvm::Value * getAccessibleItemCount(const llvm::StringRef name) {
     32        return mKernel->getAccessibleInputItems(name);
    3733    }
    3834
    39     llvm::Value * getProducedItemCount(const std::string & name) {
    40         return getNamedItemCount(name, PRODUCED_ITEM_COUNT_SUFFIX);
    41     }
     35    llvm::Value * getProcessedItemCount(const llvm::StringRef name);
    4236
    43     void setProducedItemCount(const std::string & name, llvm::Value * value) {
    44         setNamedItemCount(name, PRODUCED_ITEM_COUNT_SUFFIX, value);
    45     }
     37    void setProcessedItemCount(const llvm::StringRef name, llvm::Value * value);
     38
     39    llvm::Value * getProducedItemCount(const llvm::StringRef name);
     40
     41    void setProducedItemCount(const llvm::StringRef name, llvm::Value * value);
    4642
    4743    llvm::Value * getConsumedItemCount(const std::string & name) {
     
    5147    void setConsumedItemCount(const std::string & name, llvm::Value * value) {
    5248        setNamedItemCount(name, CONSUMED_ITEM_COUNT_SUFFIX, value);
    53     }
    54 
    55     llvm::Value * getNonDeferredProcessedItemCount(const Binding & input) {
    56         return getNamedItemCount(input.getName(), input.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PROCESSED_ITEM_COUNT_SUFFIX);
    57     }
    58 
    59     void setNonDeferredProcessedItemCount(const Binding & input, llvm::Value * value) {
    60         setNamedItemCount(input.getName(), input.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PROCESSED_ITEM_COUNT_SUFFIX, value);
    61     }
    62 
    63     llvm::Value * getNonDeferredProducedItemCount(const Binding & output) {
    64         return getNamedItemCount(output.getName(), output.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PRODUCED_ITEM_COUNT_SUFFIX);
    65     }
    66 
    67     void setNonDeferredProducedItemCount(const Binding & output, llvm::Value * value) {
    68         setNamedItemCount(output.getName(), output.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PRODUCED_ITEM_COUNT_SUFFIX, value);
    6949    }
    7050
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/buffer_management_logic.hpp

    r6241 r6249  
    274274    }
    275275
    276   //  printBufferGraph(G, errs());
     276//    printBufferGraph(G, errs());
    277277
    278278    return G;
     
    443443
    444444/** ------------------------------------------------------------------------------------------------------------- *
    445  * @brief readInitialProducedItemCounts
    446  ** ------------------------------------------------------------------------------------------------------------- */
    447 inline void PipelineCompiler::readInitialProducedItemCounts(BuilderRef b) {
     445 * @brief readInitialItemCounts
     446 ** ------------------------------------------------------------------------------------------------------------- */
     447inline void PipelineCompiler::readInitialItemCounts(BuilderRef b) {
     448    b->setKernel(mPipelineKernel);
     449    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     450    for (unsigned i = 0; i < numOfInputs; ++i) {
     451        const Binding & input = mKernel->getInputStreamSetBinding(i);
     452        const auto prefix = makeBufferName(mKernelIndex, input);
     453        mInitiallyProcessedItemCount[i] = b->getScalarField(prefix + ITEM_COUNT_SUFFIX);
     454        #ifdef PRINT_DEBUG_MESSAGES
     455        b->CallPrintInt(prefix + "_initialProcessed", mInitiallyProcessedItemCount[i]);
     456        #endif
     457        if (input.isDeferred()) {
     458            mInitiallyProcessedDeferredItemCount[i] = b->getScalarField(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
     459            #ifdef PRINT_DEBUG_MESSAGES
     460            b->CallPrintInt(prefix + "_initialProducedDeferred", mInitiallyProcessedDeferredItemCount[i]);
     461            #endif
     462        }
     463    }
    448464    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    449465    for (unsigned i = 0; i < numOfOutputs; ++i) {
    450466        const Binding & output = mKernel->getOutputStreamSetBinding(i);
    451         Value * const produced = b->getNonDeferredProducedItemCount(output);
    452         mInitiallyProducedItemCount[i] = produced;
    453     }
     467        const auto prefix = makeBufferName(mKernelIndex, output);
     468        mInitiallyProducedItemCount[i] = b->getScalarField(prefix + ITEM_COUNT_SUFFIX);
     469        #ifdef PRINT_DEBUG_MESSAGES
     470        b->CallPrintInt(prefix + "_initialProduced", mInitiallyProducedItemCount[i]);
     471        #endif
     472    }
     473    b->setKernel(mKernel);
     474}
     475
     476/** ------------------------------------------------------------------------------------------------------------- *
     477 * @brief writeUpdatedItemCounts
     478 ** ------------------------------------------------------------------------------------------------------------- */
     479inline void PipelineCompiler::writeUpdatedItemCounts(BuilderRef b) {
     480    b->setKernel(mPipelineKernel);
     481    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     482    for (unsigned i = 0; i < numOfInputs; ++i) {
     483        const Binding & input = mKernel->getInputStreamSetBinding(i);
     484        const auto prefix = makeBufferName(mKernelIndex, input);
     485        b->setScalarField(prefix + ITEM_COUNT_SUFFIX, mUpdatedProcessedPhi[i]);
     486        if (input.isDeferred()) {
     487            b->setScalarField(prefix + DEFERRED_ITEM_COUNT_SUFFIX, mUpdatedProcessedDeferredPhi[i]);
     488        }
     489    }
     490    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     491    for (unsigned i = 0; i < numOfOutputs; ++i) {
     492        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     493        const auto prefix = makeBufferName(mKernelIndex, output);
     494        b->setScalarField(prefix + ITEM_COUNT_SUFFIX, mUpdatedProducedPhi[i]);
     495    }
     496    b->setKernel(mKernel);
    454497}
    455498
     
    459502inline void PipelineCompiler::readFinalProducedItemCounts(BuilderRef b) {
    460503    for (const auto e : make_iterator_range(out_edges(mKernelIndex, mBufferGraph))) {
    461         const Binding & output = mKernel->getOutputStreamSetBinding(mBufferGraph[e].Port);
    462         Value * const produced = b->getNonDeferredProducedItemCount(output);
    463         // TODO: we only need to consider the blocksize attribute if it's possible this
    464         // stream could be read before being fully written. This might occur if one of
    465         // it's consumers is a PopCount rate that does not have a matching BlockSize
    466         // attribute.
    467         Value * fullyProduced = truncateBlockSize(b, output, produced, mTerminatedFlag);
    468504        const auto bufferVertex = target(e, mBufferGraph);
     505        const auto outputPort = mBufferGraph[e].Port;
     506        Value * fullyProduced = mFullyProducedItemCount[outputPort];
    469507        BufferNode & bn = mBufferGraph[bufferVertex];
    470508        assert (bn.TotalItems == nullptr);
     
    473511        initializePopCountReferenceItemCount(b, bufferVertex, fullyProduced);
    474512        #ifdef PRINT_DEBUG_MESSAGES
     513        const auto & output = mKernel->getOutputStreamSetBinding(outputPort);
    475514        const auto prefix = makeBufferName(mKernelIndex, output);
    476         b->CallPrintInt(prefix + "_produced'", produced);
     515        b->CallPrintInt(prefix + "_fullyProduced", fullyProduced);
    477516        #endif
    478517    }
     
    529568            const StreamSetBuffer * const buffer = getOutputBuffer(i);
    530569            Value * const capacity = buffer->getCapacity(b.get());
    531             Value * const priorOffset = b->CreateURem(mAlreadyProducedItemCount[i], capacity);
    532             Value * const produced = b->getNonDeferredProducedItemCount(output);
     570            Value * const priorOffset = b->CreateURem(mAlreadyProducedPhi[i], capacity);
     571            Value * const produced = mProducedItemCount[i];
    533572            Value * const producedOffset = b->CreateURem(produced, capacity);
    534573            Value * const nonCapacityAlignedWrite = b->CreateIsNotNull(producedOffset);
     
    538577
    539578            b->SetInsertPoint(copyBack);
     579            #ifdef PRINT_DEBUG_MESSAGES
     580            b->CallPrintInt(prefix + "_CopyBack", producedOffset);
     581            #endif
    540582            writeOverflowCopy(b, buffer, OverflowCopy::Backwards, producedOffset);
    541583            b->CreateBr(copyExit);
     
    564606            Value * const capacity = buffer->getCapacity(b.get());
    565607            Value * const initial = mInitiallyProducedItemCount[i];
    566             Value * const produced = b->getNonDeferredProducedItemCount(output);
     608            Value * const produced = mUpdatedProducedPhi[i];
    567609
    568610            // If we wrote anything and it was not our first write to the buffer ...
     
    580622            Value * const wroteToFirstBlock = b->CreateAnd(overwroteData, startedWithinFirstBlock);
    581623
    582             // Or we started writing at the end of the buffer but wrapped over to the start of it,
     624            // And we started writing at the end of the buffer but wrapped over to the start of it,
    583625            Value * const producedOffset = b->CreateURem(produced, capacity);
    584626            Value * const wroteFromEndToStart = b->CreateICmpULT(producedOffset, initialOffset);
     
    586628            // Then mirror the data in the overflow region.
    587629            Value * const needsCopyForward = b->CreateOr(wroteToFirstBlock, wroteFromEndToStart);
     630
     631
    588632            b->CreateUnlikelyCondBr(needsCopyForward, copyForward, copyExit);
    589633
     
    595639
    596640            b->SetInsertPoint(copyForward);
     641            #ifdef PRINT_DEBUG_MESSAGES
     642            b->CallPrintInt(prefix + "_CopyForward.initialOffset", initialOffset);
     643            b->CallPrintInt(prefix + "_CopyForward.producedOffset", producedOffset);
     644            #endif
    597645            writeOverflowCopy(b, buffer, OverflowCopy::Forwards, overflowSize);
    598646            b->CreateBr(copyExit);
     
    632680    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
    633681    const StreamSetBuffer * const buffer = getInputBuffer(inputPort);
    634     Value * const processed = getAlreadyProcessedItemCount(b, inputPort);
     682    Value * const processed = mAlreadyProcessedPhi[inputPort];
    635683    return calculateLogicalBaseAddress(b, input, buffer, processed);
    636684}
     
    642690    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
    643691    const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
    644     Value * const produced = getAlreadyProducedItemCount(b, outputPort);
     692    Value * const produced = mAlreadyProducedPhi[outputPort];
    645693    return calculateLogicalBaseAddress(b, output, buffer, produced);
    646694}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/consumer_logic.hpp

    r6228 r6249  
    8585        b->setKernel(mKernel);
    8686    } else if (bn.Type == BufferType::Managed) {
     87        b->setKernel(mKernel);
    8788        consumed = b->getScalarField(output.getName() + CONSUMED_ITEM_COUNT_SUFFIX);
    8889    } else if (bn.Type == BufferType::External) {
     
    9293    return consumed;
    9394}
    94 
    9595
    9696/** ------------------------------------------------------------------------------------------------------------- *
     
    103103    assert (producer->getHandle());
    104104    const Binding & output = producer->getOutputStreamSetBinding(mConsumerGraph[pe]);
     105    #ifdef PRINT_DEBUG_MESSAGES
     106    const auto prefix = makeBufferName(producerVertex, output);
     107    b->CallPrintInt(prefix + CONSUMED_ITEM_COUNT_SUFFIX, consumed);
     108    #endif
    105109    if (LLVM_UNLIKELY(storedInNestedKernel(output))) {
    106110        b->setKernel(producer);
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/core_logic.hpp

    r6244 r6249  
    2929 ** ------------------------------------------------------------------------------------------------------------- */
    3030inline void PipelineCompiler::addInternalKernelProperties(BuilderRef b, const unsigned kernelIndex) {
    31 //    Kernel * const kernel = mPipeline[kernelIndex];
     31
    3232    IntegerType * const sizeTy = b->getSizeTy();
    3333
     
    3535    // TODO: prove two termination signals can be fused into a single counter?
    3636    mPipelineKernel->addInternalScalar(sizeTy, name + TERMINATION_SIGNAL);
     37    mPipelineKernel->addInternalScalar(sizeTy, name + LOGICAL_SEGMENT_SUFFIX);
     38
    3739    // TODO: non deferred item count for fixed rates could be calculated from seg no.
    38     mPipelineKernel->addInternalScalar(sizeTy, name + LOGICAL_SEGMENT_NO_SCALAR);
    39 
    40 //    const auto numOfInputs = kernel->getNumOfStreamInputs();
    41 //    for (unsigned i = 0; i < numOfInputs; i++) {
    42 //        const Binding & input = kernel->getInputStreamSetBinding(i);
    43 //        const auto prefix = makeBufferName(kernelIndex, input);
    44 //        mPipelineKernel->addInternalScalar(sizeTy, prefix + PROCESSED_ITEM_COUNT_SUFFIX);
    45 //        if (input.isDeferred()) {
    46 //            mPipelineKernel->addInternalScalar(sizeTy, prefix + NON_DEFERRED_ITEM_COUNT_SUFFIX);
    47 //        }
    48 //    }
    49 
    50 //    const auto numOfOutputs = kernel->getNumOfStreamOutputs();
    51 //    for (unsigned i = 0; i < numOfOutputs; i++) {
    52 //        const Binding & output = kernel->getOutputStreamSetBinding(i);
    53 //        const auto prefix = makeBufferName(kernelIndex, output);
    54 //        mPipelineKernel->addInternalScalar(sizeTy, prefix + PRODUCED_ITEM_COUNT_SUFFIX);
    55 //        if (output.isDeferred()) {
    56 //            mPipelineKernel->addInternalScalar(sizeTy, prefix + NON_DEFERRED_ITEM_COUNT_SUFFIX);
    57 //        }
    58 //    }
     40    // Should I seperate non-deferred from normal item counts to improve cache locality?
     41    const Kernel * const kernel = mPipeline[kernelIndex];
     42    const auto numOfInputs = kernel->getNumOfStreamInputs();
     43    for (unsigned i = 0; i < numOfInputs; i++) {
     44        const Binding & input = kernel->getInputStreamSetBinding(i);
     45        const auto prefix = makeBufferName(kernelIndex, input);
     46        if (input.isDeferred()) {
     47            mPipelineKernel->addInternalScalar(sizeTy, prefix + DEFERRED_ITEM_COUNT_SUFFIX);
     48        }
     49        mPipelineKernel->addInternalScalar(sizeTy, prefix + ITEM_COUNT_SUFFIX);
     50    }
     51
     52    const auto numOfOutputs = kernel->getNumOfStreamOutputs();
     53    for (unsigned i = 0; i < numOfOutputs; i++) {
     54        const Binding & output = kernel->getOutputStreamSetBinding(i);
     55        const auto prefix = makeBufferName(kernelIndex, output);
     56        if (output.isDeferred()) {
     57            mPipelineKernel->addInternalScalar(sizeTy, prefix + DEFERRED_ITEM_COUNT_SUFFIX);
     58        }
     59        mPipelineKernel->addInternalScalar(sizeTy, prefix + ITEM_COUNT_SUFFIX);
     60    }
    5961
    6062}
     
    136138    resetMemoizedFields();
    137139    mPortOrdering = lexicalOrderingOfStreamIO();
    138     loadBufferHandles(b);
    139 
    140     mKernelEntry = b->GetInsertBlock();
    141 
    142     const auto kernelName = makeKernelName(mKernelIndex);
    143     BasicBlock * const checkProducers = b->CreateBasicBlock(kernelName + "_checkProducers", mPipelineEnd);
    144     mKernelLoopEntry = b->CreateBasicBlock(kernelName + "_loopEntry", mPipelineEnd);
    145     mKernelLoopCall = b->CreateBasicBlock(kernelName + "_executeKernel", mPipelineEnd);
    146     mKernelLoopExit = b->CreateBasicBlock(kernelName + "_loopExit", mPipelineEnd);
    147     mKernelExit = b->CreateBasicBlock(kernelName + "_kernelExit", mPipelineEnd);
     140
     141    const auto prefix = makeKernelName(mKernelIndex);
     142    mKernelLoopEntry = b->CreateBasicBlock(prefix + "_loopEntry", mPipelineEnd);
     143    mKernelLoopCall = b->CreateBasicBlock(prefix + "_executeKernel", mPipelineEnd);
     144    mKernelTerminationCheck = b->CreateBasicBlock(prefix + "_normalTerminationCheck", mPipelineEnd);
     145    mKernelTerminated = b->CreateBasicBlock(prefix + "_terminated", mPipelineEnd);
     146    mKernelLoopExit = b->CreateBasicBlock(prefix + "_loopExit", mPipelineEnd);
     147    mKernelExit = b->CreateBasicBlock(prefix + "_kernelExit", mPipelineEnd);
    148148    // The phi catch simplifies compilation logic by "forward declaring" the loop exit point.
    149149    // Subsequent optimization phases will collapse it into the correct exit block.
    150     mKernelLoopExitPhiCatch = b->CreateBasicBlock(kernelName + "_kernelExitPhiCatch", mPipelineEnd);
     150    mKernelLoopExitPhiCatch = b->CreateBasicBlock(prefix + "_kernelExitPhiCatch", mPipelineEnd);
    151151
    152152    /// -------------------------------------------------------------------------------------
    153153    /// KERNEL ENTRY
    154154    /// -------------------------------------------------------------------------------------
    155     Value * const initiallyTerminated = getInitialTerminationSignal(b);
    156     #ifdef PRINT_DEBUG_MESSAGES
    157     if (1) {
    158     Constant * const MAX_INT = ConstantInt::getAllOnesValue(mSegNo->getType());
    159     Value * const round = b->CreateSelect(initiallyTerminated, MAX_INT, mSegNo);
    160     b->CallPrintInt("--- " + kernelName + "_start ---", round);
    161     }
    162     #endif
    163     b->CreateUnlikelyCondBr(initiallyTerminated, mKernelExit, checkProducers);
    164 
    165     /// -------------------------------------------------------------------------------------
    166     /// KERNEL CHECK PRODUCERS
    167     /// -------------------------------------------------------------------------------------
    168 
    169     b->SetInsertPoint(checkProducers);
    170     readInitialProducedItemCounts(b);
    171     b->CreateBr(mKernelLoopEntry);
    172 
    173     // Set up some PHI nodes "early" to simplify accumulating their incoming values.
     155
     156    loadBufferHandles(b);
     157    readInitialItemCounts(b);
     158    mKernelEntry = b->GetInsertBlock();
     159    b->CreateUnlikelyCondBr(initiallyTerminated(b), mKernelExit, mKernelLoopEntry);
     160
     161    // Set up some PHI nodes early to simplify accumulating their incoming values.
     162    initializeKernelLoopEntryPhis(b);
     163    initializeKernelCallPhis(b);
     164    initializeKernelTerminatedPhis(b);
     165    initializeKernelLoopExitPhis(b);
     166    initializeKernelExitPhis(b);
    174167
    175168    /// -------------------------------------------------------------------------------------
    176169    /// KERNEL LOOP ENTRY
    177     /// -------------------------------------------------------------------------------------
    178 
    179     b->SetInsertPoint(mKernelLoopEntry);
    180     // Since we may loop and call the kernel again, we want to mark that we've progressed
    181     // if we execute any kernel even if we could not complete a full segment.
    182     if (mPipelineProgress) {
    183         mAlreadyProgressedPhi = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_madeProgress");
    184         mAlreadyProgressedPhi->addIncoming(mPipelineProgress, checkProducers);
    185     }
    186 
    187     /// -------------------------------------------------------------------------------------
    188     /// KERNEL CALL
    189     /// -------------------------------------------------------------------------------------
    190 
    191     b->SetInsertPoint(mKernelLoopCall);
    192     initializeKernelCallPhis(b);
    193 
    194     /// -------------------------------------------------------------------------------------
    195     /// KERNEL LOOP EXIT
    196     /// -------------------------------------------------------------------------------------
    197 
    198     b->SetInsertPoint(mKernelLoopExit);
    199     mTerminatedPhi = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_terminated");
    200     if (mPipelineProgress) {
    201         mHasProgressedPhi = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_anyProgress");
    202     }
    203 
    204     /// -------------------------------------------------------------------------------------
    205     /// KERNEL EXIT
    206     /// -------------------------------------------------------------------------------------
    207 
    208     b->SetInsertPoint(mKernelExit);
    209     initializeKernelExitPhis(b);
    210 
    211     /// -------------------------------------------------------------------------------------
    212     /// KERNEL LOOP ENTRY (CONTINUED)
    213170    /// -------------------------------------------------------------------------------------
    214171
     
    217174    determineNumOfLinearStrides(b);
    218175
     176    // TODO: it would be better to try and statically prove whether a kernel will only ever
     177    // need a single "run" per segment rather than allowing only source kernels to have this
     178    // optimization.
     179
    219180    Value * isFinal = nullptr;
    220181
    221     ConstantInt * const ZERO = b->getSize(0);
    222 
    223182    if (mNumOfLinearStrides) {
    224183
    225         BasicBlock * const enteringNonFinalSegment = b->CreateBasicBlock(kernelName + "_nonFinalSegment", mKernelLoopCall);
    226         BasicBlock * const enteringFinalStride = b->CreateBasicBlock(kernelName + "_finalStride", mKernelLoopCall);
    227 
    228         isFinal = b->CreateICmpEQ(mNumOfLinearStrides, ZERO);
     184        BasicBlock * const enteringNonFinalSegment = b->CreateBasicBlock(prefix + "_nonFinalSegment", mKernelLoopCall);
     185        BasicBlock * const enteringFinalStride = b->CreateBasicBlock(prefix + "_finalStride", mKernelLoopCall);
     186
     187        isFinal = b->CreateICmpEQ(mNumOfLinearStrides, b->getSize(0));
    229188
    230189        b->CreateUnlikelyCondBr(isFinal, enteringFinalStride, enteringNonFinalSegment);
     
    247206
    248207    } else {
    249         mNumOfLinearStrides = ZERO;
     208        mNumOfLinearStrides = b->getSize(1);
     209        calculateNonFinalItemCounts(b);
    250210        b->CreateBr(mKernelLoopCall);
    251211    }
    252212
    253213    /// -------------------------------------------------------------------------------------
    254     /// KERNEL CALL (CONTINUED)
     214    /// KERNEL CALL
    255215    /// -------------------------------------------------------------------------------------
    256216
     
    259219    writeKernelCall(b);
    260220
    261     BasicBlock * const incrementItemCounts = b->CreateBasicBlock(kernelName + "_incrementItemCounts", mKernelLoopExit);
    262     BasicBlock * const terminationCheck = b->CreateBasicBlock(kernelName + "_normalTerminationCheck", mKernelLoopExit);
    263     BasicBlock * const terminated = b->CreateBasicBlock(kernelName + "_terminated", mKernelLoopExit);
    264 
    265     // If the kernel itself terminates, it must set the final processed/produced item counts.
     221    BasicBlock * const copyBack =
     222            b->CreateBasicBlock(prefix + "_copyBack", mKernelTerminationCheck);
     223    BasicBlock * const abnormalTermination =
     224            b->CreateBasicBlock(prefix + "_abnormalTermination", mKernelTerminationCheck);
     225
     226    // If the kernel explicitly terminates, it must set its processed/produced item counts.
    266227    // Otherwise, the pipeline will update any countable rates, even upon termination.
    267     b->CreateUnlikelyCondBr(mTerminationExplicitly, terminated, incrementItemCounts);
    268 
    269     /// -------------------------------------------------------------------------------------
    270     /// KERNEL INCREMENT ITEM COUNTS
    271     /// -------------------------------------------------------------------------------------
    272 
    273     b->SetInsertPoint(incrementItemCounts);
    274     // TODO: phi out the item counts and set them once at the end.
    275     incrementItemCountsOfCountableRateStreams(b);
     228    b->CreateUnlikelyCondBr(mTerminationExplicitly, abnormalTermination, copyBack);
     229
     230    /// -------------------------------------------------------------------------------------
     231    /// KERNEL COPY BACK
     232    /// -------------------------------------------------------------------------------------
     233
     234    b->SetInsertPoint(copyBack);
    276235    writeCopyBackLogic(b);
    277     b->CreateBr(terminationCheck);
     236    b->CreateBr(mKernelTerminationCheck);
    278237
    279238    /// -------------------------------------------------------------------------------------
     
    281240    /// -------------------------------------------------------------------------------------
    282241
    283     b->SetInsertPoint(terminationCheck);
    284     if (isFinal) {
    285         if (mAlreadyProgressedPhi) {
    286             mAlreadyProgressedPhi->addIncoming(b->getTrue(), terminationCheck);
    287         }
    288         b->CreateUnlikelyCondBr(isFinal, terminated, mKernelLoopEntry);
    289     } else { // just exit the loop
    290         if (mHasProgressedPhi) {
    291             mHasProgressedPhi->addIncoming(b->getTrue(), terminationCheck);
    292         }
    293         mTerminatedPhi->addIncoming(b->getFalse(), terminationCheck);
    294         b->CreateBr(mKernelLoopExit);
    295     }
     242    b->SetInsertPoint(mKernelTerminationCheck);
     243    normalTerminationCheck(b, isFinal);
     244
     245    /// -------------------------------------------------------------------------------------
     246    /// KERNEL ABNORMAL TERMINATION
     247    /// -------------------------------------------------------------------------------------
     248
     249    b->SetInsertPoint(abnormalTermination);
     250    loadItemCountsOfCountableRateStreams(b);
     251    b->CreateBr(mKernelTerminated);
    296252
    297253    /// -------------------------------------------------------------------------------------
     
    299255    /// -------------------------------------------------------------------------------------
    300256
    301     b->SetInsertPoint(terminated);
     257    b->SetInsertPoint(mKernelTerminated);
    302258    zeroFillPartiallyWrittenOutputStreams(b);
    303259    setTerminated(b, b->getTrue());
    304     BasicBlock * const kernelTerminatedEnd = b->GetInsertBlock();
    305     mTerminatedPhi->addIncoming(b->getTrue(), kernelTerminatedEnd);
    306     if (mHasProgressedPhi) {
    307         mHasProgressedPhi->addIncoming(b->getTrue(), kernelTerminatedEnd);
    308     }
     260    updatePhisAfterTermination(b);
    309261    b->CreateBr(mKernelLoopExit);
    310262
    311263    /// -------------------------------------------------------------------------------------
    312     /// KERNEL LOOP EXIT (CONTINUED)
     264    /// KERNEL LOOP EXIT
    313265    /// -------------------------------------------------------------------------------------
    314266
    315267    b->SetInsertPoint(mKernelLoopExit);
     268    writeUpdatedItemCounts(b);
    316269    computeFullyProcessedItemCounts(b);
    317270    computeMinimumConsumedItemCounts(b);
    318271    computeMinimumPopCountReferenceCounts(b);
     272    computeFullyProducedItemCounts(b);
    319273    writeCopyForwardLogic(b);
    320274    writePopCountComputationLogic(b);
     
    330284    writeFinalConsumedItemCounts(b);
    331285    updatePopCountReferenceCounts(b);
    332 
    333     // TODO: logically we should only need to read produced item counts in the loop exit; however, that
    334     // would mean that we'd first need to load the initial produced item counts prior to the loop entry
    335     // to have access to them here and then PHI them out within the kernel loop
    336 
    337286    readFinalProducedItemCounts(b);
    338287    updateOptionalCycleCounter(b);
    339288
    340289    assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
    341 }
    342 
    343 // Synchronization actions for executing a kernel for a particular logical segment.
    344 
    345 // Before the segment is processed, CreateAtomicLoadAcquire must be used to load
    346 // the segment number of the kernel state to ensure that the previous segment is
    347 // complete (by checking that the acquired segment number is equal to the desired segment
    348 // number).
    349 
    350 // After all segment processing actions for the kernel are complete, and any necessary
    351 // data has been extracted from the kernel for further pipeline processing, the
    352 // segment number must be incremented and stored using CreateAtomicStoreRelease.
    353 
    354 /** ------------------------------------------------------------------------------------------------------------- *
    355  * @brief synchronize
    356  ** ------------------------------------------------------------------------------------------------------------- */
    357 void PipelineCompiler::synchronize(BuilderRef b) {
    358 
    359     b->setKernel(mPipelineKernel);
    360     const auto prefix = makeKernelName(mKernelIndex);
    361     const auto serialize = codegen::DebugOptionIsSet(codegen::SerializeThreads);
    362     const unsigned waitingOnIdx = serialize ? (mPipeline.size() - 1) : mKernelIndex;
    363     const auto waitingOn = makeKernelName(waitingOnIdx);
    364     Value * const waitingOnPtr = b->getScalarFieldPtr(waitingOn + LOGICAL_SEGMENT_NO_SCALAR);
    365     BasicBlock * const kernelWait = b->CreateBasicBlock(prefix + "Wait", mPipelineEnd);
    366     b->CreateBr(kernelWait);
    367 
    368     b->SetInsertPoint(kernelWait);
    369     Value * const processedSegmentCount = b->CreateAtomicLoadAcquire(waitingOnPtr);
    370     assert (processedSegmentCount->getType() == mSegNo->getType());
    371     Value * const ready = b->CreateICmpEQ(mSegNo, processedSegmentCount);
    372     BasicBlock * const kernelStart = b->CreateBasicBlock(prefix + "Start", mPipelineEnd);
    373     b->CreateCondBr(ready, kernelStart, kernelWait);
    374 
    375     b->SetInsertPoint(kernelStart);
    376     b->setKernel(mKernel);
    377 }
    378 
    379 /** ------------------------------------------------------------------------------------------------------------- *
    380  * @brief releaseCurrentSegment
    381  ** ------------------------------------------------------------------------------------------------------------- */
    382 inline void PipelineCompiler::releaseCurrentSegment(BuilderRef b) {
    383     b->setKernel(mPipelineKernel);
    384     Value * const nextSegNo = b->CreateAdd(mSegNo, b->getSize(1));
    385     const auto prefix = makeKernelName(mKernelIndex);
    386     Value * const waitingOnPtr = b->getScalarFieldPtr(prefix + LOGICAL_SEGMENT_NO_SCALAR);
    387     b->CreateAtomicStoreRelease(nextSegNo, waitingOnPtr);
    388290}
    389291
     
    414316    Value * notEnoughSpace = b->getFalse();
    415317    for (const auto e : make_iterator_range(in_edges(pipelineOutputVertex, mBufferGraph))) {
    416         // TODO: not a very elegant way here; revise
     318
    417319        const auto bufferVertex = source(e, mBufferGraph);
    418         setActiveKernel(b, parent(bufferVertex, mBufferGraph));
    419         resetMemoizedFields();
    420         const auto outputPort = mBufferGraph[e].Port;
    421         Value * const writable = getWritableOutputItems(b, outputPort);
     320        const BufferNode & bn = mBufferGraph[bufferVertex];
     321        const StreamSetBuffer * const buffer = bn.Buffer;
     322
     323        Value * const produced = bn.TotalItems; assert (produced);
     324        Value * const consumed = b->getSize(0);  assert (consumed);
     325        Value * const writable = buffer->getLinearlyWritableItems(b, produced, consumed, getCopyBack(bufferVertex));
     326
     327//        const auto kernelVertex = parent(bufferVertex, mBufferGraph);
     328        const BufferRateData & rd = mBufferGraph[e];
     329        const auto outputPort = rd.Port;
     330
    422331        // NOTE: this method doesn't check a popcount's ref stream to determine how many
    423332        // items we actually require. Instead it just calculates them as bounded rates.
     
    425334        // that writes to this output actually consumes. Since this effectively adds a
    426335        // delay equivalent to a LookAhead of a full stride, this doesn't seem useful.
    427         Value * const strideLength = getMaximumStrideLength(b, Port::Output, outputPort);
     336        const Binding & output = mPipelineKernel->getOutputStreamSetBinding(outputPort);
     337        Value * const strideLength = getMaximumStrideLength(b, mPipelineKernel, output);
    428338        notEnoughSpace = b->CreateOr(b->CreateICmpULT(writable, strideLength), notEnoughSpace);
    429339    }
     
    517427
    518428/** ------------------------------------------------------------------------------------------------------------- *
     429 * @brief initializeKernelLoopEntryPhis
     430 ** ------------------------------------------------------------------------------------------------------------- */
     431inline void PipelineCompiler::initializeKernelLoopEntryPhis(BuilderRef b) {
     432    b->SetInsertPoint(mKernelLoopEntry);
     433    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     434    Type * const sizeTy = b->getSizeTy();
     435    for (unsigned i = 0; i < numOfInputs; ++i) {
     436        const Binding & input = mKernel->getInputStreamSetBinding(i);
     437        const auto prefix = makeBufferName(mKernelIndex, input);
     438        mAlreadyProcessedPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_alreadyProcessed");
     439        mAlreadyProcessedPhi[i]->addIncoming(mInitiallyProcessedItemCount[i], mKernelEntry);
     440        if (mInitiallyProcessedDeferredItemCount[i]) {
     441            mAlreadyProcessedDeferredPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_alreadyProcessedDeferred");
     442            mAlreadyProcessedDeferredPhi[i]->addIncoming(mInitiallyProcessedDeferredItemCount[i], mKernelEntry);
     443        }
     444    }
     445    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     446    for (unsigned i = 0; i < numOfOutputs; ++i) {
     447        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     448        const auto prefix = makeBufferName(mKernelIndex, output);
     449        mAlreadyProducedPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_alreadyProduced");
     450        mAlreadyProducedPhi[i]->addIncoming(mInitiallyProducedItemCount[i], mKernelEntry);
     451    }
     452    // Since we may loop and call the kernel again, we want to mark that we've progressed
     453    // if we execute any kernel even if we could not complete a full segment.
     454    if (mPipelineProgress) {
     455        const auto prefix = makeKernelName(mKernelIndex);
     456        mAlreadyProgressedPhi = b->CreatePHI(b->getInt1Ty(), 2, prefix + "_madeProgress");
     457        mAlreadyProgressedPhi->addIncoming(mPipelineProgress, mKernelEntry);
     458    }
     459}
     460
     461/** ------------------------------------------------------------------------------------------------------------- *
    519462 * @brief initializeKernelCallPhis
    520463 ** ------------------------------------------------------------------------------------------------------------- */
    521464inline void PipelineCompiler::initializeKernelCallPhis(BuilderRef b) {
     465    b->SetInsertPoint(mKernelLoopCall);
    522466    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    523467    Type * const sizeTy = b->getSizeTy();
     
    529473    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    530474    for (unsigned i = 0; i < numOfOutputs; ++i) {
    531         if (LLVM_LIKELY(getOutputBufferType(i) != BufferType::Managed)) {
    532             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    533             const auto prefix = makeBufferName(mKernelIndex, output);
    534             mLinearOutputItemsPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_linearlyWritable");
    535         }
     475        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     476        const auto prefix = makeBufferName(mKernelIndex, output);
     477        mLinearOutputItemsPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_linearlyWritable");
     478    }
     479}
     480
     481/** ------------------------------------------------------------------------------------------------------------- *
     482 * @brief initializeKernelTerminatedPhis
     483 ** ------------------------------------------------------------------------------------------------------------- */
     484inline void PipelineCompiler::initializeKernelTerminatedPhis(BuilderRef b) {
     485    b->SetInsertPoint(mKernelTerminated);
     486    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     487    Type * const sizeTy = b->getSizeTy();
     488    for (unsigned i = 0; i < numOfInputs; ++i) {
     489        const Binding & input = mKernel->getInputStreamSetBinding(i);
     490        const auto prefix = makeBufferName(mKernelIndex, input);
     491        mFinalProcessedPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_finalProcessed");
     492    }
     493    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     494    for (unsigned i = 0; i < numOfOutputs; ++i) {
     495        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     496        const auto prefix = makeBufferName(mKernelIndex, output);
     497        mFinalProducedPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_finalProduced");
     498    }
     499}
     500
     501/** ------------------------------------------------------------------------------------------------------------- *
     502 * @brief initializeKernelLoopExitPhis
     503 ** ------------------------------------------------------------------------------------------------------------- */
     504inline void PipelineCompiler::initializeKernelLoopExitPhis(BuilderRef b) {
     505    b->SetInsertPoint(mKernelLoopExit);
     506    const auto prefix = makeKernelName(mKernelIndex);
     507    mTerminatedPhi = b->CreatePHI(b->getInt1Ty(), 2, prefix + "_terminated");
     508    if (mPipelineProgress) {
     509        mHasProgressedPhi = b->CreatePHI(b->getInt1Ty(), 2, prefix + "_anyProgress");
     510    }
     511    Type * const sizeTy = b->getSizeTy();
     512    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     513    for (unsigned i = 0; i < numOfInputs; ++i) {
     514        const Binding & input = mKernel->getInputStreamSetBinding(i);
     515        const auto prefix = makeBufferName(mKernelIndex, input);
     516        mUpdatedProcessedPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_updatedProcessed");
     517        if (mAlreadyProcessedDeferredPhi[i]) {
     518            mUpdatedProcessedDeferredPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_updatedProcessedDeferred");
     519        }
     520    }
     521    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     522    for (unsigned i = 0; i < numOfOutputs; ++i) {
     523        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     524        const auto prefix = makeBufferName(mKernelIndex, output);
     525        mUpdatedProducedPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_updatedProduced");
    536526    }
    537527}
     
    541531 ** ------------------------------------------------------------------------------------------------------------- */
    542532inline void PipelineCompiler::initializeKernelExitPhis(BuilderRef b) {
    543     const auto kernelName = makeKernelName(mKernelIndex);
    544     mTerminatedFlag = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_terminated");
    545     mTerminatedFlag->addIncoming(b->getTrue(), mKernelEntry);
    546     mTerminatedFlag->addIncoming(mTerminatedPhi, mKernelLoopExitPhiCatch);
    547     mTerminationGraph[mKernelIndex] = mTerminatedFlag;
     533    b->SetInsertPoint(mKernelExit);
     534    const auto prefix = makeKernelName(mKernelIndex);
     535    PHINode * const terminated = b->CreatePHI(b->getInt1Ty(), 2, prefix + "_terminated");
     536    terminated->addIncoming(b->getTrue(), mKernelEntry);
     537    terminated->addIncoming(mTerminatedPhi, mKernelLoopExitPhiCatch);
     538    mTerminationGraph[mKernelIndex] = terminated;
    548539    if (mPipelineProgress) {
    549         PHINode * const pipelineProgress = b->CreatePHI(b->getInt1Ty(), 2, "pipelineProgress");
     540        PHINode * const pipelineProgress = b->CreatePHI(b->getInt1Ty(), 2, prefix + "_pipelineProgress");
    550541        pipelineProgress->addIncoming(mPipelineProgress, mKernelEntry);
    551542        pipelineProgress->addIncoming(mHasProgressedPhi, mKernelLoopExitPhiCatch);
     
    553544    }
    554545    createConsumedPhiNodes(b);
     546    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     547    Type * const sizeTy = b->getSizeTy();
     548    for (unsigned i = 0; i < numOfOutputs; ++i) {
     549        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     550        const auto prefix = makeBufferName(mKernelIndex, output);
     551        PHINode * const fullyProduced = b->CreatePHI(sizeTy, 2, prefix + "_fullyProduced");
     552        fullyProduced->addIncoming(mInitiallyProducedItemCount[i], mKernelEntry);
     553        mFullyProducedItemCount[i] = fullyProduced;
     554    }
    555555    createPopCountReferenceCounts(b);
    556556}
    557557
    558558/** ------------------------------------------------------------------------------------------------------------- *
     559 * @brief normalTerminationCheck
     560 ** ------------------------------------------------------------------------------------------------------------- */
     561inline void PipelineCompiler::normalTerminationCheck(BuilderRef b, Value * const isFinal) {
     562    BasicBlock * const entryBlock = b->GetInsertBlock();
     563    if (isFinal) {
     564        const auto numOfInputs = mKernel->getNumOfStreamInputs();
     565        for (unsigned i = 0; i < numOfInputs; ++i) {
     566            mAlreadyProcessedPhi[i]->addIncoming(mProcessedItemCount[i], entryBlock);
     567            if (mAlreadyProcessedDeferredPhi[i]) {
     568                mAlreadyProcessedDeferredPhi[i]->addIncoming(mProcessedDeferredItemCount[i], entryBlock);
     569            }
     570            mFinalProcessedPhi[i]->addIncoming(mProcessedItemCount[i], entryBlock);
     571        }
     572        const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     573        for (unsigned i = 0; i < numOfOutputs; ++i) {
     574            mAlreadyProducedPhi[i]->addIncoming(mProducedItemCount[i], entryBlock);
     575            mFinalProducedPhi[i]->addIncoming(mProducedItemCount[i], entryBlock);
     576        }
     577        if (mAlreadyProgressedPhi) {
     578            mAlreadyProgressedPhi->addIncoming(b->getTrue(), entryBlock);
     579        }
     580        b->CreateUnlikelyCondBr(isFinal, mKernelTerminated, mKernelLoopEntry);
     581    } else { // just exit the loop
     582        const auto numOfInputs = mKernel->getNumOfStreamInputs();
     583        for (unsigned i = 0; i < numOfInputs; ++i) {
     584            mUpdatedProcessedPhi[i]->addIncoming(mProcessedItemCount[i], entryBlock);
     585            if (mUpdatedProcessedDeferredPhi[i]) {
     586                mUpdatedProcessedDeferredPhi[i]->addIncoming(mProcessedDeferredItemCount[i], entryBlock);
     587            }
     588        }
     589        const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     590        for (unsigned i = 0; i < numOfOutputs; ++i) {
     591            mUpdatedProducedPhi[i]->addIncoming(mProducedItemCount[i], entryBlock);
     592        }
     593        if (mHasProgressedPhi) {
     594            mHasProgressedPhi->addIncoming(b->getTrue(), entryBlock);
     595        }
     596        mTerminatedPhi->addIncoming(b->getFalse(), entryBlock);
     597        b->CreateBr(mKernelLoopExit);
     598    }
     599}
     600
     601/** ------------------------------------------------------------------------------------------------------------- *
    559602 * @brief getInitialTerminationSignal
    560603 ** ------------------------------------------------------------------------------------------------------------- */
    561 inline Value * PipelineCompiler::getInitialTerminationSignal(BuilderRef b) const {
     604inline Value * PipelineCompiler::initiallyTerminated(BuilderRef b) const {
    562605    b->setKernel(mPipelineKernel);
    563606    const auto prefix = makeKernelName(mKernelIndex);
     
    580623}
    581624
    582 }
     625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief updatePhiCountAfterTermination
     627 ** ------------------------------------------------------------------------------------------------------------- */
     628inline void PipelineCompiler::updatePhisAfterTermination(BuilderRef b) {
     629    BasicBlock * const exitBlock = b->GetInsertBlock();
     630    assert (mTerminatedPhi);
     631    mTerminatedPhi->addIncoming(b->getTrue(), exitBlock);
     632    if (mHasProgressedPhi) {
     633        mHasProgressedPhi->addIncoming(b->getTrue(), exitBlock);
     634    }
     635    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     636    for (unsigned i = 0; i < numOfInputs; ++i) {
     637        // TODO: set these to the total produced item count for that input?
     638        mUpdatedProcessedPhi[i]->addIncoming(mFinalProcessedPhi[i], exitBlock);
     639        if (mUpdatedProcessedDeferredPhi[i]) {
     640            mUpdatedProcessedDeferredPhi[i]->addIncoming(mFinalProcessedPhi[i], exitBlock);
     641        }
     642    }
     643    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     644    for (unsigned i = 0; i < numOfOutputs; ++i) {
     645        mUpdatedProducedPhi[i]->addIncoming(mFinalProducedPhi[i], exitBlock);
     646    }
     647}
     648
     649}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/kernel_logic.hpp

    r6233 r6249  
    22
    33namespace kernel {
     4
     5/** ------------------------------------------------------------------------------------------------------------- *
     6 * @brief reset
     7 ** ------------------------------------------------------------------------------------------------------------- */
     8template <typename Vec>
     9inline void reset(Vec & vec, const unsigned n) {
     10    vec.resize(n);
     11    std::fill_n(vec.begin(), n, nullptr);
     12}
    413
    514/** ------------------------------------------------------------------------------------------------------------- *
     
    5766    mAccessibleInputItems[inputPort] = accessible;
    5867    BasicBlock * const target = b->CreateBasicBlock(prefix + "_hasInputData", mKernelLoopCall);
    59 
    60     b->CreateLikelyCondBr(sufficientInput, target, mKernelLoopExit);
    61     BasicBlock * const exitBlock = b->GetInsertBlock();
    62     mTerminatedPhi->addIncoming(b->getFalse(), exitBlock);
    63     if (mHasProgressedPhi) {
    64         mHasProgressedPhi->addIncoming(mAlreadyProgressedPhi, exitBlock);
    65     }
    66     b->SetInsertPoint(target);
    67 }
    68 
    69 /** ------------------------------------------------------------------------------------------------------------- *
    70  * @brief getAlreadyProcessedItemCount
    71  ** ------------------------------------------------------------------------------------------------------------- */
    72 Value * PipelineCompiler::getAlreadyProcessedItemCount(BuilderRef b, const unsigned inputPort) {
    73     if (mAlreadyProcessedItemCount[inputPort]) {
    74         return mAlreadyProcessedItemCount[inputPort];
    75     }
    76     const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
    77     Value * const processed = b->getNonDeferredProcessedItemCount(input);
    78     mAlreadyProcessedItemCount[inputPort] = processed;
    79     return processed;
     68    branchToTargetOrLoopExit(b, sufficientInput, target);
    8069}
    8170
     
    9786    const StreamSetBuffer * const buffer = getInputBuffer(inputPort);
    9887    Value * const totalItems = getTotalItemCount(b, inputPort);
    99     Value * const processed = getAlreadyProcessedItemCount(b, inputPort);
     88    Value * const processed = mAlreadyProcessedPhi[inputPort];
    10089    #ifdef PRINT_DEBUG_MESSAGES
    10190    const auto prefix = makeBufferName(mKernelIndex, input);
     
    123112inline void PipelineCompiler::checkForSufficientOutputSpaceOrExpand(BuilderRef b, const unsigned outputPort) {
    124113    // If the buffer is managed by the kernel, ignore it
    125     if (mLinearOutputItemsPhi[outputPort]) {
     114    if (LLVM_LIKELY(getOutputBufferType(outputPort) != BufferType::Managed)) {
    126115        const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
    127116        Value * const writable = getWritableOutputItems(b, outputPort);
     
    139128            expandOutputBuffer(b, outputPort, check, target);
    140129        } else {
    141             b->CreateLikelyCondBr(check, target, mKernelLoopExit);
    142             BasicBlock * const exitBlock = b->GetInsertBlock();
    143             mTerminatedPhi->addIncoming(b->getFalse(), exitBlock);
    144             if (mHasProgressedPhi) {
    145                 mHasProgressedPhi->addIncoming(mAlreadyProgressedPhi, exitBlock);
    146             }
    147             b->SetInsertPoint(target);
     130            branchToTargetOrLoopExit(b, check, target);
    148131        }
    149132    }
     
    152135/** ------------------------------------------------------------------------------------------------------------- *
    153136 * @brief willNotOverwriteOverflow
     137 *
     138 * check whether the potential overflow copy will overwrite the buffer
    154139 ** ------------------------------------------------------------------------------------------------------------- */
    155140inline Value * PipelineCompiler::willNotOverwriteOverflow(BuilderRef b, const unsigned outputPort) {
    156     if (LLVM_UNLIKELY(requiresCopyBack(getOutputBufferVertex(outputPort)))) { // check whether the potential overflow copy will overwrite the buffer
    157         Value * const produced = getAlreadyProducedItemCount(b, outputPort);
     141    if (LLVM_UNLIKELY(requiresCopyBack(getOutputBufferVertex(outputPort)))) {
     142        Value * const produced = mAlreadyProducedPhi[outputPort];
    158143        Value * const consumed = getConsumedItemCount(b, outputPort);
    159144        Value * const unconsumed = b->CreateSub(produced, consumed);
     
    173158 * @brief branchToTargetOrLoopExit
    174159 ** ------------------------------------------------------------------------------------------------------------- */
    175 inline void PipelineCompiler::branchToTargetOrLoopExit(BuilderRef b, Value * const cond, BasicBlock * const target) {
     160void PipelineCompiler::branchToTargetOrLoopExit(BuilderRef b, Value * const cond, BasicBlock * const target) {
    176161    b->CreateLikelyCondBr(cond, target, mKernelLoopExit);
    177162    BasicBlock * const exitBlock = b->GetInsertBlock();
     
    180165        mHasProgressedPhi->addIncoming(mAlreadyProgressedPhi, exitBlock);
    181166    }
     167    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     168    for (unsigned i = 0; i < numOfInputs; ++i) {
     169        mUpdatedProcessedPhi[i]->addIncoming(mAlreadyProcessedPhi[i], exitBlock);
     170        if (mUpdatedProcessedDeferredPhi[i]) {
     171            mUpdatedProcessedDeferredPhi[i]->addIncoming(mAlreadyProcessedDeferredPhi[i], exitBlock);
     172        }
     173    }
     174    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     175    for (unsigned i = 0; i < numOfOutputs; ++i) {
     176        mUpdatedProducedPhi[i]->addIncoming(mAlreadyProducedPhi[i], exitBlock);
     177    }
    182178    b->SetInsertPoint(target);
    183 }
    184 
    185 /** ------------------------------------------------------------------------------------------------------------- *
    186  * @brief getAlreadyProducedItemCount
    187  ** ------------------------------------------------------------------------------------------------------------- */
    188 Value * PipelineCompiler::getAlreadyProducedItemCount(BuilderRef b, const unsigned outputPort) {
    189     if (mAlreadyProducedItemCount[outputPort]) {
    190         return mAlreadyProducedItemCount[outputPort];
    191     }
    192     const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
    193     Value * const produced = b->getNonDeferredProducedItemCount(output);
    194     mAlreadyProducedItemCount[outputPort] = produced;
    195     return produced;
    196179}
    197180
     
    203186    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
    204187    const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
    205     Value * const produced = getAlreadyProducedItemCount(b, outputPort);
    206     Value * const consumed = getConsumedItemCount(b, outputPort);
     188    Value * const produced = mAlreadyProducedPhi[outputPort]; assert (produced);
     189    Value * const consumed = getConsumedItemCount(b, outputPort);  assert (consumed);
    207190    #ifdef PRINT_DEBUG_MESSAGES
    208191    const auto prefix = makeBufferName(mKernelIndex, output);
     
    276259inline Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b, const unsigned outputPort) {
    277260    Value * numOfStrides = nullptr;
    278     if (mLinearOutputItemsPhi[outputPort]) {
     261    if (LLVM_LIKELY(getOutputBufferType(outputPort) != BufferType::Managed)) {
    279262        const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
    280263        const ProcessingRate & rate = output.getRate();
     
    304287    Value * linearOutputItems[numOfOutputs];
    305288    for (unsigned i = 0; i < numOfInputs; ++i) {
    306         linearInputItems[i] = calculateNumOfLinearItems(b, Port::Input, i);
     289        linearInputItems[i] = calculateNumOfLinearItems(b, mKernel->getInputStreamSetBinding(i));
    307290    }
    308291    for (unsigned i = 0; i < numOfOutputs; ++i) {
    309         linearOutputItems[i] = calculateNumOfLinearItems(b, Port::Output, i);
     292        linearOutputItems[i] = calculateNumOfLinearItems(b, mKernel->getOutputStreamSetBinding(i));
    310293    }
    311294    BasicBlock * const exitBlock = b->GetInsertBlock();
     
    314297    }
    315298    for (unsigned i = 0; i < numOfOutputs; ++i) {
    316         if (mLinearOutputItemsPhi[i]) {
    317             mLinearOutputItemsPhi[i]->addIncoming(linearOutputItems[i], exitBlock);
    318         }
     299        mLinearOutputItemsPhi[i]->addIncoming(linearOutputItems[i], exitBlock);
    319300    }
    320301}
     
    382363
    383364    for (unsigned i = 0; i < numOfOutputs; ++i) {
    384         if (mLinearOutputItemsPhi[i]) {
    385             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    386             const ProcessingRate & rate = output.getRate();
    387             Value * writable = nullptr;
    388             if (rate.isFixed() && minScaledInverseOfAccessibleInput) {
    389                 writable = b->CreateCeilUDiv2(minScaledInverseOfAccessibleInput, rateLCM / rate.getRate());
    390             } else if (rate.isPopCount() || rate.isNegatedPopCount()) {
    391                 writable = getMinimumNumOfLinearPopCountItems(b, output);
    392             } else {
    393                 writable = mWritableOutputItems[i];
     365        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     366        const ProcessingRate & rate = output.getRate();
     367        Value * writable = nullptr;
     368        if (rate.isFixed() && minScaledInverseOfAccessibleInput) {
     369            writable = b->CreateCeilUDiv2(minScaledInverseOfAccessibleInput, rateLCM / rate.getRate());
     370        } else if (rate.isPopCount() || rate.isNegatedPopCount()) {
     371            writable = getMinimumNumOfLinearPopCountItems(b, output);
     372        } else {
     373            writable = mWritableOutputItems[i];
     374        }
     375        // update the final item counts with any Add/RoundUp attributes
     376        for (const Attribute & attr : output.getAttributes()) {
     377            if (attr.isAdd()) {
     378                writable = b->CreateAdd(writable, b->getSize(attr.amount()));
     379            } else if (attr.isRoundUpTo()) {
     380                writable = b->CreateRoundUp(writable, b->getSize(attr.amount()));
    394381            }
    395             // update the final item counts with any Add/RoundUp attributes
    396             for (const Attribute & attr : output.getAttributes()) {
    397                 if (attr.isAdd()) {
    398                     writable = b->CreateAdd(writable, b->getSize(attr.amount()));
    399                 } else if (attr.isRoundUpTo()) {
    400                     writable = b->CreateRoundUp(writable, b->getSize(attr.amount()));
    401                 }
    402             }
    403             pendingItems[i] = writable;
    404         }
     382        }
     383        pendingItems[i] = writable;
    405384    }
    406385
     
    412391    }
    413392    for (unsigned i = 0; i < numOfOutputs; ++i) {
    414         if (mLinearOutputItemsPhi[i]) {
    415             mLinearOutputItemsPhi[i]->addIncoming(pendingItems[i], exitBlock);
    416         }
     393        mLinearOutputItemsPhi[i]->addIncoming(pendingItems[i], exitBlock);
    417394    }
    418395}
     
    423400 ** ------------------------------------------------------------------------------------------------------------- */
    424401inline Value * PipelineCompiler::calculateBufferExpansionSize(BuilderRef b, const unsigned outputPort) {
    425     Value * const produced = getAlreadyProducedItemCount(b, outputPort);
     402    Value * const produced = mAlreadyProducedPhi[outputPort];
    426403    Value * const consumed = getConsumedItemCount(b, outputPort);
    427404    Value * const unconsumed = b->CreateSub(produced, consumed);
     
    467444
    468445/** ------------------------------------------------------------------------------------------------------------- *
     446 * @brief addKernelCallArgument
     447 ** ------------------------------------------------------------------------------------------------------------- */
     448Value * PipelineCompiler::addItemCountArg(BuilderRef b, const Binding & binding,
     449                                          const bool addressable,
     450                                          PHINode * const itemCount,
     451                                          std::vector<Value *> & args) const {
     452    const ProcessingRate & rate = binding.getRate();
     453    if (LLVM_UNLIKELY(rate.isRelative())) {
     454        return nullptr;
     455    }
     456    Value * ptr = nullptr;
     457    if (addressable || rate.isBounded() || rate.isUnknown()) {
     458        ptr = b->CreateAlloca(itemCount->getType());
     459        b->CreateStore(itemCount, ptr);
     460        args.push_back(ptr);
     461    } else {
     462        args.push_back(itemCount);
     463    }
     464    return ptr;
     465}
     466
     467/** ------------------------------------------------------------------------------------------------------------- *
    469468 * @brief writeKernelCall
    470469 ** ------------------------------------------------------------------------------------------------------------- */
     
    477476#warning TODO: send in the # of output items we want in the external buffers
    478477
    479     std::vector<Value *> arguments;
    480     arguments.reserve((numOfInputs + numOfOutputs + 1) * 2);
    481     arguments.push_back(mKernel->getHandle());
    482     arguments.push_back(mNumOfLinearStrides);
     478    b->setKernel(mPipelineKernel);
     479
     480    std::vector<Value *> args;
     481    args.reserve((numOfInputs + numOfOutputs) * 4 + 2);
     482    args.push_back(mKernel->getHandle());
     483    args.push_back(mNumOfLinearStrides);
    483484    for (unsigned i = 0; i < numOfInputs; ++i) {
     485        args.push_back(getLogicalInputBaseAddress(b, i));
    484486        const Binding & input = mKernel->getInputStreamSetBinding(i);
    485         arguments.push_back(getLogicalInputBaseAddress(b, i));
    486         arguments.push_back(mLinearInputItemsPhi[i]);
     487        // calculate the deferred processed item count
     488        PHINode * itemCount = nullptr;
     489        bool deferred = false;
     490        if (mAlreadyProcessedDeferredPhi[i]) {
     491            itemCount = mAlreadyProcessedDeferredPhi[i];
     492            deferred = true;
     493        } else {
     494            itemCount = mAlreadyProcessedPhi[i];
     495        }
     496        mReturnedProcessedItemCountPtr[i] = addItemCountArg(b, input, deferred, itemCount, args);
     497        // calculate how many linear items are from the *deferred* position
     498        Value * linearItemCount = mLinearInputItemsPhi[i];
     499        if (mAlreadyProcessedDeferredPhi[i]) {
     500            Value * diff = b->CreateSub(mAlreadyProcessedPhi[i], mAlreadyProcessedDeferredPhi[i]);
     501            linearItemCount = b->CreateAdd(linearItemCount, diff);
     502        }
     503        args.push_back(linearItemCount);
    487504        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresPopCountArray))) {
    488             arguments.push_back(getPopCountArray(b, i));
     505            args.push_back(getPopCountArray(b, i));
    489506        }
    490507        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresNegatedPopCountArray))) {
    491             arguments.push_back(getNegatedPopCountArray(b, i));
    492         }
    493     }
     508            args.push_back(getNegatedPopCountArray(b, i));
     509        }
     510    }
     511
     512    const auto canTerminate = mKernel->canSetTerminateSignal();
    494513
    495514    for (unsigned i = 0; i < numOfOutputs; ++i) {
    496         if (mLinearOutputItemsPhi[i]) {
    497             arguments.push_back(getLogicalOutputBaseAddress(b, i));
    498             arguments.push_back(mLinearOutputItemsPhi[i]);
     515        const auto nonManaged = getOutputBufferType(i) != BufferType::Managed;
     516        if (LLVM_LIKELY(nonManaged)) {
     517            args.push_back(getLogicalOutputBaseAddress(b, i));
     518        }
     519        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     520        PHINode * produced = mAlreadyProducedPhi[i];
     521        mReturnedProducedItemCountPtr[i] = addItemCountArg(b, output, canTerminate, produced, args);
     522        if (LLVM_LIKELY(nonManaged)) {
     523            args.push_back(mLinearOutputItemsPhi[i]);
    499524        }
    500525    }
     
    509534    #endif
    510535
    511 
    512     mTerminationExplicitly = b->CreateCall(getDoSegmentFunction(b), arguments);
    513     if (LLVM_LIKELY(mTerminationExplicitly->getType()->isVoidTy())) {
     536    mTerminationExplicitly = b->CreateCall(getDoSegmentFunction(b), args);
     537    if (LLVM_LIKELY(!canTerminate)) {
    514538        mTerminationExplicitly = b->getFalse();
    515539    }
     
    519543    }
    520544
    521 }
    522 
    523 /** ------------------------------------------------------------------------------------------------------------- *
    524  * @brief computeFullyProcessedItemCounts
    525  ** ------------------------------------------------------------------------------------------------------------- */
    526 void PipelineCompiler::computeFullyProcessedItemCounts(BuilderRef b) {
    527     const auto numOfInputs = mKernel->getNumOfStreamInputs();
    528     mFullyProcessedItemCount.resize(numOfInputs);
     545    // calculate or read the item counts (assuming this kernel did not terminate)
    529546    for (unsigned i = 0; i < numOfInputs; ++i) {
    530547        const Binding & input = mKernel->getInputStreamSetBinding(i);
    531         Value * processed = b->getProcessedItemCount(input.getName());
    532         processed = truncateBlockSize(b, input, processed, mTerminatedPhi);
    533         mFullyProcessedItemCount[i] = processed;
    534         #ifdef PRINT_DEBUG_MESSAGES
    535         const auto prefix = makeBufferName(mKernelIndex, input);
    536         b->CallPrintInt(prefix + "_processed'", processed);
    537         #endif
     548        const ProcessingRate & rate = input.getRate();
     549        if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
     550            mProcessedItemCount[i] = b->CreateAdd(mAlreadyProcessedPhi[i], mLinearInputItemsPhi[i]);
     551            if (mAlreadyProcessedDeferredPhi[i]) {
     552                assert (mReturnedProcessedItemCountPtr[i]);
     553                mProcessedDeferredItemCount[i] = b->CreateLoad(mReturnedProcessedItemCountPtr[i]);
     554                #ifdef PRINT_DEBUG_MESSAGES
     555                b->CallPrintInt("> " + prefix + "_deferredItemCount", mProcessedDeferredItemCount[i]);
     556                #endif
     557                if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     558                    const auto prefix = makeBufferName(mKernelIndex, input);
     559                    Value * const isDeferred = b->CreateICmpULE(mProcessedDeferredItemCount[i], mProcessedItemCount[i]);
     560                    b->CreateAssert(isDeferred, prefix + ": deferred processed item count exceeds non-deferred");
     561                }
     562            }
     563        } else if (rate.isBounded() || rate.isUnknown()) {
     564            mProcessedItemCount[i] = b->CreateLoad(mReturnedProcessedItemCountPtr[i]);
     565        }
     566    }
     567    for (unsigned i = 0; i < numOfOutputs; ++i) {
     568        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     569        const ProcessingRate & rate = output.getRate();
     570        if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
     571            mProducedItemCount[i] = b->CreateAdd(mAlreadyProducedPhi[i], mLinearOutputItemsPhi[i]);
     572        } else if (rate.isBounded() || rate.isUnknown()) {
     573            mProducedItemCount[i] = b->CreateLoad(mReturnedProducedItemCountPtr[i]);
     574        }
     575    }
     576    b->setKernel(mKernel);
     577
     578}
     579
     580/** ------------------------------------------------------------------------------------------------------------- *
     581 * @brief loadItemCountsOfCountableRateStreams
     582 ** ------------------------------------------------------------------------------------------------------------- */
     583inline void PipelineCompiler::loadItemCountsOfCountableRateStreams(BuilderRef b) {
     584    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     585    for (unsigned i = 0; i < numOfInputs; i++) {
     586        const Binding & input = mKernel->getInputStreamSetBinding(i);
     587        const ProcessingRate & rate = input.getRate();
     588        if (mReturnedProcessedItemCountPtr[i] && (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount())) {
     589            mProcessedItemCount[i] = b->CreateLoad(mReturnedProcessedItemCountPtr[i]);
     590        }
     591    }
     592    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     593    for (unsigned i = 0; i < numOfOutputs; i++) {
     594        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     595        const ProcessingRate & rate = output.getRate();
     596        if (mReturnedProducedItemCountPtr[i] && (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount())) {
     597            mProducedItemCount[i] = b->CreateLoad(mReturnedProducedItemCountPtr[i]);
     598        }
     599    }
     600    BasicBlock * const exitBlock = b->GetInsertBlock();
     601    for (unsigned i = 0; i < numOfInputs; i++) {
     602        mFinalProcessedPhi[i]->addIncoming(mProcessedItemCount[i], exitBlock);
     603    }
     604    for (unsigned i = 0; i < numOfOutputs; i++) {
     605        mFinalProducedPhi[i]->addIncoming(mProducedItemCount[i], exitBlock);
    538606    }
    539607}
     
    542610 * @brief zeroFillPartiallyWrittenOutputStreams
    543611 ** ------------------------------------------------------------------------------------------------------------- */
    544 void PipelineCompiler::zeroFillPartiallyWrittenOutputStreams(BuilderRef b) {
     612inline void PipelineCompiler::zeroFillPartiallyWrittenOutputStreams(BuilderRef /* b */) {
    545613
    546614    // TODO: this ought to check what streams have a lookahead dependency on this and make sure that it
     
    600668
    601669/** ------------------------------------------------------------------------------------------------------------- *
    602  * @brief incrementItemCountsOfCountableRateStreams
    603  ** ------------------------------------------------------------------------------------------------------------- */
    604 inline void PipelineCompiler::incrementItemCountsOfCountableRateStreams(BuilderRef b) {
     670 * @brief computeFullyProcessedItemCounts
     671 ** ------------------------------------------------------------------------------------------------------------- */
     672inline void PipelineCompiler::computeFullyProcessedItemCounts(BuilderRef b) {
    605673    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    606     for (unsigned i = 0; i < numOfInputs; i++) {
     674    for (unsigned i = 0; i < numOfInputs; ++i) {
    607675        const Binding & input = mKernel->getInputStreamSetBinding(i);
    608         const ProcessingRate & rate = input.getRate();
    609         if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
    610             Value * const processed = getAlreadyProcessedItemCount(b, i);
    611             Value * const items = b->CreateAdd(processed, mLinearInputItemsPhi[i]);
    612             b->setNonDeferredProcessedItemCount(input, items);
    613         }
    614     }
     676        Value * processed = nullptr;
     677        if (mUpdatedProcessedDeferredPhi[i]) {
     678            processed = mUpdatedProcessedDeferredPhi[i];
     679        } else {
     680            processed = mUpdatedProcessedPhi[i];
     681        }
     682        processed = truncateBlockSize(b, input, processed, mTerminatedPhi);
     683        mFullyProcessedItemCount[i] = processed;
     684    }
     685}
     686
     687/** ------------------------------------------------------------------------------------------------------------- *
     688 * @brief computeFullyProducedItemCounts
     689 ** ------------------------------------------------------------------------------------------------------------- */
     690inline void PipelineCompiler::computeFullyProducedItemCounts(BuilderRef b) {
     691
     692    // TODO: we only need to consider the blocksize attribute if it's possible this
     693    // stream could be read before being fully written. This might occur if one of
     694    // it's consumers has a non-Fixed rate that does not have a matching BlockSize
     695    // attribute.
     696
    615697    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    616     for (unsigned i = 0; i < numOfOutputs; i++) {
    617         if (mLinearOutputItemsPhi[i]) {
    618             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    619             const ProcessingRate & rate = output.getRate();
    620             if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
    621                 Value * const produced = getAlreadyProducedItemCount(b, i);
    622                 Value * const items = b->CreateAdd(produced, mLinearOutputItemsPhi[i]);
    623                 b->setNonDeferredProducedItemCount(output, items);
    624             }
    625         }
    626     }
    627 
    628     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    629         for (unsigned i = 0; i < numOfInputs; ++i) {
    630             const Binding & input = mKernel->getInputStreamSetBinding(i);
    631             const ProcessingRate & rate = input.getRate();
    632             if (rate.isBounded() || rate.isUnknown()) {
    633                 Value * const processed = getAlreadyProcessedItemCount(b, i);
    634                 Value * const expected = b->CreateAdd(processed, mLinearInputItemsPhi[i]);
    635                 itemCountSanityCheck(b, input, "processed", processed, expected);
    636             }
    637         }
    638         for (unsigned i = 0; i < numOfOutputs; ++i) {
    639             if (mLinearOutputItemsPhi[i]) {
    640                 const Binding & output = mKernel->getOutputStreamSetBinding(i);
    641                 const ProcessingRate & rate = output.getRate();
    642                 if (rate.isBounded() || rate.isUnknown()) {
    643                     Value * const produced = getAlreadyProducedItemCount(b, i);
    644                     Value * const expected = b->CreateAdd(produced, mLinearOutputItemsPhi[i]);
    645                     itemCountSanityCheck(b, output, "produced", produced, expected);
    646                 }
    647             }
    648         }
    649     }
    650 
    651 }
    652 
    653 /** ------------------------------------------------------------------------------------------------------------- *
    654  * @brief itemCountSanityCheck
    655  ** ------------------------------------------------------------------------------------------------------------- */
    656 void PipelineCompiler::itemCountSanityCheck(BuilderRef b, const Binding & binding,
    657                                             const std::string & label,
    658                                             Value * const itemCount, Value * const expected) {
    659 
    660     const auto prefix = makeBufferName(mKernelIndex, binding);
    661     const auto lb = mKernel->getLowerBound(binding);
    662     if (lb > 0 && !binding.hasAttribute(AttrId::Deferred)) {
    663         Constant * const strideSize = b->getSize(ceiling(lb * mKernel->getStride()));
    664         Value * hasEnough = b->CreateICmpULE(itemCount, strideSize);
    665         hasEnough = b->CreateOr(hasEnough, mTerminationExplicitly);
    666         b->CreateAssert(hasEnough, prefix + " " + label + " fewer items than expected");
    667     }
    668     Value * const withinBounds = b->CreateICmpULE(itemCount, expected);
    669     b->CreateAssert(withinBounds, prefix + " " + label + " more items than expected");
    670 
    671 }
    672 
     698    for (unsigned i = 0; i < numOfOutputs; ++i) {
     699        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     700        Value * produced = truncateBlockSize(b, output, mUpdatedProducedPhi[i], mTerminatedPhi);
     701        mFullyProducedItemCount[i]->addIncoming(produced, mKernelLoopExitPhiCatch);
     702    }
     703}
    673704
    674705/** ------------------------------------------------------------------------------------------------------------- *
     
    722753 * @brief getMaximumStrideLength
    723754 ** ------------------------------------------------------------------------------------------------------------- */
    724 inline Value * PipelineCompiler::getMaximumStrideLength(BuilderRef b, const Port port, const unsigned portNum) {
    725     const Binding & binding = getBinding(mKernel, port, portNum);
     755inline Value * PipelineCompiler::getMaximumStrideLength(BuilderRef b, const Kernel * kernel, const Binding & binding) {
    726756    const ProcessingRate & rate = binding.getRate();
    727757    if (LLVM_LIKELY(rate.isFixed() || rate.isBounded() || rate.isPopCount() || rate.isNegatedPopCount())) {
    728         return b->getSize(ceiling(mKernel->getUpperBound(binding) * mKernel->getStride()));
     758        return b->getSize(ceiling(kernel->getUpperBound(binding) * kernel->getStride()));
    729759    } else if (LLVM_LIKELY(rate.isUnknown())) {
    730760        return b->getSize(0);
    731761    } else if (rate.isRelative()) {
    732         Port refPort; unsigned refPortNum;
    733         std::tie(refPort, refPortNum) = mKernel->getStreamPort(rate.getReference());
    734         Value * const baseRate = getMaximumStrideLength(b, refPort, refPortNum);
     762        const Binding & ref = kernel->getStreamBinding(rate.getReference());
     763        Value * const baseRate = getMaximumStrideLength(b, kernel, ref);
    735764        return b->CreateMul2(baseRate, rate.getRate());
    736765    }
     
    741770 * @brief calculateNumOfLinearItems
    742771 ** ------------------------------------------------------------------------------------------------------------- */
    743 inline Value * PipelineCompiler::calculateNumOfLinearItems(BuilderRef b, const Port portType,  const unsigned portNum) {
    744     const Binding & binding = getBinding(mKernel, portType, portNum);
     772inline Value * PipelineCompiler::calculateNumOfLinearItems(BuilderRef b, const Binding & binding) {
    745773    const ProcessingRate & rate = binding.getRate();
    746774    if (rate.isFixed() || rate.isBounded()) {
     
    749777        return getNumOfLinearPopCountItems(b, binding);
    750778    } else if (rate.isRelative()) {
    751         Port refPort; unsigned refPortNum;
    752         std::tie(refPort, refPortNum) = mKernel->getStreamPort(rate.getReference());
    753         Value * const baseCount = calculateNumOfLinearItems(b, refPort, refPortNum);
     779        const Binding & ref = mKernel->getStreamBinding(rate.getReference());
     780        Value * const baseCount = calculateNumOfLinearItems(b, ref);
    754781        return b->CreateMul2(baseCount, rate.getRate());
    755782    }
     
    808835}
    809836
    810 
    811837/** ------------------------------------------------------------------------------------------------------------- *
    812838 * @brief getInitializationFunction
     
    856882}
    857883
    858 template <typename Vec>
    859 inline void reset(Vec & vec, const unsigned n) {
    860     vec.resize(n);
    861     std::fill_n(vec.begin(), n, nullptr);
    862 }
     884#if 0
     885
     886/** ------------------------------------------------------------------------------------------------------------- *
     887 * @brief verifyInputItemCount
     888 ** ------------------------------------------------------------------------------------------------------------- */
     889inline void PipelineCompiler::verifyInputItemCount(BuilderRef b, Value * processed, const unsigned inputPort) const {
     890    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     891        const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
     892        Value * const expected = b->CreateAdd(mAlreadyProcessedPhi[inputPort], mLinearInputItemsPhi[inputPort]);
     893        itemCountSanityCheck(b, input, "processed", processed, expected);
     894    }
     895}
     896
     897/** ------------------------------------------------------------------------------------------------------------- *
     898 * @brief verifyOutputItemCount
     899 ** ------------------------------------------------------------------------------------------------------------- */
     900inline void PipelineCompiler::verifyOutputItemCount(BuilderRef b, Value * produced, const unsigned outputPort) const {
     901    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     902        const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     903        Value * const expected = b->CreateAdd(mAlreadyProducedPhi[outputPort], mLinearOutputItemsPhi[outputPort]);
     904        itemCountSanityCheck(b, output, "produced", produced, expected);
     905    }
     906}
     907
     908
     909/** ------------------------------------------------------------------------------------------------------------- *
     910 * @brief itemCountSanityCheck
     911 ** ------------------------------------------------------------------------------------------------------------- */
     912void PipelineCompiler::itemCountSanityCheck(BuilderRef b, const Binding & binding,
     913                                            const std::string & label,
     914                                            Value * const itemCount, Value * const expected) const {
     915
     916    const auto prefix = makeBufferName(mKernelIndex, binding);
     917    const auto lb = mKernel->getLowerBound(binding);
     918    if (lb > 0 && !binding.hasAttribute(AttrId::Deferred)) {
     919        Constant * const strideSize = b->getSize(ceiling(lb * mKernel->getStride()));
     920        Value * hasEnough = b->CreateICmpULE(itemCount, strideSize);
     921        hasEnough = b->CreateOr(hasEnough, mTerminationExplicitly);
     922        b->CreateAssert(hasEnough, prefix + " " + label + " fewer items than expected");
     923    }
     924    Value * const withinBounds = b->CreateICmpULE(itemCount, expected);
     925    b->CreateAssert(withinBounds, prefix + " " + label + " more items than expected");
     926
     927}
     928
     929#endif
    863930
    864931/** ------------------------------------------------------------------------------------------------------------- *
     
    867934void PipelineCompiler::resetMemoizedFields() {
    868935    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    869     reset(mAlreadyProcessedItemCount, numOfInputs);
     936    reset(mInitiallyProcessedItemCount, numOfInputs);
     937    reset(mInitiallyProcessedDeferredItemCount, numOfInputs);
     938    reset(mAlreadyProcessedPhi, numOfInputs);
     939    reset(mAlreadyProcessedDeferredPhi, numOfInputs);
    870940    reset(mInputStrideLength, numOfInputs);
    871941    reset(mAccessibleInputItems, numOfInputs);
    872942    reset(mLinearInputItemsPhi, numOfInputs);
     943    reset(mReturnedProcessedItemCountPtr, numOfInputs);
     944    reset(mProcessedItemCount, numOfInputs);
     945    reset(mProcessedDeferredItemCount, numOfInputs);
     946    reset(mFinalProcessedPhi, numOfInputs);
     947    reset(mUpdatedProcessedPhi, numOfInputs);
     948    reset(mUpdatedProcessedDeferredPhi, numOfInputs);
    873949    reset(mFullyProcessedItemCount, numOfInputs);
    874950    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    875951    reset(mInitiallyProducedItemCount, numOfOutputs);
    876     reset(mAlreadyProducedItemCount, numOfOutputs);
     952    reset(mAlreadyProducedPhi, numOfOutputs);
    877953    reset(mOutputStrideLength, numOfOutputs);
    878954    reset(mWritableOutputItems, numOfOutputs);
    879955    reset(mLinearOutputItemsPhi, numOfOutputs);
    880 }
    881 
    882 
    883 }
     956    reset(mReturnedProducedItemCountPtr, numOfOutputs);
     957    reset(mProducedItemCount, numOfOutputs);
     958    reset(mFinalProducedPhi, numOfOutputs);
     959    reset(mUpdatedProducedPhi, numOfOutputs);
     960    reset(mFullyProducedItemCount, numOfOutputs);
     961}
     962
     963
     964}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_compiler.hpp

    r6241 r6249  
    161161using PopCountGraph = adjacency_list<vecS, vecS, bidirectionalS, no_property, PopCountEdge>;
    162162
     163const static std::string LOGICAL_SEGMENT_SUFFIX = ".LSN";
     164const static std::string ITEM_COUNT_SUFFIX = ".IC";
     165const static std::string DEFERRED_ITEM_COUNT_SUFFIX = ".ICD";
     166
    163167class PipelineCompiler {
    164168public:
     
    183187    void start(BuilderRef b, Value * const initialSegNo);
    184188    void setActiveKernel(BuilderRef b, const unsigned index);
    185     void synchronize(BuilderRef b);
     189    void acquireCurrentSegment(BuilderRef b);
    186190    void executeKernel(BuilderRef b);
    187191    void end(BuilderRef b, const unsigned step);
     
    201205// inter-kernel functions
    202206
     207    void readInitialItemCounts(BuilderRef b);
     208
     209    void initializeKernelLoopEntryPhis(BuilderRef b);
     210    void initializeKernelCallPhis(BuilderRef b);
     211    void initializeKernelTerminatedPhis(BuilderRef b);
     212    void initializeKernelLoopExitPhis(BuilderRef b);
     213    void initializeKernelExitPhis(BuilderRef b);
     214
    203215    void checkForSufficientInputDataAndOutputSpace(BuilderRef b);
     216    void branchToTargetOrLoopExit(BuilderRef b, Value * const cond, BasicBlock * target);
    204217    void determineNumOfLinearStrides(BuilderRef b);
    205218    void calculateNonFinalItemCounts(BuilderRef b);
    206219    void calculateFinalItemCounts(BuilderRef b);
     220    Value * addItemCountArg(BuilderRef b, const Binding & binding, const bool addressable, PHINode * const itemCount, std::vector<Value *> & args) const;
     221
    207222    void writeKernelCall(BuilderRef b);
    208     void computeFullyProcessedItemCounts(BuilderRef b);
     223
     224    void normalTerminationCheck(BuilderRef b, Value * const isFinal);
     225
    209226    void writeCopyBackLogic(BuilderRef b);
    210227    void writeCopyForwardLogic(BuilderRef b);
     228    enum class OverflowCopy { Forwards, Backwards };
     229    Value * writeOverflowCopy(BuilderRef b, const StreamSetBuffer * const buffer, const OverflowCopy direction, Value * const itemsToCopy) const;
     230
     231
     232    void computeFullyProcessedItemCounts(BuilderRef b);
     233    void computeFullyProducedItemCounts(BuilderRef b);
     234
     235    void updatePhisAfterTermination(BuilderRef b);
    211236
    212237    void zeroFillPartiallyWrittenOutputStreams(BuilderRef b);
    213     void initializeKernelCallPhis(BuilderRef b);
    214     void initializeKernelExitPhis(BuilderRef b);
    215 
    216     void readInitialProducedItemCounts(BuilderRef b);
     238
    217239    void computeMinimumConsumedItemCounts(BuilderRef b);
    218240    void writeFinalConsumedItemCounts(BuilderRef b);
     
    222244    void checkForSufficientInputData(BuilderRef b, const unsigned inputPort);
    223245    void checkForSufficientOutputSpaceOrExpand(BuilderRef b, const unsigned outputPort);
    224     void incrementItemCountsOfCountableRateStreams(BuilderRef b);
    225     enum class OverflowCopy { Forwards, Backwards };
    226     Value * writeOverflowCopy(BuilderRef b, const StreamSetBuffer * const buffer, const OverflowCopy direction, Value * const itemsToCopy) const;
    227 
     246
     247    void loadItemCountsOfCountableRateStreams(BuilderRef b);
     248
     249    void writeUpdatedItemCounts(BuilderRef b);
    228250
    229251// intra-kernel functions
    230252
    231     void branchToTargetOrLoopExit(BuilderRef b, Value * const cond, BasicBlock * const target);
    232253    void expandOutputBuffers(BuilderRef b);
    233254    void expandOutputBuffer(BuilderRef b, const unsigned outputPort, Value * const hasEnough, BasicBlock * const target);
    234 
    235     Value * getAlreadyProcessedItemCount(BuilderRef b, const unsigned inputPort);
    236     Value * getAlreadyProducedItemCount(BuilderRef b, const unsigned outputPort);
    237255
    238256    Value * getInputStrideLength(BuilderRef b, const unsigned inputPort);
    239257    Value * getOutputStrideLength(BuilderRef b, const unsigned outputPort);
    240258    Value * getInitialStrideLength(BuilderRef b, const Port port, const unsigned portNum);
    241     Value * getMaximumStrideLength(BuilderRef b, const Port port, const unsigned portNum);
    242     Value * calculateNumOfLinearItems(BuilderRef b, const Port portType,  const unsigned portNum);
     259    static Value * getMaximumStrideLength(BuilderRef b, const Kernel * kernel, const Binding & binding);
     260    Value * calculateNumOfLinearItems(BuilderRef b, const Binding & binding);
    243261    Value * getAccessibleInputItems(BuilderRef b, const unsigned inputPort);
    244262    Value * getNumOfAccessibleStrides(BuilderRef b, const unsigned inputPort);
     
    253271    Value * getTotalItemCount(BuilderRef b, const unsigned inputPort) const;
    254272    Value * hasProducerTerminated(BuilderRef b, const unsigned inputPort) const;
    255     Value * getInitialTerminationSignal(BuilderRef b) const;
     273    Value * initiallyTerminated(BuilderRef b) const;
    256274    void setTerminated(BuilderRef b, Value * const terminated);
    257275    void resetMemoizedFields();
     
    381399    void writeOutputScalars(BuilderRef b, const unsigned u, std::vector<Value *> & args);
    382400
     401    void verifyInputItemCount(BuilderRef b, Value * processed, const unsigned inputPort) const;
     402
     403    void verifyOutputItemCount(BuilderRef b, Value * produced, const unsigned outputPort) const;
     404
    383405    void itemCountSanityCheck(BuilderRef b, const Binding & binding, const std::string & pastLabel,
    384                               Value * const itemCount, Value * const expected);
     406                              Value * const itemCount, Value * const expected) const;
     407
     408
    385409
    386410protected:
     
    395419    // pipeline state
    396420    PHINode *                                   mTerminatedPhi = nullptr;
    397     PHINode *                                   mTerminatedFlag = nullptr;
    398421    PHINode *                                   mSegNo = nullptr;
    399422    BasicBlock *                                mPipelineLoop = nullptr;
     
    401424    BasicBlock *                                mKernelLoopEntry = nullptr;
    402425    BasicBlock *                                mKernelLoopCall = nullptr;
     426    BasicBlock *                                mKernelTerminationCheck = nullptr;
     427    BasicBlock *                                mKernelTerminated = nullptr;
    403428    BasicBlock *                                mKernelLoopExit = nullptr;
    404429    BasicBlock *                                mKernelLoopExitPhiCatch = nullptr;
     
    413438    std::vector<unsigned>                       mPortOrdering;
    414439
    415     std::vector<Value *>                        mAlreadyProcessedItemCount; // entering the stride
     440    std::vector<Value *>                        mInitiallyProcessedItemCount; // *before* entering the kernel
     441    std::vector<Value *>                        mInitiallyProcessedDeferredItemCount;
     442    std::vector<PHINode *>                      mAlreadyProcessedPhi; // entering the segment loop
     443    std::vector<PHINode *>                      mAlreadyProcessedDeferredPhi;
    416444    std::vector<Value *>                        mInputStrideLength;
    417445    std::vector<Value *>                        mAccessibleInputItems;
    418446    std::vector<PHINode *>                      mLinearInputItemsPhi;
    419     std::vector<Value *>                        mFullyProcessedItemCount; // exiting the kernel
    420 
    421     std::vector<Value *>                        mInitiallyProducedItemCount; // entering the *kernel*
    422     std::vector<Value *>                        mAlreadyProducedItemCount; // entering the stride
     447    std::vector<Value *>                        mReturnedProcessedItemCountPtr; // written by the kernel
     448    std::vector<Value *>                        mProcessedItemCount; // exiting the segment loop
     449    std::vector<Value *>                        mProcessedDeferredItemCount;
     450    std::vector<PHINode *>                      mFinalProcessedPhi; // exiting after termination
     451    std::vector<PHINode *>                      mUpdatedProcessedPhi; // exiting the kernel
     452    std::vector<PHINode *>                      mUpdatedProcessedDeferredPhi;
     453    std::vector<Value *>                        mFullyProcessedItemCount; // *after* exiting the kernel
     454
     455    std::vector<Value *>                        mInitiallyProducedItemCount; // *before* entering the kernel
     456    std::vector<PHINode *>                      mAlreadyProducedPhi; // entering the segment loop
    423457    std::vector<Value *>                        mOutputStrideLength;
    424458    std::vector<Value *>                        mWritableOutputItems;
    425459    std::vector<PHINode *>                      mLinearOutputItemsPhi;
     460    std::vector<Value *>                        mReturnedProducedItemCountPtr; // written by the kernel
     461    std::vector<Value *>                        mProducedItemCount; // exiting the segment loop
     462    std::vector<PHINode *>                      mFinalProducedPhi; // exiting after termination
     463    std::vector<PHINode *>                      mUpdatedProducedPhi; // exiting the kernel
     464    std::vector<PHINode *>                      mFullyProducedItemCount; // *after* exiting the kernel
     465
    426466
    427467    // debug + misc state
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_kernel.cpp

    r6237 r6249  
    5858        }
    5959
    60         Value * const init = getInitFunction(m);
    61         assert (cast<FunctionType>(init->getType()->getPointerElementType())->getNumParams() == args.size());
    62         b->CreateCall(init, args);
     60        b->CreateCall(getInitFunction(m), args);
    6361    }
    6462}
     
    179177    b->setKernel(this);
    180178
    181     // maintain consistency with the Kernel interface
    182     const auto numOfDoSegArgs = (getNumOfStreamInputs() + getNumOfStreamOutputs()) * 2;
    183     std::vector<Type *> paramTypes;
    184     paramTypes.reserve(numOfDoSegArgs + getNumOfScalarInputs());
    185     for (const auto & input : getInputStreamSetBuffers()) {
    186         paramTypes.push_back(input->getType()->getPointerTo());
    187         paramTypes.push_back(b->getSizeTy());
    188     }
    189     for (const auto & output : getOutputStreamSetBuffers()) {
    190         paramTypes.push_back(output->getType()->getPointerTo());
    191         paramTypes.push_back(b->getSizeTy());
    192     }
     179    Module * const m = b->getModule();
     180    Function * const doSegment = getDoSegmentFunction(m);
     181    assert (doSegment->arg_size() >= 2);
     182    const auto numOfDoSegArgs = doSegment->arg_size() - 2;
     183    Function * const terminate = getTerminateFunction(m);
     184
     185    // maintain consistency with the Kernel interface by passing first the stream sets
     186    // and then the scalars.
     187    std::vector<Type *> params;
     188    params.reserve(numOfDoSegArgs + getNumOfScalarInputs());
     189
     190    // the first two params of doSegmentare its handle and numOfStrides; the remaining
     191    // are the stream set params
     192    auto doSegParam = doSegment->arg_begin(); ++doSegParam;
     193    const auto doSegEnd = doSegment->arg_end();
     194    while (++doSegParam != doSegEnd) {
     195        params.push_back(doSegParam->getType());
     196    }
     197
    193198    for (const auto & input : getInputScalarBindings()) {
    194         if (LLVM_LIKELY(!input.hasAttribute(AttrId::Family))) {
    195             paramTypes.push_back(input.getType());
    196         }
    197     }
    198     const auto numOfInitArgs = (paramTypes.size() - numOfDoSegArgs);
     199        if (!input.hasAttribute(AttrId::Family)) {
     200            params.push_back(input.getType());
     201        }
     202    }
     203
     204    const auto numOfInitArgs = params.size() - numOfDoSegArgs;
    199205
    200206    // get the finalize method output type and set its return type as this function's return type
    201     Module * const m = b->getModule();
    202     Function * const tf = getTerminateFunction(m);
    203     FunctionType * const mainFunctionType = FunctionType::get(tf->getReturnType(), paramTypes, false);
    204 
    205     auto linkageType = (method == AddInternal) ? Function::InternalLinkage : Function::ExternalLinkage;
     207    FunctionType * const mainFunctionType = FunctionType::get(terminate->getReturnType(), params, false);
     208
     209    const auto linkageType = (method == AddInternal) ? Function::InternalLinkage : Function::ExternalLinkage;
    206210
    207211    Function * const main = Function::Create(mainFunctionType, linkageType, getName() + "_main", m);
     
    217221        setHandle(b, handle);
    218222
    219         std::vector<Value *> segmentArgs;
    220         segmentArgs.reserve(numOfDoSegArgs + 2);
    221         segmentArgs.push_back(handle);
    222         segmentArgs.push_back(b->getSize(0));
     223        std::vector<Value *> segmentArgs(doSegment->arg_size());
     224        segmentArgs[0] = handle;
     225        segmentArgs[1] = b->getSize(0);
    223226        for (unsigned i = 0; i < numOfDoSegArgs; ++i) {
    224             segmentArgs.push_back(&*arg++);
    225         }
    226 
    227         std::vector<Value *> initArgs;
    228         initArgs.reserve(numOfInitArgs + 1);
    229         initArgs.push_back(handle);
     227            assert (arg != main->arg_end());
     228            segmentArgs[i + 2] = &*arg++;
     229        }
     230
     231        std::vector<Value *> initArgs(numOfInitArgs + 1);
     232        initArgs[0] = handle;
    230233        for (unsigned i = 0; i < numOfInitArgs; ++i) {
    231             initArgs.push_back(&*arg++);
     234            assert (arg != main->arg_end());
     235            initArgs[i + 1] = &*arg++;
    232236        }
    233237        assert (arg == main->arg_end());
    234 
    235238        // initialize the kernel
    236239        initializeInstance(b, initArgs);
    237 
    238240        // call the pipeline kernel
    239         Function * const doSegment = getDoSegmentFunction(m);
    240         assert (doSegment->getFunctionType()->getNumParams() == segmentArgs.size());
    241241        b->CreateCall(doSegment, segmentArgs);
    242 
    243242        // call and return the final output value(s)
    244243        b->CreateRet(finalizeInstance(b));
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_logic.hpp

    r6244 r6249  
    9494    for (unsigned i = 0; i < mPipeline.size(); ++i) {
    9595        setActiveKernel(b, i);
    96         synchronize(b);
     96        acquireCurrentSegment(b);
    9797        executeKernel(b);
    9898        releaseCurrentSegment(b);
     
    117117}
    118118
     119
     120/** ------------------------------------------------------------------------------------------------------------- *
     121 * @brief acquireCurrentSegment
     122 *
     123 * Before the segment is processed, this loads the segment number of the kernel state and ensures the previous
     124 * segment is complete (by checking that the acquired segment number is equal to the desired segment number).
     125 ** ------------------------------------------------------------------------------------------------------------- */
     126void PipelineCompiler::acquireCurrentSegment(BuilderRef b) {
     127
     128    b->setKernel(mPipelineKernel);
     129    const auto prefix = makeKernelName(mKernelIndex);
     130    const auto serialize = codegen::DebugOptionIsSet(codegen::SerializeThreads);
     131    const unsigned waitingOnIdx = serialize ? (mPipeline.size() - 1) : mKernelIndex;
     132    const auto waitingOn = makeKernelName(waitingOnIdx);
     133    Value * const waitingOnPtr = b->getScalarFieldPtr(waitingOn + LOGICAL_SEGMENT_SUFFIX);
     134    BasicBlock * const kernelWait = b->CreateBasicBlock(prefix + "Wait", mPipelineEnd);
     135    b->CreateBr(kernelWait);
     136
     137    b->SetInsertPoint(kernelWait);
     138    Value * const processedSegmentCount = b->CreateAtomicLoadAcquire(waitingOnPtr);
     139    assert (processedSegmentCount->getType() == mSegNo->getType());
     140    Value * const ready = b->CreateICmpEQ(mSegNo, processedSegmentCount);
     141    BasicBlock * const kernelStart = b->CreateBasicBlock(prefix + "Start", mPipelineEnd);
     142    b->CreateCondBr(ready, kernelStart, kernelWait);
     143
     144    b->SetInsertPoint(kernelStart);
     145    b->setKernel(mKernel);
     146}
     147
     148/** ------------------------------------------------------------------------------------------------------------- *
     149 * @brief releaseCurrentSegment
     150 *
     151 * After executing the kernel, the segment number must be incremented to release the kernel for the next thread.
     152 ** ------------------------------------------------------------------------------------------------------------- */
     153inline void PipelineCompiler::releaseCurrentSegment(BuilderRef b) {
     154    b->setKernel(mPipelineKernel);
     155    Value * const nextSegNo = b->CreateAdd(mSegNo, b->getSize(1));
     156    const auto prefix = makeKernelName(mKernelIndex);
     157    Value * const waitingOnPtr = b->getScalarFieldPtr(prefix + LOGICAL_SEGMENT_SUFFIX);
     158    b->CreateAtomicStoreRelease(nextSegNo, waitingOnPtr);
     159}
     160
    119161enum : int {
    120162    HANDLE_INDEX = 0
     
    296338
    297339#endif // PIPELINE_LOGIC_HPP
    298 
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/popcount_logic.hpp

    r6228 r6249  
    5959        Value * const consumed = getPopCountReferenceConsumedCount(b, bufferVertex);
    6060        Value * const startIndex = b->CreateLShr(consumed, LOG2_BLOCK_WIDTH);
    61         Value * const produced = b->getNonDeferredProducedItemCount(output);
     61        Value * const produced = mUpdatedProducedPhi[bufferPort];
    6262        // If this is the producer's final stride, round the index position up
    6363        // to account for a partial stride.
     
    474474inline Value * PipelineCompiler::getReferenceStreamOffset(BuilderRef b, const Binding & binding) {
    475475    const auto refPortNum = getPopCountReferencePort(mKernel, binding.getRate());
    476     Value * const itemCount = getAlreadyProcessedItemCount(b, refPortNum);
     476    Value * const itemCount = mAlreadyProcessedPhi[refPortNum];
    477477    Value * const strideLength = getInputStrideLength(b, refPortNum);
    478478    return b->CreateUDiv(itemCount, strideLength);
     
    574574    indices[2] = b->getInt32(BASE_OFFSET_INDEX);
    575575    Value * const baseOffset = b->CreateLoad(b->CreateGEP(mPopCountState, indices));
    576     Value * const processed = getAlreadyProcessedItemCount(b, inputPort);
     576    Value * const processed = mAlreadyProcessedPhi[inputPort];
    577577    Constant * const LOG2_COUNT_WIDTH = getLog2BlockWidth(b);
    578578    Value * const processedOffset = b->CreateLShr(processed, LOG2_COUNT_WIDTH);
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp

    r6184 r6249  
    2424
    2525    Module * const m = b->getModule();
    26    
     26
    2727    BasicBlock * const entryBlock = b->GetInsertBlock();
    2828    BasicBlock * const initialBlock = b->CreateBasicBlock("initialBlock");
     
    200200
    201201    b->SetInsertPoint(callFinalizeScan);
    202     b->setProcessedItemCount("InputStream", b->CreateAdd(avail, scanwordPos));
    203202    Function * finalizer = m->getFunction("finalize_match_wrapper"); assert (finalizer);
    204     Value * const buffer_base = b->getRawInputPointer("InputStream", ZERO);
    205     Value * buffer_end_address = b->CreateGEP(buffer_base, avail);
    206     b->CreateCall(finalizer, {accumulator, buffer_end_address});
     203    Value * const bufferEnd = b->getRawInputPointer("InputStream", avail);
     204    b->CreateCall(finalizer, {accumulator, bufferEnd});
    207205    b->CreateBr(scanReturn);
    208206
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.cpp

    r6241 r6249  
    4141    IntegerType * const sizeTy = b->getSizeTy();
    4242    ConstantInt * const STRIDE_SIZE = b->getSize(stride);
    43     Constant * const PAGE_ITEMS = b->getSize(stride /(codeUnitWidth/8));
     43    Constant * const PAGE_ITEMS = b->getSize((8 * stride) / codeUnitWidth);
    4444    Value * const fd = b->getScalarField("fileDescriptor");
    45 
    4645    PointerType * const codeUnitPtrTy = b->getIntNTy(codeUnitWidth)->getPointerTo();
    4746    b->setScalarField("ancillaryBuffer", ConstantPointerNull::get(codeUnitPtrTy));
     
    8281    BasicBlock * const exit = b->CreateBasicBlock("mmapSourceExit");
    8382
    84     Constant * const MMAP_PAGE_SIZE = b->getSize(getPageSize());
    85 
    86     Constant * const STRIDE_SIZE = b->getSize(stride);
    87 
    88     Constant * const PAGE_ITEMS = b->getSize((8 * stride) / codeUnitWidth);
    89     Constant * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
    90     Constant * const CODE_UNIT_BYTES = b->getSize(codeUnitWidth / 8);
    91     Constant * const PADDING_SIZE = b->getSize(b->getBitBlockWidth() * codeUnitWidth / 8);
     83    ConstantInt * const MMAP_PAGE_SIZE = b->getSize(getPageSize());
     84    ConstantInt * const STRIDE_ITEMS = b->getSize(stride);
     85    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
     86    ConstantInt * const CODE_UNIT_BYTES = b->getSize(codeUnitWidth / 8);
     87
     88    ConstantInt * const STRIDE_BYTES = b->getSize((codeUnitWidth * stride) / 8);
     89    ConstantInt * const PADDING_SIZE = b->getSize(b->getBitBlockWidth() * codeUnitWidth / 8);
    9290
    9391    Value * const consumedItems = b->getConsumedItemCount("sourceBuffer");
     
    104102    // instruct the OS that it can safely drop any fully consumed pages
    105103    b->CreateMAdvise(readableBuffer, unnecessaryBytes, CBuilder::ADVICE_DONTNEED);
    106     //b->setScalarField("buffer", b->CreateGEP(readableBuffer, unnecessaryBytes));
    107104    b->CreateBr(checkRemaining);
    108105
    109     // determine whether or not we've exhausted the file buffer
     106    // determine whether or not we've exhausted the "safe" region of the file buffer
    110107    b->SetInsertPoint(checkRemaining);
    111108    Value * const producedItems = b->getProducedItemCount("sourceBuffer");
    112     Value * const nextProducedItems = b->CreateAdd(producedItems, PAGE_ITEMS);
     109    Value * const nextProducedItems = b->CreateAdd(producedItems, STRIDE_ITEMS);
    113110    Value * const fileItems = b->getScalarField("fileItems");
    114111    Value * const lastPage = b->CreateICmpULE(fileItems, nextProducedItems);
     
    128125    Value * unconsumedBytes = b->CreateSub(readEndInt, readStartInt);
    129126    unconsumedBytes = b->CreateTrunc(unconsumedBytes, b->getSizeTy());
    130     Value * const bufferSize = b->CreateRoundUp(b->CreateAdd(unconsumedBytes, PADDING_SIZE), STRIDE_SIZE);
     127    Value * const bufferSize = b->CreateRoundUp(b->CreateAdd(unconsumedBytes, PADDING_SIZE), STRIDE_BYTES);
    131128    Value * const buffer = b->CreateAlignedMalloc(bufferSize, b->getCacheAlignment());
    132129    b->CreateMemCpy(buffer, readStart, unconsumedBytes, 1);
     
    141138    b->setBaseAddress("sourceBuffer", b->CreatePointerCast(offsettedBuffer, codeUnitPtrTy));
    142139    b->setTerminationSignal();
    143 
    144 
    145     BasicBlock * const terminationExit = b->GetInsertBlock();
     140    b->setProducedItemCount("sourceBuffer", fileItems);
    146141    b->CreateBr(exit);
    147142
    148     // finally, set the "produced" count to reflect current position in the file
    149143    b->SetInsertPoint(exit);
    150     PHINode * const newProducedItems = b->CreatePHI(b->getSizeTy(), 2);
    151     newProducedItems->addIncoming(nextProducedItems, checkRemaining);
    152     newProducedItems->addIncoming(fileItems, terminationExit);
    153     b->setProducedItemCount("sourceBuffer", newProducedItems);
    154144}
    155145void MMapSourceKernel::freeBuffer(const std::unique_ptr<KernelBuilder> & b, const unsigned codeUnitWidth) {
     
    261251    b->setScalarField("fileItems", itemsBuffered);
    262252    b->setTerminationSignal();
     253    b->setProducedItemCount("sourceBuffer", itemsBuffered);
    263254    b->CreateBr(readExit);
    264255
    265256    b->SetInsertPoint(readExit);
    266     PHINode * const itemsProduced = b->CreatePHI(itemsPending->getType(), 2);
    267     itemsProduced->addIncoming(itemsPending, readData);
    268     itemsProduced->addIncoming(itemsBuffered, setTermination);
    269     b->setProducedItemCount("sourceBuffer", itemsProduced);
    270257}
    271258
     
    285272    BasicBlock * finalizeMMap = b->CreateBasicBlock("finalizeMMap");
    286273    BasicBlock * finalizeDone = b->CreateBasicBlock("finalizeDone");
    287     b->CreateCondBr(b->CreateTrunc(b->getScalarField("useMMap"), b->getInt1Ty()), finalizeMMap, finalizeRead);
     274    b->CreateCondBr(b->CreateIsNotNull(b->getScalarField("useMMap")), finalizeMMap, finalizeRead);
    288275    b->SetInsertPoint(finalizeMMap);
    289276    MMapSourceKernel::freeBuffer(b, mCodeUnitWidth);
     
    303290    // The source will use MMapSource or readSoure kernel logic depending on the useMMap
    304291    // parameter, possibly overridden.
    305     Value * useMMap = b->CreateTrunc(b->getScalarField("useMMap"), b->getInt1Ty());
     292
     293    Value * useMMap = b->getScalarField("useMMap");
     294    Constant * const ZERO = ConstantInt::getNullValue(useMMap->getType());
     295    useMMap = b->CreateICmpNE(useMMap, ZERO);
    306296    // if the fileDescriptor is 0, the file is stdin, use readSource kernel logic.
    307297    Value * fd = b->getScalarField("fileDescriptor");
    308     useMMap = b->CreateAnd(useMMap, b->CreateICmpNE(fd, b->getInt32(STDIN_FILENO)));
     298    Value * notStdIn = b->CreateICmpNE(fd, b->getInt32(STDIN_FILENO));
     299    useMMap = b->CreateAnd(useMMap, notStdIn);
    309300    b->CreateCondBr(useMMap, tryMMap, initializeRead);
    310301
     
    321312    b->SetInsertPoint(initializeRead);
    322313    // Ensure that readSource logic is used throughout.
    323     b->setScalarField("useMMap", b->getInt8(0));
     314    b->setScalarField("useMMap", ZERO);
    324315    ReadSourceKernel::generateInitializeMethod(mCodeUnitWidth, mStride,b);
    325316    b->CreateBr(initializeDone);
     
    359350    Constant * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
    360351
    361     BasicBlock * const entry = b->GetInsertBlock();
    362352    BasicBlock * const createTemporary = b->CreateBasicBlock("createTemporary");
    363353    BasicBlock * const exit = b->CreateBasicBlock("exit");
     
    410400    b->setBaseAddress("sourceBuffer", b->CreatePointerCast(offsettedBuffer, codeUnitPtrTy));
    411401    b->setTerminationSignal();
    412     BasicBlock * const terminationExit = b->GetInsertBlock();
     402    b->setProducedItemCount("sourceBuffer", fileItems);
    413403    b->CreateBr(exit);
    414404
    415405    b->SetInsertPoint(exit);
    416     PHINode * const newProducedItems = b->CreatePHI(b->getSizeTy(), 2);
    417     newProducedItems->addIncoming(nextProducedItems, entry);
    418     newProducedItems->addIncoming(fileItems, terminationExit);
    419     b->setProducedItemCount("sourceBuffer", newProducedItems);
    420406}
    421407
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r6239 r6249  
    5555}
    5656
    57 static cl::bits<RE_PrintFlags> 
     57static cl::bits<RE_PrintFlags>
    5858    PrintOptions(cl::values(clEnumVal(ShowREs, "Show parsed regular expressions and transformations that change them"),
    5959                            clEnumVal(ShowAllREs, "Print all regular expression passes")
     
    6262static cl::bits<RE_AlgorithmFlags>
    6363    AlgorithmOptions(cl::values(clEnumVal(DisableLog2BoundedRepetition, "disable log2 optimizations for bounded repetition of bytes"),
    64                               clEnumVal(DisableIfHierarchy, "disable nested if hierarchy for generated Unicode classes (not recommended)"), 
    65                               clEnumVal(DisableMatchStar, "disable MatchStar optimization"), 
     64                              clEnumVal(DisableIfHierarchy, "disable nested if hierarchy for generated Unicode classes (not recommended)"),
     65                              clEnumVal(DisableMatchStar, "disable MatchStar optimization"),
    6666                              clEnumVal(DisableUnicodeMatchStar, "disable Unicode MatchStar optimization"),
    6767                              clEnumVal(DisableUnicodeLineBreak, "disable Unicode line breaks - use LF only")
    6868                              CL_ENUM_VAL_SENTINEL), cl::cat(RegexOptions));
    6969
    70    
     70
    7171static cl::opt<bool> UnicodeLevel2("U2", cl::desc("Enable Unicode Level matching under canonical and compatible (?K) equivalence."), cl::cat(RegexOptions));
    7272
     
    8080
    8181int IfInsertionGap;
    82 static cl::opt<int, true> 
     82static cl::opt<int, true>
    8383    IfInsertionGapOption("if-insertion-gap",  cl::location(IfInsertionGap), cl::init(3),
    84                          cl::desc("minimum number of nonempty elements between inserted if short-circuit tests"), 
     84                         cl::desc("minimum number of nonempty elements between inserted if short-circuit tests"),
    8585                         cl::cat(RegexOptions));
    8686
     
    161161    return false;
    162162}
    163    
     163
    164164inline bool lessThan(const Name * const lh, const Name * const rh) {
    165165    if (lh->getType() != rh->getType()) {
Note: See TracChangeset for help on using the changeset viewer.