Changeset 5865


Ignore:
Timestamp:
Feb 6, 2018, 4:57:35 PM (9 months ago)
Author:
nmedfort
Message:

More work on the pipeline I/O rate handling

Location:
icGREP/icgrep-devel/icgrep
Files:
18 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.cpp

    r5841 r5865  
    13231323}
    13241324
    1325 llvm::CallInst * CBuilder::CreateMemSet(llvm::Value * Ptr, llvm::Value * Val, llvm::Value * Size, unsigned Align,
    1326                        bool isVolatile, llvm::MDNode * TBAATag, llvm::MDNode * ScopeTag, llvm::MDNode * NoAliasTag) {
     1325CallInst * CBuilder::CreateMemSet(Value * Ptr, Value * Val, Value * Size, unsigned Align,
     1326                       bool isVolatile, MDNode * TBAATag, MDNode * ScopeTag, MDNode * NoAliasTag) {
    13271327    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    13281328        CHECK_ADDRESS(Ptr, Size, "CreateMemSet");
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.h

    r5828 r5865  
    258258    }
    259259
    260     llvm::BasicBlock * CreateBasicBlock(const llvm::StringRef name, llvm::BasicBlock * insertBefore = nullptr);
     260    llvm::BasicBlock * CreateBasicBlock(const llvm::StringRef name = "", llvm::BasicBlock * insertBefore = nullptr);
    261261
    262262    virtual bool supportsIndirectBr() const;
     
    285285    template <typename ExternalFunctionType>
    286286    llvm::Function * LinkFunction(llvm::StringRef name, ExternalFunctionType * functionPtr) const;
    287 
    288287
    289288    virtual llvm::LoadInst * CreateLoad(llvm::Value * Ptr, const char * Name);
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.cpp

    r5843 r5865  
    166166    Value * bVec = fwCast(fw, b);
    167167    return CreateSelect(CreateICmpULT(aVec, bVec), aVec, bVec);
     168}
     169
     170Value * IDISA_Builder::mvmd_sll(unsigned fw, Value * value, Value * shift) {
     171    VectorType * const vecTy = cast<VectorType>(value->getType());
     172    IntegerType * const intTy = getIntNTy(vecTy->getBitWidth());
     173    value = CreateBitCast(value, intTy);
     174    shift = CreateZExtOrTrunc(CreateMul(shift, ConstantInt::get(shift->getType(), fw)), intTy);
     175    return CreateBitCast(CreateShl(value, shift), vecTy);
     176}
     177
     178Value * IDISA_Builder::mvmd_srl(unsigned fw, Value * value, Value * shift) {
     179    VectorType * const vecTy = cast<VectorType>(value->getType());
     180    IntegerType * const intTy = getIntNTy(vecTy->getBitWidth());
     181    value = CreateBitCast(value, intTy);
     182    shift = CreateZExtOrTrunc(CreateMul(shift, ConstantInt::get(shift->getType(), fw)), intTy);
     183    return CreateBitCast(CreateLShr(value, shift), vecTy);
    168184}
    169185
     
    579595}
    580596
    581 IDISA_Builder::IDISA_Builder(llvm::LLVMContext & C, unsigned vectorWidth, unsigned stride)
     597IDISA_Builder::IDISA_Builder(LLVMContext & C, unsigned vectorWidth, unsigned stride)
    582598: CBuilder(C)
    583599, mBitBlockWidth(vectorWidth)
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.h

    r5861 r5865  
    149149    virtual llvm::Value * mvmd_extract(unsigned fw, llvm::Value * a, unsigned fieldIndex);
    150150    virtual llvm::Value * mvmd_insert(unsigned fw, llvm::Value * blk, llvm::Value * elt, unsigned fieldIndex);
     151
     152    virtual llvm::Value * mvmd_sll(unsigned fw, llvm::Value * value, llvm::Value * shift);
     153    virtual llvm::Value * mvmd_srl(unsigned fw, llvm::Value * value, llvm::Value * shift);
    151154    virtual llvm::Value * mvmd_slli(unsigned fw, llvm::Value * a, unsigned shift);
    152155    virtual llvm::Value * mvmd_srli(unsigned fw, llvm::Value * a, unsigned shift);
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_sse_builder.h

    r5489 r5865  
    2323    llvm::Value * hsimd_signmask(unsigned fw, llvm::Value * a) override;
    2424    ~IDISA_SSE_Builder() {}
    25 
    2625};
    2726
     
    4039    llvm::Value * hsimd_packl(unsigned fw, llvm::Value * a, llvm::Value * b) override;
    4140    std::pair<llvm::Value *, llvm::Value *> bitblock_advance(llvm::Value * a, llvm::Value * shiftin, unsigned shift) final;
    42 
    4341    ~IDISA_SSE2_Builder() {}
    44 
    4542};
    4643
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.cpp

    r5751 r5865  
    99#include "PropertyObjectTable.h"
    1010#include <llvm/Support/Casting.h>
    11 #include <algorithm>
    12 #include <assert.h>
    1311#include <sstream>
    14 #include <llvm/Support/raw_ostream.h>
    1512#include <llvm/Support/ErrorHandling.h>
    1613#include <toolchain/grep_pipeline.h>
  • icGREP/icgrep-devel/icgrep/character_deposit.cpp

    r5857 r5865  
    8383    }
    8484
    85     const int inputBufferBlocks = codegen::BufferSegments * codegen::ThreadNum * 8;
    86     const int outputBufferBlocks = inputBufferBlocks * 2;
     85    const int inputBufferBlocks = codegen::BufferSegments * codegen::ThreadNum * codegen::SegmentSize;
     86    const int outputBufferBlocks = inputBufferBlocks; // * 2;
    8787
    8888    ParabixDriver pxDriver("character_deletion");
     
    110110    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), inputBufferBlocks);
    111111
    112     kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy(), codegen::SegmentSize);
     112    kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
    113113    sourceK->setInitialArguments({inputStream, fileSize});
    114114    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
     
    117117
    118118
    119     StreamSetBuffer * const CharacterMarkerBuffer = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), inputBufferBlocks);
     119    StreamSetBuffer * const CharacterMarkerBuffer = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), inputBufferBlocks);
    120120    Kernel * ccK = pxDriver.addKernelInstance<ParabixCharacterClassKernelBuilder>(iBuilder, "extenders", std::vector<re::CC *>{re::makeCC(characterToBeDeposit)}, 8);
    121121    pxDriver.makeKernelCall(ccK, {BasisBits}, {CharacterMarkerBuffer});
    122122
    123123
    124     StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
    125     StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     124//    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     125//    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     126
     127    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1, 2);
     128    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1, 2);
    126129    Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 8);
    127130    pxDriver.makeKernelCall(delK, {CharacterMarkerBuffer, BasisBits}, {u16Swizzle0, u16Swizzle1});
    128 
    129131
    130132    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), outputBufferBlocks, 1);
     
    160162    pxDriver.generatePipelineIR();
    161163
     164    pxDriver.deallocateBuffers();
     165
    162166    iBuilder->CreateRetVoid();
    163167
  • icGREP/icgrep-devel/icgrep/icgrep-workspace.files

    r5856 r5865  
    4545IR_Gen/tracegen.cpp
    4646IR_Gen/tracegen.h
     47kernels/lz4/lz4_bitstream_not_kernel.cpp
     48kernels/lz4/lz4_bitstream_not_kernel.h
     49kernels/lz4/lz4_block_decoder.cpp
     50kernels/lz4/lz4_block_decoder.h
     51kernels/lz4/lz4_bytestream_decoder.cpp
     52kernels/lz4/lz4_bytestream_decoder.h
     53kernels/lz4/lz4_deposit_uncompressed.cpp
     54kernels/lz4/lz4_deposit_uncompressed.h
     55kernels/lz4/lz4_extract_e_m0.cpp
     56kernels/lz4/lz4_extract_e_m0.h
     57kernels/lz4/lz4_generate_deposit_stream.cpp
     58kernels/lz4/lz4_generate_deposit_stream.h
     59kernels/lz4/lz4_index_decoder.cpp
     60kernels/lz4/lz4_index_decoder.h
     61kernels/lz4/lz4_match_copy_kernel.cpp
     62kernels/lz4/lz4_match_copy_kernel.h
     63kernels/lz4/lz4_numbers_to_bitstream_kernel.cpp
     64kernels/lz4/lz4_numbers_to_bitstream_kernel.h
     65kernels/lz4/LZ4MarkerToMaskKernel.cpp
     66kernels/lz4/LZ4MarkerToMaskKernel.h
    4767kernels/alignedprint.cpp
    4868kernels/alignedprint.h
     
    7191kernels/linebreak_kernel.cpp
    7292kernels/linebreak_kernel.h
    73 kernels/lz4_bytestream_decoder.cpp
    74 kernels/lz4_bytestream_decoder.h
    75 kernels/lz4_index_decoder.cpp
    76 kernels/lz4_index_decoder.h
    7793kernels/p2s_kernel.cpp
    7894kernels/p2s_kernel.h
     
    87103kernels/scanmatchgen.cpp
    88104kernels/scanmatchgen.h
     105kernels/sequential_kernel.cpp
     106kernels/sequential_kernel.h
    89107kernels/source_kernel.cpp
    90108kernels/source_kernel.h
     
    101119kernels/until_n.cpp
    102120kernels/until_n.h
     121lz4/LZ4Generator.cpp
     122lz4/LZ4Generator.h
    103123pablo/analysis/pabloverifier.cpp
    104124pablo/analysis/pabloverifier.hpp
     
    297317util/slab_allocator.h
    298318base64.cpp
     319character_deletion.cpp
     320character_deposit.cpp
    299321grep_engine.cpp
    300322grep_engine.h
     
    304326icgrep.cpp
    305327lz4d.cpp
     328lz4d_ext_dep.cpp
    306329lz4FrameDecoder.cpp
    307330lz4FrameDecoder.h
     
    313336utf8_encoder.h
    314337wc.cpp
    315 CMakeLists.txt
    316 character_deletion.cpp
  • icGREP/icgrep-devel/icgrep/icgrep-workspace.includes

    r5856 r5865  
    11.
    22/home/nigel/icgrep-devel/boost/include/
     3pablo/passes
     4UCD
     5IR_Gen
     6toolchain
     7pablo
     8pablo/analysis
     9combine
     10kernels
     11editd
     12cc
     13util
     14combine/icgrep-test
     15pablo/optimizers
     16re
     17kernels/lz4
     18lz4
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5856 r5865  
    214214void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
    215215
    216     if (mStreamMap.empty()) {
    217         prepareStreamSetNameMap();
    218     }
     216    assert (mStreamMap.empty());
     217
     218    prepareStreamSetNameMap();
    219219
    220220    normalizeStreamProcessingRates();
     
    290290    if (LLVM_UNLIKELY(hasSignature())) {
    291291        generateKernel(idb);
    292         std::string signature;
    293         raw_string_ostream OS(signature);
    294         WriteBitcodeToFile(getModule(), OS);
    295         return signature;
     292        std::string tmp;
     293        raw_string_ostream signature(tmp);
     294        WriteBitcodeToFile(getModule(), signature);
     295        return signature.str();
    296296    } else {
    297297        return getModule()->getModuleIdentifier();
     
    304304 ** ------------------------------------------------------------------------------------------------------------- */
    305305void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    306     assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
    307     // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
    308     // generated the unoptimized IR.
    309     if (!mIsGenerated) {
    310         const auto m = idb->getModule();
    311         const auto ip = idb->saveIP();
    312         // const auto saveInstance = getInstance();
    313         idb->setModule(mModule);
    314         addKernelDeclarations(idb);
    315         callGenerateInitializeMethod(idb);
    316         callGenerateDoSegmentMethod(idb);
    317         callGenerateFinalizeMethod(idb);
    318         // setInstance(saveInstance);
    319         idb->setModule(m);
    320         idb->restoreIP(ip);
    321         mIsGenerated = true;
    322     }
     306    assert ("Kernel does not have a valid IDISA Builder" && idb.get());
     307    if (LLVM_UNLIKELY(mIsGenerated)) return;
     308    idb->setModule(mModule);
     309    addKernelDeclarations(idb);
     310    callGenerateInitializeMethod(idb);
     311    callGenerateDoSegmentMethod(idb);
     312    callGenerateFinalizeMethod(idb);
     313    mIsGenerated = true;
    323314}
    324315
     
    685676}
    686677
     678// #define DEBUG_LOG
     679
    687680/** ------------------------------------------------------------------------------------------------------------- *
    688681 * @brief generateKernelMethod
     
    710703            Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    711704            auto ub = getUpperBound(rate);
     705            assert (ub != 0);
    712706            if (LLVM_UNLIKELY(input.hasLookahead())) {
    713707                ub += RateValue(input.getLookahead(), mStride);
     
    727721        if (requiresTemporaryOutputBuffer(output, rate)) {
    728722            auto ub = getUpperBound(rate);
    729             if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
    730                 ub += mStreamSetOutputBuffers[i]->overflowSize();
    731             }
    732             Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
    733             Constant * const arraySize = b->getInt64(ceiling(ub));
    734             AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
    735             assert (ptr->isStaticAlloca());
    736             temporaryOutputBuffer[i] = ptr;
     723            if (ub > 0) {
     724                if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     725                    ub += mStreamSetOutputBuffers[i]->overflowSize();
     726                }
     727                Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     728                Constant * const arraySize = b->getInt64(ceiling(ub));
     729                AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
     730                assert (ptr->isStaticAlloca());
     731                temporaryOutputBuffer[i] = ptr;
     732            }
    737733        }
    738734    }
     
    762758
    763759    Value * const initiallyFinal = mIsFinal;
    764 
    765 //    b->CallPrintInt(getName() + "_initiallyFinal", initiallyFinal);
    766 
     760    #ifdef DEBUG_LOG
     761    b->CallPrintInt(getName() + "_initiallyFinal", initiallyFinal);
     762    #endif
    767763    // Now proceed with creation of the doSegment method.
    768764    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     
    791787        const auto & name = input.getName();
    792788        Value * const processed = b->getProcessedItemCount(name);
    793 
    794 //        b->CallPrintInt(getName() + "_" + name + "_avail", mAvailableItemCount[i]);
    795 //        b->CallPrintInt(getName() + "_" + name + "_processed", processed);
    796 
     789        #ifdef DEBUG_LOG
     790        b->CallPrintInt(getName() + "_" + name + "_avail", mAvailableItemCount[i]);
     791        b->CallPrintInt(getName() + "_" + name + "_processed", processed);
     792        #endif
    797793        mInitialProcessedItemCount[i] = processed;
    798794        mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
     
    802798        }
    803799
    804         Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);       
    805 //        b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
    806 
     800        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);
     801        #ifdef DEBUG_LOG
     802        b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
     803        #endif
    807804        Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
    808 //        b->CallPrintInt(getName() + "_" + name + "_accessible", accessible);
    809 
     805        #ifdef DEBUG_LOG
     806        b->CallPrintInt(getName() + "_" + name + "_accessible", accessible);
     807        #endif
    810808        mAvailableItemCount[i] = unprocessed;
    811 
    812809        linearlyAccessible[i] = accessible;
    813810        inputStrideSize[i] = getStrideSize(b, input.getRate());
     
    926923        const auto & name = output.getName();
    927924        Value * const produced = b->getProducedItemCount(name);
    928 //        b->CallPrintInt(getName() + "_" + name + "_produced", produced);
    929 
     925        #ifdef DEBUG_LOG
     926        b->CallPrintInt(getName() + "_" + name + "_produced", produced);
     927        #endif
    930928        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    931929        mInitialProducedItemCount[i] = produced;
    932930        mStreamSetOutputBaseAddress[i] = baseBuffer;
    933 
     931        linearlyWritable[i] = nullptr;
    934932        // Is the number of linearly writable items sufficient for a stride?
    935933        outputStrideSize[i] = getStrideSize(b, output.getRate());
     
    986984        const ProcessingRate & rate = input.getRate();
    987985        if (rate.isFixed() && input.nonDeferred()) {
    988 //            b->CallPrintInt(getName() + "_" + input.getName() + "_processed (+)", mAvailableItemCount[i]);
    989986            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
    990987            b->setProcessedItemCount(input.getName(), ic);
     
    998995            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
    999996            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
    1000 //            b->CallPrintInt(getName() + "_" + output.getName() + "_produced (+)", produced);
    1001997            b->setProducedItemCount(output.getName(), ic);
    1002998        }
     
    10261022    // Copy back data to the actual output buffers.
    10271023    for (unsigned i = 0; i < outputSetCount; i++) {
    1028 
    10291024        AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    10301025        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
    10311026            continue;
    10321027        }
    1033 
    10341028        const auto & name = mStreamSetOutputs[i].getName();
    10351029        Value * const produced = b->getProducedItemCount(name);
     
    10481042        //Value * const newProducedItemCount = b->getProducedItemCount(name);
    10491043        Value * const newlyProduced = b->CreateSub(produced, mInitialProducedItemCount[i]);
     1044
     1045
    10501046        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
    10511047        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     
    11041100        const auto & name = mStreamSetOutputs[i].getName();
    11051101        Value * const produced = b->getProducedItemCount(name);
     1102
    11061103        // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
    11071104        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
     
    11121109            }
    11131110            Value * const unconsumed = b->CreateSub(produced, consumed);
     1111
     1112//            b->CallPrintInt(getName() + "_" + name + "_unconsumed", unconsumed);
     1113
    11141114            Value * const capacity = b->getBufferedSize(name);
     1115
     1116//            b->CallPrintInt(getName() + "_" + name + "_capacity", capacity);
     1117
    11151118            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    11161119                b->CreateAssert(b->CreateICmpULE(unconsumed, capacity),
    1117                                 getName() + ": " + name + " unconsumed data exceeds capacity");
    1118             }
     1120                                getName() + ": " + name + " more data was written than its capacity allows");
     1121            }
     1122
     1123
     1124
    11191125            Value * const remaining = b->CreateSub(capacity, unconsumed);
    11201126            Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, outputStrideSize[i]);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5856 r5865  
    164164}
    165165
    166 /** ------------------------------------------------------------------------------------------------------------- *
    167  * @brief getFieldWidth
    168  ** ------------------------------------------------------------------------------------------------------------- */
    169 inline unsigned getFieldWidth(const unsigned bitWidth, const unsigned blockWidth) {
    170     for (unsigned k = 16; k <= blockWidth; k *= 2) {
    171         if ((bitWidth & (k - 1)) != 0) {
    172             return k / 2;
    173         }
    174     }
    175     return blockWidth;
     166inline static unsigned ceil_log2(const unsigned v) {
     167    assert ("log2(0) is undefined!" && v != 0);
     168    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
    176169}
    177170
     
    186179    assert (target->getType()->isPointerTy());
    187180    assert (isConstantZero(targetOffset) || isConstantZero(sourceOffset));
    188 
    189     const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    190 
    191     const auto itemWidth = getItemWidth(buf->getBaseType());
     181    const StreamSetBuffer * const buffer = mKernel->getAnyStreamSetBuffer(name);
     182    const auto itemWidth = getItemWidth(buffer->getBaseType());
    192183    assert ("invalid item width" && is_power_2(itemWidth));
    193184    const auto blockWidth = getBitBlockWidth();
    194185    // Although our item width may be n bits, if we know we're always processing m items per block, our field width
    195186    // (w.r.t the stream copy) would be n*m. By taking this into account we can optimize and simplify the copy code.
    196     const auto fieldWidth = getFieldWidth(itemWidth * itemAlignment, blockWidth);
    197     const auto alignment = (fieldWidth + 7) / 8;
     187    const auto fieldWidth = std::min(1U << ceil_log2(itemWidth * itemAlignment), blockWidth);
     188    assert ((blockWidth % fieldWidth) == 0);
    198189
    199190    if (LLVM_LIKELY(itemWidth < fieldWidth)) {
     
    201192        Constant * const FACTOR = getSize(factor);
    202193        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    203             ConstantInt * const ALIGNMENT = getSize(alignment);
    204194            const auto kernelName = mKernel->getName()+ ": " + name;
    205             CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     195            if (fieldWidth > 8) {
     196                const auto alignment = (fieldWidth + 7) / 8;
     197                ConstantInt * const ALIGNMENT = getSize(alignment);
     198                CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     199                CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
     200            }
    206201            CreateAssertZero(CreateURem(targetOffset, FACTOR), kernelName + " target offset is misaligned (" + std::to_string(factor) + ")");
    207             CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
    208202            CreateAssertZero(CreateURem(sourceOffset, FACTOR), kernelName + " source offset is misaligned (" + std::to_string(factor) + ")");
    209203        }
     
    236230    */
    237231
    238     Type * const fieldWidthTy = getIntNTy(fieldWidth);
    239 
    240     Value * n = buf->getStreamSetCount(this, getStreamHandle(name));
    241 
    242     if (isConstantOne(n) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset))) {
     232    Value * const n = buffer->getStreamSetCount(this, getStreamHandle(name));
     233    if (((isConstantOne(n) && fieldWidth >= 8) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset)))) {
    243234        if (LLVM_LIKELY(itemWidth < 8)) {
    244235            itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
     
    249240            itemsToCopy = CreateMul(itemsToCopy, n);
    250241        }
    251         PointerType * const ptrTy = fieldWidthTy->getPointerTo();
     242        PointerType * const ptrTy = getIntNTy(fieldWidth)->getPointerTo();
    252243        target = CreateGEP(CreatePointerCast(target, ptrTy), targetOffset);
    253244        source = CreateGEP(CreatePointerCast(source, ptrTy), sourceOffset);
     245        const auto alignment = (fieldWidth + 7) / 8;
    254246        CreateMemCpy(target, source, itemsToCopy, alignment);
    255247
    256248    } else { // either the target offset or source offset is non-zero but not both
    257 
    258249        VectorType * const blockTy = getBitBlockType();
    259250        PointerType * const blockPtrTy = blockTy->getPointerTo();
    260 
    261         target = CreatePointerCast(target, blockPtrTy, "target");
    262         source = CreatePointerCast(source, blockPtrTy, "source");
    263 
    264         assert ((blockWidth % fieldWidth) == 0);
    265 
    266         VectorType * const shiftTy = VectorType::get(fieldWidthTy, blockWidth / fieldWidth);
    267         Constant * const width = getSize(blockWidth / itemWidth);
     251        Constant * const BLOCK_WIDTH = getSize(blockWidth);
     252        target = CreatePointerCast(target, blockPtrTy);
     253        target = CreateGEP(target, CreateUDiv(targetOffset, BLOCK_WIDTH));
     254        source = CreatePointerCast(source, blockPtrTy);
     255        source = CreateGEP(source, CreateUDiv(sourceOffset, BLOCK_WIDTH));
     256        const auto alignment = blockWidth / 8;
    268257        Constant * const ZERO = getSize(0);
    269258        Constant * const ONE = getSize(1);
     259
    270260        BasicBlock * const entry = GetInsertBlock();
     261
     262        // TODO: this code isn't correct. I was hoping to shift by fieldwidth units to give LLVM
     263        // the ability to better select
    271264
    272265        if (isConstantZero(targetOffset)) {
     
    285278                                    2  |FFeee|GGfff|HHggg|    h|
    286279                                    3  |JJiii|KKjjj|LLkkk|    l|
    287              */
    288 
    289             Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, width), n);
    290             Value * const offset = CreateURem(sourceOffset, width);
    291             Value * const offsetVector = simd_fill(fieldWidth, CreateTrunc(offset, fieldWidthTy));
    292             Value * const remaining = CreateSub(width, offset);
    293             Value * const remainingVector = simd_fill(fieldWidth, CreateTrunc(remaining, fieldWidthTy));
    294 
    295             BasicBlock * const streamCopy = CreateBasicBlock(name + "PullCopy");
    296             BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "PullCopyRemaining");
    297             BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "PullCopyEnd");
    298 
    299             CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemaining);
     280            */
     281
     282            sourceOffset = CreateURem(sourceOffset, BLOCK_WIDTH);
     283
     284            Value * const borrowOffset = CreateSub(BLOCK_WIDTH, sourceOffset);
     285            BasicBlock * const streamCopy = CreateBasicBlock();
     286            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
     287            BasicBlock * const streamCopyRemaining = CreateBasicBlock();
     288            BasicBlock * const streamCopyEnd = CreateBasicBlock();
     289
     290            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
     291            CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemainingCond);
    300292
    301293            SetInsertPoint(streamCopy);
    302294            PHINode * const i = CreatePHI(getSizeTy(), 2);
    303295            i->addIncoming(n, entry);
    304             Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
    305             prior = CreateBitCast(CreateLShr(CreateBitCast(prior, shiftTy), offsetVector), blockTy);
    306             Value * value = CreateAlignedLoad(CreateGEP(source, i), alignment);
    307             value = CreateBitCast(CreateShl(CreateBitCast(value, shiftTy), remainingVector), blockTy);
    308             CreateAlignedStore(CreateOr(value, prior), CreateGEP(target, i), alignment);
     296            Value * Ai = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
     297            Ai = mvmd_srl(fieldWidth, Ai, borrowOffset);
     298            Value * Bi = CreateAlignedLoad(CreateGEP(source, i), alignment);
     299            Bi = mvmd_sll(fieldWidth, Bi, sourceOffset);
     300            CreateAlignedStore(CreateOr(Bi, Ai), CreateGEP(target, i), alignment);
    309301            Value * const next_i = CreateAdd(i, ONE);
    310302            i->addIncoming(next_i, streamCopy);
    311             CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemaining);
     303            CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemainingCond);
     304
     305            SetInsertPoint(streamCopyRemainingCond);
     306            Value * const partialBlocksToCopy = CreateAdd(blocksToCopy, n);
     307            Value * const remainingItemsToCopy = CreateURem(itemsToCopy, BLOCK_WIDTH);
     308            CreateLikelyCondBr(CreateIsNotNull(remainingItemsToCopy), streamCopyRemaining, streamCopyEnd);
    312309
    313310            SetInsertPoint(streamCopyRemaining);
    314311            PHINode * const j = CreatePHI(getSizeTy(), 2);
    315             j->addIncoming(blocksToCopy, entry);
    316             j->addIncoming(blocksToCopy, streamCopy);
    317             Value * final = CreateAlignedLoad(CreateGEP(source, j), alignment);
    318             final = CreateBitCast(CreateLShr(CreateBitCast(final, shiftTy), offsetVector), blockTy);
    319             CreateAlignedStore(final, CreateGEP(target, j), alignment);
     312            j->addIncoming(blocksToCopy, streamCopyRemainingCond);
     313            Value * Aj = CreateAlignedLoad(CreateGEP(source, j), alignment);
     314            Aj = mvmd_srl(fieldWidth, Aj, borrowOffset);
     315            CreateAlignedStore(Aj, CreateGEP(target, j), alignment);
    320316            Value * const next_j = CreateAdd(j, ONE);
    321317            j->addIncoming(next_j, streamCopyRemaining);
    322             CreateCondBr(CreateICmpNE(next_j, CreateAdd(blocksToCopy, n)), streamCopyRemaining, streamCopyEnd);
     318            CreateCondBr(CreateICmpNE(next_j, partialBlocksToCopy), streamCopyRemaining, streamCopyEnd);
    323319
    324320            SetInsertPoint(streamCopyEnd);
     
    336332
    337333                                          A     B     C     D
    338                TARGET STREAM        1  |aa---|bbAAA|ccBBB| dCCC|
    339                                     2  |ee---|ffEEE|ggFFF| hGGG|
    340                                     3  |ii---|jjIII|kkJJJ| lKKK|
     334               TARGET STREAM        1  |--XXX|-----|-----|-----|
     335                                    2  |--YYY|-----|-----|-----|
     336                                    3  |--ZZZ|-----|-----|-----|
     337
     338                                          A     B     C     D
     339               OUTPUT STREAM        1  |aaXXX|bbAAA|ccBBB| dCCC|
     340                                    2  |eeYYY|ffEEE|ggFFF| hGGG|
     341                                    3  |iiZZZ|jjIII|kkJJJ| lKKK|
    341342
    342343            */
    343344
    344             BasicBlock * const streamCopy = CreateBasicBlock(name + "PushCopy");
    345             BasicBlock * const streamCopyRemainingCond = CreateBasicBlock(name + "PushCopyRemainingCond");
    346             BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "PushCopyRemaining");
    347             BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "PushCopyEnd");
    348 
    349             Value * const pos = CreateURem(targetOffset, width);
    350             Value * const copied = CreateSub(width, pos);
    351             Value * const copiedVector = simd_fill(fieldWidth, CreateTrunc(copied, fieldWidthTy));
    352             Value * const mask = CreateLShr(Constant::getAllOnesValue(shiftTy), copiedVector);
    353             Value * const offsetVector = simd_fill(fieldWidth, CreateTrunc(pos, fieldWidthTy));
    354 
     345            BasicBlock * const streamCopy = CreateBasicBlock();
     346            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
     347            BasicBlock * const streamCopyRemaining = CreateBasicBlock();
     348            BasicBlock * const streamCopyEnd = CreateBasicBlock();
     349
     350            targetOffset = CreateURem(targetOffset, BLOCK_WIDTH);
     351
     352            Value * const carryOffset = CreateSub(BLOCK_WIDTH, targetOffset);
     353            Value * const mask = mvmd_srl(fieldWidth, Constant::getAllOnesValue(blockTy), carryOffset);
    355354            CreateBr(streamCopy);
    356355
     
    358357            PHINode * const i = CreatePHI(getSizeTy(), 2);
    359358            i->addIncoming(ZERO, entry);
    360             Value * priorTargetValue = CreateAlignedLoad(CreateGEP(target, i), alignment);
    361             priorTargetValue = CreateBitCast(CreateAnd(CreateBitCast(priorTargetValue, shiftTy), mask), blockTy);
    362             Value * sourceValue = CreateAlignedLoad(CreateGEP(source, i), alignment);
    363             sourceValue = CreateBitCast(CreateShl(CreateBitCast(sourceValue, shiftTy), offsetVector), blockTy);
    364             CreateAlignedStore(CreateOr(sourceValue, priorTargetValue), CreateGEP(target, i), alignment);
     359            Value * A0 = CreateAlignedLoad(CreateGEP(target, i), alignment);
     360            A0 = CreateAnd(A0, mask);
     361            Value * Ai = CreateAlignedLoad(CreateGEP(source, i), alignment);
     362            Ai = mvmd_sll(fieldWidth, Ai, targetOffset);
     363            CreateAlignedStore(CreateOr(Ai, A0), CreateGEP(target, i), alignment);
    365364            Value * const next_i = CreateAdd(i, ONE);
    366365            i->addIncoming(next_i, streamCopy);
     
    368367
    369368            SetInsertPoint(streamCopyRemainingCond);
    370             Value * const blocksToCopy = CreateMul(CreateUDiv(CreateSub(itemsToCopy, copied), width), n);
    371             CreateCondBr(CreateICmpULT(copied, itemsToCopy), streamCopyRemaining, streamCopyEnd);
     369            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
     370            CreateCondBr(CreateICmpUGT(blocksToCopy, n), streamCopyRemaining, streamCopyEnd);
    372371
    373372            SetInsertPoint(streamCopyRemaining);
    374373            PHINode * const j = CreatePHI(getSizeTy(), 2);
    375374            j->addIncoming(n, streamCopyRemainingCond);
    376             Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
    377             prior = CreateBitCast(CreateShl(CreateBitCast(prior, shiftTy), offsetVector), blockTy);
    378             Value * value = CreateAlignedLoad(CreateGEP(source, j), alignment);
    379             value = CreateBitCast(CreateLShr(CreateBitCast(value, shiftTy), copiedVector), blockTy);
    380             CreateAlignedStore(CreateOr(value, prior), CreateGEP(target, j), alignment);
     375            Value * Aj = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
     376            Aj = mvmd_srl(fieldWidth, Aj, carryOffset);
     377            Value * Bj = CreateAlignedLoad(CreateGEP(source, j), alignment);
     378            Bj = mvmd_sll(fieldWidth, Bj, targetOffset);
     379            CreateAlignedStore(CreateOr(Bj, Aj), CreateGEP(target, j), alignment);
    381380            Value * const next_j = CreateAdd(j, ONE);
    382381            j->addIncoming(next_j, streamCopyRemaining);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5856 r5865  
    123123    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    124124   
    125     void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopy, const unsigned itemAlignment);   
     125    void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopyFromOffset, const unsigned itemAlignment);
    126126
    127127    llvm::BasicBlock * CreateConsumerWait();
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_block_decoder.cpp

    r5864 r5865  
    4343
    4444void LZ4BlockDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    45     BasicBlock * entry_block = iBuilder->GetInsertBlock();
     45//    BasicBlock * entry_block = iBuilder->GetInsertBlock();
    4646//    iBuilder->CallPrintInt("block_available", iBuilder->getAvailableItemCount("byteStream"));
    4747    BasicBlock * exit_block = iBuilder->CreateBasicBlock("exit");
    4848
    49     BasicBlock * assert_fail_block = iBuilder->CreateBasicBlock("assert_fail_block");
    50     BasicBlock * real_entry_block = iBuilder->CreateBasicBlock("real_entry_block");
     49//    BasicBlock * assert_fail_block = iBuilder->CreateBasicBlock("assert_fail_block");
     50//    BasicBlock * real_entry_block = iBuilder->CreateBasicBlock("real_entry_block");
    5151
    5252    Value* hasSkipHeader = iBuilder->getScalarField("hasSkipHeader");
     
    152152    }
    153153
    154     Value* LZ4BlockDecoderKernel::appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, Value* isCompressed, Value* blockStart, Value* blockEnd) {
     154    void LZ4BlockDecoderKernel::appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, Value* isCompressed, Value* blockStart, Value* blockEnd) {
    155155        // TODO adjust output storing
    156156        this->generateStoreCircularOutput(iBuilder, "isCompressed", iBuilder->getInt1Ty()->getPointerTo(), isCompressed);
     
    176176
    177177    size_t LZ4BlockDecoderKernel::getOutputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, const string& bufferName) {
    178         size_t s = this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks();
     178//        size_t s = this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks();
    179179        return this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    180180    }
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_block_decoder.h

    r5864 r5865  
    3434    llvm::Value *generateLoadInput(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value *offset);
    3535
    36     llvm::Value *appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value *isCompressed, llvm::Value *blockStart, llvm::Value *blockEnd);
     36    void appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value *isCompressed, llvm::Value *blockStart, llvm::Value *blockEnd);
    3737
    3838    void generateStoreCircularOutput(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& outputBufferName,
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5857 r5865  
    1414PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
    1515: MultiBlockKernel(name + "",
    16                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)},
     16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"},
    1717                   Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
    18                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet", RateEqualTo("PDEPmarkerStream")}},
     18                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
    1919                  {}, {}, {})
    2020, mSwizzleFactor(swizzleFactor)
     
    154154
    155155    kb->SetInsertPoint(terminate);
    156     Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
    157     itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
    158     kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
     156//    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
     157//    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
     158//    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
    159159    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedSourceBitsPhi);
    160160
  • icGREP/icgrep-devel/icgrep/lz4/LZ4Generator.cpp

    r5864 r5865  
    9090
    9191    StreamSetBuffer * const DecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
    92     StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
     92  //  StreamSetBuffer * const FinalDecompressedByteStream = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), this->getDecompressedBufferBlocks());
    9393
    9494
  • icGREP/icgrep-devel/icgrep/toolchain/grep_pipeline.cpp

    r5862 r5865  
    3838
    3939void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
    40     const unsigned segmentSize = 8;
     40    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
    4141
    4242    pattern = resolveCaseInsensitiveMode(pattern, false);
     
    101101    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
    102102   
    103     bool saveSegmentParallel = codegen::SegmentPipelineParallel;
    104     codegen::SegmentPipelineParallel = false;
    105103    pxDriver.generatePipelineIR();
    106104    pxDriver.deallocateBuffers();
     
    111109    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
    112110    f(search_buffer, bufferLength);
    113     codegen::SegmentPipelineParallel = saveSegmentParallel;
    114111}
    115112}
  • icGREP/icgrep-devel/icgrep/toolchain/pipeline.cpp

    r5856 r5865  
    3636    using StreamSetBufferMap = flat_map<const StreamSetBuffer *, Value>;
    3737
     38    using CheckMap = flat_map<const Kernel *, std::vector<const StreamSetBuffer *>>;
     39
    3840    using RateValue = ProcessingRate::RateValue;
    3941
     
    5557    Value * executeKernel(const std::unique_ptr<KernelBuilder> & b, const Kernel * const kernel, PHINode * const segNo, Value * const finished);
    5658
     59protected:
     60
     61    Graph makeInputGraph(const std::vector<Kernel *> & kernels);
     62
     63    Graph makeOutputGraph(const std::vector<Kernel *> & kernels);
     64
     65    Graph printGraph(const bool input, Graph && G);
     66
     67    Graph pruneGraph(Graph && G);
     68
     69    void addChecks(Graph && G, CheckMap & M);
     70
    5771    void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel);
    5872
    59     void updateProducedAndConsumedCounts(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel);
    60 
    6173private:
    6274
    63     Graph   G;
    64     Map     M;
     75    CheckMap                            inputAvailabilityChecks;
     76    CheckMap                            outputSpaceChecks;
    6577
    6678    StreamSetBufferMap<Value *>         producedItemCount;
     
    343355    // input from A. Moreover if C is depedent on B, C could be skipped entirely.
    344356
    345     // Note: we cannot simply test the output of A for both B and C. In a our data-parallel
    346     // pipeline A's state may change by the time we process C.
    347 
    348     for (const Kernel * const consumer : kernels) {
    349         const auto v = add_vertex(consumer, G);
     357    // Note: we cannot simply test the output of A for both B and C. In the data-parallel
     358    // pipeline, A's state may change by the time we process C.
     359
     360//    addChecks(printGraph(true, pruneGraph(printGraph(true, makeInputGraph(kernels)))), inputAvailabilityChecks);
     361//    addChecks(printGraph(false, pruneGraph(printGraph(false, makeOutputGraph(kernels)))), outputSpaceChecks);
     362
     363    addChecks(pruneGraph(makeInputGraph(kernels)), inputAvailabilityChecks);
     364    addChecks(pruneGraph(makeOutputGraph(kernels)), outputSpaceChecks);
     365
     366
     367    // iterate through each kernel in order and determine which kernel last used a particular buffer
     368    for (Kernel * const kernel : kernels) {
     369        const auto & inputs = kernel->getStreamInputs();
     370        for (unsigned i = 0; i < inputs.size(); ++i) {
     371            lastConsumer[kernel->getStreamSetInputBuffer(i)] = kernel;
     372        }
     373    }
     374
     375}
     376
     377/** ------------------------------------------------------------------------------------------------------------- *
     378 * @brief makeInputGraph
     379 *
     380 * The input graph models whether a kernel could *consume* more data than may be produced by a preceeding kernel.
     381 ** ------------------------------------------------------------------------------------------------------------- */
     382PipelineGenerator::Graph PipelineGenerator::makeInputGraph(const std::vector<Kernel *> & kernels) {
     383
     384    const auto n = kernels.size();
     385    Graph   G(n);
     386    Map     M;
     387
     388    for (Graph::vertex_descriptor v = 0; v < n; ++v) {
     389
     390        const Kernel * const consumer = kernels[v];
    350391        M.emplace(consumer, v);
     392        G[v] = consumer;
     393
    351394        const auto & inputs = consumer->getStreamInputs();
    352395        for (unsigned i = 0; i < inputs.size(); ++i) {
     396
     397            const Binding & input = inputs[i];
     398            auto ub_in = consumer->getUpperBound(input.getRate()) * consumer->getStride(); assert (ub_in > 0);
     399            if (input.hasLookahead()) {
     400                ub_in += input.getLookahead();
     401            }
    353402
    354403            const auto buffer = consumer->getStreamSetInputBuffer(i);
    355404            const Kernel * const producer = buffer->getProducer();
    356405            const Binding & output = producer->getStreamOutput(buffer);
    357             if (output.getRate().isRelative()) continue;
    358 
    359             const Binding & input = inputs[i];
    360             auto ub_in = consumer->getUpperBound(input.getRate()) * consumer->getStride();
    361             if (input.hasLookahead()) {
    362                 ub_in += input.getLookahead();
    363             }
    364 
    365406            const auto lb_out = producer->getLowerBound(output.getRate()) * producer->getStride();
    366407
     
    388429        }
    389430    }
     431    return G;
     432}
     433
     434/** ------------------------------------------------------------------------------------------------------------- *
     435 * @brief makeOutputGraph
     436 *
     437 * The output graph models whether a kernel could *produce* more data than may be consumed by a subsequent kernel.
     438 ** ------------------------------------------------------------------------------------------------------------- */
     439PipelineGenerator::Graph PipelineGenerator::makeOutputGraph(const std::vector<Kernel *> & kernels) {
     440
     441    const auto n = kernels.size();
     442    Graph   G(n);
     443    Map     M;
     444
     445    for (Graph::vertex_descriptor i = 0; i < n; ++i) {
     446        const Kernel * const consumer = kernels[i];
     447        const auto v = n - i - 1;
     448        M.emplace(consumer, v);
     449        G[v] = consumer;
     450
     451        const auto & inputs = consumer->getStreamInputs();
     452        for (unsigned i = 0; i < inputs.size(); ++i) {
     453            const auto buffer = consumer->getStreamSetInputBuffer(i);
     454            if (isa<SourceBuffer>(buffer)) continue;
     455            const Kernel * const producer = buffer->getProducer();
     456            const Binding & output = producer->getStreamOutput(buffer);
     457            auto ub_out = producer->getUpperBound(output.getRate()) * producer->getStride();
     458            if (LLVM_UNLIKELY(ub_out > 0)) { // unknown output rates are handled by reallocating their buffers
     459                const Binding & input = inputs[i];
     460                if (input.hasLookahead()) {
     461                    ub_out -= input.getLookahead();
     462                }
     463                const auto lb_in = consumer->getLowerBound(input.getRate()) * consumer->getStride();
     464                const auto inverseRate = lb_in / ub_out;
     465                const auto f = M.find(producer); assert (f != M.end());
     466                const auto u = f->second;
     467                // If we have multiple inputs from the same kernel, we only need to consider the "fastest" one
     468                bool fastest = true;
     469                if (ub_out > 0) {
     470                    for (const auto e : make_iterator_range(in_edges(v, G))) {
     471                        if (source(e, G) == u) {
     472                            Channel & p = G[e];
     473                            fastest = false;
     474                            if (inverseRate < p.rate) {
     475                                p.rate = inverseRate;
     476                                p.buffer = buffer;
     477                            }
     478                            break;
     479                        }
     480                    }
     481                }
     482                if (fastest) {
     483                    add_edge(v, u, Channel{inverseRate, buffer}, G);
     484                }
     485            }
     486        }
     487    }
     488    return G;
     489}
     490
     491/** ------------------------------------------------------------------------------------------------------------- *
     492 * @brief printGraph
     493 ** ------------------------------------------------------------------------------------------------------------- */
     494PipelineGenerator::Graph PipelineGenerator::printGraph(const bool input, Graph && G) {
     495
     496    auto & out = errs();
     497
     498    out << "digraph " << (input ? "I" : "O") << " {\n";
     499    for (auto u : make_iterator_range(vertices(G))) {
     500        out << "v" << u << " [label=\"" << u << ": "
     501            << G[u]->getName() << "\"];\n";
     502    }
     503
     504    for (auto e : make_iterator_range(edges(G))) {
     505        const Channel & c = G[e];
     506        const auto s = source(e, G);
     507        const auto t = target(e, G);
     508        const Kernel * const S = G[input ? s : t];
     509        const Kernel * const T = G[input ? t : s];
     510
     511        out << "v" << s << " -> v" << t
     512            << " [label=\"";
     513
     514        if (c.buffer) {
     515            out << S->getStreamOutput(c.buffer).getName()
     516                << " -> "
     517                << T->getStreamInput(c.buffer).getName()
     518                << "   ";
     519        }
     520
     521        out << c.rate.numerator() << " / " << c.rate.denominator()
     522            << "\"];\n";
     523    }
     524
     525    out << "}\n\n";
     526    out.flush();
     527
     528    return G;
     529}
     530
     531/** ------------------------------------------------------------------------------------------------------------- *
     532 * @brief pruneGraph
     533 ** ------------------------------------------------------------------------------------------------------------- */
     534PipelineGenerator::Graph PipelineGenerator::pruneGraph(Graph && G) {
    390535
    391536    // Take a transitive closure of G but whenever we attempt to insert an edge into the closure
    392537    // that already exists, check instead whether the rate of our proposed edge is <= the existing
    393     // edge's rate. If so, the data availability is transitively guaranteed.
     538    // edge's rate. If so, the data availability/consumption is transitively guaranteed.
    394539    for (const auto u : make_iterator_range(vertices(G))) {
    395540        for (auto ei : make_iterator_range(in_edges(u, G))) {
    396541            const auto v = source(ei, G);
    397             const Channel & pu = G[ei];           
    398             for (auto ej : make_iterator_range(out_edges(u, G))) {               
     542            const Channel & pu = G[ei];
     543            for (auto ej : make_iterator_range(out_edges(u, G))) {
    399544                const auto w = target(ej, G);
    400545                const auto ratio = RateValue(G[u]->getStride(), G[w]->getStride());
     
    404549                    if (source(ek, G) == v) {
    405550                        Channel & pw = G[ek];
    406                         if (rate <= pw.rate && pw.rate > 0) {
     551                        if (rate <= pw.rate) {
    407552                            pw.buffer = nullptr;
    408553                        }
     
    419564
    420565    // remove any closure edges from G
    421     remove_edge_if([&](const Graph::edge_descriptor e) { return G[e].buffer == nullptr; }, G);
    422 
    423     // If a kernel has no 'necessary to check' inputs then we can remove every output with a rate >= 1 from G
     566    remove_edge_if([&G](const Graph::edge_descriptor e) { return G[e].buffer == nullptr; }, G);
     567
     568    // For any kernel, if we do not need to check any of its inputs, we can avoid checking any of its
     569    // outputs that have a rate >= 1 (i.e., its production rates >= consumption rates.)
    424570    for (const auto u : make_iterator_range(vertices(G))) {
    425571        if (in_degree(u, G) == 0) {
    426             remove_out_edge_if(u, [&](const Graph::edge_descriptor e) { return G[e].rate >= RateValue{1, 1}; }, G);
    427         }
    428     }
    429 
    430     // iterate through each kernel in order and determine which kernel last used a particular buffer
    431     for (Kernel * const kernel : kernels) {
    432         const auto & inputs = kernel->getStreamInputs();
    433         for (unsigned i = 0; i < inputs.size(); ++i) {
    434             lastConsumer[kernel->getStreamSetInputBuffer(i)] = kernel;
    435         }
    436     }
    437 
     572            remove_out_edge_if(u, [&G](const Graph::edge_descriptor e) { return G[e].rate >= RateValue{1, 1}; }, G);
     573        }
     574    }
     575
     576    return G;
     577}
     578
     579/** ------------------------------------------------------------------------------------------------------------- *
     580 * @brief addChecks
     581 ** ------------------------------------------------------------------------------------------------------------- */
     582void PipelineGenerator::addChecks(Graph && G, CheckMap & M) {
     583    for (const auto u : make_iterator_range(vertices(G))) {
     584        if (LLVM_LIKELY(in_degree(u, G) == 0)) continue;
     585        flat_set<const StreamSetBuffer *> B;
     586        for (auto e : make_iterator_range(in_edges(u, G))) {
     587            B.insert(G[e].buffer);
     588        }
     589        M.emplace(G[u], std::vector<const StreamSetBuffer *>{B.begin(), B.end()});
     590    }
    438591}
    439592
     
    441594 * @brief executeKernel
    442595 ** ------------------------------------------------------------------------------------------------------------- */
    443 Value *PipelineGenerator::executeKernel(const std::unique_ptr<KernelBuilder> & b, const Kernel * const kernel, PHINode * const segNo, Value * const finished) {
     596Value * PipelineGenerator::executeKernel(const std::unique_ptr<KernelBuilder> & b, const Kernel * const kernel, PHINode * const segNo, Value * const finished) {
    444597
    445598    const auto & inputs = kernel->getStreamInputs();
    446599
    447600    std::vector<Value *> args(2 + inputs.size());
    448 
    449     const auto f = M.find(kernel); assert (f != M.end());
    450     const auto u = f->second;
    451601
    452602    BasicBlock * const kernelEntry = b->GetInsertBlock();
    453603    BasicBlock * const kernelCode = b->CreateBasicBlock(kernel->getName());
    454     BasicBlock * const kernelExit = b->CreateBasicBlock(kernel->getName() + "_exit");
     604    BasicBlock * const kernelFinished = b->CreateBasicBlock(kernel->getName() + "Finished");
     605    BasicBlock * const kernelExit = b->CreateBasicBlock(kernel->getName() + "Exit");
    455606
    456607    b->CreateUnlikelyCondBr(b->getTerminationSignal(), kernelExit, kernelCode);
     608
     609    b->SetInsertPoint(kernelFinished);
     610    PHINode * const final = b->CreatePHI(b->getInt1Ty(), 2);
    457611
    458612    b->SetInsertPoint(kernelExit);
    459613    PHINode * const terminated = b->CreatePHI(b->getInt1Ty(), 2);
    460     // Since our initial "isFinal" state is equal to what the first kernel's termination signal state
     614    // The initial "isFinal" state is equal to the first kernel's termination signal state
    461615    terminated->addIncoming(finished ? finished : b->getTrue(), kernelEntry);
    462616    Value * isFinal = finished ? finished : b->getFalse();
    463617
     618    // Since it is possible that a sole consumer of some stream could terminate early, set the
     619    // initial consumed amount to the amount produced in this iteration.
     620    std::vector<PHINode *> consumedItemCountPhi(inputs.size());
     621    std::vector<Value *> priorConsumedItemCount(inputs.size());
     622    for (unsigned i = 0; i < inputs.size(); ++i) {
     623        const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     624        auto c = consumedItemCount.find(buffer);
     625        PHINode * const consumedPhi = b->CreatePHI(b->getSizeTy(), 2);
     626        Value * consumed = nullptr;
     627        if (c == consumedItemCount.end()) {
     628            const auto p = producedItemCount.find(buffer);
     629            assert (p != producedItemCount.end());
     630            consumed = p->second;
     631            consumedItemCount.emplace(buffer, consumedPhi);
     632        } else {
     633            consumed = c->second;
     634            c->second = consumedPhi;
     635        }
     636        consumedPhi->addIncoming(consumed, kernelEntry);
     637        consumedItemCountPhi[i] = consumedPhi;
     638        priorConsumedItemCount[i] = consumed;
     639    }
     640
    464641    b->SetInsertPoint(kernelCode);
     642
     643    // Check for sufficient output space
     644    const auto O = outputSpaceChecks.find(kernel);
     645    if (O != outputSpaceChecks.end()) {
     646        for (const StreamSetBuffer * buffer : O->second) {
     647
     648
     649            const Binding & output = kernel->getStreamOutput(buffer);
     650            const auto name = output.getName();
     651            BasicBlock * const sufficient = b->CreateBasicBlock(name + "HasOutputSpace");
     652            const auto ub = kernel->getUpperBound(output.getRate()); assert (ub > 0);
     653            Constant * const strideLength = b->getSize(ceiling(ub * kernel->getStride()));
     654            Value * const produced = b->getProducedItemCount(name);
     655            Value * const consumed = b->getConsumedItemCount(name);
     656            Value * const unused = b->CreateSub(produced, consumed);
     657            Value * const potentialData = b->CreateAdd(unused, strideLength);
     658            Value * const capacity = b->getBufferedSize(name);
     659
     660          //  b->CallPrintInt("< " + kernel->getName() + "_" + name + "_potential", potentialData);
     661          //  b->CallPrintInt("< " + kernel->getName() + "_" + name + "_capacity", capacity);
     662
     663            Value * const hasSufficientSpace = b->CreateICmpULE(potentialData, capacity);
     664
     665          //  b->CallPrintInt("* < " + kernel->getName() + "_" + name + "_sufficientSpace", hasSufficientSpace);
     666
     667            final->addIncoming(b->getFalse(), b->GetInsertBlock());
     668            b->CreateLikelyCondBr(hasSufficientSpace, sufficient, kernelFinished);
     669            b->SetInsertPoint(sufficient);
     670        }
     671    }
     672
    465673    for (unsigned i = 0; i < inputs.size(); ++i) {
    466 
    467674        const Binding & input = inputs[i];
    468 
    469675        const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    470 
    471676        const auto name = input.getName();
    472677
     
    476681        }
    477682        Value * const produced = p->second;
     683
     684      //  b->CallPrintInt("< " + kernel->getName() + "_" + name + "_produced", produced);
     685
    478686        const auto ub = kernel->getUpperBound(input.getRate()); assert (ub > 0);
    479687        const auto strideLength = ceiling(ub * kernel->getStride()) ;
    480688        Constant * const segmentLength = b->getSize(strideLength * codegen::SegmentSize);
    481689
    482         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     690        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts) && !isa<SourceBuffer>(buffer))) {
    483691            b->CreateAssert(b->CreateICmpULE(segmentLength, b->getCapacity(name)),
    484692                            kernel->getName() + ": " + name + " upper bound of segment length exceeds buffer capacity");
    485693        }
    486694
    487         Value * limit = nullptr;
    488         if (input.getRate().isFixed()) {
    489             // if the input is deferred, simply adding length to the processed item count may result in setting a limit
    490             // that is too low for. instead just calculate the limit of all fixed rates from the segment no.
    491             limit = b->CreateMul(b->CreateAdd(segNo, b->getSize(1)), segmentLength);
    492         } else {
    493             Value * const processed = b->getProcessedItemCount(name);
    494             limit = b->CreateAdd(processed, segmentLength);
    495         }
     695//        Value * limit = nullptr;
     696//        if (input.getRate().isFixed()) {
     697//            // if the input is deferred, simply adding length to the processed item count may result in setting a limit
     698//            // that is too low for. instead just calculate the limit of all fixed rates from the segment no.
     699//            limit = b->CreateMul(b->CreateAdd(segNo, b->getSize(1)), segmentLength);
     700//        } else {
     701//            Value * const processed = b->getProcessedItemCount(name);
     702//            limit = b->CreateAdd(processed, segmentLength);
     703//        }
     704
     705        // if the input is deferred, simply adding length to the processed item count may result in setting a limit
     706        // that is too low for. instead just calculate the limit of all fixed rates from the segment no.
     707        Value * const limit = b->CreateMul(b->CreateAdd(segNo, b->getSize(1)), segmentLength);
     708
     709     //  b->CallPrintInt("< " + kernel->getName() + "_" + name + "_limit", limit);
    496710
    497711        // TODO: currently, if we produce the exact amount as our limit states, we will have to process one additional segment
     
    502716        args[i + 2] = b->CreateSelect(consumingAll, produced, limit);
    503717        isFinal = b->CreateAnd(isFinal, consumingAll);
    504 
    505         // Check for available input (if it's both computable and not guaranteed to be sufficient by the processing rates)
    506         for (auto e : make_iterator_range(in_edges(u, G))) {
    507             const auto p = G[e];
    508             if (p.buffer == buffer) {
    509                 BasicBlock * const sufficient = b->CreateBasicBlock(name + "_hasSufficientData");
    510 
    511                 Constant * const sl = b->getSize(strideLength);
    512 
    513                 Value * remaining = nullptr;
    514                 if (input.getRate().isFixed()) {
    515                     remaining = b->CreateMul(segNo, sl);
    516                 } else {
    517                     remaining = b->getProcessedItemCount(name);
    518                 }
    519                 remaining = b->CreateSub(produced, remaining);
    520 
    521                 Value * const hasSufficientData = b->CreateOr(b->CreateICmpUGE(remaining, sl), isFinal);
    522                 terminated->addIncoming(b->getFalse(), b->GetInsertBlock());
    523                 b->CreateLikelyCondBr(hasSufficientData, sufficient, kernelExit);
    524                 b->SetInsertPoint(sufficient);
    525             }
     718    }
     719
     720    // Check for available input
     721    const auto I = inputAvailabilityChecks.find(kernel);
     722    if (I != inputAvailabilityChecks.end()) {
     723        for (const StreamSetBuffer * buffer : I->second) {
     724            const Binding & input = kernel->getStreamInput(buffer);
     725            const auto name = input.getName();
     726            BasicBlock * const sufficient = b->CreateBasicBlock(name + "HasInputData");
     727            const auto ub = kernel->getUpperBound(input.getRate()); assert (ub > 0);
     728            Constant * const strideLength = b->getSize(ceiling(ub * kernel->getStride()));
     729            Value * const processed = b->getProcessedItemCount(name);
     730//            if (input.getRate().isFixed()) {
     731//                processed = b->CreateMul(segNo, strideLength);
     732//            } else {
     733//                processed = b->getProcessedItemCount(name);
     734//            }
     735            const auto p = producedItemCount.find(buffer);
     736            assert (p != producedItemCount.end());
     737            Value * const produced = p->second;
     738            Value * const unprocessed = b->CreateSub(produced, processed);
     739
     740          //  b->CallPrintInt("< " + kernel->getName() + "_" + name + "_unprocessed", unprocessed);
     741
     742            Value * const hasSufficientData = b->CreateOr(b->CreateICmpUGE(unprocessed, strideLength), isFinal);
     743
     744          //  b->CallPrintInt("* < " + kernel->getName() + "_" + name + "_sufficientData", hasSufficientData);
     745
     746            final->addIncoming(b->getFalse(), b->GetInsertBlock());
     747            b->CreateLikelyCondBr(hasSufficientData, sufficient, kernelFinished);
     748            b->SetInsertPoint(sufficient);
    526749        }
    527750    }
     
    538761    }
    539762    b->setTerminationSignal(isFinal);
    540 //    b->CallPrintInt(kernel->getName() + "_finished", isFinal);
    541     BasicBlock * const kernelFinished = b->GetInsertBlock();
     763  //  b->CallPrintInt(kernel->getName() + "_finished", isFinal);
     764    final->addIncoming(isFinal, b->GetInsertBlock());
     765    b->CreateBr(kernelFinished);
     766
     767    b->SetInsertPoint(kernelFinished);
     768
     769    // update the consumed item counts
     770    for (unsigned i = 0; i < inputs.size(); ++i) {
     771        Value * const processed = b->getProcessedItemCount(inputs[i].getName());
     772      //  b->CallPrintInt("> " + kernel->getName() + "_" + inputs[i].getName() + "_processed", processed);
     773        Value * const consumed = b->CreateUMin(priorConsumedItemCount[i], processed);
     774        consumedItemCountPhi[i]->addIncoming(consumed, kernelFinished);
     775    }
     776    b->CreateBr(kernelExit);
     777
    542778    kernelExit->moveAfter(kernelFinished);
    543     b->CreateBr(kernelExit);
    544779
    545780    b->SetInsertPoint(kernelExit);
    546     terminated->addIncoming(isFinal, kernelFinished);
    547 
    548     updateProducedAndConsumedCounts(b, kernel);
     781    terminated->addIncoming(final, kernelFinished);
     782
     783
     784    // If this kernel is the last consumer of a input buffer, update the consumed count for that buffer.
     785    // NOTE: unless we can prove that this kernel cannot terminate before any prior consumer, we cannot
     786    // put this code into the kernelFinished block.
     787    for (unsigned i = 0; i < inputs.size(); ++i) {
     788        const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     789        const auto c = lastConsumer.find(buffer);
     790        assert (c != lastConsumer.end());
     791        if (c->second == kernel) {
     792            Kernel * const producer = buffer->getProducer();
     793            const auto & output = producer->getStreamOutput(buffer);
     794            if (output.getRate().isRelative()) continue;
     795
     796           // b->CallPrintInt("* " + producer->getName() + "_" + output.getName() + "_consumed", consumedItemCountPhi[i]);
     797
     798            b->setKernel(producer);
     799            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     800                Value * const alreadyConsumed = b->getConsumedItemCount(output.getName());
     801                b->CreateAssert(b->CreateICmpULE(alreadyConsumed, consumedItemCountPhi[i]),
     802                                producer->getName() + ": " + output.getName() + " consumed item count is not monotonically non-decreasing!");
     803            }
     804            b->setConsumedItemCount(output.getName(), consumedItemCountPhi[i]);
     805            b->setKernel(kernel);
     806        }
     807    }
     808
     809    const auto & outputs = kernel->getStreamOutputs();
     810    for (unsigned i = 0; i < outputs.size(); ++i) {
     811        Value * const produced = b->getProducedItemCount(outputs[i].getName());
     812
     813       // b->CallPrintInt("> " + kernel->getName() + "_" + outputs[i].getName() + "_produced", produced);
     814
     815        const StreamSetBuffer * const buffer = kernel->getStreamSetOutputBuffer(i);
     816        assert (producedItemCount.count(buffer) == 0);
     817        producedItemCount.emplace(buffer, produced);
     818    }
    549819
    550820    return terminated;
    551821}
     822
    552823
    553824/** ------------------------------------------------------------------------------------------------------------- *
     
    588859}
    589860
    590 /** ------------------------------------------------------------------------------------------------------------- *
    591  * @brief updateProducedAndConsumedCounts
    592  ** ------------------------------------------------------------------------------------------------------------- */
    593 void PipelineGenerator::updateProducedAndConsumedCounts(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel) {
    594 
    595     const auto & inputs = kernel->getStreamInputs();
    596     for (unsigned i = 0; i < inputs.size(); ++i) {
    597         Value * const processed = b->getProcessedItemCount(inputs[i].getName());
    598 
    599         const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    600         auto f = consumedItemCount.find(buffer);
    601         Value * consumed = processed;
    602         if (f == consumedItemCount.end()) {
    603             consumedItemCount.emplace(buffer, consumed);
    604         } else {
    605             consumed = b->CreateUMin(consumed, f->second);
    606             f->second = consumed;
    607         }
    608 
    609         // If this kernel is the last consumer of a input buffer, update the consumed count for that buffer.
    610         const auto c = lastConsumer.find(buffer);
    611         assert (c != lastConsumer.end());
    612         if (c->second == kernel) {
    613             Kernel * const producer = buffer->getProducer();
    614             const auto & output = producer->getStreamOutput(buffer);
    615             if (output.getRate().isRelative()) continue;
    616             b->setKernel(producer);
    617 
    618             b->setConsumedItemCount(output.getName(), consumed);
    619             b->setKernel(kernel);
    620         }
    621     }
    622 
    623     const auto & outputs = kernel->getStreamOutputs();
    624     for (unsigned i = 0; i < outputs.size(); ++i) {
    625         Value * const produced = b->getProducedItemCount(outputs[i].getName());
    626         const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
    627         assert (producedItemCount.count(buf) == 0);
    628         producedItemCount.emplace(buf, produced);
    629     }
    630 
    631 }
    632 
    633 
Note: See TracChangeset for help on using the changeset viewer.