Changeset 6228 for icGREP


Ignore:
Timestamp:
Dec 14, 2018, 2:28:41 PM (5 months ago)
Author:
nmedfort
Message:

redesign of PopCount? calculation + mem leak fix

Location:
icGREP/icgrep-devel/icgrep
Files:
1 added
37 edited
1 copied

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.cpp

    r6193 r6228  
    255255    Module * const m = getModule();
    256256    Function * mkstempFn = m->getFunction("mkstemp");
    257     if (mkstempFn == nullptr) {       
     257    if (mkstempFn == nullptr) {
    258258        FunctionType * const fty = FunctionType::get(getInt32Ty(), {getInt8PtrTy()}, false);
    259259        mkstempFn = Function::Create(fty, Function::ExternalLinkage, "mkstemp", m);
     
    319319        name->setName("name");
    320320        Value * value = &*arg;
    321         value->setName("value");       
     321        value->setName("value");
    322322        std::vector<Value *> args(4);
    323323        args[0] = fdInt;
     
    341341Value * CBuilder::CreateMalloc(Value * size) {
    342342    Module * const m = getModule();
    343     IntegerType * const sizeTy = getSizeTy();   
     343    IntegerType * const sizeTy = getSizeTy();
    344344    Function * f = m->getFunction("malloc");
    345345    if (f == nullptr) {
     
    419419}
    420420
    421 Value * CBuilder::CreateRealloc(Value * const ptr, Value * const size) {
     421Value * CBuilder::CreateRealloc(llvm::Type * const type, llvm::Value * const base, llvm::Value * const ArraySize) {
     422    Value * size = ConstantExpr::getSizeOf(type);
     423    if (ArraySize) {
     424        size = CreateMul(size, CreateZExtOrTrunc(ArraySize, size->getType()));
     425    }
     426    return CreatePointerCast(CreateRealloc(base, size), type->getPointerTo());
     427}
     428
     429Value * CBuilder::CreateRealloc(Value * const base, Value * const size) {
     430    assert ("Ptr is not a pointer type" && base->getType()->isPointerTy());
     431    assert ("Size is not an integer" && size->getType()->isIntegerTy());
    422432    Module * const m = getModule();
    423433    IntegerType * const sizeTy = getSizeTy();
     
    429439        f->setCallingConv(CallingConv::C);
    430440        f->setReturnDoesNotAlias();
    431 #if LLVM_VERSION_INTEGER < LLVM_VERSION_CODE(5, 0, 0)
     441        #if LLVM_VERSION_INTEGER < LLVM_VERSION_CODE(5, 0, 0)
    432442        f->setDoesNotAlias(1);
    433 #endif
    434     }
    435     CallInst * const ci = CreateCall(f, {CreatePointerCast(ptr, voidPtrTy), CreateZExtOrTrunc(size, sizeTy)});
    436     return CreatePointerCast(ci, ptr->getType());
     443        #endif
     444    }
     445    CallInst * const ci = CreateCall(f, {CreatePointerCast(base, voidPtrTy), CreateZExtOrTrunc(size, sizeTy)});
     446    return CreatePointerCast(ci, base->getType());
    437447}
    438448
     
    470480    ConstantInt * const prot =  ConstantInt::get(intTy, PROT_READ);
    471481    ConstantInt * const flags =  ConstantInt::get(intTy, MAP_PRIVATE);
    472     Constant * const offset = ConstantInt::get(sizeTy, 0);       
     482    Constant * const offset = ConstantInt::get(sizeTy, 0);
    473483    return CreateMMap(ConstantPointerNull::getNullValue(voidPtrTy), size, prot, flags, fd, offset);
    474484}
     
    665675    inst->setOrdering(AtomicOrdering::Acquire);
    666676    return inst;
    667    
     677
    668678}
    669679
     
    811821}
    812822
    813 void __report_failure(const char * msg, const uintptr_t * trace, const uint32_t n) {
     823void __report_failure(const char * name, const char * msg, const uintptr_t * trace, const uint32_t n) {
    814824    // TODO: look into boost stacktrace, available from version 1.65
    815825    raw_fd_ostream out(STDERR_FILENO, false);
     
    843853        out << trace_string.str();
    844854    }
     855    if (name) {
     856        out.changeColor(raw_fd_ostream::RED, true);
     857        out << name << ": ";
     858    }
    845859    out.changeColor(raw_fd_ostream::WHITE, true);
    846     out << msg << "\n";   
     860    out << msg << "\n";
    847861    if (trace == nullptr) {
    848862        out.changeColor(raw_fd_ostream::WHITE, true);
     
    907921    uintptr_t stacktop = (uintptr_t)(pthread_get_stackaddr_np(self));
    908922    uintptr_t stackbot = stacktop - (uintptr_t)(pthread_get_stacksize_np(self));
    909    
     923
    910924    *nb = 0;
    911    
     925
    912926    /* make sure return address is never out of bounds */
    913927    stacktop -= (FP_LINK_OFFSET + 1) * sizeof(void *);
    914    
     928
    915929    /*
    916930     * The original implementation called the first_frame_address() function,
     
    967981            auto ip = saveIP();
    968982            IntegerType * const int1Ty = getInt1Ty();
    969             FunctionType * fty = FunctionType::get(getVoidTy(), { int1Ty, int8PtrTy, stackPtrTy, getInt32Ty() }, false);
     983            FunctionType * fty = FunctionType::get(getVoidTy(), { int1Ty, int8PtrTy, int8PtrTy, stackPtrTy, getInt32Ty() }, false);
    970984            function = Function::Create(fty, Function::PrivateLinkage, "assert", m);
    971985            function->setDoesNotThrow();
     
    979993            arg->setName("assertion");
    980994            Value * assertion = &*arg++;
     995            arg->setName("name");
     996            Value * name = &*arg++;
    981997            arg->setName("msg");
    982998            Value * msg = &*arg++;
     
    9881004            IRBuilder<>::CreateCondBr(assertion, success, failure);
    9891005            IRBuilder<>::SetInsertPoint(failure);
    990             IRBuilder<>::CreateCall(LinkFunction("__report_failure", __report_failure), { msg, trace, depth });
     1006            IRBuilder<>::CreateCall(LinkFunction("__report_failure", __report_failure), { name, msg, trace, depth });
    9911007            CreateExit(-1);
    9921008            IRBuilder<>::CreateBr(success); // necessary to satisfy the LLVM verifier. this is never executed.
     
    10741090        Value * depth = getInt32(0);
    10751091        #endif
     1092        Value * const name = GetString(getKernelName());
    10761093        SmallVector<char, 1024> tmp;
    1077         IRBuilder<>::CreateCall(function, {assertion, GetString(failureMessage.toStringRef(tmp)), trace, depth});
     1094        Value * const msg = GetString(failureMessage.toStringRef(tmp));
     1095        IRBuilder<>::CreateCall(function, {assertion, name, msg, trace, depth});
    10781096    } else { // if assertions are not enabled, make it a compiler assumption.
    10791097
     
    11241142        CreateAssert(value, "CreateCountForwardZeroes: value cannot be zero!");
    11251143    }
    1126     Value * cttzFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::cttz, value->getType());   
     1144    Value * cttzFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::cttz, value->getType());
    11271145    return CreateCall(cttzFunc, {value, getInt1(guaranteedNonZero)});
    11281146}
     
    12081226    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    12091227        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Ptr->getType()->getPointerElementType()), "CreateLoad");
    1210     }   
     1228    }
    12111229    return IRBuilder<>::CreateLoad(Ptr, isVolatile, Name);
    12121230}
    12131231
    12141232StoreInst * CBuilder::CreateStore(Value * Val, Value * Ptr, bool isVolatile) {
    1215     assert (Val->getType() == Ptr->getType()->getPointerElementType());
     1233    assert ("Ptr is not a pointer type for Val" &&
     1234            Ptr->getType()->isPointerTy() && Val->getType() == Ptr->getType()->getPointerElementType());
    12161235    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    12171236        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Val->getType()), "CreateStore");
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.h

    r6189 r6228  
    2929
    3030    CBuilder(llvm::LLVMContext & C);
    31    
     31
    3232    virtual ~CBuilder() {}
    3333
     
    4848        ClearInsertionPoint();
    4949    }
    50    
     50
    5151    // UDiv and URem with optimization for division by power-of-2 constants
    5252    llvm::Value * CreateUDiv(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "");
     
    6060        return CreateCeilUDiv(number, divisor, Name);
    6161    }
    62    
     62
    6363    // Round up to a multiple of divisor.
    6464    llvm::Value * CreateRoundUp(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "");
    65            
     65
    6666    // Get minimum of two unsigned numbers
    6767    llvm::Value * CreateUMin(llvm::Value * const a, llvm::Value * const b, const llvm::Twine &Name = "") {
     
    108108    void CreateFree(llvm::Value * const ptr);
    109109
    110     llvm::Value * CreateRealloc(llvm::Value * const ptr, llvm::Value * const size);
     110    llvm::Value * CreateRealloc(llvm::Type * const type, llvm::Value * const base, llvm::Value * const ArraySize);
     111
     112    llvm::Value * CreateRealloc(llvm::Value * const base, llvm::Value * const size);
    111113
    112114    llvm::CallInst * CreateMemZero(llvm::Value * const ptr, llvm::Value * const size, const unsigned alignment = 1) {
     
    117119
    118120    llvm::CallInst * CreateMemCmp(llvm::Value * ptr1, llvm::Value * ptr2, llvm::Value * num);
    119    
     121
    120122    llvm::AllocaInst * CreateAlignedAlloca(llvm::Type * const Ty, const unsigned alignment, llvm::Value * const ArraySize = nullptr) {
    121123        llvm::AllocaInst * const alloc = CreateAlloca(Ty, ArraySize);
     
    140142    //  Create a call to:  int remove(const char *path);
    141143    llvm::Value * CreateRemoveCall(llvm::Value * path);
    142    
     144
    143145    //  Create a call to:  int rename(const char *old, const char *new);
    144146    llvm::Value * CreateRenameCall(llvm::Value * oldName, llvm::Value * newName);
    145    
     147
    146148    llvm::Function * GetPrintf();
    147149
     
    167169    //  Create a call to:  int mkstemp (char *template);
    168170    llvm::Value * CreateMkstempCall(llvm::Value * ftemplate);
    169    
     171
    170172    //  Create a call to:  size_t strlen(const char *str);
    171173    llvm::Value * CreateStrlenCall(llvm::Value * str);
    172    
     174
    173175    llvm::Value * CreateAnonymousMMap(llvm::Value * size);
    174176
     
    212214    //  Create a call to:  int pthread_yield(void);
    213215    llvm::Value * CreatePThreadYield();
    214    
     216
    215217    //  Create a call to:  void pthread_exit(void *value_ptr);
    216218    llvm::Value * CreatePThreadExitCall(llvm::Value * value_ptr);
    217    
     219
    218220    //  Create a call to:  int pthread_join(pthread_t thread, void **value_ptr);
    219221    llvm::Value * CreatePThreadJoinCall(llvm::Value * thread, llvm::Value * value_ptr);
     
    228230
    229231    void CallPrintInt(llvm::StringRef name, llvm::Value * const value, const STD_FD fd = STD_FD::STD_ERR);
    230        
     232
    231233    llvm::Value * GetString(llvm::StringRef Str);
    232234
     
    235237        return mSizeType;
    236238    }
    237    
     239
    238240    inline llvm::ConstantInt * LLVM_READNONE getSize(const size_t value) {
    239241        return llvm::ConstantInt::get(getSizeTy(), value);
    240242    }
    241    
     243
    242244    llvm::IntegerType * LLVM_READNONE getIntAddrTy() const;
    243    
     245
    244246    llvm::PointerType * LLVM_READNONE getVoidPtrTy(const unsigned AddressSpace = 0) const;
    245    
     247
    246248    llvm::PointerType * LLVM_READNONE getFILEptrTy();
    247    
     249
    248250    inline unsigned getCacheAlignment() const {
    249251        return mCacheLineAlignment;
     
    251253
    252254    static LLVM_READNONE unsigned getPageSize();
    253    
     255
    254256    virtual llvm::LoadInst* CreateAtomicLoadAcquire(llvm::Value * ptr);
    255257
     
    281283    llvm::Value * CreateCountForwardZeroes(llvm::Value * value, const bool guaranteedNonZero = false);
    282284    llvm::Value * CreateCountReverseZeroes(llvm::Value * value, const bool guaranteedNonZero = false);
    283    
    284     // Useful bit manipulation operations 
    285     llvm::Value * CreateResetLowestBit(llvm::Value * bits);   
    286    
     285
     286    // Useful bit manipulation operations
     287    llvm::Value * CreateResetLowestBit(llvm::Value * bits);
     288
    287289    llvm::Value * CreateIsolateLowestBit(llvm::Value * bits);
    288    
     290
    289291    llvm::Value * CreateMaskToLowestBitInclusive(llvm::Value * bits);
    290    
     292
    291293    llvm::Value * CreateMaskToLowestBitExclusive(llvm::Value * bits);
    292    
     294
    293295    llvm::Value * CreateExtractBitField(llvm::Value * bits, llvm::Value * start, llvm::Value * length);
    294    
     296
    295297    llvm::Value * CreateCeilLog2(llvm::Value * value);
    296    
     298
    297299    llvm::Value * CreateReadCycleCounter();
    298300
     
    357359                           llvm::MDNode *ScopeTag = nullptr,
    358360                           llvm::MDNode *NoAliasTag = nullptr);
    359    
     361
    360362    llvm::Value * CreateExtractElement(llvm::Value *Vec, llvm::Value *Idx, const llvm::Twine &Name = "");
    361363
     
    384386
    385387    bool hasAddressSanitizer() const;
     388
     389    virtual std::string getKernelName() const = 0;
    386390
    387391    void __CreateAssert(llvm::Value * assertion, const llvm::Twine & failureMessage);
     
    393397    llvm::IntegerType * const       mSizeType;
    394398    llvm::StructType *              mFILEtype;
    395     BaseDriver *                        mDriver;
     399    BaseDriver *                    mDriver;
    396400    llvm::LLVMContext               mContext;
    397401    const std::string               mTriple;
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.cpp

    r6184 r6228  
    276276        return mY;
    277277    }
    278     if (mNoUninitialized) {
    279         mN = ~mY;
    280         mNoUninitialized = false;
    281     }
    282     return mN;
     278    if (mN.get() == nullptr) {
     279        mN = llvm::make_unique<UnicodeSet>(std::move(~mY));
     280    }
     281    return *mN;
    283282}
    284283
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.h

    r6186 r6228  
    4747
    4848    virtual const std::string & GetPropertyValueGrepString();
    49     property_t the_property;
    50     ClassTypeId the_kind;
     49    const property_t the_property;
     50    const ClassTypeId the_kind;
    5151    virtual ~PropertyObject() {}
    5252};
     
    6060        return false;
    6161    }
    62    
     62
    6363    BinaryPropertyObject(UCD::property_t p, const UnicodeSet && set)
    6464    : PropertyObject(p, ClassTypeId::BinaryProperty)
    65     , mNoUninitialized(true)
    66     , mY(std::move(set)) {
    67        
     65    , mY(std::move(set))
     66    , mN() {
     67
    6868    }
    6969    const UnicodeSet GetCodepointSet(const std::string & prop_value_string) override;
     
    7272    const std::string & GetPropertyValueGrepString() override;
    7373private:
    74     bool mNoUninitialized;
    7574    const UnicodeSet mY;
    76     UnicodeSet mN;
     75    std::unique_ptr<UnicodeSet> mN;
    7776    std::string mPropertyValueGrepString;
    7877};
     
    174173        return false;
    175174    }
    176    
     175
    177176    NumericPropertyObject(UCD::property_t p, const UnicodeSet && NaN_Set, const char * string_buffer, unsigned bufsize, const std::vector<UCD::codepoint_t> && cps)
    178177    : PropertyObject(p, ClassTypeId::NumericProperty)
     
    182181    , mExplicitCps(std::move(cps))
    183182    {
    184        
     183
    185184    }
    186185    const UnicodeSet GetCodepointSet(const std::string & numeric_spec) override;
     
    189188private:
    190189    const UnicodeSet mNaNCodepointSet;  // codepoints for which the property value is NaN (not a number).
    191     const char * mStringBuffer;  // buffer holding all string values for other codepoints, in sorted order. 
     190    const char * mStringBuffer;  // buffer holding all string values for other codepoints, in sorted order.
    192191    unsigned mBufSize;
    193192    const std::vector<UCD::codepoint_t> mExplicitCps;
     
    217216    const UnicodeSet GetReflexiveSet() const override;
    218217    const std::string GetStringValue(UCD::codepoint_t cp) const override;
    219    
     218
    220219private:
    221220    const UnicodeSet mNullCodepointSet;  // codepoints for which the property value is the null string.
     
    227226    const std::vector<UCD::codepoint_t> mExplicitCps;  // the codepoints having explicit strings
    228227};
    229    
     228
    230229class StringOverridePropertyObject final : public PropertyObject {
    231230public:
     
    245244    , mExplicitCps(cps)
    246245    {
    247        
     246
    248247    }
    249248    const UnicodeSet GetCodepointSet(const std::string & value_spec) override;
     
    257256    PropertyObject & mBaseObject;  // the base object that provides default values for this property unless overridden.
    258257    const UnicodeSet mOverriddenSet;   // codepoints for which the baseObject value is overridden.
    259     const char * mStringBuffer;  // buffer holding all string values for overridden codepoints, in sorted order. 
     258    const char * mStringBuffer;  // buffer holding all string values for overridden codepoints, in sorted order.
    260259    const std::vector<unsigned> mStringOffsets;        // the offsets of each string within the buffer.
    261260    //unsigned mBufSize;                               // mStringOffsets has one extra element for buffer size.
    262261    const std::vector<codepoint_t> mExplicitCps;
    263262};
    264    
     263
    265264class ObsoletePropertyObject final : public PropertyObject {
    266265public:
     
    271270        return false;
    272271    }
    273    
     272
    274273    ObsoletePropertyObject(property_t p)
    275274    : PropertyObject(p, ClassTypeId::ObsoleteProperty) {}
    276    
     275
    277276    const std::string & GetPropertyValueGrepString() override;
    278277    const UnicodeSet GetCodepointSet(const std::string & value_spec) override;
     
    288287        return false;
    289288    }
    290    
     289
    291290    UnsupportedPropertyObject(property_t p, ClassTypeId)
    292291    : PropertyObject(p, ClassTypeId::UnsupportedProperty) {}
  • icGREP/icgrep-devel/icgrep/editd/editd.cpp

    r6221 r6228  
    166166static size_t size;
    167167
    168 //class PreprocessPipeline : public PipelineKernel {
    169 //public:
    170 //    PreprocessPipeline(EngineInstance & driver, StreamSet * CCResults)
    171 //     : PipelineKernel(driver,
    172 //    {},
    173 //    {Binding{"CCResults", CCResults}},
    174 //    {Binding{driver.getBuilder()->getInt32Ty(), "fileDescriptor"}},
    175 //    {}) {
    176 
    177 //    }
    178 //};
    179 
    180168class PreprocessKernel final: public pablo::PabloKernel {
    181169public:
  • icGREP/icgrep-devel/icgrep/icgrep.files

    r6209 r6228  
    1 icgrep.pro
    2 kernels/directorysearch.cpp
    3 kernels/directorysearch.h
    4 toolchain/object_cache.cpp
    5 toolchain/object_cache.h
    6 toolchain/object_cache_daemon.cpp
    7 toolchain/object_cache_util.hpp
    81wc.cpp
    9 CMakeLists.txt
    102base64.cpp
    11 character_deletion.cpp
    12 character_deposit.cpp
    133grep_interface.cpp
    144grep_interface.h
     
    5141UCD/DerivedNumericType.h
    5242UCD/EastAsianWidth.h
     43UCD/Equivalence.cpp
     44UCD/Equivalence.h
    5345UCD/GraphemeBreakProperty.h
    5446UCD/HangulSyllableType.h
     
    5850UCD/LineBreak.h
    5951UCD/NameAliases.h
     52UCD/PrecomposedMappings.h
    6053UCD/PropList.h
    6154UCD/PropertyAliases.h
     
    7366UCD/VerticalOrientation.h
    7467UCD/WordBreakProperty.h
     68UCD/emoji-data.h
    7569UCD/resolve_properties.cpp
    7670UCD/resolve_properties.h
     
    8175toolchain/toolchain.h
    8276toolchain/toolchain.cpp
     77toolchain/object_cache_util.hpp
     78toolchain/object_cache_daemon.cpp
    8379toolchain/object_cache.h
    8480toolchain/object_cache.cpp
     
    8985toolchain/cpudriver.h
    9086toolchain/cpudriver.cpp
     87re/validation.h
     88re/validation.cpp
    9189re/to_utf8.h
    9290re/to_utf8.cpp
     
    128126re/re_group.h
    129127re/re_end.h
     128re/re_empty_set.h
     129re/re_empty_set.cpp
    130130re/re_diff.h
    131131re/re_diff.cpp
     132re/re_contextual_simplification.h
     133re/re_contextual_simplification.cpp
    132134re/re_compiler.h
    133135re/re_compiler.cpp
     
    144146re/Unicode/decomposition.cpp
    145147re/Unicode/decomposition.h
     148re/Unicode/equivalence.cpp
     149re/Unicode/equivalence.h
    146150re/casing.cpp
    147151re/casing.h
     
    169173pablo/symbol_generator.h
    170174pablo/symbol_generator.cpp
     175pablo/ps_terminate.h
    171176pablo/ps_assign.h
    172177pablo/printer_pablos.h
     
    281286kernels/bitstream_pdep_kernel.h
    282287kernels/block_kernel.cpp
     288kernels/callback.cpp
     289kernels/callback.h
    283290kernels/cc_kernel.cpp
    284 kernels/cc_kernel.h
    285291kernels/cc_scan_kernel.cpp
    286292kernels/cc_scan_kernel.h
     
    291297kernels/delmask_kernel.cpp
    292298kernels/delmask_kernel.h
     299kernels/directorysearch.cpp
     300kernels/directorysearch.h
    293301kernels/evenodd.cpp
    294302kernels/evenodd.h
     
    318326kernels/pipeline/pipeline_builder.cpp
    319327kernels/pipeline/pipeline_analysis.hpp
     328kernels/pipeline/lexographic_ordering.hpp
    320329kernels/pipeline/kernel_logic.hpp
    321330kernels/pipeline/cycle_counter_logic.hpp
     
    391400cc/multiplex_CCs.h
    392401cc/multiplex_CCs.cpp
     402cc/cc_kernel.h
     403cc/cc_kernel.cpp
    393404cc/cc_compiler.h
    394405cc/cc_compiler.cpp
  • icGREP/icgrep-devel/icgrep/kernels/cc_kernel.cpp

    r6220 r6228  
    1515using namespace re;
    1616using namespace llvm;
    17 
    18 // ccNameStr, std::vector<re::CC *>{cc}, ByteStream, ccStream
    1917
    2018DirectCharacterClassKernelBuilder::DirectCharacterClassKernelBuilder(
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r6200 r6228  
    180180    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PEXT width for PEXTFieldCompressKernel");
    181181}
    182    
     182
    183183StreamCompressKernel::StreamCompressKernel(const std::unique_ptr<kernel::KernelBuilder> & b
    184184                                           , StreamSet * source
     
    189189{Binding{"sourceStreamSet", source},
    190190Binding{"extractionMask", extractionMask}},
    191 {Binding{"compressedOutput", compressedOutput, PopcountOf("extractionMask")}},
     191{Binding{"compressedOutput", compressedOutput, PopcountOf("extractionMask"), BlockSize(1)}},
    192192{}, {}, {})
    193193, mCompressedFieldWidth(FieldWidth)
    194194, mStreamCount(source->getNumElements()) {
    195     Type * const fwTy = b->getIntNTy(mCompressedFieldWidth);
    196     addInternalScalar(fwTy, "pendingItemCount");
    197195    for (unsigned i = 0; i < mStreamCount; i++) {
    198196        addInternalScalar(b->getBitBlockType(), "pendingOutputBlock_" + std::to_string(i));
    199197    }
    200 
    201 }
    202    
     198}
     199
    203200void StreamCompressKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
    204201    IntegerType * const fwTy = b->getIntNTy(mCompressedFieldWidth);
     
    218215    Constant * const ZERO = ConstantInt::get(sizeTy, 0);
    219216    Constant * const ONE = ConstantInt::get(sizeTy, 1);
    220 
    221 
    222     Value * pendingItemCount = b->getScalarField("pendingItemCount");
     217    Constant * const BlockWidth = b->getSize(b->getBitBlockWidth());
     218
     219    Value * produced = b->getProducedItemCount("compressedOutput");
     220    Value * const pendingItemCount = b->CreateURem(produced, BlockWidth);
     221
    223222    std::vector<Value *> pendingData(mStreamCount);
    224223    for (unsigned i = 0; i < mStreamCount; i++) {
    225224        pendingData[i] = b->getScalarField("pendingOutputBlock_" + std::to_string(i));
    226225    }
    227    
     226
    228227    b->CreateBr(segmentLoop);
    229228    // Main Loop
     
    361360    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
    362361    b->CreateCondBr(moreToDo, segmentLoop, segmentDone);
    363    
     362
    364363    b->SetInsertPoint(segmentDone);
    365364    // Save kernel state.
    366     b->setScalarField("pendingItemCount", newPending);
    367365    for (unsigned i = 0; i < mStreamCount; i++) {
    368366        b->setScalarField("pendingOutputBlock_" + std::to_string(i), b->bitCast(pendingOutput[i]));
     
    378376
    379377    b->SetInsertPoint(updateProducedCount);
    380     //Value * produced = b->getProducedItemCount("compressedOutput");
    381     //Value * const blockOffset = b->CreateMul(nextOutputBlk, b->getSize(b->getBitBlockWidth()));
    382     //produced = b->CreateAdd(produced, blockOffset);
    383     //newPending = b->CreateZExtOrTrunc(newPending, sizeTy);
    384     //produced = b->CreateSelect(mIsFinal, b->CreateAdd(produced, newPending), produced);
    385     //b->setProducedItemCount("compressedOutput", produced);
     378
    386379}
    387380
     
    685678}
    686679
    687    
     680
    688681//
    689682// This kernel performs final stream compression for a set of N bitstreams, given
     
    721714    addInternalScalar(kb->getSizeTy(), "pendingOffset");
    722715}
    723    
     716
    724717void SwizzledBitstreamCompressByCount::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
    725        
     718
    726719    Value * countsPerStridePtr = kb->getInputStreamBlockPtr("countsPerStride", kb->getInt32(0));
    727720    Value * countStreamPtr = kb->CreatePointerCast(countsPerStridePtr, kb->getIntNTy(mFieldWidth)->getPointerTo());
    728    
     721
    729722    // Output is written and committed to the output buffer one swizzle at a time.
    730723    //
    731724    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
    732725    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
    733    
     726
    734727    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    735728    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
     
    745738        outputStreamPtr.push_back(kb->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), kb->getInt32(0)));
    746739    }
    747    
     740
    748741    // Generate code for each of the mSwizzleFactor fields making up a block.
    749742    // We load the count for the field and process all swizzle groups accordingly.
     
    752745        Value * pendingSpace = kb->CreateSub(kb->getSize(mFieldWidth), pendingOffset);
    753746        Value * pendingSpaceFilled = kb->CreateICmpUGE(newItemCount, pendingSpace);
    754        
     747
    755748        Value * const fieldWidths = kb->simd_fill(mFieldWidth, pendingOffset);
    756749
     
    761754            // Combine as many of the new items as possible into the pending group.
    762755            Value * combinedGroup = kb->CreateOr(pendingData[j], kb->CreateShl(newItems, fieldWidths));
    763             // To avoid an unpredictable branch, always store the combined group, whether full or not.               
     756            // To avoid an unpredictable branch, always store the combined group, whether full or not.
    764757            kb->CreateBlockAlignedStore(combinedGroup, kb->CreateGEP(outputStreamPtr[j], outputIndex));
    765758            // Any items in excess of the space available in the current pending group overflow for the next group.
     
    784777    Constant * blockOffsetMask = kb->getSize(kb->getBitBlockWidth() - 1);
    785778    Constant * outputIndexShift = kb->getSize(std::log2(mFieldWidth));
    786    
     779
    787780    Value * outputProduced = kb->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    788781    Value * producedOffset = kb->CreateAnd(outputProduced, blockOffsetMask);
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r6209 r6228  
    758758    const auto f = mScalarMap.find(name);
    759759    if (LLVM_UNLIKELY(f == mScalarMap.end())) {
     760        assert (false && "could not find scalar!");
    760761        report_fatal_error(getName() + " does not contain scalar: " + name);
    761762    }
     
    837838    if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
    838839        return true;
    839 //    } else if (rate.isRelative()) {
    840 //        return isCountable(getStreamBinding(rate.getReference()));
     840    } else if (rate.isRelative()) {
     841        return isCountable(getStreamBinding(rate.getReference()));
    841842    } else {
    842843        return false;
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r6187 r6228  
    3131namespace llvm { class Type; }
    3232namespace llvm { class Value; }
     33
    3334class BaseDriver;
    3435
     
    4445
    4546namespace kernel {
    46    
     47
    4748class KernelBuilder;
    4849class StreamSetBuffer;
     
    5051
    5152class Kernel : public AttributeSet {
    52     friend class KernelBuilder;       
     53    friend class KernelBuilder;
    5354    friend class PipelineBuilder;
    5455    friend class PipelineCompiler;
    5556    friend class PipelineKernel;
    56 
     57    friend class BaseDriver;
    5758public:
    5859
     
    243244    LLVM_READNONE Binding & getInputScalarBinding(const llvm::StringRef name);
    244245
     246    LLVM_READNONE const Binding & getInputScalarBinding(const llvm::StringRef name) const {
     247        return const_cast<Kernel *>(this)->getInputScalarBinding(name);
     248    }
     249
    245250    LLVM_READNONE Scalar * getInputScalar(const unsigned i) {
    246251        return llvm::cast<Scalar>(getInputScalarBinding(i).getRelationship());
     
    271276
    272277    LLVM_READNONE Binding & getOutputScalarBinding(const llvm::StringRef name);
     278
     279    LLVM_READNONE const Binding & getOutputScalarBinding(const llvm::StringRef name) const {
     280        return const_cast<Kernel *>(this)->getOutputScalarBinding(name);
     281    }
    273282
    274283    LLVM_READNONE Scalar * getOutputScalar(const llvm::StringRef name) {
     
    330339
    331340    void generateKernel(const std::unique_ptr<KernelBuilder> & b);
    332    
     341
    333342    void prepareKernel(const std::unique_ptr<KernelBuilder> & b);
    334343
     
    352361
    353362    LLVM_READNONE bool isUnknownRate(const Binding & binding) const;
     363
     364    /* Fill in any generated names / attributes for the kernel if their initialization is dependent on
     365     * settings / bindings added after construction. */
     366    virtual void finalizeKernel() { }
    354367
    355368    void initializeBindings(BaseDriver & driver);
     
    483496
    484497    const std::string               mKernelName;
    485    
     498
    486499
    487500    OwnedStreamSetBuffers           mStreamSetInputBuffers;
     
    526539    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
    527540
    528     void recordInitialItemCountsOfCountableStreams(const std::unique_ptr<KernelBuilder> & b);
    529 
    530     void incrementItemCountsOfCountableRateStreams(const std::unique_ptr<KernelBuilder> & b);
    531 
    532541};
    533542
     
    581590}
    582591
    583 #endif 
     592#endif
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r6189 r6228  
    337337Type * KernelBuilder::resolveStreamSetType(Type * const streamSetType) {
    338338    // TODO: Should this function be here? in StreamSetBuffer? or in Binding?
    339     unsigned numElements = 1;   
     339    unsigned numElements = 1;
    340340    Type * type = streamSetType;
    341341    if (LLVM_LIKELY(type->isArrayTy())) {
     
    361361}
    362362
    363 
    364 }
     363/** ------------------------------------------------------------------------------------------------------------- *
     364 * @brief getKernelName
     365 ** ------------------------------------------------------------------------------------------------------------- */
     366std::string KernelBuilder::getKernelName() const {
     367    return mKernel->getName();
     368}
     369
     370}
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r6184 r6228  
    156156
    157157    void setBaseAddress(const std::string & name, llvm::Value * addr);
    158    
     158
    159159    llvm::Value * getCapacity(const std::string & name);
    160160
     
    198198
    199199    }
    200    
     200
    201201    const unsigned mStride;
    202202
     
    208208
    209209    void setNamedItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value);
     210
     211    std::string getKernelName() const final;
    210212
    211213protected:
    212214    const Kernel * mKernel;
    213    
     215
    214216};
    215217
  • icGREP/icgrep-devel/icgrep/kernels/multiblock_kernel.cpp

    r6184 r6228  
    3636 ** ------------------------------------------------------------------------------------------------------------- */
    3737void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
    38 
    3938    Value * const numOfStrides = b->CreateSelect(mIsFinal, b->getSize(1), mNumOfStrides);
    40 
    4139    generateMultiBlockLogic(b, numOfStrides);
    42 
    43     BasicBlock * const exit = b->CreateBasicBlock();
    44     if (LLVM_UNLIKELY(hasAttribute(AttrId::CanTerminateEarly) || hasAttribute(AttrId::MustExplicitlyTerminate))) {
    45         Value * const terminated = b->getTerminationSignal();
    46         BasicBlock * const regularExit = b->CreateBasicBlock("regularExit", exit);
    47         b->CreateUnlikelyCondBr(terminated, exit, regularExit);
    48 
    49         b->SetInsertPoint(regularExit);
    50     }
    51     incrementItemCountsOfCountableRateStreams(b);
    52     b->CreateBr(exit);
    53 
    54     b->SetInsertPoint(exit);
    55 }
    56 
    57 /** ------------------------------------------------------------------------------------------------------------- *
    58  * @brief incrementItemCountsOfCountableRateStreams
    59  ** ------------------------------------------------------------------------------------------------------------- */
    60 inline void MultiBlockKernel::incrementItemCountsOfCountableRateStreams(const std::unique_ptr<KernelBuilder> & b) {
    61 
    62     const auto numOfInputs = getNumOfStreamInputs();
    63     for (unsigned i = 0; i < numOfInputs; i++) {
    64         const Binding & input = getInputStreamSetBinding(i);
    65         if (isCountable(input)) {
    66             Value * avail = getAvailableInputItems(i);
    67             b->setNonDeferredProcessedItemCount(input, avail);
    68         }
    69     }
    70 
    71     const auto numOfOutputs = getNumOfStreamOutputs();
    72     for (unsigned i = 0; i < numOfOutputs; i++) {
    73         const Binding & output = getOutputStreamSetBinding(i);
    74         if (isCountable(output)) {
    75             Value * avail = b->getCapacity(output.getName());
    76             b->setNonDeferredProducedItemCount(output, avail);
    77         }
    78     }
    7940}
    8041
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r6184 r6228  
    141141    b->SetInsertPoint(finishedStrides);
    142142}
    143    
    144 StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> &
     143
     144StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & b
    145145                                       , StreamSet * source, const unsigned base, StreamSet * mask
    146146                                       , StreamSet * expanded
     
    151151
    152152{Binding{"marker", mask, FixedRate(), Principal()},
    153 Binding{"source", source, PopcountOf("marker")}},
     153Binding{"source", source, PopcountOf("marker")}}, // BlockSize(b->getBitBlockWidth())
    154154{Binding{"output", expanded, FixedRate()}},
    155155{}, {}, {})
     
    164164    Type * sizeTy = b->getSizeTy();
    165165    const unsigned numFields = b->getBitBlockWidth() / mFieldWidth;
    166    
     166
    167167    Constant * const ZERO = b->getSize(0);
    168168    Constant * bwConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
     
    170170    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fieldWidthTy, mFieldWidth));
    171171    Constant * fw_sub1Splat = ConstantVector::getSplat(numFields, ConstantInt::get(fieldWidthTy, mFieldWidth - 1));
    172    
     172
    173173    BasicBlock * entry = b->GetInsertBlock();
    174174    BasicBlock * expandLoop = b->CreateBasicBlock("expandLoop");
     
    176176    Value * processedSourceItems = b->getProcessedItemCount("source");
    177177    Value * initialSourceOffset = b->CreateURem(processedSourceItems, bwConst);
     178
    178179    Value * pendingData[mSelectedStreamCount];
    179180    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
    180181        pendingData[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), ZERO);
    181182    }
    182    
     183
    183184    b->CreateBr(expandLoop);
    184185    // Main Loop
     
    216217    // The bits for each output field will typically come from (at most) two
    217218    // source fields, with offsets.  Calculate the field numbers and offsets.
    218    
     219
    219220    Value * fieldPopCounts = b->simd_popcount(mFieldWidth, deposit_mask);
    220221    // For each field determine the (partial) sum popcount of all fields prior to
     
    237238    Value * const newPendingOffset = b->CreateAdd(pendingOffsetPhi, blockPopCount);
    238239    Value * const srcBlockNo = b->CreateUDiv(newPendingOffset, bwConst);
     240
    239241    // Now load and process source streams.
    240242    Value * sourceData[mSelectedStreamCount];
     
    262264    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
    263265    b->CreateCondBr(moreToDo, expandLoop, expansionDone);
    264    
     266
    265267    b->SetInsertPoint(expansionDone);
    266268}
     
    278280
    279281}
    280    
     282
    281283void FieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
    282284    BasicBlock * entry = kb->GetInsertBlock();
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/buffer_management_logic.hpp

    r6187 r6228  
    2828
    2929inline RateValue getOverflowSize(const Kernel * kernel, const Binding & binding, const BufferRateData & rate) {
    30     return std::min(rate.maximum - rate.minimum, upperBound(kernel, binding));
     30    if (binding.hasAttribute(AttrId::BlockSize)) {
     31        return 0;
     32    }
     33    return std::min(rate.Maximum - rate.Minimum, upperBound(kernel, binding));
    3134}
    3235
     
    4649
    4750inline const Binding & PipelineCompiler::getOutputBinding(const Kernel * const producer, const unsigned index) const {
    48     return producer == mPipelineKernel ? mPipelineKernel->getInputStreamSetBinding(index) : producer->getOutputStreamSetBinding(index);
     51    if (producer == mPipelineKernel) {
     52        return mPipelineKernel->getInputStreamSetBinding(index);
     53    } else {
     54        return producer->getOutputStreamSetBinding(index);
     55    }
    4956}
    5057
    5158inline const Binding & PipelineCompiler::getInputBinding(const Kernel * const consumer, const unsigned index) const {
    52     return consumer == mPipelineKernel ? mPipelineKernel->getOutputStreamSetBinding(index) : consumer->getInputStreamSetBinding(index);
     59    if (consumer == mPipelineKernel) {
     60        return mPipelineKernel->getOutputStreamSetBinding(index);
     61    } else {
     62        return consumer->getInputStreamSetBinding(index);
     63    }
    5364}
    5465
     
    93104 *
    94105 * Return a cyclic bi-partite graph indicating the I/O relationships between the kernels and their buffers.
     106 *
     107 * Ordering: producer -> buffer -> consumer
    95108 ** ------------------------------------------------------------------------------------------------------------- */
    96109BufferGraph PipelineCompiler::makeBufferGraph(BuilderRef b) {
     
    103116    // make an edge from the pipeline input to a buffer vertex
    104117    enumerateBufferProducerBindings(pipelineVertex, mPipelineKernel->getInputStreamSetBindings(), G, M);
    105     G[pipelineVertex].kernel = mPipelineKernel;
     118    G[pipelineVertex].Kernel = mPipelineKernel;
    106119    // make an edge from each producing kernel to a buffer vertex
    107120    for (unsigned i = 0; i < numOfKernels; ++i) {
    108121        const auto & producer = mPipeline[i];
    109122        enumerateBufferProducerBindings(i, producer->getOutputStreamSetBindings(), G, M);
    110         G[i].kernel = producer;
     123        G[i].Kernel = producer;
    111124    }
    112125    // make an edge from each buffer to its consuming kernel(s)
     
    133146
    134147            BufferNode & kn = G[i];
    135             assert(kn.kernel);
     148            assert(kn.Kernel);
    136149
    137150            for (const auto ce : make_iterator_range(in_edges(i, G))) {
     
    143156                const BufferRateData & pd = G[pe];
    144157
    145                 const auto min = div_by_non_zero(pd.minimum, cd.maximum);
     158                const auto min = div_by_non_zero(pd.Minimum, cd.Maximum);
    146159                lower = std::min(lower, min);
    147160
    148                 const auto max = div_by_non_zero(pd.maximum, cd.minimum);
     161                const auto max = div_by_non_zero(pd.Maximum, cd.Minimum);
    149162                upper = std::min(upper, max);
    150163            }
    151164
    152             kn.lower = lower;
    153             kn.upper = upper;
     165            kn.Lower = lower;
     166            kn.Upper = upper;
    154167
    155168            for (const auto e : make_iterator_range(out_edges(i, G))) {
    156169                BufferRateData & rd = G[e];
    157                 rd.minimum = lower * rd.minimum;
    158                 rd.maximum = upper * rd.maximum;
     170                rd.Minimum = lower * rd.Minimum;
     171                rd.Maximum = upper * rd.Maximum;
    159172            }
    160173
    161174        }
    162175    }
    163 
    164 //    printBufferGraph(G, errs());
    165176
    166177    // fill in any known pipeline I/O buffers
    167178    for (const auto e : make_iterator_range(out_edges(pipelineVertex, G))) {
    168179        const auto bufferVertex = target(e, G);
    169         assert (G[bufferVertex].buffer == nullptr);
    170         StreamSetBuffer * const buffer = mPipelineKernel->getInputStreamSetBuffer(G[e].port);
    171         mIsPipelineIO.insert(buffer);
    172         G[bufferVertex].buffer = buffer;
     180        BufferNode & bn = G[bufferVertex];
     181        assert (bn.Buffer == nullptr);
     182        const auto inputPort = G[e].Port;
     183        bn.Buffer = mPipelineKernel->getInputStreamSetBuffer(inputPort);
     184        bn.Type = BufferType::External;
    173185    }
    174186
    175187    for (const auto e : make_iterator_range(in_edges(pipelineVertex, G))) {
    176188        const auto bufferVertex = source(e, G);
    177         assert (G[bufferVertex].buffer == nullptr);
    178         StreamSetBuffer * const buffer = mPipelineKernel->getOutputStreamSetBuffer(G[e].port);
    179         mIsPipelineIO.insert(buffer);
    180         G[bufferVertex].buffer = buffer;
     189        BufferNode & bn = G[bufferVertex];
     190        assert (bn.Buffer == nullptr);
     191        const auto outputPort = G[e].Port;
     192        bn.Buffer = mPipelineKernel->getOutputStreamSetBuffer(outputPort);
     193        bn.Type = BufferType::External;
    181194    }
    182195
     
    185198
    186199        // Is this a pipeline I/O buffer?
    187         if (LLVM_UNLIKELY(G[i].buffer != nullptr)) {
     200        BufferNode & bn = G[i];
     201
     202        if (LLVM_UNLIKELY(bn.Buffer != nullptr)) {
    188203            continue;
    189204        }
     
    194209
    195210        const BufferNode & producerNode = G[producerVertex];
    196         const auto & producer = producerNode.kernel;
     211        const auto & producer = producerNode.Kernel;
    197212        const BufferRateData & producerRate = G[pe];
    198         const Binding & output = getOutputBinding(producer, producerRate.port);
     213        const Binding & output = getOutputBinding(producer, producerRate.Port);
    199214
    200215        StreamSetBuffer * buffer = nullptr;
    201216
    202         const auto isUnknown = producerRate.maximum.numerator() == 0;
     217        const auto isUnknown = producerRate.Maximum.numerator() == 0;
    203218        const auto isManaged = output.hasAttribute(AttrId::ManagedBuffer);
     219
     220        BufferType bufferType = BufferType::Internal;
     221
    204222        if (LLVM_UNLIKELY(isUnknown || isManaged)) {
    205223            buffer = new ExternalBuffer(b, output.getType());
     224            bufferType = BufferType::Managed;
    206225        } else {
    207226
    208             RateValue requiredSpace{producerRate.maximum};
     227            RateValue requiredSpace{producerRate.Maximum};
    209228            RateValue overflowSpace{getInputOverflowSize(producer, output, producerRate)};
    210229            RateValue facsimileSpace{0};
    211             bool dynamic = (producerRate.minimum < producerRate.maximum);
     230
     231            bool dynamic = false;
    212232
    213233            for (const auto ce : make_iterator_range(out_edges(i, G))) {
    214234                const BufferRateData & consumerRate = G[ce];
    215                 requiredSpace = lcm(requiredSpace, consumerRate.maximum);
     235                requiredSpace = lcm(requiredSpace, consumerRate.Maximum);
    216236                const auto c = target(ce, G);
    217237                const BufferNode & consumerNode = G[c];
    218                 const Kernel * const consumer = consumerNode.kernel; assert (consumer);
    219                 const Binding & input = getInputBinding(consumer, consumerRate.port);
     238                const Kernel * const consumer = consumerNode.Kernel; assert (consumer);
     239                const Binding & input = getInputBinding(consumer, consumerRate.Port);
    220240                facsimileSpace = std::max(facsimileSpace, getOutputOverflowSize(consumer, input, consumerRate));
    221241                // Could the consumption rate be less than the production rate?
    222                 if ((consumerNode.lower * consumerRate.minimum) < producerRate.maximum) {
     242                if ((consumerNode.Lower * consumerRate.Minimum) < producerRate.Maximum) {
    223243                    dynamic = true;
    224244                }
    225245            }
    226246
    227             const auto bufferSize = lcm(requiredSpace, b->getBitBlockWidth()) * mPipelineKernel->getNumOfThreads();
    228             assert (bufferSize.denominator() == 1);
    229             const auto overflowSize = lcm(std::max(overflowSpace, facsimileSpace), b->getBitBlockWidth());
    230             assert (overflowSize.denominator() == 1);
     247            // calculate overflow (copyback) and fascimile (copyforward) space
     248            overflowSpace = lcm(overflowSpace, b->getBitBlockWidth());
     249            assert (overflowSpace.denominator() == 1);
     250            facsimileSpace = lcm(facsimileSpace, b->getBitBlockWidth());
     251            assert (facsimileSpace.denominator() == 1);
     252            bn.Overflow = overflowSpace.numerator();
     253            bn.Fasimile = facsimileSpace.numerator();
     254            const auto overflowSize = std::max(bn.Overflow, bn.Fasimile);
     255
     256            // compute the buffer size
     257            const auto bufferSpace = lcm(requiredSpace, b->getBitBlockWidth());
     258            assert (bufferSpace.denominator() == 1);
     259            const auto bufferSize = bufferSpace.numerator() * mPipelineKernel->getNumOfThreads();
     260
    231261
    232262            // A DynamicBuffer is necessary when we cannot bound the amount of unconsumed data a priori.
    233263            if (dynamic) {
    234                 buffer = new DynamicBuffer(b, output.getType(), bufferSize.numerator(), overflowSize.numerator());
     264                buffer = new DynamicBuffer(b, output.getType(), bufferSize, overflowSize);
    235265            } else {
    236                 buffer = new StaticBuffer(b, output.getType(), bufferSize.numerator(), overflowSize.numerator());
     266                buffer = new StaticBuffer(b, output.getType(), bufferSize, overflowSize);
    237267            }
    238             if (LLVM_UNLIKELY(overflowSize > 0)) {
    239                 mOverflowRequirements.emplace(buffer, OverflowRequirement{ceiling(overflowSpace), ceiling(facsimileSpace)});
    240             }
    241         }
    242 
    243         G[i].buffer = buffer;
     268        }
     269
     270        bn.Buffer = buffer;
     271        bn.Type = bufferType;
     272
    244273        mOwnedBuffers.emplace_back(buffer);
    245274    }
    246275
    247     // printBufferGraph(G, errs());
     276//    printBufferGraph(G, errs());
    248277
    249278    return G;
     
    271300    for (unsigned i = firstBuffer; i != lastBuffer; ++i) {
    272301        out << "v" << i << " [label=\"";
    273         if (G[i].buffer == nullptr) {
     302        const BufferNode & bn = G[i];
     303        if (bn.Buffer == nullptr) {
    274304            out << '?';
    275         } else if (isa<ExternalBuffer>(G[i].buffer)) {
     305        } else if (isa<ExternalBuffer>(bn.Buffer)) {
    276306            out << 'E';
    277         } else if (isa<DynamicBuffer>(G[i].buffer)) {
     307        } else if (isa<DynamicBuffer>(bn.Buffer)) {
    278308            out << 'D';
    279         } else if (isa<StaticBuffer>(G[i].buffer)) {
     309        } else if (isa<StaticBuffer>(bn.Buffer)) {
    280310            out << 'S';
     311        }
     312        if (bn.Overflow || bn.Fasimile) {
     313            out << " (O:" << bn.Overflow << ",F:" << bn.Fasimile << ')';
    281314        }
    282315        out << "\"];\n";
     
    293326
    294327        out << " [label=\"";
    295         if (pd.minimum.denominator() > 1 || pd.maximum.denominator() > 1) {
    296             out << pd.minimum.numerator() << "/" << pd.minimum.denominator()
     328        if (pd.Minimum.denominator() > 1 || pd.Maximum.denominator() > 1) {
     329            out << pd.Minimum.numerator() << "/" << pd.Minimum.denominator()
    297330                << " - "
    298                 << pd.maximum.numerator() << "/" << pd.maximum.denominator();
     331                << pd.Maximum.numerator() << "/" << pd.Maximum.denominator();
    299332        } else {
    300             out << pd.minimum.numerator() << " - " << pd.maximum.numerator();
     333            out << pd.Minimum.numerator() << " - " << pd.Maximum.numerator();
    301334        }
    302335        out << '\n';
     
    304337        if (s <= numOfKernels) {
    305338            // producer edge
    306             const Kernel * const k = G[s].kernel;
    307             out << k->getName() << "." << getOutputBinding(k, pd.port).getName();
     339            const Kernel * const k = G[s].Kernel;
     340            out << k->getName() << "." << getOutputBinding(k, pd.Port).getName();
    308341        } else {
    309342            assert (t <= numOfKernels);
    310             const Kernel * const k = G[t].kernel;
    311             out << k->getName() << "." << getInputBinding(k, pd.port).getName();
     343            const Kernel * const k = G[t].Kernel;
     344            out << k->getName() << "." << getInputBinding(k, pd.Port).getName();
    312345        }
    313346
     
    322355 * @brief addHandlesToPipelineKernel
    323356 ** ------------------------------------------------------------------------------------------------------------- */
    324 void PipelineCompiler::addHandlesToPipelineKernel(BuilderRef b) {
    325     const auto numOfKernels = mPipeline.size();
    326     b->setKernel(mPipelineKernel);
     357inline void PipelineCompiler::addBufferHandlesToPipelineKernel(BuilderRef b, const unsigned index) {
    327358    IntegerType * const sizeTy = b->getSizeTy();
    328     for (unsigned i = 0; i < numOfKernels; ++i) {
    329         const auto & kernel = mPipeline[i];
    330         if (!kernel->hasFamilyName()) {
    331             PointerType * kernelPtrTy = kernel->getKernelType()->getPointerTo(0);
    332             mPipelineKernel->addInternalScalar(kernelPtrTy, makeKernelName(i));
    333         }
    334         for (const auto e : make_iterator_range(out_edges(i, mBufferGraph))) {
    335             const auto bufferVertex = target(e, mBufferGraph);
    336             const StreamSetBuffer * const buffer = mBufferGraph[bufferVertex].buffer;
    337             if (LLVM_UNLIKELY(isa<ExternalBuffer>(buffer))) {
    338                 continue;
    339             }
    340             const Binding & output = kernel->getOutputStreamSetBinding(mBufferGraph[e].port);
    341             const auto bufferName = makeBufferName(i, output);
    342             mPipelineKernel->addInternalScalar(buffer->getHandleType(b), bufferName);
    343             mPipelineKernel->addInternalScalar(sizeTy, bufferName + CONSUMED_ITEM_COUNT_SUFFIX);
    344         }
     359    const Kernel * const kernel = mPipeline[index];
     360    if (!kernel->hasFamilyName()) {
     361        PointerType * kernelPtrTy = kernel->getKernelType()->getPointerTo(0);
     362        mPipelineKernel->addInternalScalar(kernelPtrTy, makeKernelName(index));
     363    }
     364    for (const auto e : make_iterator_range(out_edges(index, mBufferGraph))) {
     365        const auto bufferVertex = target(e, mBufferGraph);
     366        const BufferNode & bn = mBufferGraph[bufferVertex];
     367        if (LLVM_UNLIKELY(bn.Type == BufferType::Managed)) {
     368            continue;
     369        }
     370        const Binding & output = kernel->getOutputStreamSetBinding(mBufferGraph[e].Port);
     371        const auto bufferName = makeBufferName(index, output);
     372        mPipelineKernel->addInternalScalar(bn.Buffer->getHandleType(b), bufferName);
     373        mPipelineKernel->addInternalScalar(sizeTy, bufferName + CONSUMED_ITEM_COUNT_SUFFIX);
    345374    }
    346375}
     
    359388
    360389    for (unsigned i = firstBuffer; i < lastBuffer; ++i) {
    361         StreamSetBuffer * const buffer = mBufferGraph[i].buffer;
    362         if (LLVM_UNLIKELY(isa<ExternalBuffer>(buffer))) {
    363             continue;
    364         }
    365         const auto pe = in_edge(i, mBufferGraph);
    366         const auto p = source(pe, mBufferGraph);
    367         const auto & producer = mPipeline[p];
    368         const Binding & output = producer->getOutputStreamSetBinding(mBufferGraph[pe].port);
    369         const auto name = makeBufferName(p, output);
    370         Value * const handle = b->getScalarFieldPtr(name);
    371         buffer->setHandle(b, handle);
    372         buffer->allocateBuffer(b);
     390        const BufferNode & bn = mBufferGraph[i];
     391        if (LLVM_LIKELY(bn.Type == BufferType::Internal)) {
     392            const auto pe = in_edge(i, mBufferGraph);
     393            const auto p = source(pe, mBufferGraph);
     394            const auto & producer = mPipeline[p];
     395            const Binding & output = producer->getOutputStreamSetBinding(mBufferGraph[pe].Port);
     396            const auto name = makeBufferName(p, output);
     397            Value * const handle = b->getScalarFieldPtr(name);
     398            StreamSetBuffer * const buffer = bn.Buffer;
     399            buffer->setHandle(b, handle);
     400            buffer->allocateBuffer(b);
     401        }
    373402    }
    374403}
     
    381410inline void PipelineCompiler::loadBufferHandles(BuilderRef b) {
    382411    assert (mPipeline[mKernelIndex] == mKernel);
    383     assert (b->getKernel() == mKernel);
    384412    for (const auto pe : make_iterator_range(out_edges(mKernelIndex, mBufferGraph))) {
    385413        const auto bufferVertex = target(pe, mBufferGraph);
    386         StreamSetBuffer * const buffer = mBufferGraph[bufferVertex].buffer;
    387         const Binding & output = mKernel->getOutputStreamSetBinding(mBufferGraph[pe].port);
    388         if (LLVM_UNLIKELY(isa<ExternalBuffer>(buffer))) {
    389             // if the handle was already set, it must be a pipeline I/O stream
    390             if (LLVM_LIKELY(buffer->getHandle() == nullptr)) {
    391                 buffer->setHandle(b, b->getScalarFieldPtr(output.getName() + BUFFER_HANDLE_SUFFIX));
    392             }
    393         } else {
     414        const auto outputPort = mBufferGraph[pe].Port;
     415        const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     416        const BufferNode & bn = mBufferGraph[bufferVertex];
     417        StreamSetBuffer * const buffer = bn.Buffer;
     418        if (LLVM_LIKELY(bn.Type == BufferType::Internal)) {
    394419            b->setKernel(mPipelineKernel);
    395420            buffer->setHandle(b, b->getScalarFieldPtr(makeBufferName(mKernelIndex, output)));
     421        } else if (bn.Type == BufferType::Managed) {
    396422            b->setKernel(mKernel);
    397         }
    398     }
     423            buffer->setHandle(b, b->getScalarFieldPtr(output.getName() + BUFFER_HANDLE_SUFFIX));
     424        }
     425        assert (buffer->getHandle());
     426    }
     427    b->setKernel(mKernel);
    399428}
    400429
     
    403432 ** ------------------------------------------------------------------------------------------------------------- */
    404433inline void PipelineCompiler::releaseBuffers(BuilderRef b) {
    405     for (const auto & buffer : mOwnedBuffers) {
    406         buffer->releaseBuffer(b);
    407     }
    408 }
    409 
    410 /** ------------------------------------------------------------------------------------------------------------- *
    411  * @brief readCurrentProducedItemCounts
    412  ** ------------------------------------------------------------------------------------------------------------- */
    413 inline void PipelineCompiler::readCurrentProducedItemCounts(BuilderRef b) {
     434    const auto firstBuffer = mPipeline.size() + 1;
     435    const auto lastBuffer = num_vertices(mBufferGraph);
     436    for (auto bufferVertex = firstBuffer; bufferVertex != lastBuffer; ++bufferVertex) {
     437        const BufferNode & bn = mBufferGraph[bufferVertex];
     438        if (LLVM_LIKELY(bn.Type == BufferType::Internal)) {
     439            bn.Buffer->releaseBuffer(b);
     440        }
     441    }
     442}
     443
     444/** ------------------------------------------------------------------------------------------------------------- *
     445 * @brief readInitialProducedItemCounts
     446 ** ------------------------------------------------------------------------------------------------------------- */
     447inline void PipelineCompiler::readInitialProducedItemCounts(BuilderRef b) {
     448    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     449    for (unsigned i = 0; i < numOfOutputs; ++i) {
     450        const Binding & output = mKernel->getOutputStreamSetBinding(i);
     451        Value * const produced = b->getNonDeferredProducedItemCount(output);
     452        mInitiallyProducedItemCount[i] = produced;
     453    }
     454}
     455
     456/** ------------------------------------------------------------------------------------------------------------- *
     457 * @brief readFinalProducedItemCounts
     458 ** ------------------------------------------------------------------------------------------------------------- */
     459inline void PipelineCompiler::readFinalProducedItemCounts(BuilderRef b) {
    414460    for (const auto e : make_iterator_range(out_edges(mKernelIndex, mBufferGraph))) {
    415         const Binding & output = mKernel->getOutputStreamSetBinding(mBufferGraph[e].port);
    416         Value * const produced = b->getProducedItemCount(output.getName());
    417         const auto bufferIndex = target(e, mBufferGraph);
    418         const StreamSetBuffer * const buffer = mBufferGraph[bufferIndex].buffer;
    419         assert (mTotalItemCount.count(buffer) == 0);
    420         mTotalItemCount.emplace(buffer, produced);
    421         initializeConsumedItemCount(b, bufferIndex, produced);
     461        const Binding & output = mKernel->getOutputStreamSetBinding(mBufferGraph[e].Port);
     462        Value * const produced = b->getNonDeferredProducedItemCount(output);
     463        // TODO: we only need to consider the blocksize attribute if it's possible this
     464        // stream could be read before being fully written. This might occur if one of
     465        // it's consumers is a PopCount rate that does not have a matching BlockSize
     466        // attribute.
     467        Value * fullyProduced = truncateBlockSize(b, output, produced, mTerminatedFlag);
     468        const auto bufferVertex = target(e, mBufferGraph);
     469        BufferNode & bn = mBufferGraph[bufferVertex];
     470        assert (bn.TotalItems == nullptr);
     471        bn.TotalItems = fullyProduced;
     472        initializeConsumedItemCount(b, bufferVertex, fullyProduced);
     473        initializePopCountReferenceItemCount(b, bufferVertex, fullyProduced);
     474        #ifdef PRINT_DEBUG_MESSAGES
     475        const auto prefix = makeBufferName(mKernelIndex, output);
     476        b->CallPrintInt(prefix + "_produced'", produced);
     477        #endif
    422478    }
    423479}
     
    426482 * @brief requiresCopyBack
    427483 ** ------------------------------------------------------------------------------------------------------------- */
    428 inline bool PipelineCompiler::requiresCopyBack(const StreamSetBuffer * const buffer) const {
    429     const auto f = mOverflowRequirements.find(buffer);
    430     if (f == mOverflowRequirements.end()) return false;
    431     const OverflowRequirement & r = f->second;
    432     return r.copyBack != 0;
     484inline bool PipelineCompiler::requiresCopyBack(const unsigned bufferVertex) const {
     485    return getCopyBack(bufferVertex) != 0;
    433486}
    434487
     
    436489 * @brief getCopyBack
    437490 ** ------------------------------------------------------------------------------------------------------------- */
    438 inline unsigned PipelineCompiler::getCopyBack(const StreamSetBuffer * const buffer) const {
    439     const auto f = mOverflowRequirements.find(buffer);
    440     if (LLVM_LIKELY(f == mOverflowRequirements.end())) {
    441         return 0;
    442     }
    443     const OverflowRequirement & r = f->second;
    444     return r.copyBack;
     491inline unsigned PipelineCompiler::getCopyBack(const unsigned bufferVertex) const {
     492    return mBufferGraph[bufferVertex].Overflow;
    445493}
    446494
     
    448496 * @brief requiresFacsimile
    449497 ** ------------------------------------------------------------------------------------------------------------- */
    450 inline bool PipelineCompiler::requiresFacsimile(const StreamSetBuffer * const buffer) const {
    451     const auto f = mOverflowRequirements.find(buffer);
    452     if (f == mOverflowRequirements.end()) return false;
    453     const OverflowRequirement & r = f->second;
    454     return r.facsimile != 0;
     498inline bool PipelineCompiler::requiresFacsimile(const unsigned bufferVertex) const {
     499    return getFacsimile(bufferVertex) != 0;
    455500}
    456501
     
    458503 * @brief getFacsimile
    459504 ** ------------------------------------------------------------------------------------------------------------- */
    460 inline unsigned PipelineCompiler::getFacsimile(const StreamSetBuffer * const buffer) const {
    461     const auto f = mOverflowRequirements.find(buffer);
    462     if (LLVM_LIKELY(f == mOverflowRequirements.end())) {
    463         return 0;
    464     }
    465     const OverflowRequirement & r = f->second;
    466     return r.facsimile;
    467 
    468 }
    469 
    470 /** ------------------------------------------------------------------------------------------------------------- *
    471  * @brief isPipelineIO
    472  ** ------------------------------------------------------------------------------------------------------------- */
    473 inline bool PipelineCompiler::isPipelineIO(const StreamSetBuffer * const buffer) const {
    474     return mIsPipelineIO.count(buffer) != 0;
    475 }
    476 
    477 /** ------------------------------------------------------------------------------------------------------------- *
    478  * @brief storeCopyBackProducedItemCounts
    479  ** ------------------------------------------------------------------------------------------------------------- */
    480 inline void PipelineCompiler::storeCopyBackProducedItemCounts(BuilderRef b) {
    481     const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    482     mCopyBackProducedOutputItems.resize(numOfOutputs);
    483     std::fill(mCopyBackProducedOutputItems.begin(), mCopyBackProducedOutputItems.end(), nullptr);
    484     for (unsigned i = 0; i < numOfOutputs; ++i) {
    485         const StreamSetBuffer * const buffer = getOutputBuffer(i);
    486         if (requiresCopyBack(buffer)) {
    487             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    488             mCopyBackProducedOutputItems[i] = b->getNonDeferredProducedItemCount(output);
    489         }
    490     }
     505inline unsigned PipelineCompiler::getFacsimile(const unsigned bufferVertex) const {
     506    return mBufferGraph[bufferVertex].Fasimile;
     507
     508}
     509
     510/** ------------------------------------------------------------------------------------------------------------- *
     511 * @brief getOutputBufferType
     512 ** ------------------------------------------------------------------------------------------------------------- */
     513BufferType PipelineCompiler::getOutputBufferType(const unsigned outputPort) const {
     514    const auto bufferVertex = getOutputBufferVertex(outputPort);
     515    return mBufferGraph[bufferVertex].Type;
    491516}
    492517
     
    497522    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    498523    for (unsigned i = 0; i < numOfOutputs; ++i) {
    499         if (mCopyBackProducedOutputItems[i]) {
     524        if (requiresCopyBack(getOutputBufferVertex(i))) {
    500525            const Binding & output = mKernel->getOutputStreamSetBinding(i);
    501             const auto prefix = mKernel->getName() + "_" + output.getName();
     526            const auto prefix = makeBufferName(mKernelIndex, output);
    502527            BasicBlock * const copyBack = b->CreateBasicBlock(prefix + "_copyBack", mKernelExit);
    503528            BasicBlock * const copyExit = b->CreateBasicBlock(prefix + "_copyBackExit", mKernelExit);
    504529            const StreamSetBuffer * const buffer = getOutputBuffer(i);
    505530            Value * const capacity = buffer->getCapacity(b.get());
    506 
    507             Value * const priorOffset = b->CreateURem(mCopyBackProducedOutputItems[i], capacity);
     531            Value * const priorOffset = b->CreateURem(mAlreadyProducedItemCount[i], capacity);
    508532            Value * const produced = b->getNonDeferredProducedItemCount(output);
    509533            Value * const producedOffset = b->CreateURem(produced, capacity);
     
    514538
    515539            b->SetInsertPoint(copyBack);
    516             Value * blocksToCopy = producedOffset;
    517             const auto itemWidth = getItemWidth(buffer->getBaseType());
    518             const auto blockWidth = b->getBitBlockWidth();
    519             if (LLVM_LIKELY(itemWidth < blockWidth)) {
    520                 blocksToCopy = b->CreateCeilUDiv(blocksToCopy, b->getSize(blockWidth / itemWidth));
    521             } else if (LLVM_UNLIKELY(blockWidth > itemWidth)) {
    522                 blocksToCopy = b->CreateMul(blocksToCopy, b->getSize(itemWidth / blockWidth));
    523             }
    524             blocksToCopy = b->CreateMul(blocksToCopy, buffer->getStreamSetCount(b.get()));
    525             const auto bytesPerBlock = blockWidth / 8;
    526             Value * const bytesToCopy = b->CreateMul(blocksToCopy, b->getSize(bytesPerBlock));
    527             Value * const source = buffer->getOverflowAddress(b.get());
    528             Value * const target = buffer->getBaseAddress(b.get());
    529             b->CreateMemCpy(target, source, bytesToCopy, bytesPerBlock);
     540            writeOverflowCopy(b, buffer, OverflowCopy::Backwards, producedOffset);
    530541            b->CreateBr(copyExit);
    531542
    532543            b->SetInsertPoint(copyExit);
    533         }
    534     }
    535 }
    536 
    537 /** ------------------------------------------------------------------------------------------------------------- *
    538  * @brief storeCopyForwardProducedItemCounts
    539  ** ------------------------------------------------------------------------------------------------------------- */
    540 inline void PipelineCompiler::storeCopyForwardProducedItemCounts(BuilderRef b) {
    541     const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    542     mCopyForwardProducedOutputItems.resize(numOfOutputs);
    543     std::fill(mCopyForwardProducedOutputItems.begin(), mCopyForwardProducedOutputItems.end(), nullptr);
    544     for (unsigned i = 0; i < numOfOutputs; ++i) {
    545         const StreamSetBuffer * const buffer = getOutputBuffer(i);
    546         if (requiresFacsimile(buffer)) {
    547             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    548             mCopyForwardProducedOutputItems[i] = b->getNonDeferredProducedItemCount(output);
    549544        }
    550545    }
     
    559554    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    560555    for (unsigned i = 0; i < numOfOutputs; ++i) {
    561         if (mCopyForwardProducedOutputItems[i]) {
     556        if (requiresFacsimile(getOutputBufferVertex(i))) {
    562557
    563558            const Binding & output = mKernel->getOutputStreamSetBinding(i);
    564             const auto prefix = mKernel->getName() + "_" + output.getName();
     559            const auto prefix = makeBufferName(mKernelIndex, output);
    565560            BasicBlock * const copyForward = b->CreateBasicBlock(prefix + "_copyForward", mKernelExit);
    566561            BasicBlock * const copyExit = b->CreateBasicBlock(prefix + "_copyForwardExit", mKernelExit);
     
    568563
    569564            Value * const capacity = buffer->getCapacity(b.get());
    570             Value * const initial = mCopyForwardProducedOutputItems[i];
     565            Value * const initial = mInitiallyProducedItemCount[i];
    571566            Value * const produced = b->getNonDeferredProducedItemCount(output);
    572567
     
    578573            }
    579574            // And we started writing within the first block ...
    580             const auto requiredOverflowSize = getFacsimile(buffer);
     575            const auto requiredOverflowSize = getFacsimile(getOutputBufferVertex(i));
    581576            assert (requiredOverflowSize <= buffer->getOverflowCapacity(b));
    582577            Constant * const overflowSize = b->getSize(requiredOverflowSize);
     
    600595
    601596            b->SetInsertPoint(copyForward);
    602             Value * blocksToCopy = overflowSize;
    603             const auto itemWidth = getItemWidth(buffer->getBaseType());
    604             const auto blockWidth = b->getBitBlockWidth();
    605             if (LLVM_LIKELY(itemWidth < blockWidth)) {
    606                 blocksToCopy = b->CreateCeilUDiv(blocksToCopy, b->getSize(blockWidth / itemWidth));
    607             } else if (LLVM_UNLIKELY(blockWidth > itemWidth)) {
    608                 blocksToCopy = b->CreateMul(blocksToCopy, b->getSize(itemWidth / blockWidth));
    609             }
    610             blocksToCopy = b->CreateMul(blocksToCopy, buffer->getStreamSetCount(b.get()));
    611 
    612             const auto bytesPerBlock = blockWidth / 8;
    613             Value * const bytesToCopy = b->CreateMul(blocksToCopy, b->getSize(bytesPerBlock));
    614             Value * const source = buffer->getBaseAddress(b.get());
    615             Value * const target = buffer->getOverflowAddress(b.get());
    616             b->CreateMemCpy(target, source, bytesToCopy, bytesPerBlock);
     597            writeOverflowCopy(b, buffer, OverflowCopy::Forwards, overflowSize);
    617598            b->CreateBr(copyExit);
    618599
     
    623604
    624605/** ------------------------------------------------------------------------------------------------------------- *
     606 * @brief writeOverflowCopy
     607 ** ------------------------------------------------------------------------------------------------------------- */
     608Value * PipelineCompiler::writeOverflowCopy(BuilderRef b, const StreamSetBuffer * const buffer, const OverflowCopy direction, Value * const itemsToCopy) const {
     609    Value * const count = buffer->getStreamSetCount(b.get());
     610    Value * blocksToCopy = b->CreateMul(itemsToCopy, count);
     611    const auto itemWidth = getItemWidth(buffer->getBaseType());
     612    const auto blockWidth = b->getBitBlockWidth();
     613    if (LLVM_LIKELY(itemWidth < blockWidth)) {
     614        blocksToCopy = b->CreateCeilUDiv(blocksToCopy, b->getSize(blockWidth / itemWidth));
     615    } else if (LLVM_UNLIKELY(blockWidth > itemWidth)) {
     616        blocksToCopy = b->CreateMul(blocksToCopy, b->getSize(itemWidth / blockWidth));
     617    }
     618    const auto bytesPerBlock = blockWidth / 8;
     619    Value * const bytesToCopy = b->CreateMul(blocksToCopy, b->getSize(bytesPerBlock));
     620    Value * const base = buffer->getBaseAddress(b.get());
     621    Value * const overflow = buffer->getOverflowAddress(b.get());
     622    Value * const source = (direction == OverflowCopy::Forwards) ? base : overflow;
     623    Value * const target = (direction == OverflowCopy::Forwards) ? overflow : base;
     624    b->CreateMemCpy(target, source, bytesToCopy, bytesPerBlock);
     625    return bytesToCopy;
     626}
     627
     628/** ------------------------------------------------------------------------------------------------------------- *
    625629 * @brief getLogicalInputBaseAddress
    626630 ** ------------------------------------------------------------------------------------------------------------- */
    627 inline Value * PipelineCompiler::getLogicalInputBaseAddress(BuilderRef b, const unsigned index) const {
    628     const Binding & input = mKernel->getInputStreamSetBinding(index);
    629     const StreamSetBuffer * const buffer = getInputBuffer(index);
    630     Value * const processed = b->getNonDeferredProcessedItemCount(input);
    631     assert (mAccessibleInputItems[index]);
     631inline Value * PipelineCompiler::getLogicalInputBaseAddress(BuilderRef b, const unsigned inputPort) {
     632    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
     633    const StreamSetBuffer * const buffer = getInputBuffer(inputPort);
     634    Value * const processed = getAlreadyProcessedItemCount(b, inputPort);
    632635    return calculateLogicalBaseAddress(b, input, buffer, processed);
    633636}
     
    636639 * @brief getLogicalOutputBaseAddress
    637640 ** ------------------------------------------------------------------------------------------------------------- */
    638 inline Value * PipelineCompiler::getLogicalOutputBaseAddress(BuilderRef b, const unsigned index) const {
    639     const Binding & output = mKernel->getOutputStreamSetBinding(index);
    640     const StreamSetBuffer * const buffer = getOutputBuffer(index);
    641     Value * const produced = b->getNonDeferredProducedItemCount(output);
     641inline Value * PipelineCompiler::getLogicalOutputBaseAddress(BuilderRef b, const unsigned outputPort) {
     642    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     643    const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
     644    Value * const produced = getAlreadyProducedItemCount(b, outputPort);
    642645    return calculateLogicalBaseAddress(b, output, buffer, produced);
    643646}
     
    648651 * Returns the address of the "zeroth" item of the (logically-unbounded) stream set.
    649652 ** ------------------------------------------------------------------------------------------------------------- */
    650 inline Value * PipelineCompiler::calculateLogicalBaseAddress(BuilderRef b, const Binding & binding, const StreamSetBuffer * const buffer, Value * const itemCount) const {
     653inline Value * PipelineCompiler::calculateLogicalBaseAddress(BuilderRef b, const Binding & binding, const StreamSetBuffer * const buffer, Value * const itemCount) {
    651654    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(floor_log2(b->getBitBlockWidth()));
    652655    Constant * const ZERO = b->getSize(0);
     
    662665        Value * const A0 = buffer->getStreamBlockPtr(b.get(), ZERO, blockIndex);
    663666        Value * const B0 = tmp.getStreamBlockPtr(b.get(), ZERO, blockIndex);
    664         b->CreateAssert(b->CreateICmpEQ(A0, B0), prefix + ": logical base address is incorrect");
     667        Value * const B1 = b->CreatePointerCast(B0, A0->getType());
     668        b->CreateAssert(b->CreateICmpEQ(A0, B1), prefix + ": logical base address is incorrect");
    665669    }
    666670    return address;
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/consumer_logic.hpp

    r6184 r6228  
    55
    66namespace kernel {
    7 
    8 namespace {
    9 
    10 const auto FAKE = std::numeric_limits<unsigned>::max();
    11 
    12 /** ------------------------------------------------------------------------------------------------------------- *
    13  * @brief minimumConsumed
    14  ** ------------------------------------------------------------------------------------------------------------- */
    15 inline LLVM_READNONE RateValue minimumConsumed(const Kernel * const kernel, const Binding & binding) {
    16     if (LLVM_UNLIKELY(binding.hasAttribute(AttrId::Deferred))) {
    17         return RateValue{0};
    18     }
    19     return lowerBound(kernel, binding);
    20 }
    21 
    22 }
    23 
    24 /** ------------------------------------------------------------------------------------------------------------- *
    25  * @brief makeConsumerGraph
    26  *
    27  * Copy the buffer graph but amalgamate any multi-edges into a single one
    28  ** ------------------------------------------------------------------------------------------------------------- */
    29 ConsumerGraph PipelineCompiler::makeConsumerGraph()  const {
    30 
    31     const auto lastBuffer = num_vertices(mBufferGraph);
    32     ConsumerGraph G(lastBuffer);
    33     const auto numOfKernels = mPipeline.size();
    34     const auto firstBuffer = numOfKernels + 1;
    35 
    36     #warning TODO: ConsumerGraph assumes the dataflow is transitively bounded by the same initial source
    37 
    38     #warning REVISIT: ConsumerGraph is not optimal for handling relative rate inputs
    39 
    40     std::vector<std::pair<unsigned, unsigned>> consumers; // kernel, portIndex
    41 
    42     for (auto bufferVertex = firstBuffer; bufferVertex < lastBuffer; ++bufferVertex) {
    43 
    44         if (LLVM_UNLIKELY(isPipelineIO(mBufferGraph[bufferVertex].buffer))) {
    45             continue;
    46         }
    47 
    48         // copy the producing edge
    49         const auto pe = in_edge(bufferVertex, mBufferGraph);
    50         add_edge(source(pe, mBufferGraph), bufferVertex, mBufferGraph[pe].port, G);
    51 
    52         // collect the consumers of the i-th buffer
    53         consumers.clear();
    54         for (const auto e : make_iterator_range(out_edges(bufferVertex, mBufferGraph))) {
    55             consumers.emplace_back(target(e, mBufferGraph), mBufferGraph[e].port);
    56         }
    57 
    58         // If the minimum input rate of the j-th consumer is greater than or equal to the maximum input
    59         // rate of the k-th consumer, we do not need to test the j-th consumer. This also ensures that
    60         // for each *FIXED* rate stream, keep only the minimum such rate. However, we may need to insert
    61         // a "fake" edge to mark the last consumer otherwise we'll set it too soon.
    62 
    63         if (LLVM_LIKELY(consumers.size() > 1)) {
    64             std::sort(consumers.begin(), consumers.end());
    65 
    66             const auto finalConsumer = consumers.back().first;
    67 
    68             for (auto j = consumers.begin() + 1; j != consumers.end(); ) {
    69 
    70                 const Kernel * const kernel_j = mPipeline[j->first];
    71                 const Binding & input_j = kernel_j->getInputStreamSetBinding(j->second);
    72                 const auto lb_j = minimumConsumed(kernel_j, input_j);
    73 
    74                 for (auto k = consumers.begin(); k != j; ++k) {
    75                     const Kernel * const kernel_k = mPipeline[k->first];
    76                     const Binding & input_k = kernel_k->getInputStreamSetBinding(k->second);
    77                     const auto ub_k = upperBound(kernel_k, input_k);
    78                     if (LLVM_UNLIKELY(lb_j >= ub_k)) {
    79                         j = consumers.erase(j);
    80                         goto next;
    81                     }
    82                 }
    83 
    84                 for (auto k = j + 1; k != consumers.end(); ++k) {
    85                     const Kernel * const kernel_k = mPipeline[k->first];
    86                     const Binding & input_k = kernel_k->getInputStreamSetBinding(k->second);
    87                     const auto ub_k = upperBound(kernel_k, input_k);
    88                     if (LLVM_UNLIKELY(lb_j >= ub_k)) {
    89                         j = consumers.erase(j);
    90                         goto next;
    91                     }
    92                 }
    93 
    94                 ++j;
    95 next:           continue;
    96             }
    97             if (LLVM_UNLIKELY(consumers.back().first != finalConsumer)) {
    98                 consumers.emplace_back(finalConsumer, FAKE);
    99             }
    100         }
    101 
    102         for (const auto & consumer : consumers) {
    103             add_edge(bufferVertex, consumer.first, consumer.second, G);
    104         }
    105     }
    106 
    107     return G;
    108 }
    1097
    1108/** ------------------------------------------------------------------------------------------------------------- *
     
    11917        setConsumedItemCount(b, bufferVertex, produced);
    12018    } else { // otherwise set the initial consumed amount to the produced amount
    121         ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.consumed == nullptr);
    122         cn.consumed = produced;
     19        ConsumerNode & cn = mConsumerGraph[bufferVertex];
     20        assert (cn.Consumed == nullptr);
     21        cn.Consumed = produced;
    12322    }
    12423}
     
    12928inline void PipelineCompiler::createConsumedPhiNodes(BuilderRef b) {
    13029    for (const auto e : make_iterator_range(in_edges(mKernelIndex, mConsumerGraph))) {
    131         if (LLVM_UNLIKELY(mConsumerGraph[e] == FAKE)) continue;
     30        if (LLVM_UNLIKELY(mConsumerGraph[e] == FAKE_CONSUMER)) continue;
    13231        const auto bufferVertex = source(e, mConsumerGraph);
    133         ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.consumed);
     32        ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.Consumed);
    13433        PHINode * const consumedPhi = b->CreatePHI(b->getSizeTy(), 2);
    135         consumedPhi->addIncoming(cn.consumed, mKernelEntry);
    136         cn.phiNode = consumedPhi;
     34        consumedPhi->addIncoming(cn.Consumed, mKernelEntry);
     35        cn.PhiNode = consumedPhi;
    13736    }
    13837}
     
    14342inline void PipelineCompiler::computeMinimumConsumedItemCounts(BuilderRef b) {
    14443    for (const auto e : make_iterator_range(in_edges(mKernelIndex, mConsumerGraph))) {
    145         if (LLVM_UNLIKELY(mConsumerGraph[e] == FAKE)) continue;
    146         const Binding & input = mKernel->getInputStreamSetBinding(mConsumerGraph[e]);
    147         Value * const processed = getFullyProcessedItemCount(b, input);
     44        if (LLVM_UNLIKELY(mConsumerGraph[e] == FAKE_CONSUMER)) continue;
     45        Value * const processed = mFullyProcessedItemCount[mConsumerGraph[e]];
    14846        const auto bufferVertex = source(e, mConsumerGraph);
    149         ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.consumed);
    150         cn.consumed = b->CreateUMin(cn.consumed, processed);
     47        ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.Consumed);
     48        cn.Consumed = b->CreateUMin(cn.Consumed, processed);
    15149    }
    15250}
     
    15856    for (const auto e : make_iterator_range(in_edges(mKernelIndex, mConsumerGraph))) {
    15957        const auto bufferVertex = source(e, mConsumerGraph);
    160         ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.consumed);
    161         if (LLVM_LIKELY(mConsumerGraph[e] != FAKE)) {
    162             cn.phiNode->addIncoming(cn.consumed, mKernelLoopExitPhiCatch);
    163             cn.consumed = cn.phiNode;
     58        ConsumerNode & cn = mConsumerGraph[bufferVertex]; assert (cn.Consumed);
     59        if (LLVM_LIKELY(mConsumerGraph[e] != FAKE_CONSUMER)) {
     60            cn.PhiNode->addIncoming(cn.Consumed, mKernelLoopExitPhiCatch);
     61            cn.Consumed = cn.PhiNode;
    16462        }
    16563        // Is this kernel the last consumer? If so, store the consumed count
    16664        if (out_degree(bufferVertex, mConsumerGraph) == 1) {
    167             setConsumedItemCount(b, bufferVertex, cn.consumed);
     65            setConsumedItemCount(b, bufferVertex, cn.Consumed);
    16866        }
    16967    }
     
    17472 * @brief getConsumedItemCount
    17573 ** ------------------------------------------------------------------------------------------------------------- */
    176 Value * PipelineCompiler::getConsumedItemCount(BuilderRef b, const unsigned index) const {
     74Value * PipelineCompiler::getConsumedItemCount(BuilderRef b, const unsigned outputPort) const {
     75    // TODO: if we can prove that this stream is always fully consumed, its consumed count
     76    // is its produced count.
    17777    assert (mPipeline[mKernelIndex] == mKernel);
    17878    assert (b->getKernel() == mKernel);
    179     const Binding & output = mKernel->getOutputStreamSetBinding(index);
    180     StreamSetBuffer * const buffer = getOutputBuffer(index);
     79    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
    18180    Value * consumed = nullptr;
    182     if (LLVM_UNLIKELY(isa<ExternalBuffer>(buffer))) {
    183         if (LLVM_UNLIKELY(isPipelineIO(buffer))) {
    184             consumed = b->getSize(0);
    185         } else {
    186             consumed = b->getScalarField(output.getName() + CONSUMED_ITEM_COUNT_SUFFIX);
    187         }
    188     } else {
     81    const BufferNode & bn = mBufferGraph[getOutputBufferVertex(mKernelIndex, outputPort)];
     82    if (LLVM_LIKELY(bn.Type == BufferType::Internal)) {
    18983        b->setKernel(mPipelineKernel);
    19084        consumed = b->getScalarField(makeBufferName(mKernelIndex, output) + CONSUMED_ITEM_COUNT_SUFFIX);
    19185        b->setKernel(mKernel);
     86    } else if (bn.Type == BufferType::Managed) {
     87        consumed = b->getScalarField(output.getName() + CONSUMED_ITEM_COUNT_SUFFIX);
     88    } else if (bn.Type == BufferType::External) {
     89        consumed = b->getSize(0);
    19290    }
     91    assert (consumed);
    19392    return consumed;
    19493}
     
    201100    const auto pe = in_edge(bufferVertex, mConsumerGraph);
    202101    const auto producerVertex = source(pe, mConsumerGraph);
    203     const Kernel * const producer = mBufferGraph[producerVertex].kernel;
     102    const Kernel * const producer = mBufferGraph[producerVertex].Kernel;
    204103    assert (producer->getHandle());
    205104    const Binding & output = producer->getOutputStreamSetBinding(mConsumerGraph[pe]);
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/core_logic.hpp

    r6184 r6228  
    1 #include "pipeline_compiler.hpp"
     1#include "pipeline_compiler.hpp"
    22
    33namespace kernel {
     4
     5/** ------------------------------------------------------------------------------------------------------------- *
     6 * @brief addInternalKernelProperties
     7 ** ------------------------------------------------------------------------------------------------------------- */
     8inline void PipelineCompiler::addInternalKernelProperties(BuilderRef b) {
     9    initializePopCounts();
     10    const auto numOfKernels = mPipeline.size();
     11    b->setKernel(mPipelineKernel);
     12    for (unsigned i = 0; i < numOfKernels; ++i) {
     13        addBufferHandlesToPipelineKernel(b, i);
     14        addPopCountScalarsToPipelineKernel(b, i);
     15    }
     16}
    417
    518/** ------------------------------------------------------------------------------------------------------------- *
     
    4154}
    4255
    43 
    44 
    4556/** ------------------------------------------------------------------------------------------------------------- *
    4657 * @brief start
     
    5263    mPipelineLoop = b->CreateBasicBlock("pipelineLoop");
    5364    mPipelineEnd = b->CreateBasicBlock("pipelineEnd");
    54 
    55     const auto numOfKernels = mPipeline.size();
    56     for (mKernelIndex = 0; mKernelIndex < numOfKernels; ++mKernelIndex) {
    57         mKernel = mPipeline[mKernelIndex];
    58         const auto numOfInputs = mKernel->getNumOfStreamInputs();
    59         for (unsigned j = 0; j < numOfInputs; ++j) {
    60             allocateThreadLocalState(b, Port::Input, j);
    61         }
    62         const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    63         for (unsigned j = 0; j < numOfOutputs; ++j) {
    64             allocateThreadLocalState(b, Port::Output, j);
    65         }
    66     }
    6765
    6866    mKernel = nullptr;
     
    7876        mPipelineProgress = b->getFalse();
    7977    }
     78    #ifdef PRINT_DEBUG_MESSAGES
     79    b->CallPrintInt("+++ pipeline start +++", mSegNo);
     80    #endif
    8081    startOptionalCycleCounter(b);
    8182}
     
    8687void PipelineCompiler::executeKernel(BuilderRef b) {
    8788
    88     assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
    89 
     89    resetMemoizedFields();
     90    mPortOrdering = lexicalOrderingOfStreamIO();
    9091    loadBufferHandles(b);
    91     mNoMore = nullptr;
    9292
    9393    mKernelEntry = b->GetInsertBlock();
    9494
    9595    const auto kernelName = makeKernelName(mKernelIndex);
    96 
    9796    BasicBlock * const checkProducers = b->CreateBasicBlock(kernelName + "_checkProducers", mPipelineEnd);
    9897    mKernelLoopEntry = b->CreateBasicBlock(kernelName + "_loopEntry", mPipelineEnd);
     
    107106    /// KERNEL ENTRY
    108107    /// -------------------------------------------------------------------------------------
    109 
    110     b->CreateUnlikelyCondBr(b->getTerminationSignal(), mKernelExit, checkProducers);
     108    Value * const term = b->getTerminationSignal();
     109    #ifdef PRINT_DEBUG_MESSAGES
     110    if (1) {
     111    Constant * const MAX_INT = ConstantInt::getAllOnesValue(mSegNo->getType());
     112    Value * const round = b->CreateSelect(term, MAX_INT, mSegNo);
     113    b->CallPrintInt("--- " + kernelName + "_start ---", round);
     114    }
     115    #endif
     116    b->CreateUnlikelyCondBr(term, mKernelExit, checkProducers);
    111117
    112118    /// -------------------------------------------------------------------------------------
     
    115121
    116122    b->SetInsertPoint(checkProducers);
    117     checkIfAllProducingKernelsHaveTerminated(b);
    118     storeCopyForwardProducedItemCounts(b);
     123    readInitialProducedItemCounts(b);
    119124    b->CreateBr(mKernelLoopEntry);
    120125
     
    133138    }
    134139
    135     assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
    136 
    137140    /// -------------------------------------------------------------------------------------
    138141    /// KERNEL CALL
     
    141144    b->SetInsertPoint(mKernelLoopCall);
    142145    initializeKernelCallPhis(b);
    143 
    144     assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
    145146
    146147    /// -------------------------------------------------------------------------------------
     
    161162    initializeKernelExitPhis(b);
    162163
    163     assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
    164 
    165164    /// -------------------------------------------------------------------------------------
    166165    /// KERNEL LOOP ENTRY (CONTINUED)
     
    168167
    169168    b->SetInsertPoint(mKernelLoopEntry);
     169    checkForSufficientInputDataAndOutputSpace(b);
     170    determineNumOfLinearStrides(b);
     171
     172    Value * isFinal = nullptr;
     173
    170174    ConstantInt * const ZERO = b->getSize(0);
    171     ConstantInt * const TRUE = b->getTrue();
    172     ConstantInt * const FALSE = b->getFalse();
    173     Value * const nonFinal = checkForSufficientInputDataAndOutputSpace(b);
    174     if (nonFinal) {
    175 
    176         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    177             Value * const valid = b->CreateOr(nonFinal, mNoMore);
    178             b->CreateAssert(valid, kernelName + ": isFinal was set before all producers have terminated");
    179         }
     175
     176    if (mNumOfLinearStrides) {
    180177
    181178        BasicBlock * const enteringNonFinalSegment = b->CreateBasicBlock(kernelName + "_nonFinalSegment", mKernelLoopCall);
    182179        BasicBlock * const enteringFinalStride = b->CreateBasicBlock(kernelName + "_finalStride", mKernelLoopCall);
    183         b->CreateLikelyCondBr(nonFinal, enteringNonFinalSegment, enteringFinalStride);
     180
     181        isFinal = b->CreateICmpEQ(mNumOfLinearStrides, ZERO);
     182
     183        b->CreateUnlikelyCondBr(isFinal, enteringFinalStride, enteringNonFinalSegment);
     184
     185        /// -------------------------------------------------------------------------------------
     186        /// KERNEL ENTERING FINAL STRIDE
     187        /// -------------------------------------------------------------------------------------
     188
     189        b->SetInsertPoint(enteringFinalStride);
     190        calculateFinalItemCounts(b);
     191        b->CreateBr(mKernelLoopCall);
    184192
    185193        /// -------------------------------------------------------------------------------------
     
    188196
    189197        b->SetInsertPoint(enteringNonFinalSegment);
    190         Value * const numOfLinearStrides = determineNumOfLinearStrides(b);
    191         calculateNonFinalItemCounts(b, numOfLinearStrides);
    192         BasicBlock * const endOfNonFinalStride = b->GetInsertBlock();
    193         mNumOfLinearStridesPhi->addIncoming(numOfLinearStrides, endOfNonFinalStride);
    194         mIsFinalPhi->addIncoming(FALSE, endOfNonFinalStride);
     198        calculateNonFinalItemCounts(b);
    195199        b->CreateBr(mKernelLoopCall);
    196200
    197         /// -------------------------------------------------------------------------------------
    198         /// KERNEL ENTERING FINAL STRIDE
    199         /// -------------------------------------------------------------------------------------
    200 
    201         b->SetInsertPoint(enteringFinalStride);
    202         calculateFinalItemCounts(b);
    203         BasicBlock * const endOfFinalStride = b->GetInsertBlock();
    204         mNumOfLinearStridesPhi->addIncoming(ZERO, endOfFinalStride);
    205         mIsFinalPhi->addIncoming(TRUE, endOfFinalStride);
     201    } else {
     202        mNumOfLinearStrides = ZERO;
    206203        b->CreateBr(mKernelLoopCall);
    207 
    208     } else {
    209 
    210         BasicBlock * const endOfLoopEntry = b->GetInsertBlock();
    211         mNumOfLinearStridesPhi->addIncoming(ZERO, endOfLoopEntry);
    212         mIsFinalPhi->addIncoming(mNoMore, endOfLoopEntry);
    213         b->CreateBr(mKernelLoopCall);
    214204    }
    215205
     
    219209
    220210    b->SetInsertPoint(mKernelLoopCall);
    221     assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
    222     storeCopyBackProducedItemCounts(b);
     211    expandOutputBuffers(b);
    223212    writeKernelCall(b);
    224     assert (mKernel == mPipeline[mKernelIndex] && b->getKernel() == mKernel);
     213
     214    BasicBlock * const incrementItemCounts = b->CreateBasicBlock(kernelName + "_incrementItemCounts", mKernelLoopExit);
     215    BasicBlock * const terminationCheck = b->CreateBasicBlock(kernelName + "_normalTerminationCheck", mKernelLoopExit);
     216    BasicBlock * const terminated = b->CreateBasicBlock(kernelName + "_terminated", mKernelLoopExit);
     217
     218    // If the kernel itself terminates, it must set the final processed/produced item counts.
     219    // Otherwise, the pipeline will update any countable rates, even upon termination.
     220    b->CreateUnlikelyCondBr(terminatedExplicitly(b), terminated, incrementItemCounts);
     221
     222    /// -------------------------------------------------------------------------------------
     223    /// KERNEL INCREMENT ITEM COUNTS
     224    /// -------------------------------------------------------------------------------------
     225
     226    b->SetInsertPoint(incrementItemCounts);
     227    // TODO: phi out the item counts and set them once at the end.
     228    incrementItemCountsOfCountableRateStreams(b);
    225229    writeCopyBackLogic(b);
    226     Value * const terminated = isTerminated(b);
    227     BasicBlock * const kernelTerminated = b->CreateBasicBlock(kernelName + "_markAsTerminated", mKernelLoopExit);
    228     BasicBlock * const callKernelEnd = b->GetInsertBlock();
    229     if (LLVM_LIKELY(mKernel->getNumOfStreamInputs() > 0)) { // hasAnyInputChecks()
     230    b->CreateBr(terminationCheck);
     231
     232    /// -------------------------------------------------------------------------------------
     233    /// KERNEL NORMAL TERMINATION CHECK
     234    /// -------------------------------------------------------------------------------------
     235
     236    b->SetInsertPoint(terminationCheck);
     237    if (isFinal) {
    230238        if (mAlreadyProgressedPhi) {
    231             mAlreadyProgressedPhi->addIncoming(TRUE, callKernelEnd);
    232         }
    233         b->CreateUnlikelyCondBr(terminated, kernelTerminated, mKernelLoopEntry);
     239            mAlreadyProgressedPhi->addIncoming(b->getTrue(), terminationCheck);
     240        }
     241        b->CreateUnlikelyCondBr(isFinal, terminated, mKernelLoopEntry);
    234242    } else { // just exit the loop
    235243        if (mHasProgressedPhi) {
    236             mHasProgressedPhi->addIncoming(TRUE, callKernelEnd);
    237         }
    238         mTerminatedPhi->addIncoming(FALSE, callKernelEnd);
    239         b->CreateUnlikelyCondBr(terminated, kernelTerminated, mKernelLoopExit);
     244            mHasProgressedPhi->addIncoming(b->getTrue(), terminationCheck);
     245        }
     246        mTerminatedPhi->addIncoming(b->getFalse(), terminationCheck);
     247        b->CreateBr(mKernelLoopExit);
    240248    }
    241249
     
    244252    /// -------------------------------------------------------------------------------------
    245253
    246     // mark this kernel as terminated
    247     b->SetInsertPoint(kernelTerminated);
     254    b->SetInsertPoint(terminated);
    248255    zeroFillPartiallyWrittenOutputStreams(b);
    249256    setTerminated(b);
    250     BasicBlock * const kernelTerminatedExit = b->GetInsertBlock();
    251     mTerminatedPhi->addIncoming(TRUE, kernelTerminatedExit);
     257    BasicBlock * const kernelTerminatedEnd = b->GetInsertBlock();
     258    mTerminatedPhi->addIncoming(b->getTrue(), kernelTerminatedEnd);
    252259    if (mHasProgressedPhi) {
    253         mHasProgressedPhi->addIncoming(TRUE, kernelTerminatedExit);
     260        mHasProgressedPhi->addIncoming(b->getTrue(), kernelTerminatedEnd);
    254261    }
    255262    b->CreateBr(mKernelLoopExit);
     
    260267
    261268    b->SetInsertPoint(mKernelLoopExit);
     269    computeFullyProcessedItemCounts(b);
    262270    computeMinimumConsumedItemCounts(b);
     271    computeMinimumPopCountReferenceCounts(b);
    263272    writeCopyForwardLogic(b);
     273    writePopCountComputationLogic(b);
    264274    b->CreateBr(mKernelLoopExitPhiCatch);
    265275    b->SetInsertPoint(mKernelLoopExitPhiCatch);
     
    272282    b->SetInsertPoint(mKernelExit);
    273283    writeFinalConsumedItemCounts(b);
     284    updatePopCountReferenceCounts(b);
    274285
    275286    // TODO: logically we should only need to read produced item counts in the loop exit; however, that
     
    277288    // to have access to them here and then PHI them out within the kernel loop
    278289
    279     readCurrentProducedItemCounts(b);
     290    readFinalProducedItemCounts(b);
    280291    releaseCurrentSegment(b);
    281292    updateOptionalCycleCounter(b);
     
    323334        Value * const plusOne = b->CreateAdd(mDeadLockCounter, ONE);
    324335        Value * const newCount = b->CreateSelect(mPipelineProgress, ZERO, plusOne);
    325         b->CreateAssert(b->CreateICmpNE(newCount, TWO), "Dead lock detected: pipeline could not progress after two iterations");
     336        b->CreateAssert(b->CreateICmpNE(newCount, TWO),
     337                        "Dead lock detected: pipeline could not progress after two iterations");
    326338        mDeadLockCounter->addIncoming(newCount, b->GetInsertBlock());
    327339    }
    328     // return whether each sink has terminated
    329     Value * done = b->getTrue();
    330     for (const auto e : make_iterator_range(in_edges(mPipeline.size(), mTerminationGraph))) {
     340    // check whether every sink has terminated
     341    Value * allTerminated = b->getTrue();
     342    const auto pipelineOutputVertex = mPipeline.size();
     343    for (const auto e : make_iterator_range(in_edges(pipelineOutputVertex, mTerminationGraph))) {
    331344        const auto u = source(e, mTerminationGraph);
    332345        assert (mTerminationGraph[u]);
    333         done = b->CreateAnd(done, mTerminationGraph[u]);
    334     }
     346        allTerminated = b->CreateAnd(allTerminated, mTerminationGraph[u]);
     347    }
     348    // or if any output stream of this pipeline cannot support a full stride
     349    Value * notEnoughSpace = b->getFalse();
     350    for (const auto e : make_iterator_range(in_edges(pipelineOutputVertex, mBufferGraph))) {
     351        // TODO: not a very elegant way here; revise
     352        const auto bufferVertex = source(e, mBufferGraph);
     353        mKernelIndex = parent(bufferVertex, mBufferGraph);
     354        mKernel = mPipeline[mKernelIndex];
     355        resetMemoizedFields();
     356        const auto outputPort = mBufferGraph[e].Port;
     357        Value * const writable = getWritableOutputItems(b, outputPort);
     358        // NOTE: this method doesn't check a popcount's ref stream to determine how many
     359        // items we actually require. Instead it just calculates them as bounded rates.
     360        // To support a precise bound, we'd need to produce more ref items than the kernel
     361        // that writes to this output actually consumes. Since this effectively adds a
     362        // delay equivalent to a LookAhead of a full stride, this doesn't seem useful.
     363        Value * const strideLength = getMaximumStrideLength(b, Port::Output, outputPort);
     364        notEnoughSpace = b->CreateOr(b->CreateICmpULT(writable, strideLength), notEnoughSpace);
     365    }
     366    Value * const done = b->CreateOr(allTerminated, notEnoughSpace);
     367    #ifdef PRINT_DEBUG_MESSAGES
     368    Constant * const ONES = Constant::getAllOnesValue(mSegNo->getType());
     369    b->CallPrintInt("+++ pipeline end +++", b->CreateSelect(done, ONES, mSegNo));
     370    #endif
     371
    335372    Value * const nextSegNo = b->CreateAdd(mSegNo, b->getSize(step));
    336373    mSegNo->addIncoming(nextSegNo, b->GetInsertBlock());
     
    338375
    339376    b->SetInsertPoint(mPipelineEnd);
    340     for (unsigned i = 0; i < mPipeline.size(); ++i) {
    341         setActiveKernel(b, i);
    342         const auto numOfInputs = mKernel->getNumOfStreamInputs();
    343         for (unsigned j = 0; j < numOfInputs; ++j) {
    344             deallocateThreadLocalState(b, Port::Input, j);
    345         }
    346         const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    347         for (unsigned j = 0; j < numOfOutputs; ++j) {
    348             deallocateThreadLocalState(b, Port::Output, j);
    349         }
    350     }
    351377    mSegNo = nullptr;
    352378}
     
    429455 ** ------------------------------------------------------------------------------------------------------------- */
    430456inline void PipelineCompiler::initializeKernelCallPhis(BuilderRef b) {
    431 
    432     const auto kernelName = makeKernelName(mKernelIndex);
    433 
    434457    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    435     mAccessibleInputItemsPhi.clear();
    436     mAccessibleInputItemsPhi.resize(numOfInputs, nullptr);
    437458    Type * const sizeTy = b->getSizeTy();
    438459    for (unsigned i = 0; i < numOfInputs; ++i) {
    439460        const Binding & input = mKernel->getInputStreamSetBinding(i);
    440         mAccessibleInputItemsPhi[i] = b->CreatePHI(sizeTy, 2, kernelName + "_" + input.getName() + "_accessible");
     461        const auto prefix = makeBufferName(mKernelIndex, input);
     462        mLinearInputItemsPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_linearlyAccessible");
    441463    }
    442464    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    443     mWritableOutputItemsPhi.clear();
    444     mWritableOutputItemsPhi.resize(numOfOutputs, nullptr);
    445465    for (unsigned i = 0; i < numOfOutputs; ++i) {
    446         const Binding & output = mKernel->getOutputStreamSetBinding(i);
    447         if (!output.hasAttribute(AttrId::ManagedBuffer)) {
    448             mWritableOutputItemsPhi[i] = b->CreatePHI(sizeTy, 2, kernelName + "_" + output.getName() + "_writable");
    449         }
    450     }
    451     mNumOfLinearStridesPhi = b->CreatePHI(b->getSizeTy(), 2, kernelName + "_numOfLinearStrides");
    452     mIsFinalPhi = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_isFinal");
     466        if (LLVM_LIKELY(getOutputBufferType(i) != BufferType::Managed)) {
     467            const Binding & output = mKernel->getOutputStreamSetBinding(i);
     468            const auto prefix = makeBufferName(mKernelIndex, output);
     469            mLinearOutputItemsPhi[i] = b->CreatePHI(sizeTy, 2, prefix + "_linearlyWritable");
     470        }
     471    }
    453472}
    454473
     
    458477inline void PipelineCompiler::initializeKernelExitPhis(BuilderRef b) {
    459478    const auto kernelName = makeKernelName(mKernelIndex);
    460 
    461     PHINode * const terminatedPhi = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_terminated");
    462     terminatedPhi->addIncoming(b->getTrue(), mKernelEntry);
    463     terminatedPhi->addIncoming(mTerminatedPhi, mKernelLoopExitPhiCatch);
    464     mTerminationGraph[mKernelIndex] = terminatedPhi;
     479    mTerminatedFlag = b->CreatePHI(b->getInt1Ty(), 2, kernelName + "_terminated");
     480    mTerminatedFlag->addIncoming(b->getTrue(), mKernelEntry);
     481    mTerminatedFlag->addIncoming(mTerminatedPhi, mKernelLoopExitPhiCatch);
     482    mTerminationGraph[mKernelIndex] = mTerminatedFlag;
    465483    if (mPipelineProgress) {
    466484        PHINode * const pipelineProgress = b->CreatePHI(b->getInt1Ty(), 2, "pipelineProgress");
     
    470488    }
    471489    createConsumedPhiNodes(b);
    472 }
    473 
    474 /** ------------------------------------------------------------------------------------------------------------- *
    475  * @brief checkIfAllInputKernelsHaveFinished
    476  ** ------------------------------------------------------------------------------------------------------------- */
    477 inline void PipelineCompiler::checkIfAllProducingKernelsHaveTerminated(BuilderRef b) {
    478     const auto n = in_degree(mKernelIndex, mTerminationGraph);
    479     Value * noMore = nullptr;
    480     if (LLVM_UNLIKELY(n == 0)) {
    481         noMore = b->getFalse();
    482     } else {
    483         for (auto e : make_iterator_range(in_edges(mKernelIndex, mTerminationGraph))) {
    484             const auto u = source(e, mTerminationGraph);
    485             Value * const finished = mTerminationGraph[u];
    486             assert (finished);
    487             if (noMore) {
    488                 noMore = b->CreateAnd(noMore, finished);
    489             } else {
    490                 noMore = finished;
    491             }
    492         }
    493     }
    494     #ifdef PRINT_DEBUG_MESSAGES
    495     b->CallPrintInt(mKernel->getName() + "_noMore", noMore);
    496     #endif
    497     mNoMore = noMore;
     490    createPopCountReferenceCounts(b);
    498491}
    499492
     
    501494
    502495/** ------------------------------------------------------------------------------------------------------------- *
    503  * @brief isTerminated
    504  ** ------------------------------------------------------------------------------------------------------------- */
    505 Value * PipelineCompiler::isTerminated(BuilderRef b) const {
    506     if (LLVM_UNLIKELY(mKernel->hasAttribute(AttrId::MustExplicitlyTerminate))) {
     496 * @brief terminatedExplicitly
     497 ** ------------------------------------------------------------------------------------------------------------- */
     498Value * PipelineCompiler::terminatedExplicitly(BuilderRef b) const {
     499    if (LLVM_UNLIKELY(mKernel->hasAttribute(AttrId::MustExplicitlyTerminate) || mKernel->hasAttribute(AttrId::CanTerminateEarly))) {
    507500        return b->getTerminationSignal();
    508501    } else {
    509         if (mKernel->hasAttribute(AttrId::CanTerminateEarly)) {
    510             Value * const terminated = b->getTerminationSignal();
    511             return b->CreateOr(mIsFinalPhi, terminated);
    512         } else {
    513             return mIsFinalPhi;
    514         }
     502        return b->getFalse();
    515503    }
    516504}
     
    523511        return;
    524512    }
     513    const auto prefix = makeKernelName(mKernelIndex);
     514    BasicBlock * const terminationIsSet = b->CreateBasicBlock(prefix + "_terminationIsSet", mKernelLoopExit);
     515    if (mKernel->hasAttribute(AttrId::CanTerminateEarly)) {
     516        BasicBlock * const setTermination = b->CreateBasicBlock(prefix + "_setTermination", terminationIsSet);
     517        Value * const terminated = b->getTerminationSignal();
     518        b->CreateCondBr(terminated, terminationIsSet, setTermination);
     519
     520        b->SetInsertPoint(setTermination);
     521    }
    525522    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableMProtect))) {
    526523        b->CreateMProtect(mKernel->getHandle(), CBuilder::Protect::WRITE);
     
    530527        b->CreateMProtect(mKernel->getHandle(), CBuilder::Protect::READ);
    531528    }
     529    #ifdef PRINT_DEBUG_MESSAGES
     530    b->CallPrintInt("*** " + prefix + "_terminated ***", b->getTrue());
     531    #endif
     532    b->CreateBr(terminationIsSet);
     533
     534    b->SetInsertPoint(terminationIsSet);
    532535}
    533536
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/kernel_logic.hpp

    r6184 r6228  
    2323 * @brief determineNumOfLogicalStrides
    2424 ** ------------------------------------------------------------------------------------------------------------- */
    25 Value * PipelineCompiler::checkForSufficientInputDataAndOutputSpace(BuilderRef b) {
    26 
     25void PipelineCompiler::checkForSufficientInputDataAndOutputSpace(BuilderRef b) {
    2726    assert (b->getKernel() == mKernel);
    28 
    2927    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    30     mInputStrideLength.clear();
    31     mInputStrideLength.resize(numOfInputs, nullptr);
    32     mAccessibleInputItems.clear();
    33     mAccessibleInputItems.resize(numOfInputs, nullptr);
    34 
    35     const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    36     mOutputStrideLength.clear();
    37     mOutputStrideLength.resize(numOfOutputs, nullptr);
    38     mWritableOutputItems.clear();
    39     mWritableOutputItems.resize(numOfOutputs, nullptr);
    40 
    41     // TODO: current analysis didn't take linear accessibility into account. For now, test everything but
    42     // return to this once this pipeline is responsible for determining buffer sizes / types.
    43 
    44     // TODO: if popcounts stated consume k items per marker, the popcount dependency graph needs to reflect it.
    45 
    46     initializePopCounts(b);
    47     mNonFinal = nullptr;
    48     for (unsigned i = 0; i < numOfInputs; ++i) {
    49         checkForSufficientInputData(b, i);
    50     }
    51     for (unsigned i = 0; i < numOfOutputs; ++i) {
    52         checkForSufficientOutputSpaceOrExpand(b, i);
    53     }
    54     assert (mNonFinal || numOfInputs == 0);
    55     return mNonFinal;
     28    for (const auto i : mPortOrdering) {
     29        if (i < numOfInputs) {
     30            checkForSufficientInputData(b, i);
     31        } else {
     32            checkForSufficientOutputSpaceOrExpand(b, i - numOfInputs);
     33        }
     34    }
    5635}
    5736
     
    5938 * @brief checkForSufficientInputData
    6039 ** ------------------------------------------------------------------------------------------------------------- */
    61 inline void PipelineCompiler::checkForSufficientInputData(BuilderRef b, const unsigned index) {
    62     const Binding & input = mKernel->getInputStreamSetBinding(index);
    63     Value * const accessible = getAccessibleInputItems(b, index);
    64     Value * const strideLength = getInputStrideLength(b, index);
    65     Value * const requiredInput = addLookahead(b, index, strideLength);
    66     Value * const hasEnough = b->CreateICmpUGE(accessible, requiredInput);
    67     Value * const sufficientInput = b->CreateOr(hasEnough, mNoMore);
    68     const auto prefix = mKernel->getName() + "_" + input.getName();
     40inline void PipelineCompiler::checkForSufficientInputData(BuilderRef b, const unsigned inputPort) {
     41    // TODO: we could eliminate some checks if we can prove a particular input
     42    // must have enough data based on its already tested inputs and ignore
     43    // checking whether an input kernel is terminated if a stronger test has
     44    // already been done. Work out the logic for these tests globally.
     45
     46    Value * const accessible = getAccessibleInputItems(b, inputPort);
     47    Value * const strideLength = getInputStrideLength(b, inputPort);
     48    Value * const requiredInput = addLookahead(b, inputPort, strideLength);
     49    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
     50    const auto prefix = makeBufferName(mKernelIndex, input);
    6951    #ifdef PRINT_DEBUG_MESSAGES
    7052    b->CallPrintInt(prefix + "_requiredInput", requiredInput);
    7153    #endif
    72     BasicBlock * const target = b->CreateBasicBlock(prefix + "_hasSufficientInput", mKernelLoopCall);
    73     branchToTargetOrLoopExit(b, sufficientInput, target);
    74     mNonFinal = mNonFinal ? b->CreateAnd(mNonFinal, hasEnough) : hasEnough;
     54    Value * const hasEnough = b->CreateICmpUGE(accessible, requiredInput);
     55    Value * const hasTerminated = hasProducerTerminated(b, inputPort);
     56    Value * const sufficientInput = b->CreateOr(hasEnough, hasTerminated);
     57    mAccessibleInputItems[inputPort] = accessible;
     58    BasicBlock * const target = b->CreateBasicBlock(prefix + "_hasInputData", mKernelLoopCall);
     59
     60    b->CreateLikelyCondBr(sufficientInput, target, mKernelLoopExit);
     61    BasicBlock * const exitBlock = b->GetInsertBlock();
     62    mTerminatedPhi->addIncoming(b->getFalse(), exitBlock);
     63    if (mHasProgressedPhi) {
     64        mHasProgressedPhi->addIncoming(mAlreadyProgressedPhi, exitBlock);
     65    }
     66    b->SetInsertPoint(target);
     67}
     68
     69/** ------------------------------------------------------------------------------------------------------------- *
     70 * @brief getAlreadyProcessedItemCount
     71 ** ------------------------------------------------------------------------------------------------------------- */
     72Value * PipelineCompiler::getAlreadyProcessedItemCount(BuilderRef b, const unsigned inputPort) {
     73    if (mAlreadyProcessedItemCount[inputPort]) {
     74        return mAlreadyProcessedItemCount[inputPort];
     75    }
     76    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
     77    Value * const processed = b->getNonDeferredProcessedItemCount(input);
     78    mAlreadyProcessedItemCount[inputPort] = processed;
     79    return processed;
     80}
     81
     82/** ------------------------------------------------------------------------------------------------------------- *
     83 * @brief hasProducerTerminated
     84 ** ------------------------------------------------------------------------------------------------------------- */
     85inline Value * PipelineCompiler::hasProducerTerminated(BuilderRef /* b */, const unsigned inputPort) const {
     86    const auto bufferVertex = getInputBufferVertex(inputPort);
     87    const auto producerVertex = parent(bufferVertex, mBufferGraph);
     88    return mTerminationGraph[producerVertex];
    7589}
    7690
     
    7892 * @brief getAccessibleInputItems
    7993 ** ------------------------------------------------------------------------------------------------------------- */
    80 inline Value * PipelineCompiler::getAccessibleInputItems(BuilderRef b, const unsigned index) {
    81     assert (index < mAccessibleInputItems.size());
    82     if (mAccessibleInputItems[index]) {
    83         return mAccessibleInputItems[index];
     94inline Value * PipelineCompiler::getAccessibleInputItems(BuilderRef b, const unsigned inputPort) {
     95    assert (inputPort < mAccessibleInputItems.size());
     96    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
     97    const StreamSetBuffer * const buffer = getInputBuffer(inputPort);
     98    Value * const totalItems = getTotalItemCount(b, inputPort);
     99    Value * const processed = getAlreadyProcessedItemCount(b, inputPort);
     100    #ifdef PRINT_DEBUG_MESSAGES
     101    const auto prefix = makeBufferName(mKernelIndex, input);
     102    b->CallPrintInt(prefix + "_capacity", buffer->getCapacity(b.get()));
     103    b->CallPrintInt(prefix + "_totalItems", totalItems);
     104    b->CallPrintInt(prefix + "_processed", processed);
     105    #endif
     106    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     107        Value * const sanityCheck = b->CreateICmpULE(processed, totalItems);
     108        b->CreateAssert(sanityCheck,
     109                        mKernel->getName() + "_" + input.getName() +
     110                        ": processed count exceeds total count");
     111    }
     112    const auto overflow = getFacsimile(getInputBufferVertex(inputPort));
     113    Value * const accessible = buffer->getLinearlyAccessibleItems(b, processed, totalItems, overflow);
     114    #ifdef PRINT_DEBUG_MESSAGES
     115    b->CallPrintInt(prefix + "_accessible", accessible);
     116    #endif
     117    return accessible;
     118}
     119
     120/** ------------------------------------------------------------------------------------------------------------- *
     121 * @brief checkForSufficientOutputSpaceOrExpand
     122 ** ------------------------------------------------------------------------------------------------------------- */
     123inline void PipelineCompiler::checkForSufficientOutputSpaceOrExpand(BuilderRef b, const unsigned outputPort) {
     124    // If the buffer is managed by the kernel, ignore it
     125    if (mLinearOutputItemsPhi[outputPort]) {
     126        const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
     127        Value * const writable = getWritableOutputItems(b, outputPort);
     128        Value * const strideLength = getOutputStrideLength(b, outputPort);
     129        const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     130        const auto prefix = makeBufferName(mKernelIndex, output);
     131        #ifdef PRINT_DEBUG_MESSAGES
     132        b->CallPrintInt(prefix + "_requiredOutput", strideLength);
     133        #endif
     134        Value * const hasEnough = b->CreateICmpULE(strideLength, writable, prefix + "_hasEnough");
     135        Value * const check = b->CreateAnd(hasEnough, willNotOverwriteOverflow(b, outputPort));
     136        BasicBlock * const target = b->CreateBasicBlock(prefix + "_hasOutputSpace", mKernelLoopCall);
     137        mWritableOutputItems[outputPort] = writable;
     138        if (LLVM_UNLIKELY(isa<DynamicBuffer>(buffer))) {
     139            expandOutputBuffer(b, outputPort, check, target);
     140        } else {
     141            b->CreateLikelyCondBr(check, target, mKernelLoopExit);
     142            BasicBlock * const exitBlock = b->GetInsertBlock();
     143            mTerminatedPhi->addIncoming(b->getFalse(), exitBlock);
     144            if (mHasProgressedPhi) {
     145                mHasProgressedPhi->addIncoming(mAlreadyProgressedPhi, exitBlock);
     146            }
     147            b->SetInsertPoint(target);
     148        }
     149    }
     150}
     151
     152/** ------------------------------------------------------------------------------------------------------------- *
     153 * @brief willNotOverwriteOverflow
     154 ** ------------------------------------------------------------------------------------------------------------- */
     155inline Value * PipelineCompiler::willNotOverwriteOverflow(BuilderRef b, const unsigned outputPort) {
     156    if (LLVM_UNLIKELY(requiresCopyBack(getOutputBufferVertex(outputPort)))) { // check whether the potential overflow copy will overwrite the buffer
     157        Value * const produced = getAlreadyProducedItemCount(b, outputPort);
     158        Value * const consumed = getConsumedItemCount(b, outputPort);
     159        Value * const unconsumed = b->CreateSub(produced, consumed);
     160        Value * const strideLength = getOutputStrideLength(b, outputPort);
     161        Value * const required = b->CreateAdd(unconsumed, strideLength);
     162        const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
     163        Value * const capacity = buffer->getCapacity(b.get());
     164        const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     165        const auto prefix = makeBufferName(mKernelIndex, output);
     166        return b->CreateICmpULT(required, capacity, prefix + "_noOverflowOverwrite");
    84167    } else {
    85         const Binding & input = mKernel->getInputStreamSetBinding(index);
    86         const StreamSetBuffer * const buffer = getInputBuffer(index);
    87         Value * const totalItems = getTotalItemCount(b, buffer);
    88         Value * const processed = b->getNonDeferredProcessedItemCount(input);
    89         Value * const accessible = buffer->getLinearlyAccessibleItems(b, processed, totalItems, getFacsimile(buffer));
    90         #ifdef PRINT_DEBUG_MESSAGES
    91         const auto prefix = mKernel->getName() + "." + input.getName();
    92         b->CallPrintInt(prefix + "_totalItems", totalItems);
    93         b->CallPrintInt(prefix + "_processed", processed);
    94         b->CallPrintInt(prefix + "_accessible", accessible);
    95         #endif
    96         storePopCountSourceItemCount(b, Port::Input, index, processed, accessible);
    97         mAccessibleInputItems[index] = accessible;
    98         return accessible;
    99     }
    100 }
    101 
    102 /** ------------------------------------------------------------------------------------------------------------- *
    103  * @brief checkForSufficientOutputSpaceOrExpand
    104  ** ------------------------------------------------------------------------------------------------------------- */
    105 inline void PipelineCompiler::checkForSufficientOutputSpaceOrExpand(BuilderRef b, const unsigned index) {
    106     // If the buffer is managed by the kernel, ignore it
    107     const StreamSetBuffer * const buffer = getOutputBuffer(index);
    108     if (LLVM_UNLIKELY(isa<ExternalBuffer>(buffer))) {
    109         return;
    110     }   
    111     const Binding & output = mKernel->getOutputStreamSetBinding(index);
    112     Value * const strideLength = getOutputStrideLength(b, index);
    113     Value * const writable = getWritableOutputItems(b, index);
    114     const auto prefix = mKernel->getName() + "_" + output.getName();
    115     #ifdef PRINT_DEBUG_MESSAGES
    116     b->CallPrintInt(prefix + "_requiredSpace", strideLength);
    117     #endif
    118     Value * hasEnough = b->CreateICmpULE(strideLength, writable, prefix + "_hasEnough");
    119     if (requiresCopyBack(buffer)) { // check whether the potential overflow copy will overwrite the buffer
    120         Value * const produced = b->getNonDeferredProducedItemCount(output);
    121         Value * const consumed = getConsumedItemCount(b, index);
    122         Value * const unconsumed = b->CreateSub(produced, consumed);
    123         Value * const capacity = b->CreateSub(buffer->getCapacity(b.get()), strideLength);
    124         Value * const noOverwrites = b->CreateICmpULT(unconsumed, capacity, prefix + "_noOverflowOverwrite");
    125         hasEnough = b->CreateAnd(hasEnough, noOverwrites);
    126     }
    127 
    128     BasicBlock * const target = b->CreateBasicBlock(prefix + "_hasOutputSpace", mKernelLoopCall);
    129     if (LLVM_UNLIKELY(isa<DynamicBuffer>(buffer))) {
    130         expandOutputBuffer(b, hasEnough, index, target);
    131     } else {
    132         branchToTargetOrLoopExit(b, hasEnough, target);
     168        return b->getTrue();
    133169    }
    134170}
     
    148184
    149185/** ------------------------------------------------------------------------------------------------------------- *
    150  * @brief expandOutputBuffer
    151  ** ------------------------------------------------------------------------------------------------------------- */
    152 inline void PipelineCompiler::expandOutputBuffer(BuilderRef b, Value * const hasEnough, const unsigned index, BasicBlock * const target) {
    153 
    154     Value * const writable = mWritableOutputItems[index]; assert (writable);
    155     const Binding & output = mKernel->getOutputStreamSetBinding(index);
    156     const auto prefix = mKernel->getName() + "_" + output.getName();
    157     BasicBlock * const expand = b->CreateBasicBlock(prefix + "_expandOutputBuffer", target);
    158     b->CreateLikelyCondBr(hasEnough, target, expand);
    159 
    160     BasicBlock * const entryBlock = b->GetInsertBlock();
    161     b->SetInsertPoint(target);
    162     PHINode * const writablePhi = b->CreatePHI(writable->getType(), 2);
    163     writablePhi->addIncoming(writable, entryBlock);
    164 
    165     b->SetInsertPoint(expand);
    166     const StreamSetBuffer * const buffer = cast<DynamicBuffer>(getOutputBuffer(index));
    167     buffer->setCapacity(b.get(), calculateBufferExpansionSize(b, index));
    168     // reset the # of writable items to reflect the expanded buffer
    169     mWritableOutputItems[index] = nullptr;
    170     Value * const newWritableItems = getWritableOutputItems(b, index);
     186 * @brief getAlreadyProducedItemCount
     187 ** ------------------------------------------------------------------------------------------------------------- */
     188Value * PipelineCompiler::getAlreadyProducedItemCount(BuilderRef b, const unsigned outputPort) {
     189    if (mAlreadyProducedItemCount[outputPort]) {
     190        return mAlreadyProducedItemCount[outputPort];
     191    }
     192    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     193    Value * const produced = b->getNonDeferredProducedItemCount(output);
     194    mAlreadyProducedItemCount[outputPort] = produced;
     195    return produced;
     196}
     197
     198/** ------------------------------------------------------------------------------------------------------------- *
     199 * @brief getWritableOutputItems
     200 ** ------------------------------------------------------------------------------------------------------------- */
     201inline Value * PipelineCompiler::getWritableOutputItems(BuilderRef b, const unsigned outputPort) {
     202    assert (outputPort < mWritableOutputItems.size());
     203    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     204    const StreamSetBuffer * const buffer = getOutputBuffer(outputPort);
     205    Value * const produced = getAlreadyProducedItemCount(b, outputPort);
     206    Value * const consumed = getConsumedItemCount(b, outputPort);
    171207    #ifdef PRINT_DEBUG_MESSAGES
    172     b->CallPrintInt(prefix + "_writable'", newWritableItems);
     208    const auto prefix = makeBufferName(mKernelIndex, output);
     209    b->CallPrintInt(prefix + "_capacity", buffer->getCapacity(b.get()));
     210    b->CallPrintInt(prefix + "_produced", produced);
     211    b->CallPrintInt(prefix + "_consumed", consumed);
    173212    #endif
    174     BasicBlock * const expansionExitBlock = b->GetInsertBlock();
    175     writablePhi->addIncoming(newWritableItems, expansionExitBlock);
    176     b->CreateBr(target);
    177 
    178     b->SetInsertPoint(target);
    179     // update the cached entry to be the phi node
    180     mWritableOutputItems[index] = writablePhi;
    181 }
    182 
    183 /** ------------------------------------------------------------------------------------------------------------- *
    184  * @brief getWritableOutputItems
    185  ** ------------------------------------------------------------------------------------------------------------- */
    186 inline Value * PipelineCompiler::getWritableOutputItems(BuilderRef b, const unsigned index) {
    187     assert (index < mWritableOutputItems.size());
    188     if (mWritableOutputItems[index]) {
    189         return mWritableOutputItems[index];
    190     } else {
    191         const Binding & output = mKernel->getOutputStreamSetBinding(index);
    192         const StreamSetBuffer * const buffer = getOutputBuffer(index);
    193         Value * const produced = b->getNonDeferredProducedItemCount(output);
    194         Value * const consumed = getConsumedItemCount(b, index);
    195         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    196             Value * const sanityCheck = b->CreateICmpULE(consumed, produced);
    197             b->CreateAssert(sanityCheck,
    198                             mKernel->getName() + "." + output.getName() +
    199                             ": consumed count exceeds produced count");
    200         }
    201         Value * const writable = buffer->getLinearlyWritableItems(b, produced, consumed, getCopyBack(buffer));
    202         #ifdef PRINT_DEBUG_MESSAGES
    203         const auto prefix = mKernel->getName() + "." + output.getName();
    204         b->CallPrintInt(prefix + "_produced", produced);
    205         b->CallPrintInt(prefix + "_consumed", consumed);
    206         b->CallPrintInt(prefix + "_writable", writable);
    207         #endif
    208         storePopCountSourceItemCount(b, Port::Output, index, produced, writable);
    209         mWritableOutputItems[index] = writable;
    210         return writable;
    211     }
    212 }
    213 
    214 /** ------------------------------------------------------------------------------------------------------------- *
    215  * @brief calculateBufferExpansionSize
    216  ** ------------------------------------------------------------------------------------------------------------- */
    217 inline Value * PipelineCompiler::calculateBufferExpansionSize(BuilderRef b, const unsigned index) {
    218     const Binding & output = mKernel->getOutputStreamSetBinding(index);
    219     Value * const produced = b->getNonDeferredProducedItemCount(output);
    220     Value * const consumed = getConsumedItemCount(b, index);
    221     Value * const unconsumed = b->CreateSub(produced, consumed);
    222     Value * const strideLength = getOutputStrideLength(b, index);
    223     return b->CreateAdd(unconsumed, strideLength);
     213    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     214        Value * const sanityCheck = b->CreateICmpULE(consumed, produced);
     215        b->CreateAssert(sanityCheck,
     216                        mKernel->getName() + "_" + output.getName() +
     217                        ": consumed count exceeds produced count");
     218    }
     219    const auto overflow = getCopyBack(getOutputBufferVertex(outputPort));
     220    Value * const writable = buffer->getLinearlyWritableItems(b, produced, consumed, overflow);
     221    #ifdef PRINT_DEBUG_MESSAGES
     222    b->CallPrintInt(prefix + "_writable", writable);
     223    #endif
     224    return writable;
    224225}
    225226
     
    227228 * @brief determineNumOfLinearStrides
    228229 ** ------------------------------------------------------------------------------------------------------------- */
    229 inline Value * PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
     230inline void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
    230231    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    231     const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    232232    mNumOfLinearStrides = nullptr;
    233     for (unsigned i = 0; i < numOfInputs; ++i) {
    234         mNumOfLinearStrides = b->CreateUMin(mNumOfLinearStrides, getNumOfAccessibleStrides(b, i));
    235     }
    236     for (unsigned i = 0; i < numOfOutputs; ++i) {
    237         mNumOfLinearStrides = b->CreateUMin(mNumOfLinearStrides, getNumOfWritableStrides(b, i));
    238     }
    239     assert (mNumOfLinearStrides);
    240     return mNumOfLinearStrides;
     233    for (const auto i : mPortOrdering) {
     234        Value * strides = nullptr;
     235        if (i < numOfInputs) {
     236            strides = getNumOfAccessibleStrides(b, i);
     237        } else {
     238            strides = getNumOfWritableStrides(b, i - numOfInputs);
     239        }
     240        mNumOfLinearStrides = b->CreateUMin(mNumOfLinearStrides, strides);
     241    }
    241242}
    242243
     
    244245 * @brief getNumOfAccessibleStrides
    245246 ** ------------------------------------------------------------------------------------------------------------- */
    246 inline Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b, const unsigned index) {
    247     const Binding & input = mKernel->getInputStreamSetBinding(index);
     247inline Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b, const unsigned inputPort) {
     248    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
    248249    const ProcessingRate & rate = input.getRate();
    249250    Value * numOfStrides = nullptr;
     251    Value * const accessible = mAccessibleInputItems[inputPort];
    250252    if (LLVM_UNLIKELY(rate.isPopCount() || rate.isNegatedPopCount())) {
    251         numOfStrides = getMaximumNumOfPopCountStrides(b, rate);
     253        numOfStrides = getMaximumNumOfPopCountStrides(b, input, accessible, getLookahead(b, inputPort));
    252254    } else {
    253         Value * const accessible = subtractLookahead(b, index, getAccessibleInputItems(b, index));
    254         Value * const strideLength = getInputStrideLength(b, index);
    255         numOfStrides = b->CreateUDiv(accessible, strideLength);
     255        Value * const strideLength = getInputStrideLength(b, inputPort);
     256        numOfStrides = b->CreateUDiv(subtractLookahead(b, inputPort, accessible), strideLength);
    256257    }
    257258    #ifdef PRINT_DEBUG_MESSAGES
    258     const auto prefix = mKernel->getName() + "." + input.getName();
    259     b->CallPrintInt(prefix + "_numOfStrides", numOfStrides);
     259    const auto prefix = makeBufferName(mKernelIndex, input);
     260    b->CallPrintInt("> " + prefix + "_numOfStrides", numOfStrides);
    260261    #endif
     262    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     263        Value * const term = hasProducerTerminated(b, inputPort);
     264        Value * const work = b->CreateIsNotNull(numOfStrides);
     265        Value * const progress = b->CreateOr(work, term);
     266        b->CreateAssert(progress,
     267                        mKernel->getName() + "." + input.getName() +
     268                        ": unexpected end of input stream");
     269    }
    261270    return numOfStrides;
    262271}
     
    265274 * @brief getNumOfWritableStrides
    266275 ** ------------------------------------------------------------------------------------------------------------- */
    267 inline Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b, const unsigned index) {
    268     const StreamSetBuffer * const buffer = getOutputBuffer(index);
    269     if (LLVM_UNLIKELY(isa<ExternalBuffer>(buffer))) {
    270         return nullptr;
    271     } else {
    272         const Binding & output = mKernel->getOutputStreamSetBinding(index);
     276inline Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b, const unsigned outputPort) {
     277    Value * numOfStrides = nullptr;
     278    if (mLinearOutputItemsPhi[outputPort]) {
     279        const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
    273280        const ProcessingRate & rate = output.getRate();
    274         Value * numOfStrides = nullptr;
     281        Value * const writable = mWritableOutputItems[outputPort];
    275282        if (LLVM_UNLIKELY(rate.isPopCount() || rate.isNegatedPopCount())) {
    276             numOfStrides = getMaximumNumOfPopCountStrides(b, rate);
     283            numOfStrides = getMaximumNumOfPopCountStrides(b, output, writable);
    277284        } else {
    278             Value * const writable = getWritableOutputItems(b, index);
    279             Value * const strideLength = getOutputStrideLength(b, index);
     285            Value * const strideLength = getOutputStrideLength(b, outputPort);
    280286            numOfStrides = b->CreateUDiv(writable, strideLength);
    281287        }
    282288        #ifdef PRINT_DEBUG_MESSAGES
    283         const auto prefix = mKernel->getName() + "." + output.getName();
    284         b->CallPrintInt(prefix + "_numOfStrides", numOfStrides);
     289        const auto prefix = makeBufferName(mKernelIndex, output);
     290        b->CallPrintInt("< " + prefix + "_numOfStrides", numOfStrides);
    285291        #endif
    286         return numOfStrides;
    287     }
    288 }
    289 
    290 /** ------------------------------------------------------------------------------------------------------------- *
    291  * @brief provideAllInputAndOutputSpace
    292  ** ------------------------------------------------------------------------------------------------------------- */
    293 inline void PipelineCompiler::provideAllInputAndOutputSpace(BuilderRef b) {
     292    }
     293    return numOfStrides;
     294}
     295
     296/** ------------------------------------------------------------------------------------------------------------- *
     297 * @brief calculateNonFinalItemCounts
     298 ** ------------------------------------------------------------------------------------------------------------- */
     299inline void PipelineCompiler::calculateNonFinalItemCounts(BuilderRef b) {
     300    assert (mNumOfLinearStrides);
    294301    const auto numOfInputs = mKernel->getNumOfStreamInputs();
    295     Value * releasedItems[numOfInputs];
     302    Value * linearInputItems[numOfInputs];
    296303    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    297     Value * pendingItems[numOfOutputs];
     304    Value * linearOutputItems[numOfOutputs];
    298305    for (unsigned i = 0; i < numOfInputs; ++i) {
    299         releasedItems[i] = getAccessibleInputItems(b, i);
     306        linearInputItems[i] = calculateNumOfLinearItems(b, Port::Input, i);
    300307    }
    301308    for (unsigned i = 0; i < numOfOutputs; ++i) {
    302         if (mWritableOutputItemsPhi[i]) {
    303             pendingItems[i] = getWritableOutputItems(b, i);
    304         }
     309        linearOutputItems[i] = calculateNumOfLinearItems(b, Port::Output, i);
    305310    }
    306311    BasicBlock * const exitBlock = b->GetInsertBlock();
    307312    for (unsigned i = 0; i < numOfInputs; ++i) {
    308         mAccessibleInputItemsPhi[i]->addIncoming(releasedItems[i], exitBlock);
     313        mLinearInputItemsPhi[i]->addIncoming(linearInputItems[i], exitBlock);
    309314    }
    310315    for (unsigned i = 0; i < numOfOutputs; ++i) {
    311         if (mWritableOutputItemsPhi[i]) {
    312             mWritableOutputItemsPhi[i]->addIncoming(pendingItems[i], exitBlock);
    313         }
    314     }
    315 }
    316 
    317 /** ------------------------------------------------------------------------------------------------------------- *
    318  * @brief calculateNonFinalItemCounts
    319  ** ------------------------------------------------------------------------------------------------------------- */
    320 inline void PipelineCompiler::calculateNonFinalItemCounts(BuilderRef b, Value * const numOfStrides) {
    321     assert (numOfStrides);
    322     const auto numOfInputs = mKernel->getNumOfStreamInputs();
    323     Value * releasedItems[numOfInputs];
    324     const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
    325     Value * pendingItems[numOfOutputs];
    326     for (unsigned i = 0; i < numOfInputs; ++i) {
    327         const Binding & input = mKernel->getInputStreamSetBinding(i);
    328         releasedItems[i] = calculateNumOfLinearItems(b, input, numOfStrides);
    329     }
    330     for (unsigned i = 0; i < numOfOutputs; ++i) {
    331         if (mWritableOutputItemsPhi[i]) {
    332             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    333             pendingItems[i] = calculateNumOfLinearItems(b, output, numOfStrides);
    334         }
    335     }
    336     BasicBlock * const exitBlock = b->GetInsertBlock();
    337     for (unsigned i = 0; i < numOfInputs; ++i) {
    338         mAccessibleInputItemsPhi[i]->addIncoming(releasedItems[i], exitBlock);
    339     }
    340     for (unsigned i = 0; i < numOfOutputs; ++i) {
    341         if (mWritableOutputItemsPhi[i]) {
    342             mWritableOutputItemsPhi[i]->addIncoming(pendingItems[i], exitBlock);
     316        if (mLinearOutputItemsPhi[i]) {
     317            mLinearOutputItemsPhi[i]->addIncoming(linearOutputItems[i], exitBlock);
    343318        }
    344319    }
     
    355330
    356331    for (unsigned i = 0; i < numOfInputs; ++i) {
    357         accessibleItems[i] = addLookahead(b, i, getAccessibleInputItems(b, i));
     332        accessibleItems[i] = addLookahead(b, i, mAccessibleInputItems[i]);
    358333    }
    359334
     
    407382
    408383    for (unsigned i = 0; i < numOfOutputs; ++i) {
    409         if (mWritableOutputItemsPhi[i]) {
     384        if (mLinearOutputItemsPhi[i]) {
    410385            const Binding & output = mKernel->getOutputStreamSetBinding(i);
    411386            const ProcessingRate & rate = output.getRate();
     
    414389                writable = b->CreateCeilUDiv2(minScaledInverseOfAccessibleInput, rateLCM / rate.getRate());
    415390            } else if (rate.isPopCount() || rate.isNegatedPopCount()) {
    416                 writable = getInitialNumOfLinearPopCountItems(b, rate);
     391                writable = getMinimumNumOfLinearPopCountItems(b, output);
    417392            } else {
    418                 writable = getWritableOutputItems(b, i);
     393                writable = mWritableOutputItems[i];
    419394            }
    420395            // update the final item counts with any Add/RoundUp attributes
     
    434409    BasicBlock * const exitBlock = b->GetInsertBlock();
    435410    for (unsigned i = 0; i < numOfInputs; ++i) {
    436         mAccessibleInputItemsPhi[i]->addIncoming(accessibleItems[i], exitBlock);
     411        mLinearInputItemsPhi[i]->addIncoming(accessibleItems[i], exitBlock);
    437412    }
    438413    for (unsigned i = 0; i < numOfOutputs; ++i) {
    439         if (mWritableOutputItemsPhi[i]) {
    440             mWritableOutputItemsPhi[i]->addIncoming(pendingItems[i], exitBlock);
    441         }
    442     }
    443 }
    444 
    445 /** ------------------------------------------------------------------------------------------------------------- *
    446  * @brief writeKernelCall
    447  ** ------------------------------------------------------------------------------------------------------------- */
    448 inline void PipelineCompiler::itemCountSanityCheck(BuilderRef b,
    449                                                    const Binding & binding, const std::string & presentLabel, const std::string & pastLabel,
    450                                                    Value * const itemCount, Value * const expected, Value * const terminated) {
    451 
    452     const auto prefix = mKernel->getName() + "." + binding.getName();
    453     if (mKernel->isCountable(binding)) {
    454         Value * const exact = b->CreateICmpEQ(itemCount, expected);
    455         Value * const tooFew = b->CreateICmpULT(itemCount, expected);
    456         Value * const valid = b->CreateOr(exact, b->CreateAnd(tooFew, terminated));
    457         b->CreateAssert(valid, prefix + " did not " + presentLabel + " the expected number of items");
    458     } else {
    459         const auto lb = mKernel->getLowerBound(binding);
    460         if (lb > 0) {
    461             Value * const hasEnough = b->CreateICmpULE(itemCount, b->getSize(ceiling(lb * mKernel->getStride())));
    462             b->CreateAssert(b->CreateOr(hasEnough, terminated), prefix + " " + pastLabel + " fewer items than expected");
    463         }
    464         Value * const withinBounds = b->CreateICmpULE(itemCount, expected);
    465         b->CreateAssert(withinBounds, prefix + " " + pastLabel + " more items than expected");
    466     }
    467 
     414        if (mLinearOutputItemsPhi[i]) {
     415            mLinearOutputItemsPhi[i]->addIncoming(pendingItems[i], exitBlock);
     416        }
     417    }
     418}
     419
     420
     421/** ------------------------------------------------------------------------------------------------------------- *
     422 * @brief calculateBufferExpansionSize
     423 ** ------------------------------------------------------------------------------------------------------------- */
     424inline Value * PipelineCompiler::calculateBufferExpansionSize(BuilderRef b, const unsigned outputPort) {
     425    Value * const produced = getAlreadyProducedItemCount(b, outputPort);
     426    Value * const consumed = getConsumedItemCount(b, outputPort);
     427    Value * const unconsumed = b->CreateSub(produced, consumed);
     428    Value * const strideLength = getOutputStrideLength(b, outputPort);
     429    return b->CreateAdd(unconsumed, strideLength);
     430}
     431
     432/** ------------------------------------------------------------------------------------------------------------- *
     433 * @brief expandOutputBuffers
     434 ** ------------------------------------------------------------------------------------------------------------- */
     435void PipelineCompiler::expandOutputBuffers(BuilderRef b) {
     436
     437}
     438
     439/** ------------------------------------------------------------------------------------------------------------- *
     440 * @brief expandOutputBuffer
     441 ** ------------------------------------------------------------------------------------------------------------- */
     442inline void PipelineCompiler::expandOutputBuffer(BuilderRef b, const unsigned outputPort, Value * const hasEnough, BasicBlock * const target) {
     443    const Binding & output = mKernel->getOutputStreamSetBinding(outputPort);
     444    const auto prefix = makeBufferName(mKernelIndex, output);
     445    BasicBlock * const expand = b->CreateBasicBlock(prefix + "_expandOutputBuffer", target);
     446    BasicBlock * const entryBlock = b->GetInsertBlock();
     447    Value * const currentWritableItems = mWritableOutputItems[outputPort];
     448    b->CreateLikelyCondBr(hasEnough, target, expand);
     449
     450    b->SetInsertPoint(expand);
     451    Value * const size = calculateBufferExpansionSize(b, outputPort);
     452    #ifdef PRINT_DEBUG_MESSAGES
     453    b->CallPrintInt(prefix + "_expandingToSize", size);
     454    #endif
     455    const StreamSetBuffer * const buffer = cast<DynamicBuffer>(getOutputBuffer(outputPort));
     456    buffer->setCapacity(b.get(), size);
     457    Value * const newWritableItems = getWritableOutputItems(b, outputPort);
     458    BasicBlock * const expandEnd = b->GetInsertBlock();
     459    b->CreateBr(target);
     460
     461    b->SetInsertPoint(target);
     462    PHINode * const writablePhi = b->CreatePHI(b->getSizeTy(), 2);
     463    writablePhi->addIncoming(currentWritableItems, entryBlock);
     464    writablePhi->addIncoming(newWritableItems, expandEnd);
     465    mWritableOutputItems[outputPort] = writablePhi;
    468466}
    469467
     
    477475#warning TODO: add MProtect to buffers and their handles.
    478476
    479     #ifdef PRINT_DEBUG_MESSAGES
    480     b->CallPrintInt(mKernel->getName() + "_finalNumOfStrides", mNumOfLinearStridesPhi);
    481     #endif
    482 
    483     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts) && numOfInputs > 0)) {
    484         Value * const hasStrides = b->CreateIsNotNull(mNumOfLinearStridesPhi);
    485         Value * const valid = b->CreateOr(hasStrides, mNoMore);
    486         b->CreateAssert(valid, mKernel->getName() + " must process at least one stride");
    487     }
     477#warning TODO: send in the # of output items we want in the external buffers
    488478
    489479    std::vector<Value *> arguments;
    490480    arguments.reserve((numOfInputs + numOfOutputs + 1) * 2);
    491481    arguments.push_back(mKernel->getHandle());
    492     arguments.push_back(mNumOfLinearStridesPhi);
     482    arguments.push_back(mNumOfLinearStrides);
    493483    for (unsigned i = 0; i < numOfInputs; ++i) {
    494484        const Binding & input = mKernel->getInputStreamSetBinding(i);
    495485        arguments.push_back(getLogicalInputBaseAddress(b, i));
    496         arguments.push_back(mAccessibleInputItemsPhi[i]);
     486        arguments.push_back(mLinearInputItemsPhi[i]);
    497487        if (LLVM_UNLIKELY(input.hasAttribute(AttrId::RequiresPopCountArray))) {
    498488            arguments.push_back(getPopCountArray(b, i));
     
    504494
    505495    for (unsigned i = 0; i < numOfOutputs; ++i) {
    506         if (mWritableOutputItemsPhi[i]) {
     496        if (mLinearOutputItemsPhi[i]) {
    507497            arguments.push_back(getLogicalOutputBaseAddress(b, i));
    508             arguments.push_back(mWritableOutputItemsPhi[i]);
    509         }       
    510     }
    511 
    512     std::vector<Value *> previouslyProcessedItemCount(0);
    513     std::vector<Value *> previouslyProducedItemCount(0);
    514     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    515         previouslyProcessedItemCount.resize(numOfInputs);
    516         for (unsigned i = 0; i < numOfInputs; ++i) {
    517             const Binding & input = mKernel->getInputStreamSetBinding(i);
    518             previouslyProcessedItemCount[i] = b->getNonDeferredProcessedItemCount(input);
    519         }
    520         previouslyProducedItemCount.resize(numOfOutputs);
    521         for (unsigned i = 0; i < numOfOutputs; ++i) {
    522             if (mWritableOutputItemsPhi[i]) {
    523                 const Binding & output = mKernel->getOutputStreamSetBinding(i);
    524                 previouslyProducedItemCount[i] = b->getNonDeferredProducedItemCount(output);
    525             }
    526         }
    527     }
    528 
    529     #ifdef PRINT_DEBUG_MESSAGES
    530     b->CallPrintInt(" *** " + mKernel->getName() + " ***", mNumOfLinearStridesPhi);
    531     #endif
     498            arguments.push_back(mLinearOutputItemsPhi[i]);
     499        }
     500    }
    532501
    533502    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableMProtect))) {
     
    535504    }
    536505
     506    #ifdef PRINT_DEBUG_MESSAGES
     507    const auto prefix = makeKernelName(mKernelIndex);
     508    b->CallPrintInt("* " + prefix + "_executing", mNumOfLinearStrides);
     509    #endif
     510
    537511    b->CreateCall(getDoSegmentFunction(b), arguments);
    538512
     
    541515    }
    542516
    543     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    544         Value * const terminated = isTerminated(b);
    545         for (unsigned i = 0; i < numOfInputs; ++i) {
    546             const Binding & input = mKernel->getInputStreamSetBinding(i);
    547             Value * const processed = b->getNonDeferredProcessedItemCount(input);
    548             Value * const expected = b->CreateAdd(previouslyProcessedItemCount[i], mAccessibleInputItemsPhi[i]);
    549             itemCountSanityCheck(b, input, "process", "processed", processed, expected, terminated);
    550         }
    551         for (unsigned i = 0; i < numOfOutputs; ++i) {
    552             if (previouslyProducedItemCount[i] == nullptr) continue;
    553             const Binding & output = mKernel->getOutputStreamSetBinding(i);
    554             Value * const produced = b->getNonDeferredProducedItemCount(output);
    555             Value * const expected = b->CreateAdd(previouslyProducedItemCount[i], mWritableOutputItemsPhi[i]);
    556             itemCountSanityCheck(b, output, "produce", "produced", produced, expected, terminated);
    557         }
     517}
     518
     519/** ------------------------------------------------------------------------------------------------------------- *
     520 * @brief computeFullyProcessedItemCounts
     521 ** ------------------------------------------------------------------------------------------------------------- */
     522void PipelineCompiler::computeFullyProcessedItemCounts(BuilderRef b) {
     523    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     524    mFullyProcessedItemCount.resize(numOfInputs);
     525    for (unsigned i = 0; i < numOfInputs; ++i) {
     526        const Binding & input = mKernel->getInputStreamSetBinding(i);
     527        Value * processed = b->getProcessedItemCount(input.getName());
     528        processed = truncateBlockSize(b, input, processed, mTerminatedPhi);
     529        mFullyProcessedItemCount[i] = processed;
     530        #ifdef PRINT_DEBUG_MESSAGES
     531        const auto prefix = makeBufferName(mKernelIndex, input);
     532        b->CallPrintInt(prefix + "_processed'", processed);
     533        #endif
    558534    }
    559535}
     
    593569//        Value * const mask = b->bitblock_mask_to(maskOffset, true);
    594570//        BasicBlock * const entry = b->GetInsertBlock();
    595 //        const auto prefix = mKernel->getName() + "_" + output.getName();
     571//        const auto prefix = makeBufferName(mKernelIndex, output);
    596572//        BasicBlock * const loop = b->CreateBasicBlock(prefix + "_clearLoop", mKernelExit);
    597573//        BasicBlock * const exit = b->CreateBasicBlock(prefix + "_clearExit", mKernelExit);
     
    620596
    621597/** ------------------------------------------------------------------------------------------------------------- *
     598 * @brief incrementItemCountsOfCountableRateStreams
     599 ** ------------------------------------------------------------------------------------------------------------- */
     600inline void PipelineCompiler::incrementItemCountsOfCountableRateStreams(BuilderRef b) {
     601    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     602    for (unsigned i = 0; i < numOfInputs; i++) {
     603        const Binding & input = mKernel->getInputStreamSetBinding(i);
     604        const ProcessingRate & rate = input.getRate();
     605        if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
     606            Value * const processed = getAlreadyProcessedItemCount(b, i);
     607            Value * const items = b->CreateAdd(processed, mLinearInputItemsPhi[i]);
     608            b->setNonDeferredProcessedItemCount(input, items);
     609        }
     610    }
     611    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     612    for (unsigned i = 0; i < numOfOutputs; i++) {
     613        if (mLinearOutputItemsPhi[i]) {
     614            const Binding & output = mKernel->getOutputStreamSetBinding(i);
     615            const ProcessingRate & rate = output.getRate();
     616            if (rate.isFixed() || rate.isPopCount() || rate.isNegatedPopCount()) {
     617                Value * const produced = getAlreadyProducedItemCount(b, i);
     618                Value * const items = b->CreateAdd(produced, mLinearOutputItemsPhi[i]);
     619                b->setNonDeferredProducedItemCount(output, items);
     620            }
     621        }
     622    }
     623
     624    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     625        for (unsigned i = 0; i < numOfInputs; ++i) {
     626            const Binding & input = mKernel->getInputStreamSetBinding(i);
     627            const ProcessingRate & rate = input.getRate();
     628            if (rate.isBounded() || rate.isUnknown()) {
     629                Value * const processed = getAlreadyProcessedItemCount(b, i);
     630                Value * const expected = b->CreateAdd(processed, mLinearInputItemsPhi[i]);
     631                itemCountSanityCheck(b, input, "processed", processed, expected);
     632            }
     633        }
     634        for (unsigned i = 0; i < numOfOutputs; ++i) {
     635            if (mLinearOutputItemsPhi[i]) {
     636                const Binding & output = mKernel->getOutputStreamSetBinding(i);
     637                const ProcessingRate & rate = output.getRate();
     638                if (rate.isBounded() || rate.isUnknown()) {
     639                    Value * const produced = getAlreadyProducedItemCount(b, i);
     640                    Value * const expected = b->CreateAdd(produced, mLinearOutputItemsPhi[i]);
     641                    itemCountSanityCheck(b, output, "produced", produced, expected);
     642                }
     643            }
     644        }
     645    }
     646
     647}
     648
     649/** ------------------------------------------------------------------------------------------------------------- *
     650 * @brief itemCountSanityCheck
     651 ** ------------------------------------------------------------------------------------------------------------- */
     652void PipelineCompiler::itemCountSanityCheck(BuilderRef b, const Binding & binding,
     653                                            const std::string & label,
     654                                            Value * const itemCount, Value * const expected) {
     655
     656    const auto prefix = makeBufferName(mKernelIndex, binding);
     657    const auto lb = mKernel->getLowerBound(binding);
     658    if (lb > 0 && !binding.hasAttribute(AttrId::Deferred)) {
     659        Constant * const strideSize = b->getSize(ceiling(lb * mKernel->getStride()));
     660        Value * hasEnough = b->CreateICmpULE(itemCount, strideSize);
     661        hasEnough = b->CreateOr(hasEnough, terminatedExplicitly(b));
     662        b->CreateAssert(hasEnough, prefix + " " + label + " fewer items than expected");
     663    }
     664    Value * const withinBounds = b->CreateICmpULE(itemCount, expected);
     665    b->CreateAssert(withinBounds, prefix + " " + label + " more items than expected");
     666
     667}
     668
     669
     670/** ------------------------------------------------------------------------------------------------------------- *
    622671 * @brief getInputStrideLength
    623672 ** ------------------------------------------------------------------------------------------------------------- */
    624 inline Value * PipelineCompiler::getInputStrideLength(BuilderRef b, const unsigned index) {
    625     assert (index < mInputStrideLength.size());
    626     if (mInputStrideLength[index]) {
    627         return mInputStrideLength[index];
     673Value * PipelineCompiler::getInputStrideLength(BuilderRef b, const unsigned inputPort) {
     674    assert (inputPort < mInputStrideLength.size());
     675    if (mInputStrideLength[inputPort]) {
     676        return mInputStrideLength[inputPort];
    628677    } else {
    629         const Binding & input = mKernel->getInputStreamSetBinding(index);
    630         Value * const strideLength = getInitialStrideLength(b, input);
    631         mInputStrideLength[index] = strideLength;
     678        Value * const strideLength = getInitialStrideLength(b, Port::Input, inputPort);
     679        mInputStrideLength[inputPort] = strideLength;
    632680        return strideLength;
    633681    }
     
    637685 * @brief getOutputStrideLength
    638686 ** ------------------------------------------------------------------------------------------------------------- */
    639 inline Value * PipelineCompiler::getOutputStrideLength(BuilderRef b, const unsigned index) {
    640     assert (index < mOutputStrideLength.size());
    641     if (mOutputStrideLength[index]) {
    642         return mOutputStrideLength[index];
     687Value * PipelineCompiler::getOutputStrideLength(BuilderRef b, const unsigned outputPort) {
     688    assert (outputPort < mOutputStrideLength.size());
     689    if (mOutputStrideLength[outputPort]) {
     690        return mOutputStrideLength[outputPort];
    643691    } else {
    644         const Binding & output = mKernel->getOutputStreamSetBinding(index);
    645         Value * const strideLength = getInitialStrideLength(b, output);
    646         mOutputStrideLength[index] = strideLength;
     692        Value * const strideLength = getInitialStrideLength(b, Port::Output, outputPort);
     693        mOutputStrideLength[outputPort] = strideLength;
    647694        return strideLength;
    648695    }
     
    652699 * @brief getInitialStrideLength
    653700 ** ------------------------------------------------------------------------------------------------------------- */
    654 inline Value * PipelineCompiler::getInitialStrideLength(BuilderRef b, const Binding & binding) {
     701Value * PipelineCompiler::getInitialStrideLength(BuilderRef b, const Port port, const unsigned portNum) {
     702    const Binding & binding = getBinding(mKernel, port, portNum);
    655703    const ProcessingRate & rate = binding.getRate();
    656704    if (LLVM_LIKELY(rate.isFixed() || rate.isBounded())) {
    657705        return b->getSize(ceiling(mKernel->getUpperBound(binding) * mKernel->getStride()));
    658706    } else if (LLVM_UNLIKELY(rate.isPopCount() || rate.isNegatedPopCount())) {
    659         return getInitialNumOfLinearPopCountItems(b, rate);
     707        return getMinimumNumOfLinearPopCountItems(b, binding);
    660708    } else if (rate.isRelative()) {
    661         const Binding & ref = mKernel->getStreamBinding(rate.getReference());
    662         Value * const baseRate = getInitialStrideLength(b, ref);
     709        Port refPort; unsigned refPortNum;
     710        std::tie(refPort, refPortNum) = mKernel->getStreamPort(rate.getReference());
     711        Value * const baseRate = getInitialStrideLength(b, refPort, refPortNum);
    663712        return b->CreateMul2(baseRate, rate.getRate());
    664713    }
     
    667716
    668717/** ------------------------------------------------------------------------------------------------------------- *
     718 * @brief getMaximumStrideLength
     719 ** ------------------------------------------------------------------------------------------------------------- */
     720inline Value * PipelineCompiler::getMaximumStrideLength(BuilderRef b, const Port port, const unsigned portNum) {
     721    const Binding & binding = getBinding(mKernel, port, portNum);
     722    const ProcessingRate & rate = binding.getRate();
     723    if (LLVM_LIKELY(rate.isFixed() || rate.isBounded() || rate.isPopCount() || rate.isNegatedPopCount())) {
     724        return b->getSize(ceiling(mKernel->getUpperBound(binding) * mKernel->getStride()));
     725    } else if (LLVM_LIKELY(rate.isUnknown())) {
     726        return b->getSize(0);
     727    } else if (rate.isRelative()) {
     728        Port refPort; unsigned refPortNum;
     729        std::tie(refPort, refPortNum) = mKernel->getStreamPort(rate.getReference());
     730        Value * const baseRate = getMaximumStrideLength(b, refPort, refPortNum);
     731        return b->CreateMul2(baseRate, rate.getRate());
     732    }
     733    llvm_unreachable("unexpected rate type");
     734}
     735
     736/** ------------------------------------------------------------------------------------------------------------- *
    669737 * @brief calculateNumOfLinearItems
    670738 ** ------------------------------------------------------------------------------------------------------------- */
    671 inline Value * PipelineCompiler::calculateNumOfLinearItems(BuilderRef b, const Binding & binding, Value * const numOfStrides) {
     739inline Value * PipelineCompiler::calculateNumOfLinearItems(BuilderRef b, const Port portType,  const unsigned portNum) {
     740    const Binding & binding = getBinding(mKernel, portType, portNum);
    672741    const ProcessingRate & rate = binding.getRate();
    673742    if (rate.isFixed() || rate.isBounded()) {
    674         return b->CreateMul2(numOfStrides, mKernel->getUpperBound(binding) * mKernel->getStride());
     743        return b->CreateMul2(mNumOfLinearStrides, mKernel->getUpperBound(binding) * mKernel->getStride());
    675744    } else if (rate.isPopCount() || rate.isNegatedPopCount()) {
    676         return getNumOfLinearPopCountItems(b, rate, numOfStrides);
     745        return getNumOfLinearPopCountItems(b, binding);
    677746    } else if (rate.isRelative()) {
    678         Value * const baseCount = calculateNumOfLinearItems(b, mKernel->getStreamBinding(rate.getReference()), numOfStrides);
     747        Port refPort; unsigned refPortNum;
     748        std::tie(refPort, refPortNum) = mKernel->getStreamPort(rate.getReference());
     749        Value * const baseCount = calculateNumOfLinearItems(b, refPort, refPortNum);
    679750        return b->CreateMul2(baseCount, rate.getRate());
    680751    }
     
    683754
    684755/** ------------------------------------------------------------------------------------------------------------- *
    685  * @brief getFullyProcessedItemCount
    686  ** ------------------------------------------------------------------------------------------------------------- */
    687 inline Value * PipelineCompiler::getFullyProcessedItemCount(BuilderRef b, const Binding & input) const {
    688     Value * const processed = b->getProcessedItemCount(input.getName());
    689     if (LLVM_UNLIKELY(input.hasAttribute(AttrId::BlockSize))) {
     756 * @brief getTotalItemCount
     757 ** ------------------------------------------------------------------------------------------------------------- */
     758inline Value * PipelineCompiler::getTotalItemCount(BuilderRef /* b */, const unsigned inputPort) const {
     759    return mBufferGraph[getInputBufferVertex(inputPort)].TotalItems;
     760}
     761
     762/** ------------------------------------------------------------------------------------------------------------- *
     763 * @brief addLookahead
     764 ** ------------------------------------------------------------------------------------------------------------- */
     765inline Value * PipelineCompiler::addLookahead(BuilderRef b, const unsigned inputPort, Value * itemCount) const {
     766    Constant * const lookAhead = getLookahead(b, inputPort);
     767    return (lookAhead) ? b->CreateAdd(itemCount, lookAhead) : itemCount;
     768}
     769
     770/** ------------------------------------------------------------------------------------------------------------- *
     771 * @brief subtractLookahead
     772 ** ------------------------------------------------------------------------------------------------------------- */
     773inline Value * PipelineCompiler::subtractLookahead(BuilderRef b, const unsigned inputPort, Value * itemCount) const {
     774    Constant * const lookAhead = getLookahead(b, inputPort);
     775    return (lookAhead) ? b->CreateSub(itemCount, lookAhead) : itemCount;
     776}
     777
     778/** ------------------------------------------------------------------------------------------------------------- *
     779 * @brief getLookahead
     780 ** ------------------------------------------------------------------------------------------------------------- */
     781Constant * PipelineCompiler::getLookahead(BuilderRef b, const unsigned inputPort) const {
     782    const Binding & input = mKernel->getInputStreamSetBinding(inputPort);
     783    if (LLVM_UNLIKELY(input.hasLookahead())) {
     784        return b->getSize(input.getLookahead());
     785    }
     786    return nullptr;
     787}
     788
     789/** ------------------------------------------------------------------------------------------------------------- *
     790 * @brief maskBlockSize
     791 ** ------------------------------------------------------------------------------------------------------------- */
     792inline Value * PipelineCompiler::truncateBlockSize(BuilderRef b, const Binding & binding, Value * itemCount, Value * all) const {
     793    // TODO: if we determine all of the inputs of a stream have a blocksize attribute, or the output has one,
     794    // we can skip masking it on input
     795    if (LLVM_UNLIKELY(binding.hasAttribute(AttrId::BlockSize))) {
    690796        // If the input rate has a block size attribute then --- for the purpose of determining how many
    691797        // items have been consumed --- we consider a stream set to be fully processed when an entire
    692798        // stride has been processed.
    693799        Constant * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
    694         Value * const partial = b->CreateAnd(processed, ConstantExpr::getNeg(BLOCK_WIDTH));
    695         return b->CreateSelect(mTerminatedPhi, processed, partial);
    696     }
    697     return processed;
    698 }
    699 
    700 /** ------------------------------------------------------------------------------------------------------------- *
    701  * @brief getTotalItemCount
    702  ** ------------------------------------------------------------------------------------------------------------- */
    703 inline Value * PipelineCompiler::getTotalItemCount(BuilderRef b, const StreamSetBuffer * buffer) const {
    704     const auto p = mTotalItemCount.find(buffer);
    705     assert (p != mTotalItemCount.end());
    706     return p->second;
    707 }
    708 
    709 /** ------------------------------------------------------------------------------------------------------------- *
    710  * @brief addLookahead
    711  ** ------------------------------------------------------------------------------------------------------------- */
    712 inline Value * PipelineCompiler::addLookahead(BuilderRef b, const unsigned index, Value * itemCount) const {
    713     const Binding & input = mKernel->getInputStreamSetBinding(index);
    714     if (LLVM_UNLIKELY(input.hasLookahead())) {
    715         Constant * const lookahead = b->getSize(input.getLookahead());
    716         itemCount = b->CreateAdd(itemCount, lookahead);
     800        Value * const maskedItemCount = b->CreateAnd(itemCount, ConstantExpr::getNeg(BLOCK_WIDTH));
     801        itemCount = b->CreateSelect(all, itemCount, maskedItemCount);
    717802    }
    718803    return itemCount;
    719804}
    720805
    721 /** ------------------------------------------------------------------------------------------------------------- *
    722  * @brief subtractLookahead
    723  ** ------------------------------------------------------------------------------------------------------------- */
    724 inline Value * PipelineCompiler::subtractLookahead(BuilderRef b, const unsigned index, Value * itemCount) const {
    725     const Binding & input = mKernel->getInputStreamSetBinding(index);
    726     if (LLVM_UNLIKELY(input.hasLookahead())) {
    727         Constant * const lookahead = b->getSize(input.getLookahead());
    728         itemCount = b->CreateSub(itemCount, lookahead);
    729     }
    730     return itemCount;
    731 }
    732 
    733 /** ------------------------------------------------------------------------------------------------------------- *
    734  * @brief allocateThreadLocalState
    735  ** ------------------------------------------------------------------------------------------------------------- */
    736 inline void PipelineCompiler::allocateThreadLocalState(BuilderRef b, const Port port, const unsigned i) {
    737     const auto & rate = getBinding(port, i).getRate();
    738     if (rate.isPopCount() || rate.isNegatedPopCount()) {
    739         allocateLocalPopCountArray(b, rate);
    740     }
    741 }
    742 
    743 /** ------------------------------------------------------------------------------------------------------------- *
    744  * @brief deallocateThreadLocalState
    745  ** ------------------------------------------------------------------------------------------------------------- */
    746 inline void PipelineCompiler::deallocateThreadLocalState(BuilderRef b, const Port port, const unsigned i) {
    747     const auto & rate = getBinding(port, i).getRate();
    748     if (rate.isPopCount() || rate.isNegatedPopCount()) {
    749         deallocateLocalPopCountArray(b, rate);
    750     }
    751 }
    752806
    753807/** ------------------------------------------------------------------------------------------------------------- *
    754808 * @brief getInitializationFunction
    755809 ** ------------------------------------------------------------------------------------------------------------- */
    756 Value *PipelineCompiler::getFunctionFromKernelState(BuilderRef b, Type * const type, const std::string & suffix) const {
     810Value * PipelineCompiler::getFunctionFromKernelState(BuilderRef b, Type * const type, const std::string & suffix) const {
    757811    const auto kn = makeKernelName(mKernelIndex);
    758812    b->setKernel(mPipelineKernel);
     
    779833 * @brief getDoSegmentFunction
    780834 ** ------------------------------------------------------------------------------------------------------------- */
    781 inline Value *PipelineCompiler::getDoSegmentFunction(BuilderRef b) const {
     835inline Value * PipelineCompiler::getDoSegmentFunction(BuilderRef b) const {
    782836    Function * const doSegment = mKernel->getDoSegmentFunction(b->getModule());
    783837    if (mKernel->hasFamilyName()) {
     
    798852}
    799853
    800 }
     854template <typename Vec>
     855inline void reset(Vec & vec, const unsigned n) {
     856    vec.resize(n);
     857    std::fill_n(vec.begin(), n, nullptr);
     858}
     859
     860/** ------------------------------------------------------------------------------------------------------------- *
     861 * @brief resetMemoizedFields
     862 ** ------------------------------------------------------------------------------------------------------------- */
     863void PipelineCompiler::resetMemoizedFields() {
     864    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     865    reset(mAlreadyProcessedItemCount, numOfInputs);
     866    reset(mInputStrideLength, numOfInputs);
     867    reset(mAccessibleInputItems, numOfInputs);
     868    reset(mLinearInputItemsPhi, numOfInputs);
     869    reset(mFullyProcessedItemCount, numOfInputs);
     870    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     871    reset(mInitiallyProducedItemCount, numOfOutputs);
     872    reset(mAlreadyProducedItemCount, numOfOutputs);
     873    reset(mOutputStrideLength, numOfOutputs);
     874    reset(mWritableOutputItems, numOfOutputs);
     875    reset(mLinearOutputItemsPhi, numOfOutputs);
     876}
     877
     878
     879}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_analysis.hpp

    r6186 r6228  
    11#include "pipeline_compiler.hpp"
     2#include "lexographic_ordering.hpp"
    23#include <boost/graph/topological_sort.hpp>
    34#include <util/extended_boost_graph_containers.h>
     
    67
    78#warning TODO: support call bindings that produce output that are inputs of other call bindings or become scalar outputs of the pipeline
     9
     10#if 0
     11
     12/** ------------------------------------------------------------------------------------------------------------- *
     13 * @brief printBufferGraph
     14 ** ------------------------------------------------------------------------------------------------------------- */
     15template <typename Graph>
     16void printGraph(const Graph & G, raw_ostream & out) {
     17
     18    out << "digraph G {\n";
     19    for (auto v : make_iterator_range(vertices(G))) {
     20        out << "v" << v << " [label=\"" << v << "\"];\n";
     21    }
     22    for (auto e : make_iterator_range(edges(G))) {
     23        const auto s = source(e, G);
     24        const auto t = target(e, G);
     25        out << "v" << s << " -> v" << t << ";\n";
     26    }
     27
     28    out << "}\n\n";
     29    out.flush();
     30}
     31
     32#endif
    833
    934namespace {
     
    4671    }
    4772
    48 }
     73} // end of anonymous namespace
    4974
    5075/** ------------------------------------------------------------------------------------------------------------- *
     
    80105}
    81106
    82 
     107namespace {
     108
     109    template <typename Graph, typename Vertex = typename graph_traits<Graph>::vertex_descriptor>
     110    bool add_edge_if_no_induced_cycle(const Vertex s, const Vertex t, Graph & G) {
     111        // If G is a DAG and there is a t-s path, adding s-t will induce a cycle.
     112        const auto d = in_degree(s, G);
     113        if (d != 0) {
     114            flat_set<Vertex> V;
     115            V.reserve(num_vertices(G) - 1);
     116            std::queue<Vertex> Q;
     117            Q.push(t);
     118            // do a BFS to find one a path to s
     119            for (;;) {
     120                const auto u = Q.front();
     121                Q.pop();
     122                for (auto e : make_iterator_range(out_edges(u, G))) {
     123                    const auto v = target(e, G);
     124                    if (V.contains(v)) continue;
     125                    if (LLVM_UNLIKELY(v == s)) return false;
     126                    Q.push(v);
     127                    V.insert(v);
     128                }
     129                if (Q.empty()) {
     130                    break;
     131                }
     132            }
     133        }
     134        add_edge(s, t, G);
     135        return true;
     136    }
     137}
    83138
    84139/** ------------------------------------------------------------------------------------------------------------- *
    85140 * @brief makePortDependencyGraph
    86  ** ------------------------------------------------------------------------------------------------------------- */
    87 PortDependencyGraph PipelineCompiler::makePortDependencyGraph() const {
    88 
    89     const auto n = mKernel->getNumOfStreamInputs();
    90     const auto m = mKernel->getNumOfStreamOutputs();
    91     const auto l = n + m;
    92 
    93     PortDependencyGraph G(l);
     141 *
     142 * Returns a lexographically sorted list of ports s.t. the inputs will be ordered as close as possible (baring
     143 * any constraints) to the kernel's original I/O ordering.
     144 ** ------------------------------------------------------------------------------------------------------------- */
     145std::vector<unsigned> PipelineCompiler::lexicalOrderingOfStreamIO() const {
     146
     147    const auto numOfInputs = mKernel->getNumOfStreamInputs();
     148    const auto numOfOutputs = mKernel->getNumOfStreamOutputs();
     149    const auto numOfPorts = numOfInputs + numOfOutputs;
     150
     151    using Graph = adjacency_list<vecS, vecS, bidirectionalS>;
     152
     153    Graph G(numOfPorts);
    94154
    95155    // enumerate the input relations
    96     for (unsigned i = 0; i < n; ++i) {
     156    for (unsigned i = 0; i < numOfInputs; ++i) {
    97157        const Binding & input = mKernel->getInputStreamSetBinding(i);
    98158        const ProcessingRate & rate = input.getRate();
     
    101161            std::tie(port, j) = mKernel->getStreamPort(rate.getReference());
    102162            assert ("input stream cannot refer to an output stream" && port == Port::Input);
    103             add_edge(i, j, rate.getKind(), G);
     163            add_edge(j, i, G);
    104164        }
    105165    }
    106166    // and then enumerate the output relations
    107     for (unsigned i = 0; i < m; ++i) {
     167    for (unsigned i = 0; i < numOfOutputs; ++i) {
    108168        const Binding & output = mKernel->getOutputStreamSetBinding(i);
    109169        const ProcessingRate & rate = output.getRate();
     
    111171            Port port; unsigned j;
    112172            std::tie(port, j) = mKernel->getStreamPort(rate.getReference());
    113             add_edge((i + n), j + ((port == Port::Output) ? n : 0), rate.getKind(), G);
    114         }
    115     }
    116     return G;
     173            add_edge(j + ((port == Port::Output) ? numOfInputs : 0), (i + numOfInputs), G);
     174        }
     175    }
     176
     177    // check any dynamic buffer last
     178
     179
     180
     181    // TODO: add additional constraints on input ports to indicate the ones
     182    // likely to have fewest number of strides?
     183
     184
     185    return lexicalOrdering(std::move(G), mKernel->getName() + " has cyclic port dependencies.");
    117186}
    118187
     
    146215    }
    147216
    148     void printTerminationGraph(const TerminationGraph & G, const std::vector<Kernel *> & pipeline, const unsigned index) {
    149 
    150         auto & out = errs();
    151 
    152         const auto numOfKernels = pipeline.size();
    153 
    154         out << "digraph G" << index << " {\n";
    155         for (auto u : make_iterator_range(vertices(G))) {
    156             out << "v" << u << " [label=\"" << u << ": ";
    157             if (u < numOfKernels) {
    158                 out << pipeline[u]->getName();
    159             } else if (u == numOfKernels) {
    160                 out << "Pipeline";
    161             } else {
    162                 out << "B" << (u - numOfKernels);
     217} // end of anonymous namespace
     218
     219namespace {
     220
     221/** ------------------------------------------------------------------------------------------------------------- *
     222 * @brief minimumConsumed
     223 ** ------------------------------------------------------------------------------------------------------------- */
     224inline LLVM_READNONE RateValue minimumConsumed(const Kernel * const kernel, const Binding & binding) {
     225    if (LLVM_UNLIKELY(binding.hasAttribute(AttrId::Deferred))) {
     226        return RateValue{0};
     227    }
     228    return lowerBound(kernel, binding);
     229}
     230
     231}
     232/** ------------------------------------------------------------------------------------------------------------- *
     233 * @brief makeConsumerGraph
     234 *
     235 * Copy the buffer graph but amalgamate any multi-edges into a single one
     236 ** ------------------------------------------------------------------------------------------------------------- */
     237ConsumerGraph PipelineCompiler::makeConsumerGraph()  const {
     238
     239    const auto lastBuffer = num_vertices(mBufferGraph);
     240    ConsumerGraph G(lastBuffer);
     241    const auto numOfKernels = mPipeline.size();
     242    const auto firstBuffer = numOfKernels + 1;
     243
     244    #warning TODO: ConsumerGraph assumes the dataflow is transitively bounded by the same initial source
     245
     246    #warning REVISIT: ConsumerGraph is not optimal for handling relative rate inputs
     247
     248    std::vector<std::pair<unsigned, unsigned>> consumers; // kernel, portIndex
     249
     250    for (auto bufferVertex = firstBuffer; bufferVertex < lastBuffer; ++bufferVertex) {
     251
     252        const BufferNode & bn = mBufferGraph[bufferVertex];
     253
     254        if (LLVM_UNLIKELY(bn.Type == BufferType::External)) {
     255            continue;
     256        }
     257
     258        // copy the producing edge
     259        const auto pe = in_edge(bufferVertex, mBufferGraph);
     260        add_edge(source(pe, mBufferGraph), bufferVertex, mBufferGraph[pe].Port, G);
     261
     262        // collect the consumers of the i-th buffer
     263        consumers.clear();
     264        for (const auto e : make_iterator_range(out_edges(bufferVertex, mBufferGraph))) {
     265            consumers.emplace_back(target(e, mBufferGraph), mBufferGraph[e].Port);
     266        }
     267
     268        // If the minimum input rate of the j-th consumer is greater than or equal to the maximum input
     269        // rate of the k-th consumer, we do not need to test the j-th consumer. This also ensures that
     270        // for each *FIXED* rate stream, keep only the minimum such rate. However, we may need to insert
     271        // a "fake" edge to mark the last consumer otherwise we'll set it too soon.
     272
     273        if (LLVM_LIKELY(consumers.size() > 1)) {
     274            std::sort(consumers.begin(), consumers.end());
     275
     276            const auto finalConsumer = consumers.back().first;
     277
     278            for (auto j = consumers.begin() + 1; j != consumers.end(); ) {
     279
     280                const Kernel * const kernel_j = mPipeline[j->first];
     281                const Binding & input_j = kernel_j->getInputStreamSetBinding(j->second);
     282                const auto lb_j = minimumConsumed(kernel_j, input_j);
     283
     284                for (auto k = consumers.begin(); k != j; ++k) {
     285                    const Kernel * const kernel_k = mPipeline[k->first];
     286                    const Binding & input_k = kernel_k->getInputStreamSetBinding(k->second);
     287                    const auto ub_k = upperBound(kernel_k, input_k);
     288                    if (LLVM_UNLIKELY(lb_j >= ub_k)) {
     289                        j = consumers.erase(j);
     290                        goto next;
     291                    }
     292                }
     293
     294                for (auto k = j + 1; k != consumers.end(); ++k) {
     295                    const Kernel * const kernel_k = mPipeline[k->first];
     296                    const Binding & input_k = kernel_k->getInputStreamSetBinding(k->second);
     297                    const auto ub_k = upperBound(kernel_k, input_k);
     298                    if (LLVM_UNLIKELY(lb_j >= ub_k)) {
     299                        j = consumers.erase(j);
     300                        goto next;
     301                    }
     302                }
     303
     304                ++j;
     305next:           continue;
    163306            }
    164             out << "\"];\n";
    165         }
    166 
    167         for (auto e : make_iterator_range(edges(G))) {
    168             const auto s = source(e, G);
    169             const auto t = target(e, G);
    170             out << "v" << s << " -> v" << t;
    171            // out << " label=\"" << G[e] << "\"";
    172             out << ";\n";
    173         }
    174 
    175         out << "}\n\n";
    176         out.flush();
    177 
    178     }
    179 
    180 
    181 }
     307            if (LLVM_UNLIKELY(consumers.back().first != finalConsumer)) {
     308                consumers.emplace_back(finalConsumer, FAKE_CONSUMER);
     309            }
     310        }
     311
     312        for (const auto & consumer : consumers) {
     313            add_edge(bufferVertex, consumer.first, consumer.second, G);
     314        }
     315    }
     316
     317    return G;
     318}
     319
    182320
    183321/** ------------------------------------------------------------------------------------------------------------- *
     
    265403}
    266404
     405/** ------------------------------------------------------------------------------------------------------------- *
     406 * @brief makePopCountGraph
     407 *
     408 * Kernel -> Port -> Buffer ordering. Edge between a Kernel and a port indicates the port # of the source
     409 * items stream. Edges between a port and buffer state the ref port #. The kernel and buffer vertices are
     410 * aligned with the BufferGraph. Any buffer vertex with an in-degree > 0 is the reference of a pop count
     411 * rate stream.
     412 ** ------------------------------------------------------------------------------------------------------------- */
     413PopCountGraph PipelineCompiler::makePopCountGraph() const {
     414    const auto numOfKernels = mPipeline.size();
     415
     416    using PopCountVertex = PopCountGraph::vertex_descriptor;
     417    using PortMapping = flat_map<unsigned, PopCountVertex>;
     418
     419    PopCountGraph G(num_vertices(mBufferGraph));
     420    PortMapping M;
     421
     422    auto addPopCountDependency = [&](
     423        const Kernel * const kernel,
     424        const PopCountVertex kernelVertex,
     425        const unsigned portIndex,
     426        const Binding & binding) {
     427
     428        const ProcessingRate & rate = binding.getRate();
     429        if (LLVM_UNLIKELY(rate.isPopCount() || rate.isNegatedPopCount())) {
     430            // determine which port this I/O port refers to
     431            Port portType; unsigned refPort;
     432            std::tie(portType, refPort) = kernel->getStreamPort(rate.getReference());
     433
     434            // verify the reference stream is an input port
     435            if (LLVM_UNLIKELY(portType != Port::Input)) {
     436                std::string tmp;
     437                raw_string_ostream msg(tmp);
     438                msg << kernel->getName();
     439                msg << ": pop count rate for ";
     440                msg << binding.getName();
     441                msg << " cannot refer to an output stream";
     442                report_fatal_error(msg.str());
     443            }
     444
     445            const auto type = rate.isPopCount() ? CountingType::Positive : CountingType::Negative;
     446
     447            // check if we've already created a vertex for the ref port ...
     448            const auto f = M.find(refPort);
     449            if (LLVM_UNLIKELY(f != M.end())) {
     450                const auto refPortVertex = f->second;
     451                add_edge(kernelVertex, refPortVertex, PopCountEdge{type, portIndex}, G);
     452                // update the ref -> buffer edge with the counting type
     453                for (const auto e : make_iterator_range(out_edges(refPortVertex, G))) {
     454                    G[e].Type |= type;
     455                }
     456            } else { // ... otherwise map a new vertex to it.
     457
     458                // verify the reference stream is a Fixed rate stream
     459                const Binding & refBinding = kernel->getInputStreamSetBinding(refPort);
     460                if (LLVM_UNLIKELY(refBinding.isDeferred() || !refBinding.getRate().isFixed())) {
     461                    std::string tmp;
     462                    raw_string_ostream msg(tmp);
     463                    msg << kernel->getName();
     464                    msg << ": pop count reference ";
     465                    msg << refBinding.getName();
     466                    msg << " must be a non-deferred Fixed rate stream";
     467                    report_fatal_error(msg.str());
     468                }
     469
     470                const auto refPortVertex = add_vertex(G);
     471                M.emplace(refPort, refPortVertex);
     472                add_edge(kernelVertex, refPortVertex, PopCountEdge{type, portIndex}, G);
     473
     474                // determine which buffer this port refers to by inspecting the buffer graph
     475                const auto bufferVertex = getInputBufferVertex(kernelVertex, refPort);
     476                add_edge(refPortVertex, bufferVertex, PopCountEdge{type, refPort}, G);
     477            }
     478        }
     479    };
     480
     481    for (unsigned i = 0; i < numOfKernels; ++i) {
     482        const Kernel * const kernel = mPipeline[i];
     483        const auto numOfInputs = kernel->getNumOfStreamInputs();
     484        const auto numOfOutputs = kernel->getNumOfStreamOutputs();
     485        for (unsigned j = 0; j < numOfInputs; ++j) {
     486            const auto & input = kernel->getInputStreamSetBinding(j);
     487            addPopCountDependency(kernel, i, j, input);
     488        }
     489        for (unsigned j = 0; j < numOfOutputs; ++j) {
     490            const auto & output = kernel->getOutputStreamSetBinding(j);
     491            addPopCountDependency(kernel, i, j + numOfInputs, output);
     492        }
     493        M.clear();
     494    }
     495
     496    return G;
     497}
     498
     499
    267500} // end of namespace
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_builder.cpp

    r6184 r6228  
    1010#include <queue>
    1111
     12
    1213#warning if two kernels have an identical signature and take the same inputs, they ought to produce the same outputs unless they are nondeterministic.
    1314
     
    4546}
    4647
     48using Kernels = PipelineBuilder::Kernels;
    4749
    4850enum class VertexType { Kernel, StreamSet, Scalar };
     
    7678}
    7779
    78 void enumerateProducerBindings(const VertexType type, const Vertex producerVertex, const Bindings & bindings, Graph & G, Map & M) {
     80void enumerateProducerBindings(const VertexType type, const Vertex producerVertex, const Bindings & bindings, Graph & G, Map & M, const Kernels & K) {
    7981    const auto n = bindings.size();
    8082    for (unsigned i = 0; i < n; ++i) {
    8183        const Relationship * const rel = getRelationship(bindings[i]);
    8284        if (LLVM_UNLIKELY(isa<ScalarConstant>(rel))) continue;
    83         assert (M.count(rel) == 0);
     85        const auto f = M.find(rel);
     86        if (LLVM_UNLIKELY(f != M.end())) {
     87            std::string tmp;
     88            raw_string_ostream out(tmp);
     89            const auto existingProducer = target(out_edge(f->second, G), G);
     90            out << "Both " << K[existingProducer]->getName() <<
     91                   " and " << K[producerVertex]->getName() <<
     92                   " produce the same stream.";
     93            throw std::runtime_error(out.str());
     94        }
    8495        const auto bufferVertex = add_vertex(G);
     96        M.emplace(rel, bufferVertex);
    8597        G[bufferVertex].type = type;
    8698        add_edge(bufferVertex, producerVertex, i, G); // buffer -> producer ordering
    87         M.emplace(rel, bufferVertex);
    8899    }
    89100}
     
    183194    Map M;
    184195
    185     enumerateProducerBindings(VertexType::Scalar, pipelineVertex, mInputScalars, G, M);
    186     enumerateProducerBindings(VertexType::StreamSet, pipelineVertex, mInputStreamSets, G, M);
     196    enumerateProducerBindings(VertexType::Scalar, pipelineVertex, mInputScalars, G, M, mKernels);
     197    enumerateProducerBindings(VertexType::StreamSet, pipelineVertex, mInputStreamSets, G, M, mKernels);
    187198    for (unsigned i = 0; i < numOfKernels; ++i) {
    188         enumerateProducerBindings(VertexType::Scalar, i, mKernels[i]->getOutputScalarBindings(), G, M);
    189         enumerateProducerBindings(VertexType::StreamSet, i, mKernels[i]->getOutputStreamSetBindings(), G, M);
     199        enumerateProducerBindings(VertexType::Scalar, i, mKernels[i]->getOutputScalarBindings(), G, M, mKernels);
     200        enumerateProducerBindings(VertexType::StreamSet, i, mKernels[i]->getOutputStreamSetBindings(), G, M, mKernels);
    190201    }
    191202    for (unsigned i = 0; i < numOfKernels; ++i) {
     
    261272
    262273    Kernels pipeline;
    263     pipeline.reserve(ordering.size());   
     274    pipeline.reserve(ordering.size());
    264275
    265276    const std::unique_ptr<kernel::KernelBuilder> & b = mDriver.getBuilder();
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_compiler.hpp

    r6186 r6228  
    99#include <boost/container/flat_map.hpp>
    1010#include <boost/graph/adjacency_list.hpp>
    11 //#include <boost/graph/topological_sort.hpp>
     11#include <boost/graph/topological_sort.hpp>
     12#include <boost/range/adaptor/reversed.hpp>
     13//#include <boost/serialization/strong_typedef.hpp>
    1214#include <boost/math/common_factor_rt.hpp>
    1315#include <llvm/IR/Module.h>
     
    1618#include <queue>
    1719
    18 // #define PRINT_DEBUG_MESSAGES
     20//#define PRINT_DEBUG_MESSAGES
    1921
    2022using namespace boost;
    2123using namespace boost::math;
     24using namespace boost::adaptors;
    2225using boost::container::flat_set;
    2326using boost::container::flat_map;
     
    3033
    3134namespace kernel {
     35
     36#include <util/enum_flags.hpp>
    3237
    3338using Port = Kernel::Port;
     
    3944using Kernels = PipelineKernel::Kernels;
    4045using CallBinding = PipelineKernel::CallBinding;
     46using BuilderRef = const std::unique_ptr<kernel::KernelBuilder> &;
     47
     48// TODO: replace ints used for port #s with the following
     49// BOOST_STRONG_TYPEDEF(unsigned, PortNumber)
    4150
    4251#warning TODO: these graphs are similar; look into streamlining their generation.
    4352
    44 struct BufferNode { // use boost::variant instead of union? std::variant is c17+
    45     Kernel * kernel = nullptr;
    46     StreamSetBuffer * buffer = nullptr;
    47     RateValue lower;
    48     RateValue upper;
    49 };
    50 
    51 struct BufferRateData {   
    52 
    53     RateValue minimum;
    54     RateValue maximum;
    55     unsigned port;
    56 
    57     BufferRateData(const unsigned port = 0) : port(port) { }
     53// TODO: split pipeline vertex into input and output vertices to keep all graphs DAGs
     54// without having to delete edges.
     55
     56enum class BufferType : unsigned {
     57    Internal = 0
     58    , External = 1
     59    , Managed = 2
     60};
     61
     62struct BufferNode {
     63    Value *             TotalItems;
     64
     65
     66    kernel::Kernel *    Kernel;
     67    StreamSetBuffer *   Buffer;
     68
     69    RateValue           Lower;
     70    RateValue           Upper;
     71
     72    unsigned            Overflow;
     73    unsigned            Fasimile;
     74
     75    BufferType          Type;
     76
     77    BufferNode() : TotalItems(nullptr), Kernel(nullptr), Buffer(nullptr), Lower(), Upper(), Overflow(0), Fasimile(0), Type(BufferType::Internal) {}
     78};
     79
     80struct BufferRateData {
     81
     82    RateValue Minimum;
     83    RateValue Maximum;
     84    unsigned  Port;
     85
     86    BufferRateData() = default;
    5887
    5988    BufferRateData(const unsigned port, RateValue min, RateValue max)
    60     : minimum(std::move(min)), maximum(std::move(max)), port(port) { }
    61 };
    62 
    63 using BufferGraph = adjacency_list<vecS, vecS, bidirectionalS, BufferNode, BufferRateData>; // unsigned>;
     89    : Minimum(std::move(min)), Maximum(std::move(max)), Port(port) { }
     90};
     91
     92using BufferGraph = adjacency_list<vecS, vecS, bidirectionalS, BufferNode, BufferRateData>;
    6493
    6594template <typename vertex_descriptor>
     
    6998
    7099struct ConsumerNode {
    71     Value * consumed = nullptr;
    72     PHINode * phiNode = nullptr;
    73 };
     100    Value * Consumed = nullptr;
     101    PHINode * PhiNode = nullptr;
     102};
     103
     104enum : unsigned { FAKE_CONSUMER = (std::numeric_limits<unsigned>::max()) };
    74105
    75106using ConsumerGraph = adjacency_list<vecS, vecS, bidirectionalS, ConsumerNode, unsigned>;
     
    81112using KernelMap = flat_map<const Kernel *, Value>;
    82113
    83 struct Channel {
    84     Channel() = default;
    85     Channel(const RateValue & ratio, const StreamSetBuffer * const buffer = nullptr, const unsigned operand = 0)
    86     : ratio(ratio), buffer(buffer), portIndex(operand) {
    87 
    88     }
    89     RateValue               ratio;
    90     const StreamSetBuffer * buffer;
    91     unsigned                portIndex;
    92 };
    93 
    94 using ChannelGraph = adjacency_list<vecS, vecS, bidirectionalS, const Kernel *, Channel>;
    95 
    96114using TerminationGraph = adjacency_list<hash_setS, vecS, bidirectionalS, Value *>;
    97115
    98116using ScalarDependencyGraph = adjacency_list<vecS, vecS, bidirectionalS, Value *, unsigned>;
    99 
    100 using PortDependencyGraph = adjacency_list<vecS, vecS, bidirectionalS, no_property, RateId>;
    101117
    102118struct OverflowRequirement {
     
    110126using OverflowRequirements = StreamSetBufferMap<OverflowRequirement>;
    111127
    112 using PopCountStreamDependencyGraph = adjacency_list<vecS, vecS, directedS, Value *>;
    113 
    114 using PopCountStreamDependencyVertex = PopCountStreamDependencyGraph::vertex_descriptor;
    115 
    116128struct PopCountData {
    117     unsigned hasConstructedArray = -1;
    118     PopCountStreamDependencyVertex vertex = 0;
    119     Value * initial = nullptr;
    120     Value * baseIndex = nullptr;
    121     AllocaInst * individualCountArray = nullptr;
    122     AllocaInst * partialSumArray = nullptr;
    123     Value * strideCapacity = nullptr;
    124     Value * finalPartialSum = nullptr;
    125     Value * maximumNumOfStrides = nullptr;
    126 };
    127 
    128 using PopCountDataMap = flat_map<std::pair<const StreamSetBuffer *, bool>, PopCountData>;
     129    // compilation state
     130    PHINode *    PhiNode;
     131    Value *      Processed;
     132    unsigned     Encountered;
     133    Value *      InitialOffset;
     134
     135    // analysis state
     136    RateValue    FieldWidth;
     137    bool         HasArray;
     138    bool         HasNegatedArray;
     139    bool         UsesConsumedCount;
     140    bool         AlwaysNegated;
     141
     142    PopCountData() = default;
     143};
     144
     145enum CountingType : unsigned {
     146    Unknown = 0
     147    , Positive = 1
     148    , Negative = 2
     149    , Both = Positive | Negative
     150};
     151
     152ENABLE_ENUM_FLAGS(CountingType)
     153
     154struct PopCountEdge {
     155    CountingType Type;
     156    unsigned     Port;
     157    PopCountEdge() : Type(Unknown), Port(0) { }
     158    PopCountEdge(const CountingType type, const unsigned port) : Type(type), Port(port) { }
     159};
     160
     161using PopCountGraph = adjacency_list<vecS, vecS, bidirectionalS, no_property, PopCountEdge>;
    129162
    130163class PipelineCompiler {
    131164public:
    132165
    133     using BuilderRef = const std::unique_ptr<kernel::KernelBuilder> &;
    134 
    135166    PipelineCompiler(BuilderRef b, PipelineKernel * const pipelineKernel);
    136167
    137     void addHandlesToPipelineKernel(BuilderRef b);
     168    void addInternalKernelProperties(BuilderRef b);
    138169    void generateInitializeMethod(BuilderRef b);
    139170    void generateSingleThreadKernelMethod(BuilderRef b);
     
    152183    void end(BuilderRef b, const unsigned step);
    153184
     185    Value * allocateThreadLocalSpace(BuilderRef b);
     186    void setThreadLocalSpace(BuilderRef b, Value * const localState);
     187    void deallocateThreadLocalSpace(BuilderRef b, Value * const localState);
     188
    154189// inter-kernel functions
    155190
    156     Value * checkForSufficientInputDataAndOutputSpace(BuilderRef b);
    157     Value * determineNumOfLinearStrides(BuilderRef b);
    158     void calculateNonFinalItemCounts(BuilderRef b, Value * const numOfStrides);
     191    void checkForSufficientInputDataAndOutputSpace(BuilderRef b);
     192    void determineNumOfLinearStrides(BuilderRef b);
     193    void calculateNonFinalItemCounts(BuilderRef b);
    159194    void calculateFinalItemCounts(BuilderRef b);
    160     void provideAllInputAndOutputSpace(BuilderRef b);
    161195    void writeKernelCall(BuilderRef b);
     196    void computeFullyProcessedItemCounts(BuilderRef b);
    162197    void writeCopyBackLogic(BuilderRef b);
    163198    void writeCopyForwardLogic(BuilderRef b);
    164     void allocateThreadLocalState(BuilderRef b, const Port port, const unsigned i);
    165     void deallocateThreadLocalState(BuilderRef b, const Port port, const unsigned i);
    166 
    167     void checkIfAllProducingKernelsHaveTerminated(BuilderRef b);
     199
    168200    void zeroFillPartiallyWrittenOutputStreams(BuilderRef b);
    169201    void initializeKernelCallPhis(BuilderRef b);
    170202    void initializeKernelExitPhis(BuilderRef b);
    171     void storeCopyForwardProducedItemCounts(BuilderRef b);
    172     void storeCopyBackProducedItemCounts(BuilderRef b);
     203
     204    void readInitialProducedItemCounts(BuilderRef b);
    173205    void computeMinimumConsumedItemCounts(BuilderRef b);
    174206    void writeFinalConsumedItemCounts(BuilderRef b);
    175     void readCurrentProducedItemCounts(BuilderRef b);
     207    void readFinalProducedItemCounts(BuilderRef b);
    176208    void releaseCurrentSegment(BuilderRef b);
    177209    void writeCopyToOverflowLogic(BuilderRef b);
    178     void checkForSufficientInputData(BuilderRef b, const unsigned index);
    179     void checkForSufficientOutputSpaceOrExpand(BuilderRef b, const unsigned index);
     210    void checkForSufficientInputData(BuilderRef b, const unsigned inputPort);
     211    void checkForSufficientOutputSpaceOrExpand(BuilderRef b, const unsigned outputPort);
     212    void incrementItemCountsOfCountableRateStreams(BuilderRef b);
     213    enum class OverflowCopy { Forwards, Backwards };
     214    Value * writeOverflowCopy(BuilderRef b, const StreamSetBuffer * const buffer, const OverflowCopy direction, Value * const itemsToCopy) const;
     215
    180216
    181217// intra-kernel functions
    182218
    183219    void branchToTargetOrLoopExit(BuilderRef b, Value * const cond, BasicBlock * const target);
    184     void expandOutputBuffer(BuilderRef b, Value * const hasEnough, const unsigned index, BasicBlock * const target);
    185     Value * getInputStrideLength(BuilderRef b, const unsigned index);
    186     Value * getOutputStrideLength(BuilderRef b, const unsigned index);
    187     Value * getInitialStrideLength(BuilderRef b, const Binding & binding);
    188     Value * calculateNumOfLinearItems(BuilderRef b, const Binding & binding, Value * const numOfStrides);
    189     Value * getAccessibleInputItems(BuilderRef b, const unsigned index);
    190     Value * getNumOfAccessibleStrides(BuilderRef b, const unsigned index);
    191     Value * getNumOfWritableStrides(BuilderRef b, const unsigned index);
    192     Value * getWritableOutputItems(BuilderRef b, const unsigned index);
    193     Value * calculateBufferExpansionSize(BuilderRef b, const unsigned index);
    194     Value * addLookahead(BuilderRef b, const unsigned index, Value * itemCount) const;
    195     Value * subtractLookahead(BuilderRef b, const unsigned index, Value * itemCount) const;
    196     Value * getFullyProcessedItemCount(BuilderRef b, const Binding & input) const;
    197     Value * getTotalItemCount(BuilderRef b, const StreamSetBuffer * buffer) const;
    198     Value * isTerminated(BuilderRef b) const;
     220    void expandOutputBuffers(BuilderRef b);
     221    void expandOutputBuffer(BuilderRef b, const unsigned outputPort, Value * const hasEnough, BasicBlock * const target);
     222
     223    Value * getAlreadyProcessedItemCount(BuilderRef b, const unsigned inputPort);
     224    Value * getAlreadyProducedItemCount(BuilderRef b, const unsigned outputPort);
     225
     226    Value * getInputStrideLength(BuilderRef b, const unsigned inputPort);
     227    Value * getOutputStrideLength(BuilderRef b, const unsigned outputPort);
     228    Value * getInitialStrideLength(BuilderRef b, const Port port, const unsigned portNum);
     229    Value * getMaximumStrideLength(BuilderRef b, const Port port, const unsigned portNum);
     230    Value * calculateNumOfLinearItems(BuilderRef b, const Port portType,  const unsigned portNum);
     231    Value * getAccessibleInputItems(BuilderRef b, const unsigned inputPort);
     232    Value * getNumOfAccessibleStrides(BuilderRef b, const unsigned inputPort);
     233    Value * getNumOfWritableStrides(BuilderRef b, const unsigned outputPort);
     234    Value * getWritableOutputItems(BuilderRef b, const unsigned outputPort);
     235    Value * calculateBufferExpansionSize(BuilderRef b, const unsigned outputPort);
     236    Value * willNotOverwriteOverflow(BuilderRef b, const unsigned outputPort);
     237    Value * addLookahead(BuilderRef b, const unsigned inputPort, Value * itemCount) const;
     238    Value * subtractLookahead(BuilderRef b, const unsigned inputPort, Value * itemCount) const;
     239    Constant * getLookahead(BuilderRef b, const unsigned inputPort) const;
     240    Value * truncateBlockSize(BuilderRef b, const Binding & binding, Value * itemCount, Value * all) const;
     241    Value * getTotalItemCount(BuilderRef b, const unsigned inputPort) const;
     242    Value * terminatedExplicitly(BuilderRef b) const;
     243    Value * hasProducerTerminated(BuilderRef b, const unsigned inputPort) const;
    199244    void setTerminated(BuilderRef b);
     245    void resetMemoizedFields();
    200246
    201247// pop-count functions
    202248
    203     void initializePopCounts(BuilderRef b);
    204     PopCountStreamDependencyGraph makePopCountStreamDependencyGraph(BuilderRef b);
    205     void addPopCountStreamDependency(BuilderRef b, const unsigned index, const Binding & binding, PopCountStreamDependencyGraph & G);
    206 
    207     Value * getInitialNumOfLinearPopCountItems(BuilderRef b, const ProcessingRate & rate);
    208     Value * getMaximumNumOfPopCountStrides(BuilderRef b, const ProcessingRate & rate);
    209     Value * getNumOfLinearPopCountItems(BuilderRef b, const ProcessingRate & rate, Value * const numOfStrides);
    210     Value * getPopCountArray(BuilderRef b, const unsigned index);
    211     Value * getNegatedPopCountArray(BuilderRef b, const unsigned index);
    212     void allocateLocalPopCountArray(BuilderRef b, const ProcessingRate & rate);
    213     void deallocateLocalPopCountArray(BuilderRef b, const ProcessingRate & rate);
    214     void storePopCountSourceItemCount(BuilderRef b, const Port port, const unsigned index, Value * const offset, Value * const processable);
    215 
    216     PopCountData & findOrAddPopCountData(BuilderRef b, const ProcessingRate & rate);
    217     PopCountData & findOrAddPopCountData(BuilderRef b, const unsigned index, const bool negated);
    218     Value * getInitialNumOfLinearPopCountItems(BuilderRef b, PopCountData & pc, const unsigned index, const bool negated);
    219     PopCountData & makePopCountArray(BuilderRef b, const ProcessingRate & rate);
    220     PopCountData & makePopCountArray(BuilderRef b, const unsigned index, const bool negated);
    221     Value * getMinimumNumOfSourceItems(BuilderRef b, const PopCountData & pc);
    222     Value * getSourceMarkers(BuilderRef b, PopCountData & pc, const unsigned index, Value * const offset) const;
     249    void addPopCountScalarsToPipelineKernel(BuilderRef b, const unsigned index);
     250
     251    void initializePopCounts();
     252    void writePopCountComputationLogic(BuilderRef b);
     253
     254    void initializePopCountReferenceItemCount(BuilderRef b, const unsigned bufferVertex, not_null<Value *> produced);
     255    void createPopCountReferenceCounts(BuilderRef b);
     256    void computeMinimumPopCountReferenceCounts(BuilderRef b);
     257    void updatePopCountReferenceCounts(BuilderRef b);
     258    LLVM_READNONE Value * getPopCountReferenceConsumedCount(BuilderRef b, const unsigned bufferVertex);
     259    LLVM_READNONE Value * getPopCountInitialOffset(BuilderRef b, const Binding & binding, const unsigned bufferVertex, PopCountData & pc);
     260
     261    Value * getMinimumNumOfLinearPopCountItems(BuilderRef b, const Binding & binding);
     262    Value * getMaximumNumOfPopCountStrides(BuilderRef b, const Binding & binding, not_null<Value *> sourceItemCount, Constant * const lookAhead = nullptr);
     263    Value * getNumOfLinearPopCountItems(BuilderRef b, const Binding & binding);
     264
     265    Value * getReferenceStreamOffset(BuilderRef b, const Binding & binding);
     266
     267    Value * getPopCountArray(BuilderRef b, const unsigned inputPort);
     268    Value * getNegatedPopCountArray(BuilderRef b, const unsigned inputPort);
     269    Value * getIndividualPopCountArray(BuilderRef b, const unsigned inputPort, const unsigned index);
     270
     271    LLVM_READNONE unsigned getPopCountReferencePort(const Kernel * kernel, const ProcessingRate & rate) const;
     272    LLVM_READNONE unsigned getPopCountReferenceBuffer(const Kernel * kernel, const ProcessingRate & rate) const;
     273
     274    StructType * getPopCountThreadLocalStateType(BuilderRef b);
     275    void allocatePopCountArrays(BuilderRef b, Value * base);
     276    void deallocatePopCountArrays(BuilderRef b, Value * base);
     277
     278// pop-count analysis
     279
     280    PopCountGraph makePopCountGraph() const;
     281
     282    LLVM_READNONE PopCountData & getPopCountReference(const unsigned bufferVertex) ;
     283    LLVM_READNONE PopCountData analyzePopCountReference(const unsigned bufferVertex) const;
     284    LLVM_READNONE RateValue popCountReferenceFieldWidth(const unsigned bufferVertex) const;
     285    LLVM_READNONE bool popCountReferenceCanUseConsumedItemCount(const unsigned bufferVertex) const;
     286    LLVM_READNONE bool popCountReferenceRequiresBaseOffset(const unsigned bufferVertex) const;
     287    LLVM_READNONE bool popCountBufferIsUsedBySingleKernel(const unsigned bufferVertex) const;
     288    LLVM_READNONE std::pair<bool, bool> popCountReferenceRequiresPopCountArray(const unsigned bufferVertex) const;
     289    LLVM_READNONE bool popCountReferenceIsAlwaysNegated(const unsigned bufferVertex) const;
     290
     291    template <typename LambdaFunction>
     292    void forEachOutputBufferThatIsAPopCountReference(const unsigned kernelIndex, LambdaFunction func);
     293
     294    template <typename LambdaFunction>
     295    void forEachPopCountReferenceInputPort(const unsigned kernelIndex, LambdaFunction func);
    223296
    224297// consumer recording
     
    228301    void initializeConsumedItemCount(BuilderRef b, const unsigned bufferVertex, Value * const produced);
    229302    void setConsumedItemCount(BuilderRef b, const unsigned bufferVertex, Value * const consumed) const;
    230     Value * getConsumedItemCount(BuilderRef b, const unsigned index) const;
     303    Value * getConsumedItemCount(BuilderRef b, const unsigned outputPort) const;
    231304
    232305// buffer analysis/management functions
    233306
    234307    BufferGraph makeBufferGraph(BuilderRef b);
     308    void addBufferHandlesToPipelineKernel(BuilderRef b, const unsigned index);
    235309    void enumerateBufferProducerBindings(const unsigned producer, const Bindings & bindings, BufferGraph & G, BufferMap & M);
    236310    void enumerateBufferConsumerBindings(const unsigned consumer, const Bindings & bindings, BufferGraph & G, BufferMap & M);
     
    240314    void loadBufferHandles(BuilderRef b);
    241315    void releaseBuffers(BuilderRef b);
    242     LLVM_READNONE bool requiresCopyBack(const StreamSetBuffer * const buffer) const;
    243     LLVM_READNONE bool requiresFacsimile(const StreamSetBuffer * const buffer) const;
    244     LLVM_READNONE unsigned getCopyBack(const StreamSetBuffer * const buffer) const;
    245     LLVM_READNONE unsigned getFacsimile(const StreamSetBuffer * const buffer) const;
    246     LLVM_READNONE bool isPipelineIO(const StreamSetBuffer * const buffer) const;
    247 
    248     Value * getLogicalInputBaseAddress(BuilderRef b, const unsigned index) const;
    249     Value * getLogicalOutputBaseAddress(BuilderRef b, const unsigned index) const;
    250     Value * calculateLogicalBaseAddress(BuilderRef b, const Binding & binding, const StreamSetBuffer * const buffer, Value * const itemCount) const;
     316    LLVM_READNONE bool requiresCopyBack(const unsigned bufferVertex) const;
     317    LLVM_READNONE bool requiresFacsimile(const unsigned bufferVertex) const;
     318    LLVM_READNONE unsigned getCopyBack(const unsigned bufferVertex) const;
     319    LLVM_READNONE unsigned getFacsimile(const unsigned bufferVertex) const;
     320    BufferType getOutputBufferType(const unsigned outputPort) const;
     321
     322    Value * getLogicalInputBaseAddress(BuilderRef b, const unsigned inputPort);
     323    Value * getLogicalOutputBaseAddress(BuilderRef b, const unsigned outputPort);
     324    Value * calculateLogicalBaseAddress(BuilderRef b, const Binding & binding, const StreamSetBuffer * const buffer, Value * const itemCount);
    251325
    252326// cycle counter functions
     
    258332// analysis functions
    259333
    260     PortDependencyGraph makePortDependencyGraph() const;
     334    std::vector<unsigned> lexicalOrderingOfStreamIO() const;
    261335    TerminationGraph makeTerminationGraph() const;
    262336    ScalarDependencyGraph makeScalarDependencyGraph() const;
     
    265339
    266340    Value * getFunctionFromKernelState(BuilderRef b, Type * const type, const std::string & suffix) const;
    267 
    268341    Value * getInitializationFunction(BuilderRef b) const;
    269 
    270342    Value * getDoSegmentFunction(BuilderRef b) const;
    271 
    272343    Value * getFinalizeFunction(BuilderRef b) const;
    273344
    274345    std::string makeKernelName(const unsigned kernelIndex) const;
    275 
    276346    std::string makeBufferName(const unsigned kernelIndex, const Binding & binding) const;
    277 
    278     StreamSetBuffer * getInputBuffer(const unsigned index) const;
    279 
    280     StreamSetBuffer * getOutputBuffer(const unsigned index) const;
    281 
    282     const Binding & getBinding(const Port port, const unsigned i) const {
     347    unsigned getInputBufferVertex(const unsigned inputPort) const;
     348    unsigned getInputBufferVertex(const unsigned kernelVertex, const unsigned inputPort) const;
     349    StreamSetBuffer * getInputBuffer(const unsigned inputPort) const;
     350    unsigned getOutputBufferVertex(const unsigned outputPort) const;
     351    unsigned getOutputBufferVertex(const unsigned kernelVertex, const unsigned outputPort) const;
     352    StreamSetBuffer * getOutputBuffer(const unsigned outputPort) const;
     353
     354    static LLVM_READNONE const Binding & getBinding(const Kernel * kernel, const Port port, const unsigned i) {
    283355        if (port == Port::Input) {
    284             return mKernel->getInputStreamSetBinding(i);
    285         } else {
    286             return mKernel->getOutputStreamSetBinding(i);
     356            return kernel->getInputStreamSetBinding(i);
     357        } else if (port == Port::Output) {
     358            return kernel->getOutputStreamSetBinding(i);
    287359        }
     360        llvm_unreachable("unknown port binding type!");
    288361    }
    289362
     
    296369    void writeOutputScalars(BuilderRef b, const unsigned u, std::vector<Value *> & args);
    297370
    298     void itemCountSanityCheck(BuilderRef b, const Binding & binding, const std::string & presentLabel, const std::string & pastLabel,
    299                               Value * const itemCount, Value * const expected, Value * const terminated);
     371    void itemCountSanityCheck(BuilderRef b, const Binding & binding, const std::string & pastLabel,
     372                              Value * const itemCount, Value * const expected);
    300373
    301374protected:
     
    306379    OwnedStreamSetBuffers                       mOwnedBuffers;
    307380    unsigned                                    mKernelIndex = 0;
    308     const Kernel *                              mKernel = nullptr;
     381    Kernel *                                    mKernel = nullptr;
    309382
    310383    // pipeline state
    311384    PHINode *                                   mTerminatedPhi = nullptr;
     385    PHINode *                                   mTerminatedFlag = nullptr;
    312386    PHINode *                                   mSegNo = nullptr;
    313387    BasicBlock *                                mPipelineLoop = nullptr;
     
    319393    BasicBlock *                                mKernelExit = nullptr;
    320394    BasicBlock *                                mPipelineEnd = nullptr;
    321 
    322     // pipeline state
    323     StreamSetBufferMap<Value *>                 mInputConsumedItemCountPhi;
    324     StreamSetBufferMap<Value *>                 mTotalItemCount;
    325     StreamSetBufferMap<Value *>                 mConsumedItemCount;
    326395    std::vector<Value *>                        mOutputScalars;
    327396
    328397    // kernel state
    329     Value *                                     mNoMore = nullptr;
    330398    Value *                                     mNumOfLinearStrides = nullptr;
    331     PHINode *                                   mNumOfLinearStridesPhi = nullptr;
    332     Value *                                     mNonFinal = nullptr;
    333     PHINode *                                   mIsFinalPhi = nullptr;
    334 
     399
     400    std::vector<unsigned>                       mPortOrdering;
     401
     402    std::vector<Value *>                        mAlreadyProcessedItemCount; // entering the stride
    335403    std::vector<Value *>                        mInputStrideLength;
    336404    std::vector<Value *>                        mAccessibleInputItems;
    337     std::vector<PHINode *>                      mAccessibleInputItemsPhi;
    338     std::vector<Value *>                        mInputStreamHandle;
    339 
     405    std::vector<PHINode *>                      mLinearInputItemsPhi;
     406    std::vector<Value *>                        mFullyProcessedItemCount;
     407
     408    std::vector<Value *>                        mInitiallyProducedItemCount; // entering the kernel
     409    std::vector<Value *>                        mAlreadyProducedItemCount; // entering the stride
    340410    std::vector<Value *>                        mOutputStrideLength;
    341411    std::vector<Value *>                        mWritableOutputItems;
    342     std::vector<Value *>                        mCopyForwardProducedOutputItems;
    343     std::vector<Value *>                        mAnteriorProcessedItemCount;
    344     std::vector<Value *>                        mCopyBackProducedOutputItems;
    345     std::vector<PHINode *>                      mWritableOutputItemsPhi;
    346     std::vector<Value *>                        mOutputStreamHandle;
     412    std::vector<PHINode *>                      mLinearOutputItemsPhi;
    347413
    348414    // debug + misc state
     
    354420
    355421    // popcount state
    356     PopCountStreamDependencyGraph               mPopCountDependencyGraph;
    357     PopCountDataMap                             mPopCountDataMap;
     422    Value *                                     mPopCountState;
     423    flat_map<unsigned, PopCountData>            mPopCountData;
    358424
    359425
    360426    // analysis state
    361     flat_set<const StreamSetBuffer *>           mIsPipelineIO;
    362     OverflowRequirements                        mOverflowRequirements;
    363427    BufferGraph                                 mBufferGraph;
    364428    ConsumerGraph                               mConsumerGraph;
     429    ScalarDependencyGraph                       mScalarDependencyGraph;
    365430    TerminationGraph                            mTerminationGraph;
    366     ScalarDependencyGraph                       mScalarDependencyGraph;
    367 
    368 };
    369 
     431    PopCountGraph                               mPopCountGraph;
     432
     433};
     434
     435/** ------------------------------------------------------------------------------------------------------------- *
     436 * @brief constructor
     437 ** ------------------------------------------------------------------------------------------------------------- */
    370438inline PipelineCompiler::PipelineCompiler(BuilderRef b, PipelineKernel * const pipelineKernel)
    371439: mPipelineKernel(pipelineKernel)
     
    373441, mBufferGraph(makeBufferGraph(b))
    374442, mConsumerGraph(makeConsumerGraph())
     443, mScalarDependencyGraph(makeScalarDependencyGraph())
    375444, mTerminationGraph(makeTerminationGraph())
    376 , mScalarDependencyGraph(makeScalarDependencyGraph()) {
    377 
    378 
     445, mPopCountGraph(makePopCountGraph()) {
    379446
    380447
     
    384451 * @brief getInputBuffer
    385452 ** ------------------------------------------------------------------------------------------------------------- */
    386 inline StreamSetBuffer * PipelineCompiler::getInputBuffer(const unsigned index) const {
    387     for (const auto e : make_iterator_range(in_edges(mKernelIndex, mBufferGraph))) {
    388         if (mBufferGraph[e].port == index) {
    389             return mBufferGraph[source(e, mBufferGraph)].buffer;
     453inline unsigned PipelineCompiler::getInputBufferVertex(const unsigned inputPort) const {
     454    return getInputBufferVertex(mKernelIndex, inputPort);
     455}
     456
     457/** ------------------------------------------------------------------------------------------------------------- *
     458 * @brief getInputBuffer
     459 ** ------------------------------------------------------------------------------------------------------------- */
     460inline unsigned PipelineCompiler::getInputBufferVertex(const unsigned kernelVertex, const unsigned inputPort) const {
     461    for (const auto e : make_iterator_range(in_edges(kernelVertex, mBufferGraph))) {
     462        if (mBufferGraph[e].Port == inputPort) {
     463            return source(e, mBufferGraph);
    390464        }
    391465    }
    392466    llvm_unreachable("input buffer not found");
    393     return nullptr;
    394 }
    395 
    396 /** ------------------------------------------------------------------------------------------------------------- *
    397  * @brief getOutputBuffer
    398  ** ------------------------------------------------------------------------------------------------------------- */
    399 inline StreamSetBuffer * PipelineCompiler::getOutputBuffer(const unsigned index) const {
    400     for (const auto e : make_iterator_range(out_edges(mKernelIndex, mBufferGraph))) {
    401         if (mBufferGraph[e].port == index) {
    402             return mBufferGraph[target(e, mBufferGraph)].buffer;
     467}
     468
     469/** ------------------------------------------------------------------------------------------------------------- *
     470 * @brief getInputBuffer
     471 ** ------------------------------------------------------------------------------------------------------------- */
     472inline StreamSetBuffer * PipelineCompiler::getInputBuffer(const unsigned inputPort) const {
     473    return mBufferGraph[getInputBufferVertex(inputPort)].Buffer;
     474}
     475
     476/** ------------------------------------------------------------------------------------------------------------- *
     477 * @brief getOutputBufferVertex
     478 ** ------------------------------------------------------------------------------------------------------------- */
     479inline unsigned PipelineCompiler::getOutputBufferVertex(const unsigned outputPort) const {
     480    return getOutputBufferVertex(mKernelIndex, outputPort);
     481}
     482
     483/** ------------------------------------------------------------------------------------------------------------- *
     484 * @brief getOutputBufferVertex
     485 ** ------------------------------------------------------------------------------------------------------------- */
     486inline unsigned PipelineCompiler::getOutputBufferVertex(const unsigned kernelVertex, const unsigned outputPort) const {
     487    for (const auto e : make_iterator_range(out_edges(kernelVertex, mBufferGraph))) {
     488        if (mBufferGraph[e].Port == outputPort) {
     489            return target(e, mBufferGraph);
    403490        }
    404491    }
    405492    llvm_unreachable("output buffer not found");
    406     return nullptr;
     493}
     494
     495
     496/** ------------------------------------------------------------------------------------------------------------- *
     497 * @brief getOutputBuffer
     498 ** ------------------------------------------------------------------------------------------------------------- */
     499inline StreamSetBuffer * PipelineCompiler::getOutputBuffer(const unsigned outputPort) const {
     500    return mBufferGraph[getOutputBufferVertex(outputPort)].Buffer;
    407501}
    408502
     
    461555}
    462556
    463 inline unsigned LLVM_READNONE getItemWidth(const Type * ty ) {
     557/** ------------------------------------------------------------------------------------------------------------- *
     558 * @brief getLog2SizeWidth
     559 ** ------------------------------------------------------------------------------------------------------------- */
     560LLVM_READNONE inline Constant * getLog2SizeWidth(BuilderRef b) {
     561    return b->getSize(std::log2(b->getSizeTy()->getBitWidth()));
     562}
     563
     564/** ------------------------------------------------------------------------------------------------------------- *
     565 * @brief getLog2BlockWidth
     566 ** ------------------------------------------------------------------------------------------------------------- */
     567LLVM_READNONE inline Constant * getLog2BlockWidth(BuilderRef b) {
     568    return b->getSize(std::log2(b->getBitBlockWidth()));
     569}
     570
     571/** ------------------------------------------------------------------------------------------------------------- *
     572 * @brief getItemWidth
     573 ** ------------------------------------------------------------------------------------------------------------- */
     574LLVM_READNONE inline unsigned getItemWidth(const Type * ty ) {
    464575    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
    465576        ty = ty->getArrayElementType();
     
    469580
    470581template <typename Graph>
     582inline typename graph_traits<Graph>::edge_descriptor first_in_edge(const typename graph_traits<Graph>::vertex_descriptor u, const Graph & G) {
     583    return *in_edges(u, G).first;
     584}
     585
     586template <typename Graph>
    471587inline typename graph_traits<Graph>::edge_descriptor in_edge(const typename graph_traits<Graph>::vertex_descriptor u, const Graph & G) {
    472588    assert (in_degree(u, G) == 1);
    473     return *in_edges(u, G).first;
     589    return first_in_edge(u, G);
     590}
     591
     592template <typename Graph>
     593inline typename graph_traits<Graph>::vertex_descriptor parent(const typename graph_traits<Graph>::vertex_descriptor u, const Graph & G) {
     594    return source(in_edge(u, G), G);
     595}
     596
     597template <typename Graph>
     598inline typename graph_traits<Graph>::edge_descriptor first_out_edge(const typename graph_traits<Graph>::vertex_descriptor u, const Graph & G) {
     599    return *out_edges(u, G).first;
     600}
     601
     602template <typename Graph>
     603inline typename graph_traits<Graph>::edge_descriptor out_edge(const typename graph_traits<Graph>::vertex_descriptor u, const Graph & G) {
     604    assert (out_degree(u, G) == 1);
     605    return first_out_edge(u, G);
     606}
     607
     608template <typename Graph>
     609inline typename graph_traits<Graph>::vertex_descriptor child(const typename graph_traits<Graph>::vertex_descriptor u, const Graph & G) {
     610    return target(out_edge(u, G), G);
    474611}
    475612
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_kernel.cpp

    r6184 r6228  
    2828    } else { // add handles for each of unique streams
    2929        mCompiler = llvm::make_unique<PipelineCompiler>(b, this);
    30         mCompiler->addHandlesToPipelineKernel(b);
     30        mCompiler->addInternalKernelProperties(b);
    3131    }
    3232}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_logic.hpp

    r6184 r6228  
    1010 ** ------------------------------------------------------------------------------------------------------------- */
    1111void PipelineCompiler::generateSingleThreadKernelMethod(BuilderRef b) {
     12    Value * const localState = allocateThreadLocalSpace(b);
     13    setThreadLocalSpace(b, localState);
    1214    start(b, b->getSize(0));
    1315    for (unsigned i = 0; i < mPipeline.size(); ++i) {
     
    1618    }
    1719    end(b, 1);
     20    deallocateThreadLocalSpace(b, localState);
    1821}
    1922
     
    3942    Value * const instance = mPipelineKernel->getHandle(); assert (instance);
    4043
    41     StructType * const threadStructType = StructType::get(m->getContext(), {instance->getType(), sizeTy});
     44    StructType * const threadStructType = StructType::get(m->getContext(), {instance->getType(), sizeTy, voidPtrTy});
    4245
    4346    FunctionType * const threadFuncType = FunctionType::get(b->getVoidTy(), {voidPtrTy}, false);
     
    5861        threadIdPtr[i] = b->CreateGEP(pthreads, {ZERO, b->getInt32(i)});
    5962    }
    60     // use the process thread to handle the initial segment function after spawning (n - 1) threads to handle the subsequent offsets
     63    // use the process thread to handle the initial segment function after spawning
     64    // (n - 1) threads to handle the subsequent offsets
    6165    ConstantInt * const ONE = b->getInt32(1);
     66    ConstantInt * const TWO = b->getInt32(2);
     67    Value * localState[threads];
    6268    for (unsigned i = 0; i < threads; ++i) {
    6369        AllocaInst * const threadState = b->CreateAlloca(threadStructType);
    6470        b->CreateStore(instance, b->CreateGEP(threadState, {ZERO, ZERO}));
    6571        b->CreateStore(b->getSize(i + 1), b->CreateGEP(threadState, {ZERO, ONE}));
     72        localState[i] = allocateThreadLocalSpace(b);
     73        b->CreateStore(localState[i], b->CreateGEP(threadState, {ZERO, TWO}));
    6674        b->CreatePThreadCreateCall(threadIdPtr[i], nullVoidPtrVal, threadFunc, threadState);
    6775    }
     76
    6877    AllocaInst * const threadState = b->CreateAlloca(threadStructType);
    6978    b->CreateStore(instance, b->CreateGEP(threadState, {ZERO, ZERO}));
    7079    b->CreateStore(b->getSize(0), b->CreateGEP(threadState, {ZERO, ONE}));
    7180    b->CreateCall(threadFunc, b->CreatePointerCast(threadState, voidPtrTy));
     81
    7282    AllocaInst * const status = b->CreateAlloca(voidPtrTy);
    7383    for (unsigned i = 0; i < threads; ++i) {
    7484        Value * threadId = b->CreateLoad(threadIdPtr[i]);
    7585        b->CreatePThreadJoinCall(threadId, status);
     86        deallocateThreadLocalSpace(b, localState[i]);
    7687    }
    7788    b->CreateRetVoid();
     
    8495    mPipelineKernel->setHandle(b, b->CreateLoad(b->CreateGEP(threadStruct, {ZERO, ZERO})));
    8596    Value * const segmentOffset = b->CreateLoad(b->CreateGEP(threadStruct, {ZERO, ONE}));
     97    setThreadLocalSpace(b, b->CreateLoad(b->CreateGEP(threadStruct, {ZERO, TWO})));
    8698    // generate the pipeline logic for this thread
    8799    start(b, segmentOffset);
     
    105117}
    106118
     119enum : int {
     120    POP_COUNT_STRUCT_INDEX = 0
     121};
     122
     123
     124/** ------------------------------------------------------------------------------------------------------------- *
     125 * @brief allocateThreadLocalSpace
     126 ** ------------------------------------------------------------------------------------------------------------- */
     127inline Value * PipelineCompiler::allocateThreadLocalSpace(BuilderRef b) {
     128    // malloc the local state object
     129    StructType * const popCountTy = getPopCountThreadLocalStateType(b);
     130    StructType * const localStateTy = StructType::get(popCountTy, nullptr);
     131    Value * const localState = b->CreateCacheAlignedAlloca(localStateTy);
     132    // and any pop count refs
     133    Constant * const ZERO = b->getInt32(0);
     134    Constant * const POP_COUNT_STRUCT = b->getInt32(POP_COUNT_STRUCT_INDEX);
     135    allocatePopCountArrays(b, b->CreateGEP(localState, {ZERO, POP_COUNT_STRUCT}));
     136    return localState;
     137}
     138
     139/** ------------------------------------------------------------------------------------------------------------- *
     140 * @brief setThreadLocalSpace
     141 ** ------------------------------------------------------------------------------------------------------------- */
     142inline void PipelineCompiler::setThreadLocalSpace(BuilderRef b, Value * const localState) {
     143    Constant * const ZERO = b->getInt32(0);
     144    Constant * const POP_COUNT_STRUCT = b->getInt32(POP_COUNT_STRUCT_INDEX);
     145    mPopCountState = b->CreateGEP(localState, {ZERO, POP_COUNT_STRUCT});
     146}
     147
     148/** ------------------------------------------------------------------------------------------------------------- *
     149 * @brief deallocateThreadLocalSpace
     150 ** ------------------------------------------------------------------------------------------------------------- */
     151inline void PipelineCompiler::deallocateThreadLocalSpace(BuilderRef b, Value * const localState) {
     152    Constant * const ZERO = b->getInt32(0);
     153    Constant * const POP_COUNT_STRUCT = b->getInt32(POP_COUNT_STRUCT_INDEX);
     154    deallocatePopCountArrays(b, b->CreateGEP(localState, {ZERO, POP_COUNT_STRUCT}));
     155}
     156
    107157}
    108158
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/popcount_logic.hpp

    r6189 r6228  
    33namespace kernel {
    44
    5 /** ------------------------------------------------------------------------------------------------------------- *
    6  * @brief initializePopCounts
    7  ** ------------------------------------------------------------------------------------------------------------- */
    8 void PipelineCompiler::initializePopCounts(BuilderRef b) {
    9     mPopCountDependencyGraph = makePopCountStreamDependencyGraph(b);
     5namespace {
     6
     7enum : int {
     8    BASE_OFFSET_INDEX       // the block # of the zeroth
     9    , CAPACITY_INDEX        // capacity of the popcount array
     10    , PARTIAL_SUM_INDEX     // pointer to the popcount partial sum array
     11    , POSITIVE_ARRAY_INDEX
     12    , NEGATED_ARRAY_INDEX
     13// -------------------------
     14    , POP_COUNT_MAX_FIELDS
     15};
     16
     17const auto REFERENCE_PROCESSED_COUNT = "PopRefProc";
     18
     19}
     20
     21/** ------------------------------------------------------------------------------------------------------------- *
     22 * @brief writePopCountComputationLogic
     23 *
     24 * Compute the partial sum of the popcount for every output stream that will potentially be used as a pop count
     25 * reference stream.
     26 ** ------------------------------------------------------------------------------------------------------------- */
     27inline void PipelineCompiler::writePopCountComputationLogic(BuilderRef b) {
     28
     29    forEachOutputBufferThatIsAPopCountReference(mKernelIndex, [&](const unsigned bufferVertex) {
     30
     31        // TODO: if we store the partial sum, we can save computation costs when
     32        // a particular reference is shared between multiple kernels. However,
     33        // unless we prove that every kernel that shares this buffer progresses
     34        // at the same rate, using the partial sum becomes more complicated as
     35        // we need to
     36
     37        const auto bufferPort = mBufferGraph[in_edge(bufferVertex, mBufferGraph)].Port;
     38        const Binding & output = mKernel->getOutputStreamSetBinding(bufferPort);
     39
     40        PopCountData & pc = getPopCountReference(bufferVertex);
     41
     42        const auto prefix = makeBufferName(mKernelIndex, output) + "_genPopCount";
     43        BasicBlock * const popCountBuild =
     44            b->CreateBasicBlock(prefix + "Build", mKernelLoopCall);
     45        BasicBlock * const popCountExpand =
     46            b->CreateBasicBlock(prefix + "Expand", mKernelLoopCall);
     47        BasicBlock * const popCountLoop =
     48            b->CreateBasicBlock(prefix + "Loop", mKernelLoopCall);
     49        BasicBlock * const popCountExit =
     50            b->CreateBasicBlock(prefix + "Exit", mKernelLoopCall);
     51
     52        IntegerType * const sizeTy = b->getSizeTy();
     53
     54        Constant * const ZERO = b->getSize(0);
     55        Constant * const ONE = b->getSize(1);
     56        Constant * const BLOCK_SIZE_MINUS_1 = b->getSize(b->getBitBlockWidth() - 1);
     57
     58        Constant * const LOG2_BLOCK_WIDTH = getLog2BlockWidth(b);
     59        Value * const consumed = getPopCountReferenceConsumedCount(b, bufferVertex);
     60        Value * const startIndex = b->CreateLShr(consumed, LOG2_BLOCK_WIDTH);
     61        Value * const produced = b->getNonDeferredProducedItemCount(output);
     62        // If this is the producer's final stride, round the index position up
     63        // to account for a partial stride.
     64        Value * const rounding = b->CreateSelect(mTerminatedPhi, BLOCK_SIZE_MINUS_1, ZERO);
     65        Value * const endIndex = b->CreateLShr(b->CreateAdd(produced, rounding), LOG2_BLOCK_WIDTH);
     66
     67
     68
     69
     70
     71        // TODO: if the source items of the consumes of this pop count ref
     72        // were already produced, we could limit how many items are considered
     73        // here. This would likely require that the lexographical ordering
     74        // took this into account and tried to insert an edge into the
     75        // graph provided it does not induce a cycle.
     76
     77        std::vector<Value *> indices(3);
     78        indices[0] = b->getInt32(0);
     79        indices[1] = b->getInt32(bufferVertex);
     80        indices[2] = b->getInt32(PARTIAL_SUM_INDEX);
     81        Value * const partialSumPtr = b->CreateGEP(mPopCountState, indices);
     82        Value * const basePartialSumArray = b->CreateLoad(partialSumPtr);
     83        assert (basePartialSumArray->getType()->isPointerTy());
     84
     85        indices[2] = b->getInt32(BASE_OFFSET_INDEX);
     86        Value * const offsetPtr = b->CreateGEP(mPopCountState, indices);
     87        b->CreateStore(startIndex, offsetPtr);
     88
     89        Value * const hasAnyStrides = b->CreateICmpNE(startIndex, endIndex);
     90        b->CreateLikelyCondBr(hasAnyStrides, popCountBuild, popCountExit);
     91
     92        b->SetInsertPoint(popCountBuild);
     93        indices[2] = b->getInt32(CAPACITY_INDEX);
     94        Value * const capacityPtr = b->CreateGEP(mPopCountState, indices);
     95        Value * const capacity = b->CreateLoad(capacityPtr);
     96
     97        Value * positiveArrayPtr = nullptr;
     98        Value * basePositiveArray = nullptr;
     99        if (pc.HasArray) {
     100            indices[2] = b->getInt32(POSITIVE_ARRAY_INDEX);
     101            positiveArrayPtr = b->CreateGEP(mPopCountState, indices);
     102            basePositiveArray = b->CreateLoad(positiveArrayPtr);
     103        }
     104
     105        Value * negativeArrayPtr = nullptr;
     106        Value * baseNegativeArray = nullptr;
     107        if (pc.HasNegatedArray) {
     108            indices[2] = b->getInt32(NEGATED_ARRAY_INDEX);
     109            negativeArrayPtr = b->CreateGEP(mPopCountState, indices);
     110            baseNegativeArray = b->CreateLoad(negativeArrayPtr);
     111        }
     112
     113        Value * const required = b->CreateAdd(b->CreateSub(endIndex, startIndex), ONE);
     114        Value * const arrayIsLargeEnough = b->CreateICmpULE(required, capacity);
     115        BasicBlock * const popCountCheckEnd = b->GetInsertBlock();
     116        b->CreateLikelyCondBr(arrayIsLargeEnough, popCountLoop, popCountExpand);
     117
     118        // Expand the popcount array buffer if it is not large enough
     119        b->SetInsertPoint(popCountExpand);
     120        Value * const newCapacity = b->CreateRoundUp(required, capacity);
     121        b->CreateStore(newCapacity, capacityPtr);
     122        Value * const newPartialSum = b->CreateRealloc(sizeTy, basePartialSumArray, newCapacity);
     123        b->CreateStore(newPartialSum, partialSumPtr);
     124        b->CreateStore(ZERO, newPartialSum);
     125        Value * newPositiveArray = nullptr;
     126        if (basePositiveArray) {
     127            newPositiveArray = b->CreateRealloc(sizeTy, basePositiveArray, newCapacity);
     128            b->CreateStore(newPositiveArray, positiveArrayPtr);
     129        }
     130        Value * newNegativeArray = nullptr;
     131        if (baseNegativeArray) {
     132            newNegativeArray = b->CreateRealloc(sizeTy, baseNegativeArray, newCapacity);
     133            b->CreateStore(newNegativeArray, negativeArrayPtr);
     134        }
     135        BasicBlock * const popCountExpandExit = b->GetInsertBlock();
     136        b->CreateBr(popCountLoop);
     137
     138        // count up the actual entries
     139        b->SetInsertPoint(popCountLoop);
     140        PHINode * const partialSumArray = b->CreatePHI(basePartialSumArray->getType(), 3);
     141        partialSumArray->addIncoming(basePartialSumArray, popCountCheckEnd);
     142        partialSumArray->addIncoming(newPartialSum, popCountExpandExit);
     143        PHINode * positiveArray = nullptr;
     144        if (basePositiveArray) {
     145            positiveArray = b->CreatePHI(basePositiveArray->getType(), 3);
     146            positiveArray->addIncoming(basePositiveArray, popCountCheckEnd);
     147            positiveArray->addIncoming(newPositiveArray, popCountExpandExit);
     148        }
     149        PHINode * negativeArray = nullptr;
     150        if (baseNegativeArray) {
     151            negativeArray = b->CreatePHI(baseNegativeArray->getType(), 3);
     152            negativeArray->addIncoming(baseNegativeArray, popCountCheckEnd);
     153            negativeArray->addIncoming(newNegativeArray, popCountExpandExit);
     154        }
     155        PHINode * const sourceIndex = b->CreatePHI(b->getSizeTy(), 3);
     156        sourceIndex->addIncoming(startIndex, popCountCheckEnd);
     157        sourceIndex->addIncoming(startIndex, popCountExpandExit);
     158        PHINode * const arrayIndex = b->CreatePHI(b->getSizeTy(), 3);
     159        arrayIndex->addIncoming(ZERO, popCountCheckEnd);
     160        arrayIndex->addIncoming(ZERO, popCountExpandExit);
     161        PHINode * const totalSum = b->CreatePHI(sizeTy, 3);
     162        totalSum->addIncoming(ZERO, popCountCheckEnd);
     163        totalSum->addIncoming(ZERO, popCountExpandExit);
     164
     165        const StreamSetBuffer * const buffer = getOutputBuffer(bufferPort);
     166        Value * const dataPtr = buffer->getStreamBlockPtr(b.get(), ZERO, sourceIndex);
     167        Value * markers = b->CreateBlockAlignedLoad(dataPtr);
     168
     169        // If this popcount field is only used for negated rates, just invert the markers.
     170        if (LLVM_UNLIKELY(pc.AlwaysNegated)) {
     171            markers = b->CreateNot(markers);
     172        }
     173
     174        // TODO: parallelize the partial sum when we're reasonably sure that we'll
     175        // have enough data to be worth it on average. Ideally, we'd also want
     176        // to know whether we'd ever need to finish counting sequentially.
     177
     178        Value * const count = b->CreateZExtOrTrunc(b->bitblock_popcount(markers), sizeTy);
     179
     180        if (positiveArray) { assert (!pc.AlwaysNegated);
     181            b->CreateStore(count, b->CreateGEP(positiveArray, arrayIndex));
     182        }
     183
     184        if (negativeArray) {
     185            Value * negCount = count;
     186            if (!pc.AlwaysNegated) {
     187                Constant * blockWidth = b->getSize(b->getBitBlockWidth());
     188                negCount = b->CreateSub(blockWidth, count);
     189            }
     190            b->CreateStore(negCount, b->CreateGEP(negativeArray, arrayIndex));
     191        }
     192
     193        Value * const partialSum = b->CreateAdd(totalSum, count);
     194        Value * const nextArrayIndex = b->CreateAdd(arrayIndex, ONE);
     195        b->CreateStore(partialSum, b->CreateGEP(partialSumArray, nextArrayIndex));
     196
     197        BasicBlock * const popCountLoopExit = b->GetInsertBlock();
     198        partialSumArray->addIncoming(partialSumArray, popCountLoopExit);
     199        Value * const nextSourceIndex = b->CreateAdd(sourceIndex, ONE);
     200        sourceIndex->addIncoming(nextSourceIndex, popCountLoopExit);
     201        arrayIndex->addIncoming(nextArrayIndex, popCountLoopExit);
     202        totalSum->addIncoming(partialSum, popCountLoopExit);
     203        if (positiveArray) {
     204            positiveArray->addIncoming(positiveArray, popCountLoopExit);
     205        }
     206        if (negativeArray) {
     207            negativeArray->addIncoming(negativeArray, popCountLoopExit);
     208        }
     209        Value * const hasMoreStrides = b->CreateICmpNE(nextSourceIndex, endIndex);
     210        b->CreateLikelyCondBr(hasMoreStrides, popCountLoop, popCountExit);
     211
     212        b->SetInsertPoint(popCountExit);
     213    });
    10214}
    11215
     
    13217 * @brief getInitialPopCount
    14218 ** ------------------------------------------------------------------------------------------------------------- */
    15 Value * PipelineCompiler::getInitialNumOfLinearPopCountItems(BuilderRef b, const ProcessingRate & rate) {
    16     assert (rate.isPopCount() || rate.isNegatedPopCount());
    17     Port refPort; unsigned refIndex;
    18     std::tie(refPort, refIndex) = mKernel->getStreamPort(rate.getReference());
    19     assert (refPort == Port::Input);
    20     PopCountData & popCount = findOrAddPopCountData(b, refIndex, rate.isNegatedPopCount());
    21     return getInitialNumOfLinearPopCountItems(b, popCount, refIndex, rate.isNegatedPopCount());
    22 }
    23 
    24 /** ------------------------------------------------------------------------------------------------------------- *
    25  * @brief getInitialPopCount
    26  ** ------------------------------------------------------------------------------------------------------------- */
    27 Value * PipelineCompiler::getInitialNumOfLinearPopCountItems(BuilderRef b, PopCountData & pc, const unsigned index, const bool negated) {
    28     // If we've already counted this stream, return the correct pop count
    29     if (pc.initial) {
    30         return pc.initial;
    31     }
    32     // Otherwise calculate it and store it for reuse.
    33     Value * markers = getSourceMarkers(b, pc, index, nullptr);
    34     if (negated) {
    35         markers = b->CreateNot(markers);
    36     }
    37     Value * const count = b->bitblock_popcount(markers);
    38     pc.initial = count;
    39     return count;
     219Value * PipelineCompiler::getMinimumNumOfLinearPopCountItems(BuilderRef b, const Binding & binding) {
     220
     221    const ProcessingRate & rate = binding.getRate();
     222    const auto bufferVertex = getPopCountReferenceBuffer(mKernel, rate);
     223    PopCountData & pc = getPopCountReference(bufferVertex);
     224
     225    std::vector<Value *> indices(3);
     226    indices[0] = b->getInt32(0);
     227    indices[1] = b->getInt32(bufferVertex);
     228
     229    Value * const offset = getPopCountInitialOffset(b, binding, bufferVertex, pc);
     230
     231    Value * minCount = nullptr;
     232    bool invertCount = false;
     233
     234    // If we have the independent count, we can read from it to get a count
     235    // w/o subtracting the i-1 th entry of the partial sum.
     236    if (pc.HasArray || pc.HasNegatedArray) {
     237        const bool isNegPopCount = rate.isNegatedPopCount() && pc.HasNegatedArray;
     238        const bool useNegArray = (isNegPopCount || !pc.HasArray);
     239        indices[2] = b->getInt32(useNegArray ? NEGATED_ARRAY_INDEX : POSITIVE_ARRAY_INDEX);
     240        Value * const array = b->CreateLoad(b->CreateGEP(mPopCountState, indices));
     241        minCount = b->CreateLoad(b->CreateGEP(array, offset));
     242        invertCount = useNegArray ^ rate.isNegatedPopCount();
     243    } else { // use the partial sum to calculate the min item count
     244        indices[2] = b->getInt32(PARTIAL_SUM_INDEX);
     245        Value * const partialSum = b->CreateLoad(b->CreateGEP(mPopCountState, indices));
     246        Value * const prior = b->CreateLoad(b->CreateGEP(partialSum, offset));
     247        Constant * const ONE = b->getSize(1);
     248        Value * const current = b->CreateAdd(offset, ONE);
     249        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     250            const auto refPortNum = getPopCountReferencePort(mKernel, rate);
     251            Value * const total = getTotalItemCount(b, refPortNum);
     252            Value * const strideLength = getInputStrideLength(b, refPortNum);
     253            Value * const term = hasProducerTerminated(b, refPortNum);
     254            Value * const strideLengthMinus1 = b->CreateSub(strideLength, ONE);
     255            Value * const padding = b->CreateSelect(term, strideLengthMinus1, b->getSize(0));
     256            Value * const paddedTotal =  b->CreateAdd(total, padding);
     257            Value * const countLength = b->CreateUDiv(paddedTotal, strideLength);
     258            Value * const sanityCheck = b->CreateICmpULE(current, countLength);
     259            b->CreateAssert(sanityCheck,
     260                            mKernel->getName() + "_" + binding.getName() + ":"
     261                            " pop count minimum linear item count lookup"
     262                            " exceeds reference stream length");
     263        }
     264        Value * const itemCount = b->CreateLoad(b->CreateGEP(partialSum, current));
     265        minCount = b->CreateSub(itemCount, prior);
     266        invertCount = rate.isNegatedPopCount() ^ pc.AlwaysNegated;
     267    }
     268
     269    if (invertCount) {
     270        Constant * const BlockWidth = b->getSize(b->getBitBlockWidth());
     271        minCount = b->CreateSub(BlockWidth, minCount);
     272    }
     273    #ifdef PRINT_DEBUG_MESSAGES
     274    const auto prefix = makeBufferName(mKernelIndex, binding);
     275    b->CallPrintInt(prefix + "_minCount", minCount);
     276    #endif
     277    return minCount;
    40278}
    41279
     
    43281 * @brief getMaximumNumOfPopCountStrides
    44282 ** ------------------------------------------------------------------------------------------------------------- */
    45 inline Value * PipelineCompiler::getMaximumNumOfPopCountStrides(BuilderRef b, const ProcessingRate & rate) {
    46     return makePopCountArray(b, rate).maximumNumOfStrides;
    47 }
    48 
    49 /** ------------------------------------------------------------------------------------------------------------- *
    50  * @brief getPopCountArray
    51  ** ------------------------------------------------------------------------------------------------------------- */
    52 inline Value * PipelineCompiler::getPopCountArray(BuilderRef b, const unsigned index) {
    53     return b->CreateLoad(makePopCountArray(b, index, false).individualCountArray);
    54 }
    55 
    56 /** ------------------------------------------------------------------------------------------------------------- *
    57  * @brief getNegatedPopCountArray
    58  ** ------------------------------------------------------------------------------------------------------------- */
    59 inline Value * PipelineCompiler::getNegatedPopCountArray(BuilderRef b, const unsigned index) {
    60     return b->CreateLoad(makePopCountArray(b, index, true).individualCountArray);
    61 }
    62 
    63 /** ------------------------------------------------------------------------------------------------------------- *
    64  * @brief getNumOfLinearPopCountItems
    65  ** ------------------------------------------------------------------------------------------------------------- */
    66 Value * PipelineCompiler::getNumOfLinearPopCountItems(BuilderRef b, const ProcessingRate & rate, Value * const numOfStrides) {
    67     PopCountData & popCount = makePopCountArray(b, rate);
    68     ConstantInt * const ONE = b->getSize(1);
    69     Value *  const strideIndex = b->CreateSub(numOfStrides, ONE);
    70     return b->CreateLoad(b->CreateGEP(b->CreateLoad(popCount.partialSumArray), strideIndex));
    71 }
    72 
    73 /** ------------------------------------------------------------------------------------------------------------- *
    74  * @brief allocateLocalPopCountArray
    75  ** ------------------------------------------------------------------------------------------------------------- */
    76 inline void PipelineCompiler::allocateLocalPopCountArray(BuilderRef b, const ProcessingRate & rate) {
    77     assert (rate.isPopCount() || rate.isNegatedPopCount());
    78     Port port; unsigned index;
    79     std::tie(port, index) = mKernel->getStreamPort(rate.getReference());
    80     assert (port == Port::Input);
    81     const auto negated = rate.isNegatedPopCount();
    82     PopCountData & pc = findOrAddPopCountData(b, index, negated);
    83     if (pc.partialSumArray == nullptr) {
    84         IntegerType * const sizeTy = b->getSizeTy();
    85         Constant * maxNumOfStrides = b->getSize(ceiling(mBufferGraph[mKernelIndex].upper));
    86         pc.strideCapacity = b->CreateAlloca(sizeTy);
    87         b->CreateStore(maxNumOfStrides, pc.strideCapacity);
    88 
    89         Constant * const sizeOfSizeTy = ConstantExpr::getTrunc(ConstantExpr::getSizeOf(sizeTy), sizeTy, true);
    90         Constant * const arraySize = ConstantExpr::getMul(maxNumOfStrides, sizeOfSizeTy);
    91         PointerType * const sizePtrTy = sizeTy->getPointerTo();
    92         Value * const ptr = b->CreatePointerCast(b->CreateCacheAlignedMalloc(arraySize), sizePtrTy);
    93 
    94         pc.partialSumArray = b->CreateAlloca(sizePtrTy);
    95         b->CreateStore(ptr, pc.partialSumArray);
    96 
    97         const Binding & input = mKernel->getInputStreamSetBinding(index);
    98         if (input.hasAttribute(negated ? AttrId::RequiresNegatedPopCountArray : AttrId::RequiresPopCountArray)) {
    99             pc.individualCountArray = b->CreateAlloca(sizePtrTy);
    100             Value * const ptr = b->CreatePointerCast(b->CreateCacheAlignedMalloc(arraySize), sizePtrTy);
    101             b->CreateStore(ptr, pc.individualCountArray);
    102         }
    103     }
    104 }
    105 
    106 /** ------------------------------------------------------------------------------------------------------------- *
    107  * @brief deallocateLocalPopCountArray
    108  ** ------------------------------------------------------------------------------------------------------------- */
    109 inline void PipelineCompiler::deallocateLocalPopCountArray(BuilderRef b, const ProcessingRate & rate) {
    110     PopCountData & pc = findOrAddPopCountData(b, rate);
    111     if (pc.partialSumArray) {
    112         b->CreateFree(b->CreateLoad(pc.partialSumArray));
    113         pc.partialSumArray = nullptr;
    114         if (pc.individualCountArray) {
    115             b->CreateFree(b->CreateLoad(pc.individualCountArray));
    116             pc.individualCountArray = nullptr;
    117         }
    118     }
    119 }
    120 
    121 /** ------------------------------------------------------------------------------------------------------------- *
    122  * @brief makePopCountArray
    123  ** ------------------------------------------------------------------------------------------------------------- */
    124 PopCountData & PipelineCompiler::makePopCountArray(BuilderRef b, const ProcessingRate & rate) {
    125     assert (rate.isPopCount() || rate.isNegatedPopCount());
    126     Port refPort; unsigned refIndex;
    127     std::tie(refPort, refIndex) = mKernel->getStreamPort(rate.getReference());
    128     assert (refPort == Port::Input);
    129     return makePopCountArray(b, refIndex, rate.isNegatedPopCount());
    130 }
    131 
    132 /** ------------------------------------------------------------------------------------------------------------- *
    133  * @brief makePopCountArray
    134  ** ------------------------------------------------------------------------------------------------------------- */
    135 PopCountData & PipelineCompiler::makePopCountArray(BuilderRef b, const unsigned index, const bool negated) {
    136 
    137     assert (mNumOfLinearStrides);
    138     PopCountData & pc = findOrAddPopCountData(b, index, negated);
    139     assert (pc.partialSumArray);
    140 
    141     // TODO: if the same popcounts were shared amongst multiple kernels but the state is preallocated,
    142     // we could attempt to "append" to the prior array(s) and reuse the calculations.
    143     if (LLVM_UNLIKELY(pc.hasConstructedArray == mKernelIndex)) {
    144         return pc;
    145     }
    146 
    147     // TODO: if we have both a popcount and negated popcount that refer to the same reference stream,
    148     // we could optimize the calculation and compute both at the same time.
    149 
    150     // NOTE: this won't correctly handle the case where we have two input ports that are popcount
    151     // reference streams that accept the same stream set buffer but are processed at different rates.
    152     // To fix this, the mapping must take the potential item position(s) imposed by reference rate
    153     // into account.
    154 
    155     const Binding & input = mKernel->getInputStreamSetBinding(index);
    156 
    157     BasicBlock * const entryBlock = b->GetInsertBlock();
    158 
    159     const auto prefix = mKernel->getName() + "_" + input.getName() + "_popCountRate";
    160     BasicBlock * const popCountExpand =
    161         b->CreateBasicBlock(prefix + "Expand", mKernelLoopCall);
    162     BasicBlock * const popCountStart =
    163         b->CreateBasicBlock(prefix + "Entry", mKernelLoopCall);
     283Value * PipelineCompiler::getMaximumNumOfPopCountStrides(BuilderRef b, const Binding & binding, not_null<Value *> sourceItemCount, Constant * const lookAhead) {
     284
     285    const ProcessingRate & rate = binding.getRate();
     286    const auto bufferVertex = getPopCountReferenceBuffer(mKernel, rate);
     287    PopCountData & pc = getPopCountReference(bufferVertex);
     288
     289    IntegerType * const sizeTy = b->getSizeTy();
     290
     291    std::vector<Value *> indices(3);
     292    indices[0] = b->getInt32(0);
     293    indices[1] = b->getInt32(bufferVertex);
     294    indices[2] = b->getInt32(PARTIAL_SUM_INDEX);
     295
     296    Value * const array = b->CreateLoad(b->CreateGEP(mPopCountState, indices));
     297    const auto prefix = makeBufferName(mKernelIndex, binding) + "_readPopCount";
     298
     299    BasicBlock * const popCountEntry =
     300        b->GetInsertBlock();
    164301    BasicBlock * const popCountLoop =
    165302        b->CreateBasicBlock(prefix + "Loop", mKernelLoopCall);
    166     BasicBlock * const checkedNumOfItems =
    167         b->CreateBasicBlock(prefix + "Next", mKernelLoopCall);
    168303    BasicBlock * const popCountExit =
    169304        b->CreateBasicBlock(prefix + "Exit", mKernelLoopCall);
    170305
    171     Value * const strideCapacity = b->CreateLoad(pc.strideCapacity);
    172 
    173     Value * const initialPartialSumArray = b->CreateLoad(pc.partialSumArray);
    174     Value * initialIndividualCountArray = nullptr;
    175     if (pc.individualCountArray) {
    176         initialIndividualCountArray = b->CreateLoad(pc.individualCountArray);
    177     }
    178 
    179     Value * const arrayIsLargeEnough = b->CreateICmpULE(mNumOfLinearStrides, strideCapacity);
    180     b->CreateLikelyCondBr(arrayIsLargeEnough, popCountStart, popCountExpand);
    181