Changeset 5755


Ignore:
Timestamp:
Dec 3, 2017, 12:40:40 PM (15 months ago)
Author:
nmedfort
Message:

Bug fixes and simplified MultiBlockKernel? logic

Location:
icGREP/icgrep-devel/icgrep
Files:
4 added
57 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5753 r5755  
    6868endif()
    6969
    70 SET(KERNEL_SRC kernels/interface.cpp kernels/kernel.cpp kernels/streamset.cpp kernels/kernel_builder.cpp)
     70SET(KERNEL_SRC kernels/attributes.cpp kernels/processing_rate.cpp kernels/interface.cpp kernels/kernel.cpp kernels/streamset.cpp kernels/kernel_builder.cpp)
    7171SET(KERNEL_SRC ${KERNEL_SRC} kernels/source_kernel.cpp kernels/s2p_kernel.cpp kernels/deletion.cpp kernels/swizzle.cpp kernels/p2s_kernel.cpp kernels/stdout_kernel.cpp)
    7272
     
    246246  COMMAND ./run_all "${CMAKE_BINARY_DIR}/u8u16 -segment-size=16 -enable-segment-pipeline-parallel")
    247247
    248 #add_test(
    249 #  NAME lz4d_test
    250 #  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/../QA/lz4d
    251 #  COMMAND ./run_all ${CMAKE_BINARY_DIR}/lz4d)
     248add_test(
     249  NAME lz4d_test
     250  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/../QA/lz4d
     251  COMMAND ./run_all ${CMAKE_BINARY_DIR}/lz4d)
    252252
    253253add_test(
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.cpp

    r5746 r5755  
    5252}
    5353
     54#ifdef HAS_ADDRESS_SANITIZER
     55Value * checkHeapAddress(CBuilder * const b, Value * const Ptr, Value * const Size) {
     56    Module * const m = b->getModule();
     57    PointerType * const voidPtrTy = b->getVoidPtrTy();
     58    IntegerType * const sizeTy = b->getSizeTy();
     59    Function * isPoisoned = m->getFunction("__asan_region_is_poisoned");
     60    if (LLVM_UNLIKELY(isPoisoned == nullptr)) {
     61        isPoisoned = Function::Create(FunctionType::get(voidPtrTy, {voidPtrTy, sizeTy}, false), Function::ExternalLinkage, "__asan_region_is_poisoned", m);
     62        isPoisoned->setCallingConv(CallingConv::C);
     63        isPoisoned->setReturnDoesNotAlias();
     64        #if LLVM_VERSION_INTEGER < LLVM_5_0_0
     65        isPoisoned->setDoesNotAlias(1);
     66        #endif
     67    }
     68    Value * const addr = b->CreatePointerCast(Ptr, voidPtrTy);
     69    Value * check = b->CreateCall(isPoisoned, { addr, b->CreateTrunc(Size, sizeTy) });
     70    return b->CreateICmpEQ(check, ConstantPointerNull::get(cast<PointerType>(isPoisoned->getReturnType())));
     71}
     72#define CHECK_HEAP_ADDRESS(Ptr, Size, Name) \
     73if (LLVM_UNLIKELY(hasAddressSanitizer())) { \
     74    CreateAssert(checkHeapAddress(this, Ptr, Size), Name " was given unallocated memory address"); \
     75}
     76#else
     77#define CHECK_HEAP_ADDRESS(Ptr, Size, Name)
     78#endif
     79
     80static AllocaInst * resolveStackAddress(Value * Ptr) {
     81    for (;;) {
     82        if (GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(Ptr)) {
     83            Ptr = gep->getPointerOperand();
     84        } else if (CastInst * ci = dyn_cast<CastInst>(Ptr)) {
     85            Ptr = ci->getOperand(0);
     86        } else {
     87            return dyn_cast<AllocaInst>(Ptr);
     88        }
     89    }
     90}
     91
     92static Value * checkStackAddress(CBuilder * const b, Value * const Ptr, Value * const Size, AllocaInst * const Base) {
     93    DataLayout DL(b->getModule());
     94    IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
     95    Value * sz = ConstantExpr::getTrunc(ConstantExpr::getSizeOf(Base->getAllocatedType()), intPtrTy);
     96    if (dyn_cast_or_null<Constant>(Base->getArraySize()) && !cast<Constant>(Base->getArraySize())->isNullValue()) {
     97        sz = b->CreateMul(sz, b->CreateZExtOrTrunc(Base->getArraySize(), intPtrTy));
     98    }
     99    Value * const p = b->CreatePtrToInt(Ptr, intPtrTy);
     100    Value * const s = b->CreatePtrToInt(Base, intPtrTy);
     101    Value * const e = b->CreateAdd(s, b->CreateSub(sz, b->CreateZExtOrTrunc(Size, intPtrTy)));
     102    return b->CreateAnd(b->CreateICmpUGE(p, s), b->CreateICmpULE(p, e));
     103}
     104
     105#define CHECK_ADDRESS(Ptr, Size, Name) \
     106    CreateAssert(Ptr, Name " was given a null address"); \
     107    if (AllocaInst * Base = resolveStackAddress(Ptr)) { \
     108        CreateAssert(checkStackAddress(this, Ptr, Size, Base), Name " was given an invalid stack address"); \
     109    } else { \
     110        CHECK_HEAP_ADDRESS(Ptr, Size, Name) \
     111    }
     112
    54113Value * CBuilder::CreateURem(Value * const number, Value * const divisor, const Twine & Name) {
    55114    if (ConstantInt * c = dyn_cast<ConstantInt>(divisor)) {
     
    59118            return CreateAnd(number, ConstantInt::get(divisor->getType(), d - 1), Name);
    60119        }
    61     } else {
    62         CreateAssert(divisor, "CreateURem divisor cannot be 0!");
    63     }
     120    }
     121    CreateAssert(divisor, "CreateURem divisor cannot be 0!");
    64122    return Insert(BinaryOperator::CreateURem(number, divisor), Name);
    65123}
     
    83141Value * CBuilder::CreateUDivCeil(Value * const number, Value * const divisor, const Twine & Name) {
    84142    assert (number->getType() == divisor->getType());
     143    Type * const t = number->getType();
     144    Value * const n = CreateAdd(number, CreateSub(divisor, ConstantInt::get(t, 1)));
    85145    if (isa<ConstantInt>(divisor)) {
    86         return CreateUDivCeil(number, cast<ConstantInt>(divisor)->getZExtValue(), Name);
     146        const auto d = cast<ConstantInt>(divisor)->getZExtValue();
     147        if (is_power_2(d)) {
     148            if (d > 1) {
     149                return CreateLShr(n, ConstantInt::get(t, std::log2(d)), Name);
     150            } else {
     151                return number;
     152            }
     153        }
    87154    }
    88155    CreateAssert(divisor, "CreateUDivCeil divisor cannot be 0!");
    89     Constant * const one = ConstantInt::get(divisor->getType(), 1);
    90     return CreateUDiv(CreateAdd(number, CreateSub(divisor, one)), divisor, Name);
    91 }
    92 
    93 Value * CBuilder::CreateUDivCeil(Value * const number, const uint64_t divisor, const Twine & Name) {
    94     assert ("CreateUDivCeil divisor cannot be 0!" && divisor);
    95     Type * const t = number->getType();
    96     Value * const n = CreateAdd(number, ConstantInt::get(t, divisor - 1));
    97     if (is_power_2(divisor)) {
    98         if (divisor > 1) {
    99             return CreateLShr(n, ConstantInt::get(t, std::log2(divisor)), Name);
    100         } else {
    101             return number;
    102         }
    103     }
    104     Constant * const d = ConstantInt::get(t, divisor);
    105     CreateAssert(d, "CreateUDivCeil divisor cannot be 0!");
    106     return CreateUDiv(n, d, Name);
     156    return CreateUDiv(n, divisor, Name);
    107157}
    108158
     
    115165    Function * openFn = m->getFunction("open");
    116166    if (openFn == nullptr) {
    117         IntegerType * int32Ty = getInt32Ty();
    118         PointerType * int8PtrTy = getInt8PtrTy();
     167        IntegerType * const int32Ty = getInt32Ty();
     168        PointerType * const int8PtrTy = getInt8PtrTy();
    119169        openFn = cast<Function>(m->getOrInsertFunction("open",
    120170                                                         int32Ty, int8PtrTy, int32Ty, int32Ty, nullptr));
     
    125175// ssize_t write(int fildes, const void *buf, size_t nbyte);
    126176Value * CBuilder::CreateWriteCall(Value * fileDescriptor, Value * buf, Value * nbyte) {
    127     PointerType * voidPtrTy = getVoidPtrTy();
     177    PointerType * const voidPtrTy = getVoidPtrTy();
    128178    Module * const m = getModule();
    129179    Function * write = m->getFunction("write");
    130180    if (write == nullptr) {
    131         IntegerType * sizeTy = getSizeTy();
    132         IntegerType * int32Ty = getInt32Ty();
     181        IntegerType * const sizeTy = getSizeTy();
     182        IntegerType * const int32Ty = getInt32Ty();
    133183        write = cast<Function>(m->getOrInsertFunction("write",
    134184#if LLVM_VERSION_INTEGER < LLVM_5_0_0
    135                                                         AttributeSet().addAttribute(getContext(), 2U, Attribute::NoAlias),
     185        AttributeSet().addAttribute(getContext(), 2U, Attribute::NoAlias),
    136186#else
    137                                                         AttributeList().addAttribute(getContext(), 2U, Attribute::NoAlias),
     187        AttributeList().addAttribute(getContext(), 2U, Attribute::NoAlias),
    138188#endif
    139                                                         sizeTy, int32Ty, voidPtrTy, sizeTy, nullptr));
     189        sizeTy, int32Ty, voidPtrTy, sizeTy, nullptr));
    140190    }
    141191    buf = CreatePointerCast(buf, voidPtrTy);
     192    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     193        CHECK_ADDRESS(buf, nbyte, "CreateWriteCall");
     194    }
    142195    return CreateCall(write, {fileDescriptor, buf, nbyte});
    143196}
    144197
    145198Value * CBuilder::CreateReadCall(Value * fileDescriptor, Value * buf, Value * nbyte) {
    146     PointerType * voidPtrTy = getVoidPtrTy();
     199    PointerType * const voidPtrTy = getVoidPtrTy();
    147200    Module * const m = getModule();
    148201    Function * readFn = m->getFunction("read");
    149202    if (readFn == nullptr) {
    150         IntegerType * sizeTy = getSizeTy();
    151         IntegerType * int32Ty = getInt32Ty();
     203        IntegerType * const sizeTy = getSizeTy();
     204        IntegerType * const int32Ty = getInt32Ty();
    152205        readFn = cast<Function>(m->getOrInsertFunction("read",
    153206#if LLVM_VERSION_INTEGER < LLVM_5_0_0
    154                                                          AttributeSet().addAttribute(getContext(), 2U, Attribute::NoAlias),
     207        AttributeSet().addAttribute(getContext(), 2U, Attribute::NoAlias),
    155208#else
    156                                                          AttributeList().addAttribute(getContext(), 2U, Attribute::NoAlias),
     209        AttributeList().addAttribute(getContext(), 2U, Attribute::NoAlias),
    157210#endif
    158                                                          sizeTy, int32Ty, voidPtrTy, sizeTy, nullptr));
     211        sizeTy, int32Ty, voidPtrTy, sizeTy, nullptr));
    159212    }
    160213    buf = CreatePointerCast(buf, voidPtrTy);
     214    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     215        CHECK_ADDRESS(buf, nbyte, "CreateReadCall");
     216    }
    161217    return CreateCall(readFn, {fileDescriptor, buf, nbyte});
    162218}
     
    332388}
    333389
    334 Value * CBuilder::CreateCacheAlignedMalloc(Value * size) {
    335     const auto alignment = getCacheAlignment();
    336     if (LLVM_LIKELY(isa<Constant>(size))) {
    337         Constant * const align = ConstantInt::get(size->getType(), alignment, false);
    338         Constant * offset = ConstantExpr::getURem(cast<Constant>(size), align);
    339         if (!offset->isNullValue()) {
    340             size = ConstantExpr::getAdd(cast<Constant>(size), ConstantExpr::getSub(align, offset));
    341         }
    342     }
    343     return CreateAlignedMalloc(size, alignment);
    344 }
    345 
    346390Value * CBuilder::CreateAlignedMalloc(Value * size, const unsigned alignment) {
    347     if (LLVM_UNLIKELY((alignment & (alignment - 1)) != 0)) {
     391    if (LLVM_UNLIKELY(!is_power_2(alignment))) {
    348392        report_fatal_error("CreateAlignedMalloc: alignment must be a power of 2");
    349393    }
     
    351395    IntegerType * const sizeTy = getSizeTy();
    352396    PointerType * const voidPtrTy = getVoidPtrTy();
    353 
     397    ConstantInt * const align = ConstantInt::get(sizeTy, alignment);
     398    ConstantInt * const alignMask = ConstantInt::get(sizeTy, alignment - 1);
    354399    size = CreateZExtOrTrunc(size, sizeTy);
    355     ConstantInt * const align = ConstantInt::get(sizeTy, alignment);
    356     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
    357         CreateAssertZero(CreateURem(size, align), "CreateAlignedMalloc: size must be an integral multiple of alignment.");
    358     }
     400    Value * const offset = CreateAnd(size, alignMask);
     401    size = CreateSelect(CreateIsNull(offset), size, CreateAdd(size, CreateXor(offset, alignMask)));
     402    CreateAssertZero(CreateURem(size, align), "CreateAlignedMalloc: size must be an integral multiple of alignment.");
    359403    Value * ptr = nullptr;
    360404    if (hasAlignedAlloc()) {
     
    374418            f->setCallingConv(CallingConv::C);
    375419            f->setReturnDoesNotAlias();
    376 #if LLVM_VERSION_INTEGER < LLVM_5_0_0
     420            #if LLVM_VERSION_INTEGER < LLVM_5_0_0
    377421            f->setDoesNotAlias(1);
    378 #endif
     422            #endif
    379423        }
    380424        Value * handle = CreateAlloca(voidPtrTy);
     
    466510    }
    467511    Value * ptr = CreateCall(fMMap, {addr, size, prot, flags, fd, offset});
    468     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     512    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    469513        DataLayout DL(m);
    470514        IntegerType * const intTy = getIntPtrTy(DL);
     
    556600        ConstantInt * const flags = ConstantInt::get(intTy, MREMAP_MAYMOVE);
    557601        ptr = CreateCall(fMRemap, {addr, oldSize, newSize, flags});
    558         if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     602        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    559603            Value * success = CreateICmpNE(CreatePtrToInt(addr, intTy), ConstantInt::getAllOnesValue(intTy)); // MAP_FAILED = -1
    560604            CreateAssert(success, "CreateMRemap: mremap failed to allocate memory");
     
    578622    }
    579623    len = CreateZExtOrTrunc(len, sizeTy);
    580     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     624    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    581625        DataLayout DL(getModule());
    582626        IntegerType * const intPtrTy = getIntPtrTy(DL);
     
    592636}
    593637
     638Value * CBuilder::CreateMProtect(Value * addr, Value * size, const int protect) {
     639    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     640        // mprotect() changes the access protections for the calling process's
     641        // memory pages containing any part of the address range in the interval
     642        // [addr, addr+len-1].  addr must be aligned to a page boundary.
     643
     644        // mprotect(): POSIX.1-2001, POSIX.1-2008, SVr4.  POSIX says that the
     645        // behavior of mprotect() is unspecified if it is applied to a region of
     646        // memory that was not obtained via mmap(2).
     647
     648        // On Linux, it is always permissible to call mprotect() on any address
     649        // in a process's address space (except for the kernel vsyscall area).
     650        // In particular, it can be used to change existing code mappings to be
     651        // writable.
     652
     653//        Triple T(mTriple);
     654//        if (!T.isOSLinux()) {
     655//            DataLayout DL(getModule());
     656//            IntegerType * const intPtrTy = getIntPtrTy(DL);
     657//            Value * a = CreatePtrToInt(addr, intPtrTy);
     658//            Constant * const pageSize = ConstantInt::get(intPtrTy, getpagesize());
     659//            CreateAssertZero(CreateURem(a, pageSize), "CreateMProtect: addr must be aligned to page boundary on non-Linux architectures");
     660//        }
     661    }
     662
     663    IntegerType * const sizeTy = getSizeTy();
     664    PointerType * const voidPtrTy = getVoidPtrTy();
     665    IntegerType * const int32Ty = getInt32Ty();
     666
     667    Module * const m = getModule();
     668    Function * mprotectFunc = m->getFunction("mprotect");
     669    if (LLVM_UNLIKELY(mprotectFunc == nullptr)) {
     670        FunctionType * const fty = FunctionType::get(sizeTy, {voidPtrTy, sizeTy, int32Ty}, false);
     671        mprotectFunc = Function::Create(fty, Function::ExternalLinkage, "mprotect", m);
     672    }
     673    addr = CreatePointerCast(addr, voidPtrTy);
     674    size = CreateZExtOrTrunc(size, sizeTy);
     675    return CreateCall(mprotectFunc, {addr, size, ConstantInt::get(int32Ty, (int)protect)});
     676
     677}
     678
    594679IntegerType * CBuilder::getIntAddrTy() const {
    595680    return TypeBuilder<intptr_t, false>::get(getContext());
    596681}
    597682
    598 PointerType * CBuilder::getVoidPtrTy() const {
    599     return TypeBuilder<void *, true>::get(getContext());
     683PointerType * CBuilder::getVoidPtrTy(const unsigned AddressSpace) const {
     684    return PointerType::get(Type::getVoidTy(getContext()), AddressSpace);
    600685}
    601686
     
    644729    }
    645730    ptr = CreatePointerCast(ptr, voidPtrTy);
     731    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     732        CHECK_ADDRESS(ptr, CreateMul(size, nitems), "CreateFReadCall");
     733    }
    646734    return CreateCall(fReadFunc, {ptr, size, nitems, stream});
    647735}
     
    658746    }
    659747    ptr = CreatePointerCast(ptr, voidPtrTy);
     748    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     749        CHECK_ADDRESS(ptr, CreateMul(size, nitems), "CreateFReadCall");
     750    }
    660751    return CreateCall(fWriteFunc, {ptr, size, nitems, stream});
    661752}
     
    9921083        }
    9931084        IRBuilder<>::CreateCall(function, {assertion, GetString(failureMessage), trace, depth});
     1085    } else { // if assertions are not enabled, make it a compiler assumption.
     1086        IRBuilder<>::CreateAssumption(assertion);
    9941087    }
    9951088}
     
    10291122
    10301123Value * CBuilder::CreateCountForwardZeroes(Value * value, const bool isZeroUndefined) {
    1031     if (isZeroUndefined) {
     1124    if (LLVM_UNLIKELY(isZeroUndefined && codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    10321125        CreateAssert(value, "CreateCountForwardZeroes: value cannot be zero!");
    10331126    }
     
    10371130
    10381131Value * CBuilder::CreateCountReverseZeroes(Value * value, const bool isZeroUndefined) {
    1039     if (isZeroUndefined) {
     1132    if (LLVM_UNLIKELY(isZeroUndefined && codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    10401133        CreateAssert(value, "CreateCountReverseZeroes: value cannot be zero!");
    10411134    }
     
    10921185}
    10931186
    1094 #ifdef HAS_ADDRESS_SANITIZER
    1095 Value * checkHeapAddress(CBuilder * const b, Value * const Ptr) {
    1096     Module * const m = b->getModule();
    1097     PointerType * const voidPtrTy = b->getVoidPtrTy();
    1098     IntegerType * const sizeTy = b->getSizeTy();
    1099     Function * isPoisoned = m->getFunction("__asan_region_is_poisoned");
    1100     if (LLVM_UNLIKELY(isPoisoned == nullptr)) {
    1101         isPoisoned = Function::Create(FunctionType::get(voidPtrTy, {voidPtrTy, sizeTy}, false), Function::ExternalLinkage, "__asan_region_is_poisoned", m);
    1102         isPoisoned->setCallingConv(CallingConv::C);
    1103         isPoisoned->setReturnDoesNotAlias();
    1104         isPoisoned->setDoesNotAlias(1);
    1105     } \
    1106     Value * const addr = b->CreatePointerCast(Ptr, voidPtrTy);
    1107     ConstantInt * const size = ConstantInt::get(sizeTy, Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits() / 8);
    1108     Value * check = b->CreateCall(isPoisoned, { addr, size });
    1109     return b->CreateICmpEQ(check, ConstantPointerNull::get(cast<PointerType>(isPoisoned->getReturnType())));
    1110 }
    1111 #define CHECK_HEAP_ADDRESS(Ptr, Name) \
    1112 if (LLVM_UNLIKELY(hasAddressSanitizer())) { \
    1113     CreateAssert(checkHeapAddress(this, Ptr), Name " was given unallocated memory address"); \
    1114 }
    1115 #else
    1116 #define CHECK_HEAP_ADDRESS(Ptr, Name)
    1117 #endif
    1118 
    1119 static AllocaInst * resolveStackAddress(Value * Ptr) {
    1120     while (isa<GetElementPtrInst>(Ptr)) {
    1121         Ptr = cast<GetElementPtrInst>(Ptr)->getPointerOperand();
    1122     }
    1123     return dyn_cast<AllocaInst>(Ptr);
    1124 }
    1125 
    1126 static Value * checkStackAddress(CBuilder * const b, Value * const Ptr, AllocaInst * const Base) {
    1127     DataLayout DL(b->getModule());
    1128     IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
    1129     Value * sz = ConstantExpr::getSizeOf(Base->getAllocatedType());
    1130     sz = b->CreateZExtOrTrunc(sz, intPtrTy);
    1131     if (dyn_cast_or_null<Constant>(Base->getArraySize()) && !cast<Constant>(Base->getArraySize())->isNullValue()) {
    1132         sz = b->CreateMul(sz, b->CreateZExtOrTrunc(Base->getArraySize(), intPtrTy));
    1133     }
    1134     Value * const p = b->CreatePtrToInt(Ptr, intPtrTy);
    1135     Value * const s = b->CreatePtrToInt(Base, intPtrTy);
    1136     Value * const e = b->CreateAdd(s, sz);
    1137     return b->CreateAnd(b->CreateICmpUGE(p, s), b->CreateICmpULT(p, e));
    1138 }
    1139 
    1140 #define CHECK_ADDRESS(Ptr, Name) \
    1141 if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) { \
    1142     CreateAssert(Ptr, Name " was given a null address"); \
    1143     if (AllocaInst * Base = resolveStackAddress(Ptr)) { \
    1144         CreateAssert(checkStackAddress(this, Ptr, Base), Name " was given an invalid stack address"); \
    1145     } else { \
    1146         CHECK_HEAP_ADDRESS(Ptr, Name) \
    1147     } \
    1148 }
    1149 
    11501187LoadInst * CBuilder::CreateLoad(Value *Ptr, const char * Name) {
    1151     CHECK_ADDRESS(Ptr, "CreateLoad");
     1188    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1189        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Ptr->getType()->getPointerElementType()), "CreateLoad");
     1190    }
    11521191    return IRBuilder<>::CreateLoad(Ptr, Name);
    11531192}
    11541193
    11551194LoadInst * CBuilder::CreateLoad(Value * Ptr, const Twine & Name) {
    1156     CHECK_ADDRESS(Ptr, "CreateLoad");
     1195    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1196        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Ptr->getType()->getPointerElementType()), "CreateLoad");
     1197    }
    11571198    return IRBuilder<>::CreateLoad(Ptr, Name);
    11581199}
    11591200
    1160 LoadInst * CBuilder::CreateLoad(Type *Ty, Value *Ptr, const Twine & Name) {
    1161     CHECK_ADDRESS(Ptr, "CreateLoad");
     1201LoadInst * CBuilder::CreateLoad(Type * Ty, Value *Ptr, const Twine & Name) {
     1202    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1203        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Ty), "CreateLoad");
     1204    }
    11621205    return IRBuilder<>::CreateLoad(Ty, Ptr, Name);
    11631206}
    11641207
    1165 LoadInst * CBuilder::CreateLoad(Value *Ptr, bool isVolatile, const Twine & Name) {   
    1166     CHECK_ADDRESS(Ptr, "CreateLoad");
     1208LoadInst * CBuilder::CreateLoad(Value * Ptr, bool isVolatile, const Twine & Name) {
     1209    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1210        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Ptr->getType()->getPointerElementType()), "CreateLoad");
     1211    }
    11671212    return IRBuilder<>::CreateLoad(Ptr, isVolatile, Name);
    11681213}
    11691214
    11701215StoreInst * CBuilder::CreateStore(Value * Val, Value * Ptr, bool isVolatile) {
    1171     assert (Val->getType()->getPointerTo() == Ptr->getType());
    1172     CHECK_ADDRESS(Ptr, "CreateStore");
     1216    assert (Val->getType() == Ptr->getType()->getPointerElementType());
     1217    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1218        CHECK_ADDRESS(Ptr, ConstantExpr::getSizeOf(Val->getType()), "CreateStore");
     1219    }
    11731220    return IRBuilder<>::CreateStore(Val, Ptr, isVolatile);
    11741221}
    11751222
    11761223inline bool CBuilder::hasAddressSanitizer() const {
    1177     return codegen::DebugOptionIsSet(codegen::EnableAsserts) && mDriver && mDriver->hasExternalFunction("__asan_region_is_poisoned");
     1224    return mDriver && mDriver->hasExternalFunction("__asan_region_is_poisoned");
    11781225}
    11791226
    11801227LoadInst * CBuilder::CreateAlignedLoad(Value * Ptr, unsigned Align, const char * Name) {
    1181     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1228    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    11821229        DataLayout DL(getModule());
    11831230        IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
     
    11911238
    11921239LoadInst * CBuilder::CreateAlignedLoad(Value * Ptr, unsigned Align, const Twine & Name) {
    1193     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1240    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    11941241        DataLayout DL(getModule());
    11951242        IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
     
    12031250
    12041251LoadInst * CBuilder::CreateAlignedLoad(Value * Ptr, unsigned Align, bool isVolatile, const Twine & Name) {
    1205     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1252    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    12061253        DataLayout DL(getModule());
    12071254        IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
     
    12151262
    12161263StoreInst * CBuilder::CreateAlignedStore(Value * Val, Value * Ptr, unsigned Align, bool isVolatile) {
    1217     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1264    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    12181265        DataLayout DL(getModule());
    12191266        IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
     
    12281275CallInst * CBuilder::CreateMemMove(Value * Dst, Value * Src, Value *Size, unsigned Align, bool isVolatile,
    12291276                                   MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) {
    1230     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1277    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1278        CHECK_ADDRESS(Src, Size, "CreateMemMove: Src");
     1279        CHECK_ADDRESS(Dst, Size, "CreateMemMove: Dst");
     1280        // If the call to this intrinisic has an alignment value that is not 0 or 1, then the caller
     1281        // guarantees that both the source and destination pointers are aligned to that boundary.
     1282        if (Align > 1) {
     1283            DataLayout DL(getModule());
     1284            IntegerType * const intPtrTy = DL.getIntPtrType(getContext());
     1285            Value * intSrc = CreatePtrToInt(Src, intPtrTy);
     1286            Value * intDst = CreatePtrToInt(Dst, intPtrTy);
     1287            ConstantInt * align = ConstantInt::get(intPtrTy, Align);
     1288            CreateAssertZero(CreateURem(intSrc, align), "CreateMemMove: Src pointer is misaligned");
     1289            CreateAssertZero(CreateURem(intDst, align), "CreateMemMove: Dst pointer is misaligned");
     1290
     1291        }
     1292    }
     1293    return IRBuilder<>::CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
     1294}
     1295
     1296CallInst * CBuilder::CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile,
     1297                                  MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) {
     1298    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1299        CHECK_ADDRESS(Src, Size, "CreateMemCpy: Src");
     1300        CHECK_ADDRESS(Dst, Size, "CreateMemCpy: Dst");
    12311301        DataLayout DL(getModule());
    12321302        IntegerType * const intPtrTy = DL.getIntPtrType(getContext());
     1303        Value * intSrc = CreatePtrToInt(Src, intPtrTy);
    12331304        Value * intDst = CreatePtrToInt(Dst, intPtrTy);
    1234         Value * intSrc = CreatePtrToInt(Src, intPtrTy);
    12351305        // If the call to this intrinisic has an alignment value that is not 0 or 1, then the caller
    12361306        // guarantees that both the source and destination pointers are aligned to that boundary.
    12371307        if (Align > 1) {
    12381308            ConstantInt * align = ConstantInt::get(intPtrTy, Align);
    1239             CreateAssertZero(CreateURem(intDst, align), "CreateMemMove: Dst pointer is misaligned");
    1240             CreateAssertZero(CreateURem(intSrc, align), "CreateMemMove: Src pointer is misaligned");
    1241         }
    1242     }
    1243     return IRBuilder<>::CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
    1244 }
    1245 
    1246 CallInst * CBuilder::CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile,
    1247                                   MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) {
    1248     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
    1249         DataLayout DL(getModule());
    1250         IntegerType * const intPtrTy = DL.getIntPtrType(getContext());
    1251         Value * intDst = CreatePtrToInt(Dst, intPtrTy);
    1252         Value * intSrc = CreatePtrToInt(Src, intPtrTy);
    1253         // If the call to this intrinisic has an alignment value that is not 0 or 1, then the caller
    1254         // guarantees that both the source and destination pointers are aligned to that boundary.
    1255         if (Align > 1) {
    1256             ConstantInt * align = ConstantInt::get(intPtrTy, Align);
     1309            CreateAssertZero(CreateURem(intSrc, align), "CreateMemCpy: Src pointer is misaligned");
    12571310            CreateAssertZero(CreateURem(intDst, align), "CreateMemCpy: Dst pointer is misaligned");
    1258             CreateAssertZero(CreateURem(intSrc, align), "CreateMemCpy: Src pointer is misaligned");
    1259         }
    1260         Value * intSize = CreateZExtOrTrunc(Size, intSrc->getType());
     1311        }
     1312        Value * intSize = CreateZExtOrTrunc(Size, intPtrTy);
    12611313        Value * nonOverlapping = CreateOr(CreateICmpULT(CreateAdd(intSrc, intSize), intDst),
    12621314                                          CreateICmpULT(CreateAdd(intDst, intSize), intSrc));
     
    12641316    }
    12651317    return IRBuilder<>::CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
     1318}
     1319
     1320llvm::CallInst * CBuilder::CreateMemSet(llvm::Value * Ptr, llvm::Value * Val, llvm::Value * Size, unsigned Align,
     1321                       bool isVolatile, llvm::MDNode * TBAATag, llvm::MDNode * ScopeTag, llvm::MDNode * NoAliasTag) {
     1322    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1323        CHECK_ADDRESS(Ptr, Size, "CreateMemSet");
     1324    }
     1325    return IRBuilder<>::CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
    12661326}
    12671327
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.h

    r5746 r5755  
    5757    llvm::Value * CreateUDivCeil(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "");
    5858   
    59     llvm::Value * CreateUDivCeil(llvm::Value * number, const uint64_t divisor, const llvm::Twine &Name = "");
    60 
    6159    // Round up to a multiple of divisor.
    6260    llvm::Value * CreateRoundUp(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "");
    6361           
    6462    // Get minimum of two unsigned numbers
    65     llvm::Value * CreateUMin(llvm::Value * a, llvm::Value * b) {
     63    llvm::Value * CreateUMin(llvm::Value * const a, llvm::Value * const b) {
     64        if (a == nullptr) return b;
     65        if (b == nullptr) return a;
    6666        assert (a->getType() == b->getType());
    6767        return CreateSelect(CreateICmpULT(a, b), a, b);
     
    6969
    7070    // Get minimum of two signed numbers
    71     llvm::Value * CreateSMin(llvm::Value * a, llvm::Value * b) {
     71    llvm::Value * CreateSMin(llvm::Value * const a, llvm::Value * const b) {
     72        if (a == nullptr) return b;
     73        if (b == nullptr) return a;
    7274        assert (a->getType() == b->getType());
    7375        return CreateSelect(CreateICmpSLT(a, b), a, b);
     
    7577
    7678    // Get maximum of two unsigned numbers
    77     llvm::Value * CreateUMax(llvm::Value * a, llvm::Value * b) {
     79    llvm::Value * CreateUMax(llvm::Value * const a, llvm::Value * const b) {
     80        if (a == nullptr) return b;
     81        if (b == nullptr) return a;
    7882        assert (a->getType() == b->getType());
    7983        return CreateSelect(CreateICmpUGT(a, b), a, b);
     
    8185
    8286    // Get maximum of two signed numbers
    83     llvm::Value * CreateSMax(llvm::Value * a, llvm::Value * b) {
     87    llvm::Value * CreateSMax(llvm::Value * const a, llvm::Value * const b) {
     88        if (a == nullptr) return b;
     89        if (b == nullptr) return a;
    8490        assert (a->getType() == b->getType());
    8591        return CreateSelect(CreateICmpSGT(a, b), a, b);
    8692    }
    8793
    88     llvm::Value * CreateMalloc(llvm::Value * size);
    89 
    90     llvm::Value * CreateAlignedMalloc(llvm::Value * size, const unsigned alignment);
     94    llvm::Value * CreateMalloc(llvm::Value * const size);
     95
     96    llvm::Value * CreateAlignedMalloc(llvm::Value * const size, const unsigned alignment);
     97
     98    llvm::Value * CreateCacheAlignedMalloc(llvm::Value * const size) {
     99        return CreateAlignedMalloc(size, getCacheAlignment());
     100    }
    91101   
    92102    void CreateFree(llvm::Value * const ptr);
    93103
    94     llvm::Value * CreateRealloc(llvm::Value * ptr, llvm::Value * size);
    95 
    96     llvm::CallInst * CreateMemZero(llvm::Value * ptr, llvm::Value * size, const unsigned alignment = 1) {
     104    llvm::Value * CreateRealloc(llvm::Value * const ptr, llvm::Value * const size);
     105
     106    llvm::CallInst * CreateMemZero(llvm::Value * const ptr, llvm::Value * const size, const unsigned alignment = 1) {
    97107        return CreateMemSet(ptr, getInt8(0), size, alignment);
    98108    }
    99109
    100     llvm::AllocaInst * CreateCacheAlignedAlloca(llvm::Type * Ty, llvm::Value * ArraySize = nullptr) {
     110    llvm::AllocaInst * CreateAlignedAlloca(llvm::Type * const Ty, const unsigned alignment, llvm::Value * const ArraySize = nullptr) {
    101111        llvm::AllocaInst * instr = CreateAlloca(Ty, ArraySize);
    102         instr->setAlignment(getCacheAlignment());
     112        instr->setAlignment(alignment);
    103113        return instr;
    104114    }
    105115
    106     llvm::Value * CreateCacheAlignedMalloc(llvm::Value * size);
     116    llvm::AllocaInst * CreateCacheAlignedAlloca(llvm::Type * const Ty, llvm::Value * const ArraySize = nullptr) {
     117        return CreateAlignedAlloca(Ty, getCacheAlignment(), ArraySize);
     118    }
    107119
    108120    // stdio.h functions
     
    169181    llvm::Value * CreateMUnmap(llvm::Value * addr, llvm::Value * size);
    170182
     183    enum Protect {
     184        NONE = 0
     185        , READ = 1
     186        , WRITE = 2
     187        , EXEC = 4
     188    };
     189
     190    llvm::Value * CreateMProtect(llvm::Value * addr, llvm::Value * size, int protect);
     191
    171192    //  Posix thread (pthread.h) functions.
    172193    //
     
    203224    llvm::IntegerType * getIntAddrTy() const;
    204225   
    205     llvm::PointerType * getVoidPtrTy() const;
     226    llvm::PointerType * getVoidPtrTy(const unsigned AddressSpace = 0) const;
    206227   
    207228    llvm::PointerType * getFILEptrTy();
     
    310331                           llvm::MDNode *NoAliasTag = nullptr);
    311332
     333    llvm::CallInst * CreateMemSet(llvm::Value *Ptr, llvm::Value *Val, uint64_t Size, unsigned Align,
     334                           bool isVolatile = false, llvm::MDNode *TBAATag = nullptr,
     335                           llvm::MDNode *ScopeTag = nullptr,
     336                           llvm::MDNode *NoAliasTag = nullptr) {
     337        return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
     338    }
     339
     340    llvm::CallInst * CreateMemSet(llvm::Value *Ptr, llvm::Value *Val, llvm::Value *Size, unsigned Align,
     341                           bool isVolatile = false, llvm::MDNode *TBAATag = nullptr,
     342                           llvm::MDNode *ScopeTag = nullptr,
     343                           llvm::MDNode *NoAliasTag = nullptr);
    312344
    313345    void setDriver(Driver * const driver) {
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_target.cpp

    r5734 r5755  
    1616
    1717using namespace kernel;
     18using namespace llvm;
     19
     20struct Features {
     21    bool hasAVX;
     22    bool hasAVX2;
     23    Features() : hasAVX(0), hasAVX2(0) { }
     24};
     25
     26Features getHostCPUFeatures() {
     27    Features hostCPUFeatures;
     28    StringMap<bool> features;
     29    if (sys::getHostCPUFeatures(features)) {
     30        hostCPUFeatures.hasAVX = features.count("avx");
     31        hostCPUFeatures.hasAVX2 = features.count("avx2");
     32    }
     33    return hostCPUFeatures;
     34}
     35
     36bool AVX2_available() {
     37    StringMap<bool> features;
     38    if (sys::getHostCPUFeatures(features)) {
     39        return features.count("avx2");
     40    }
     41    return false;
     42}
    1843
    1944namespace IDISA {
    2045   
    2146KernelBuilder * GetIDISA_Builder(llvm::LLVMContext & C) {
    22     const bool hasAVX2 = AVX2_available();
     47    const auto hostCPUFeatures = getHostCPUFeatures();
    2348    if (LLVM_LIKELY(codegen::BlockSize == 0)) {  // No BlockSize override: use processor SIMD width
    24         codegen::BlockSize = hasAVX2 ? 256 : 128;
     49        codegen::BlockSize = hostCPUFeatures.hasAVX2 ? 256 : 128;
    2550    }
    2651    else if (((codegen::BlockSize & (codegen::BlockSize - 1)) != 0) || (codegen::BlockSize < 64)) {
    2752        llvm::report_fatal_error("BlockSize must be a power of 2 and >=64");
    2853    }
    29     if (codegen::BlockSize >= 256) {
    30         if (hasAVX2) {
     54    if (codegen::BlockSize >= 128) {
     55        if (hostCPUFeatures.hasAVX2) {
    3156            return new KernelBuilderImpl<IDISA_AVX2_Builder>(C, codegen::BlockSize, codegen::BlockSize);
     57        } else if (hostCPUFeatures.hasAVX) {
     58            return new KernelBuilderImpl<IDISA_AVX_Builder>(C, codegen::BlockSize, codegen::BlockSize);
    3259        }
    3360    } else if (codegen::BlockSize == 64) {
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_target.h

    r5734 r5755  
    99namespace llvm { class LLVMContext; }
    1010namespace kernel { class KernelBuilder; }
    11 #include <string>
     11
     12extern bool AVX2_available();
    1213
    1314namespace IDISA {
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.cpp

    r5750 r5755  
    185185            const auto m = lengthOf(r) * QUAD_BITS;
    186186            if (LLVM_UNLIKELY(remaining < m)) {
    187                 return base + remaining;
     187                return (base * QUAD_BITS) + remaining;
    188188            }
    189189            base += m;
    190             remaining -= m;
     190            remaining -= m * QUAD_BITS;
    191191        } else { // if (typeOf(r) == Mixed) {
    192192            for (auto l = lengthOf(r); l; --l, ++qi) {
     
    199199                        const bitquad_t k = scan_forward_zeroes<bitquad_t>(q);
    200200                        if (remaining == 0) {
    201                             return base + k;
     201                            return (base * QUAD_BITS) + k;
    202202                        }
    203203                        q ^= static_cast<bitquad_t>(1) << k;
     
    205205                    }
    206206                }
    207                 base += QUAD_BITS;
     207                ++base;
    208208                remaining -= c;
    209209            }
     
    14121412 ** ------------------------------------------------------------------------------------------------------------- */
    14131413UnicodeSet::UnicodeSet(const UnicodeSet & other) noexcept
    1414 : mRuns(other.mRuns)
    1415 , mQuads(other.mQuads)
    1416 , mRunLength(other.mRunLength)
    1417 , mQuadLength(other.mQuadLength)
    1418 , mRunCapacity(0) // lazily ensure reallocation on modification
     1414: mRuns(nullptr)
     1415, mQuads(nullptr)
     1416, mRunLength(0)
     1417, mQuadLength(0)
     1418, mRunCapacity(0)
    14191419, mQuadCapacity(0) {
     1420    // lazily ensure reallocation on modification if and only if the source cannot modify it
     1421    if (other.mRunCapacity == 0) {
     1422        mRuns = other.mRuns;
     1423        mRunCapacity = 0;
     1424    } else {
     1425        mRuns = copyOf<run_t>(other.mRuns, other.mRunLength, GlobalAllocator);
     1426        mRunCapacity = other.mRunLength;
     1427    }
     1428    mRunLength = other.mRunLength;
     1429    if (other.mQuadCapacity == 0) {
     1430        mQuads = other.mQuads;
     1431        mQuadCapacity = 0;
     1432    } else {
     1433        mQuads = copyOf<bitquad_t>(other.mQuads, other.mQuadCapacity, GlobalAllocator);
     1434        mQuadCapacity = other.mQuadLength;
     1435    }
     1436    mQuadLength = other.mQuadLength;
    14201437    assert (verify(mRuns, other.mRunLength, mQuads, other.mQuadLength));
    14211438}
     
    14311448        GlobalAllocator.deallocate<bitquad_t>(mQuads, mQuadCapacity);
    14321449    }
    1433     mRuns = other.mRuns;
    1434     mQuads = other.mQuads;
     1450    // lazily ensure reallocation on modification if and only if the source cannot modify it
     1451    if (other.mRunCapacity == 0) {
     1452        mRuns = other.mRuns;
     1453        mRunCapacity = 0;
     1454    } else {
     1455        mRuns = copyOf<run_t>(other.mRuns, other.mRunLength, GlobalAllocator);
     1456        mRunCapacity = other.mRunLength;
     1457    }
    14351458    mRunLength = other.mRunLength;
     1459    if (other.mQuadCapacity == 0) {
     1460        mQuads = other.mQuads;
     1461        mQuadCapacity = 0;
     1462    } else {
     1463        mQuads = copyOf<bitquad_t>(other.mQuads, other.mQuadCapacity, GlobalAllocator);
     1464        mQuadCapacity = other.mQuadLength;
     1465    }
    14361466    mQuadLength = other.mQuadLength;
    1437     mRunCapacity = 0; // lazily ensure reallocation on modification
    1438     mQuadCapacity = 0;
    14391467    assert (verify(mRuns, mRunLength, mQuads, mQuadLength));
    14401468    return *this;
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.h

    r5748 r5755  
    223223    }
    224224
    225     inline quad_iterator quad_end() const {
     225    inline quad_iterator quad_end() const {       
    226226        return quad_iterator(mRuns + mRunLength, mRuns + mRunLength, mQuads + mQuadLength, mQuads + mQuadLength, Empty, 0);
    227227    }
  • icGREP/icgrep-devel/icgrep/array-test.cpp

    r5486 r5755  
    156156    const unsigned bufferSegments = codegen::BufferSegments;
    157157   
    158     auto ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, byteStreamTy));
    159 
    160     auto mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, segmentSize));
     158    auto ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, byteStreamTy);
     159
     160    auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, segmentSize);
    161161    mmapK->setInitialArguments({fileDecriptor});
    162162
    163163    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
    164164
    165     auto BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, byteStreamTy, segmentSize * bufferSegments));
    166 
    167     auto s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
     165    auto BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, byteStreamTy, segmentSize * bufferSegments);
     166
     167    auto s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder, true);
    168168    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    169169
    170     auto bm = pxDriver.addKernelInstance(make_unique<ParenthesisMatchingKernel>(iBuilder, count));
    171 
    172     auto matches = pxDriver.addBuffer(make_unique<ExpandableBuffer>(iBuilder, iBuilder->getStreamSetTy(count), segmentSize * bufferSegments));
    173 
    174     auto errors = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamTy(), segmentSize * bufferSegments));
     170    auto bm = pxDriver.addKernelInstance<ParenthesisMatchingKernel>(iBuilder, count);
     171
     172    auto matches = pxDriver.addBuffer<ExpandableBuffer>(iBuilder, iBuilder->getStreamSetTy(count), segmentSize * bufferSegments);
     173
     174    auto errors = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamTy(), segmentSize * bufferSegments);
    175175
    176176    pxDriver.makeKernelCall(bm, {BasisBits}, {matches, errors});
    177177
    178     auto printer = pxDriver.addKernelInstance(make_unique<PrintStreamSet>(iBuilder, std::vector<std::string>{"matches", "errors"}));
     178    auto printer = pxDriver.addKernelInstance<PrintStreamSet>(iBuilder, std::vector<std::string>{"matches", "errors"});
    179179    pxDriver.makeKernelCall(printer, {&matches, &errors}, {});
    180180
  • icGREP/icgrep-devel/icgrep/base64.cpp

    r5620 r5755  
    6565    const unsigned bufferSize = (4 * initSegSize * codegen::BufferSegments) / 3;
    6666
    67     StreamSetBuffer * ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
    68 
    69     Kernel * mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, initSegSize));
     67    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
     68
     69    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, initSegSize);
    7070    mmapK->setInitialArguments({fileDescriptor});
    7171    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
    7272   
    73     StreamSetBuffer * Expanded3_4Out = pxDriver.addBuffer(make_unique<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize));
    74     Kernel * expandK = pxDriver.addKernelInstance(make_unique<expand3_4Kernel>(iBuilder));
     73    StreamSetBuffer * Expanded3_4Out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
     74    Kernel * expandK = pxDriver.addKernelInstance<expand3_4Kernel>(iBuilder);
    7575    pxDriver.makeKernelCall(expandK, {ByteStream}, {Expanded3_4Out});
    7676   
    77     StreamSetBuffer * Radix64out = pxDriver.addBuffer(make_unique<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize));
    78     Kernel * radix64K = pxDriver.addKernelInstance(make_unique<radix64Kernel>(iBuilder));
     77    StreamSetBuffer * Radix64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
     78    Kernel * radix64K = pxDriver.addKernelInstance<radix64Kernel>(iBuilder);
    7979    pxDriver.makeKernelCall(radix64K, {Expanded3_4Out}, {Radix64out});
    8080   
    8181    if (memAlignBuffering){
    82         auto Base64out = pxDriver.addExternalBuffer(make_unique<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputStream));
    83         Kernel * base64K = pxDriver.addKernelInstance(make_unique<base64Kernel>(iBuilder));
     82        auto Base64out = pxDriver.addBuffer<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputStream);
     83        Kernel * base64K = pxDriver.addKernelInstance<base64Kernel>(iBuilder);
    8484        pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});
    8585    }
    8686    else {
    87         StreamSetBuffer * Base64out = pxDriver.addBuffer(make_unique<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize));
    88         Kernel * base64K = pxDriver.addKernelInstance(make_unique<base64Kernel>(iBuilder));
     87        StreamSetBuffer * Base64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
     88        Kernel * base64K = pxDriver.addKernelInstance<base64Kernel>(iBuilder);
    8989        pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});
    9090       
    91         Kernel * outK = pxDriver.addKernelInstance(make_unique<StdOutKernel>(iBuilder, 8));
     91        Kernel * outK = pxDriver.addKernelInstance<StdOutKernel>(iBuilder, 8);
    9292        pxDriver.makeKernelCall(outK, {Base64out}, {});
    9393    }
  • icGREP/icgrep-devel/icgrep/editd/editd.cpp

    r5734 r5755  
    257257    idb->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
    258258
    259     auto ChStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(4)));
    260     auto mmapK = pxDriver.addKernelInstance(make_unique<MemorySourceKernel>(idb, inputType, segmentSize));
     259    auto ChStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(4));
     260    auto mmapK = pxDriver.addKernelInstance<MemorySourceKernel>(idb, inputType, segmentSize);
    261261    mmapK->setInitialArguments({inputStream, fileSize});
    262262    pxDriver.makeKernelCall(mmapK, {}, {ChStream});
    263263
    264     auto MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments));
    265     auto editdk = pxDriver.addKernelInstance(make_unique<PatternKernel>(idb, patterns));
     264    auto MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments);
     265    auto editdk = pxDriver.addKernelInstance<PatternKernel>(idb, patterns);
    266266    pxDriver.makeKernelCall(editdk, {ChStream}, {MatchResults});
    267267
    268     auto editdScanK = pxDriver.addKernelInstance(make_unique<editdScanKernel>(idb, editDistance));
     268    auto editdScanK = pxDriver.addKernelInstance<editdScanKernel>(idb, editDistance);
    269269    pxDriver.makeKernelCall(editdScanK, {MatchResults}, {});
    270270
     
    327327    iBuilder->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main));
    328328
    329     auto ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
    330 
    331     auto mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder, segmentSize));
     329    auto ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
     330
     331    auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, segmentSize);
    332332    mmapK->setInitialArguments({fileDescriptor});
    333333    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
    334334
    335     auto BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), segmentSize * bufferSegments));
    336     auto s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
     335    auto BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8), segmentSize * bufferSegments);
     336    auto s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder);
    337337    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    338338
    339     auto CCResults = pxDriver.addExternalBuffer(make_unique<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(4), outputStream));
    340     auto ccck = pxDriver.addKernelInstance(make_unique<PreprocessKernel>(iBuilder));
     339    auto CCResults = pxDriver.addBuffer<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(4), outputStream);
     340    auto ccck = pxDriver.addKernelInstance<PreprocessKernel>(iBuilder);
    341341    pxDriver.makeKernelCall(ccck, {BasisBits}, {CCResults});
    342342
     
    368368    idb->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
    369369
    370     auto ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
    371 
    372     auto mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(idb, segmentSize));
     370    auto ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
     371
     372    auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(idb, segmentSize);
    373373    mmapK->setInitialArguments({fileDescriptor});
    374374    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
    375375
    376     auto ChStream = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4), segmentSize * bufferSegments));
    377     auto ccck = pxDriver.addKernelInstance(make_unique<kernel::DirectCharacterClassKernelBuilder>(idb, "ccc",
     376    auto ChStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(4), segmentSize * bufferSegments);
     377    auto ccck = pxDriver.addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "ccc",
    378378        std::vector<re::CC *>{re::makeCC(re::makeCC(0x41), re::makeCC(0x61)),
    379379                              re::makeCC(re::makeCC(0x43), re::makeCC(0x63)),
    380380                              re::makeCC(re::makeCC(0x54), re::makeCC(0x74)),
    381                               re::makeCC(re::makeCC(0x47), re::makeCC(0x67))}, 1));
     381                              re::makeCC(re::makeCC(0x47), re::makeCC(0x67))}, 1);
    382382    pxDriver.makeKernelCall(ccck, {ByteStream}, {ChStream});
    383383
     
    387387   
    388388    for(unsigned i = 0; i < n; ++i){
    389         auto MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments));
    390         auto editdk = pxDriver.addKernelInstance(make_unique<PatternKernel>(idb, pattGroups[i]));
     389        auto MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments);
     390        auto editdk = pxDriver.addKernelInstance<PatternKernel>(idb, pattGroups[i]);
    391391        pxDriver.makeKernelCall(editdk, {ChStream}, {MatchResults});
    392392        MatchResultsBufs[i] = MatchResults;
     
    394394    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
    395395    if (n > 1) {
    396         MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments));
    397         kernel::Kernel * streamsMergeK = pxDriver.addKernelInstance(make_unique<kernel::StreamsMerge>(idb, editDistance + 1, n));
     396        MergedResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments);
     397        kernel::Kernel * streamsMergeK = pxDriver.addKernelInstance<kernel::StreamsMerge>(idb, editDistance + 1, n);
    398398        pxDriver.makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
    399399    }
    400400
    401     auto editdScanK = pxDriver.addKernelInstance(make_unique<editdScanKernel>(idb, editDistance));
     401    auto editdScanK = pxDriver.addKernelInstance<editdScanKernel>(idb, editDistance);
    402402    pxDriver.makeKernelCall(editdScanK, {MergedResults}, {});
    403403
     
    435435    idb->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
    436436
    437     auto ChStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(4)));
    438     auto mmapK = pxDriver.addKernelInstance(make_unique<MemorySourceKernel>(idb, inputType, segmentSize));
     437    auto ChStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(4));
     438    auto mmapK = pxDriver.addKernelInstance<MemorySourceKernel>(idb, inputType, segmentSize);
    439439    mmapK->setInitialArguments({inputStream, fileSize});
    440440    pxDriver.makeKernelCall(mmapK, {}, {ChStream});
    441441
    442     auto MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments));
    443     auto editdk = pxDriver.addKernelInstance(make_unique<kernel::editdCPUKernel>(idb, editDistance, patternLen, groupSize));
     442    auto MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(editDistance + 1), segmentSize * bufferSegments);
     443    auto editdk = pxDriver.addKernelInstance<kernel::editdCPUKernel>(idb, editDistance, patternLen, groupSize);
    444444
    445445    const unsigned numOfCarries = patternLen * (editDistance + 1) * 4 * groupSize;
     
    451451    pxDriver.makeKernelCall(editdk, {ChStream}, {MatchResults});
    452452
    453     auto editdScanK = pxDriver.addKernelInstance(make_unique<editdScanKernel>(idb, editDistance));
     453    auto editdScanK = pxDriver.addKernelInstance<editdScanKernel>(idb, editDistance);
    454454    pxDriver.makeKernelCall(editdScanK, {MatchResults}, {});
    455455
     
    547547    Function * const main = cast<Function>(M->getOrInsertFunction("Main", voidTy, inputTy, inputSizeTy, patternPtrTy, outputTy, stridesTy, nullptr));
    548548    main->setCallingConv(CallingConv::C);
    549     Function::arg_iterator args = main->arg_begin();
     549    auto args = main->arg_begin();
    550550
    551551    Value * const inputStream = &*(args++);
     
    573573    Value * inputSize = iBuilder->CreateLoad(inputSizePtr);
    574574
    575     StreamSetBuffer * CCStream = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(4), 1));
    576     kernel::Kernel * sourceK = pxDriver.addKernelInstance(make_unique<kernel::MemorySourceKernel>(iBuilder, inputTy, segmentSize));
     575    auto CCStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(4), 1);
     576    auto sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(iBuilder, inputTy, segmentSize);
    577577    sourceK->setInitialArguments({inputThreadPtr, inputSize});
    578578    pxDriver.makeKernelCall(sourceK, {}, {CCStream});
    579579
    580     ExternalBuffer * ResultStream = pxDriver.addExternalBuffer(make_unique<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(editDistance+1), resultStreamPtr, 1));
    581     kernel::Kernel * editdk = pxDriver.addKernelInstance(make_unique<kernel::editdGPUKernel>(iBuilder, editDistance, patternLen, groupSize));
     580    auto ResultStream = pxDriver.addBuffer<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(editDistance+1), resultStreamPtr, 1);
     581    auto editdk = pxDriver.addKernelInstance<kernel::editdGPUKernel>(iBuilder, editDistance, patternLen, groupSize);
    582582     
    583583    const unsigned numOfCarries = patternLen * (editDistance + 1) * 4 * groupSize;
     
    682682    fileSize->setName("fileSize");
    683683
    684     StreamSetBuffer * MatchResults = pxDriver.addBuffer(make_unique<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(editDistance+1)));
    685     kernel::Kernel * sourceK = pxDriver.addKernelInstance(make_unique<kernel::MemorySourceKernel>(iBuilder, inputType, segmentSize * bufferSegments));
     684    StreamSetBuffer * MatchResults = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(editDistance+1));
     685    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(iBuilder, inputType, segmentSize * bufferSegments);
    686686    sourceK->setInitialArguments({inputStream, fileSize});
    687687    pxDriver.makeKernelCall(sourceK, {}, {MatchResults});
    688688
    689     auto editdScanK = pxDriver.addKernelInstance(make_unique<editdScanKernel>(iBuilder, editDistance));
     689    auto editdScanK = pxDriver.addKernelInstance<editdScanKernel>(iBuilder, editDistance);
    690690    pxDriver.makeKernelCall(editdScanK, {MatchResults}, {});
    691691    pxDriver.LinkFunction(*editdScanK, "wrapped_report_pos", &wrapped_report_pos);
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5748 r5755  
    102102    const unsigned encodingBits = 8;
    103103   
    104     StreamSetBuffer * BasisBits = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
    105     kernel::Kernel * s2pk = mGrepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
     104    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
     105    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    106106    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    107107   
    108     StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
    109     kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
     108    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     109    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
    110110    mGrepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
    111111   
    112     kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
    113     StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
     112    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
     113    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments);
    114114    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
    115115   
     
    130130    for(unsigned i = 0; i < n; ++i){
    131131        const auto numOfCharacterClasses = charclasses[i].size();
    132         StreamSetBuffer * CharClasses = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
    133         kernel::Kernel * ccK = mGrepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
     132        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
     133        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(charclasses[i]));
    134134        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
    135         StreamSetBuffer * MatchResults = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
    136         kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
     135        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     136        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
    137137        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
    138138        MatchResultsBufs[i] = MatchResults;
     
    140140    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
    141141    if (REs.size() > 1) {
    142         MergedResults = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
    143         kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
     142        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     143        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
    144144        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
    145145    }
     
    148148    if (mMoveMatchesToEOL) {
    149149        StreamSetBuffer * OriginalMatches = Matches;
    150         kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
    151         Matches = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
     150        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
     151        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    152152        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
    153153    }
    154154   
    155155    if (InvertMatchFlag) {
    156         kernel::Kernel * invertK = mGrepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
     156        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
    157157        StreamSetBuffer * OriginalMatches = Matches;
    158         Matches = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
     158        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    159159        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
    160160    }
    161161    if (MaxCountFlag > 0) {
    162         kernel::Kernel * untilK = mGrepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
     162        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
    163163        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
    164164        StreamSetBuffer * AllMatches = Matches;
    165         Matches = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
     165        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    166166        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
    167167    }
     
    191191    fileDescriptor->setName("fileDescriptor");
    192192   
    193     StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
    194     kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
     193    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
     194    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize);
    195195    sourceK->setInitialArguments({fileDescriptor});
    196196    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
     
    200200    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
    201201   
    202     kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
     202    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
    203203    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
    204204    mGrepDriver->generatePipelineIR();
     
    312312    match_accumulator->setName("match_accumulator");
    313313   
    314     StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
    315     kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
     314    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
     315    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize);
    316316    sourceK->setInitialArguments({fileDescriptor});
    317317    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
     
    321321    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
    322322   
    323     kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb));
     323    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
    324324    scanMatchK->setInitialArguments({match_accumulator});
    325325    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
  • icGREP/icgrep-devel/icgrep/icgrep-devel.files

    r5706 r5755  
    308308wc.cpp
    309309CMakeLists.txt
     310applications/cfg-validator/ANTLRv4Lexer.cpp
     311applications/cfg-validator/ANTLRv4Lexer.h
     312applications/cfg-validator/ANTLRv4Lexer.tokens
     313applications/cfg-validator/ANTLRv4Parser.cpp
     314applications/cfg-validator/ANTLRv4Parser.h
     315applications/cfg-validator/ANTLRv4Parser.tokens
     316applications/cfg-validator/ANTLRv4ParserBaseListener.cpp
     317applications/cfg-validator/ANTLRv4ParserBaseListener.h
     318applications/cfg-validator/ANTLRv4ParserListener.cpp
     319applications/cfg-validator/ANTLRv4ParserListener.h
     320applications/cfg-validator/LexBasic.cpp
     321applications/cfg-validator/LexBasic.h
     322applications/cfg-validator/LexBasic.tokens
     323applications/cfg-validator/ANTLRv4Lexer.cpp
     324applications/cfg-validator/ANTLRv4Lexer.h
     325applications/cfg-validator/ANTLRv4Parser.cpp
     326applications/cfg-validator/ANTLRv4Parser.h
     327applications/cfg-validator/ANTLRv4ParserBaseListener.cpp
     328applications/cfg-validator/ANTLRv4ParserBaseListener.h
     329applications/cfg-validator/ANTLRv4ParserListener.cpp
     330applications/cfg-validator/ANTLRv4ParserListener.h
     331kernels/attributes.cpp
     332kernels/processing_rate.cpp
  • icGREP/icgrep-devel/icgrep/icgrep-devel.includes

    r5706 r5755  
    22../boost/include/
    33../libllvm/include/
    4 editd
     4/usr/local/include/antlr4-runtime
     5applications/cfg-validator
    56kernels
    6 pablo/passes
    7 pablo
    8 toolchain
    9 pablo/analysis
    10 re
    11 combine/pugixml/src
    12 IR_Gen
    13 util
    14 combine
    15 UCD
    16 pablo/optimizers
    17 combine/icgrep-test
    18 cc
  • icGREP/icgrep-devel/icgrep/kernels/attributes.h

    r5706 r5755  
    22#define ATTRIBUTES_H
    33
     4#include <vector>
     5
    46namespace kernel {
    57
    68struct Attribute {
    79
     10    friend struct AttributeSet;
     11
    812    friend struct Binding;
    913
     
    1216        /** INPUT STREAM ATTRIBUTES **/
    1317
    14         BlockSize,
    15 
    16         // A BlockSize(K) attribute, where K=2^k for some value of k>=4 declares
    17         // that the layout of stream data items within the corresponding input
    18         // or output buffer is arranged in blocks of K items each.   In each
    19         // block, the data buffer contains K items of the first stream in the
    20         // set, followed by K items of the next stream in the set and so on,
    21         // up to and including K items of the last stream in the set.
    22 
    23         // (Note: this replaces the concept of swizzling and anticipates that
    24         // the pipeline will take on the role of automatically inserting the
    25         // swizzling code necessary).
    26 
    27         LookAhead,
     18        LookAhead, /// NOT DONE
    2819
    2920        // A LookAhead(n) attribute on an input stream set S declares that the kernel
     
    4637        // that holds a copy of the data at the physical start of buffer).
    4738
    48         LookBehind,
     39        LookBehind, /// NOT DONE
    4940
    5041        // A LookBehind(n) attribute on an input stream S declares that the kernel
     
    6051        // (Example: lz4d lookbehind(65536)).
    6152
    62         Principle,
    63 
    64         // One input stream can be declared as the principle input buffer for a kernel.
    65         // If a kernel has a principle input stream, when processing the final stride,
     53        Principal,
     54
     55        // One input stream can be declared as the principal input buffer for a kernel.
     56        // If a kernel has a principal input stream, when processing the final stride,
    6657        // a MultiBlockKernel assumes the item count of the principle is the correct
    6758        // one and zero extends / truncates all other input streams to match it.
    6859
     60        Deferred,
     61
     62        // Normally, the processed item count of fixed rate streams is automatically
     63        // updated by the MultiBlock kernel. However, some streams behave like Fixed
     64        // rate streams (in that they will always eventually process a Fixed amount of
     65        // data) but the kernel processes the data in unpredictable chunks. Rather than
     66        // declaring those as Unknown or Bounded rates, marking their rate calculation
     67        // as Deferred provides the pipeline with a stronger guarantee when it comes to
     68        // buffer size calculations.
     69
     70        Greedy,
     71
     72        // Normally, the available item count of fixed rate streams is equal to the
     73        // number of strides processed by the MultiBlock times its stride size for all
     74        // strides except for the final stride. Some kernels consume
     75
    6976        /** OUTPUT STREAM ATTRIBUTES **/
    7077
    7178        Add,
    7279
    73         // An Add(K) attribute states that K bits will be added to this stream after
     80        // An Add(K) attribute states that K items will be added to this stream after
    7481        // processing the final block.
    7582
     
    7986        // be rounded up to the nearest multiple of k
    8087
     88        /** INPUT/OUTPUT STREAM ATTRIBUTES **/
     89
     90        BlockSize, /// NOT DONE
     91
     92        // A BlockSize(K) attribute, where K=2^k for some value of k>=4 declares
     93        // that the layout of stream data items within the corresponding input
     94        // or output buffer is arranged in blocks of K items each.   In each
     95        // block, the data buffer contains K items of the first stream in the
     96        // set, followed by K items of the next stream in the set and so on,
     97        // up to and including K items of the last stream in the set.
     98
     99        // (Note: this replaces the concept of swizzling and anticipates that
     100        // the pipeline will take on the role of automatically inserting the
     101        // swizzling code necessary).
     102
    81103        /** KERNEL ATTRIBUTES **/
    82104
    83         SelectMinimumInputLength,
     105        SelectMinimumInputLength, /// NOT DONE
    84106
    85107        // If a kernel has multiple input streams and their final item count differs,
     
    88110
    89111        // NOTE: this is the default if a kernel does not have SelectMaximumInputLength
    90         // set and no PrincipleInputStream was declared.
    91 
    92         SelectMaximumInputLength,
     112        // set and no PrincipalInputStream was declared.
     113
     114        SelectMaximumInputLength, /// NOT DONE
    93115
    94116        // If a kernel has multiple input streams and their final item count differs,
     
    96118        // principle item length and zero-extend the streams accordingly.
    97119
     120        CanTerminate,
     121
     122        // Informs the pipeline that this kernel can pass a "termination" message to it.
     123        // in which case the pipeline will propogate the message to the subsequent
     124        // kernels and end the program once the final kernel has returned its result.
     125
     126        IndependentRegions,
     127
     128        // Some kernels can divide their processing into concrete non-overlapping regions
     129        // between a start and end position in which the data produced by a kernel. If a
     130        // kernel K is processed simultaneously by two threads, K_0 and K_1, and K_1 is
     131        // waiting K_0 to finish and update it's kernel state for K_1 to resume at, K_1 can
     132        // compute what its state will be and begin processing before K_0 is finished. This
     133        // requires a the pipeline to intervene and call an optimized "output-less" instance
     134        // of the kernel prior to calling B.
     135
    98136    };
    99137
     
    102140    }
    103141
    104     bool isPrinciple() const {
    105         return mKind == KindId::Principle;
     142    bool isPrincipal() const {
     143        return mKind == KindId::Principal;
    106144    }
    107145
     
    110148    }
    111149
     150    bool isBlockSize() const {
     151        return mKind == KindId::BlockSize;
     152    }
     153
    112154    unsigned getAmount() const {
    113155        return mK;
     
    129171
    130172    friend Attribute Add1();
    131     friend Attribute Principle();
     173    friend Attribute Principal();
    132174    friend Attribute RoundUpTo(const unsigned);
    133175    friend Attribute LookBehind(const unsigned);
     176    friend Attribute Deferred();
    134177
    135178    Attribute(const KindId kind, const unsigned k) : mKind(kind), mK(k) { }
     
    138181
    139182    const KindId    mKind;
    140     const unsigned  mK;
    141 
     183    unsigned        mK;
    142184};
     185
     186struct AttributeSet : public std::vector<Attribute> {
     187
     188    using AttributeId = Attribute::KindId;
     189
     190    const AttributeSet & getAttributes() const {
     191        return *this;
     192    }
     193
     194    const Attribute & getAttribute(const unsigned i) const {
     195        return getAttributes()[i];
     196    }
     197
     198    void addAttribute(Attribute attribute);
     199
     200    bool hasAttributes() const {
     201        return !empty();
     202    }
     203
     204    bool hasAttribute(const AttributeId id) const;
     205
     206    AttributeSet() = default;
     207
     208    AttributeSet(std::initializer_list<Attribute> attrs) : std::vector<Attribute>(attrs) { }
     209};
     210
    143211
    144212inline Attribute Add1() {
     
    150218}
    151219
    152 inline Attribute Principle() {
    153     return Attribute(Attribute::KindId::Principle, 0);
     220inline Attribute Principal() {
     221    return Attribute(Attribute::KindId::Principal, 0);
    154222}
    155223
     
    158226}
    159227
     228inline Attribute Deferred() {
     229    return Attribute(Attribute::KindId::Deferred, 0);
     230}
    160231
    161232}
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5706 r5755  
    369369        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    370370    }
    371     Value * delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
     371    Value * const delCount = partial_sum_popcount(iBuilder, mDeletionFieldWidth, iBuilder->simd_not(delMask));
    372372    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    373373}
    374374
    375 DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount)
    376 : BlockOrientedKernel("del" + std::to_string(fw) + "_" + std::to_string(streamCount),
     375DeletionKernel::DeletionKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned fieldWidth, const unsigned streamCount)
     376: BlockOrientedKernel("del" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    377377              {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    378378               Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    379379              {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
    380                Binding{iBuilder->getStreamSetTy(), "deletionCounts"}},
     380               Binding{iBuilder->getStreamSetTy(), "deletionCounts", FixedRate(), RoundUpTo(iBuilder->getBitBlockWidth())}},
    381381              {}, {}, {})
    382 , mDeletionFieldWidth(fw)
     382, mDeletionFieldWidth(fieldWidth)
    383383, mStreamCount(streamCount) {
    384384}
     
    626626        pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mFieldWidth-1));
    627627    }
    628     iBuilder->setScalarField("pendingOffset", pendingOffset);
    629     iBuilder->CallPrintInt("pendingOffset", pendingOffset);
    630 
    631    
     628    iBuilder->setScalarField("pendingOffset", pendingOffset);   
    632629    Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
    633630    Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
  • icGREP/icgrep-devel/icgrep/kernels/interface.cpp

    r5733 r5755  
    6767    args->setName("self");
    6868    (++args)->setName("doFinal");
    69 //    if (mHasPrincipleItemCount) {
     69//    if (mHasPrincipalItemCount) {
    7070//        (++args)->setName("principleAvailableItemCount");
    7171//    }
     
    142142}
    143143
    144 void Binding::addAttribute(Attribute attribute) {
    145     for (Attribute & attr : attributes) {
    146         if (attr.getKind() == attribute.getKind()) {
    147             return;
    148         }
    149     }
    150     attributes.emplace_back(attribute);
    151144}
    152 
    153 void KernelInterface::normalizeStreamProcessingRates() {
    154 
    155 }
    156 
    157 }
  • icGREP/icgrep-devel/icgrep/kernels/interface.h

    r5706 r5755  
    2525namespace kernel {
    2626
    27 struct Binding {
    28 
    29     friend class KernelInterface;
     27struct Binding : public AttributeSet {
    3028
    3129    Binding(llvm::Type * type, const std::string & name, ProcessingRate r = FixedRate(1))
    32     : type(type), name(name), rate(r), attributes() { }
     30    : AttributeSet()
     31    , mType(type), mName(name), mRate(std::move(r)) { }
    3332
    3433
    3534    Binding(llvm::Type * type, const std::string & name, ProcessingRate r, Attribute && attribute)
    36     : type(type), name(name), rate(r), attributes({std::move(attribute)}) { }
     35    : AttributeSet({std::move(attribute)})
     36    , mType(type), mName(name), mRate(std::move(r)) { }
    3737
    3838
    3939    Binding(llvm::Type * type, const std::string & name, ProcessingRate r, std::initializer_list<Attribute> attributes)
    40     : type(type), name(name), rate(r), attributes(attributes) { }
     40    : AttributeSet(attributes)
     41    , mType(type), mName(name), mRate(std::move(r)) { }
    4142
    4243    llvm::Type * getType() const {
    43         return type;
     44        return mType;
    4445    }
    4546
    4647    const std::string & getName() const {
    47         return name;
     48        return mName;
    4849    }
    4950
    5051    const ProcessingRate & getRate() const {
    51         return rate;
    52     }
    53 
    54     const Attribute & getAttribute(const unsigned i) const {
    55         return attributes[i];
    56     }
    57 
    58     const std::vector<Attribute> & getAttributes() const {
    59         return attributes;
    60     }
    61 
    62     void addAttribute(Attribute attribute);
    63 
    64     bool hasAttributes() const {
    65         return !attributes.empty();
     52        return mRate;
     53    }
     54
     55    ProcessingRate & getRate() {
     56        return mRate;
     57    }
     58
     59    bool isPrincipal() const {
     60        return hasAttribute(Attribute::KindId::Principal);
     61    }
     62
     63    bool notDeferred() const {
     64        return !hasAttribute(Attribute::KindId::Deferred);
    6665    }
    6766
    6867private:
    69     llvm::Type * const          type;
    70     const std::string           name;
    71     ProcessingRate              rate;
    72     std::vector<Attribute>      attributes;
     68    llvm::Type * const          mType;
     69    const std::string           mName;
     70    ProcessingRate              mRate;
    7371};
    7472
    75 class KernelInterface {
     73using Bindings = std::vector<Binding>;
     74
     75class KernelInterface : public AttributeSet {
    7676public:
    7777    /*
     
    9797
    9898    const Binding & getStreamInput(const unsigned i) const {
     99        assert (i < getNumOfStreamInputs());
    99100        return mStreamSetInputs[i];
    100101    }
     
    113114
    114115    const Binding & getStreamOutput(const unsigned i) const {
     116        assert (i < getNumOfStreamOutputs());
    115117        return mStreamSetOutputs[i];
    116118    }
     
    153155    void setInstance(llvm::Value * const instance);
    154156
    155     bool hasPrincipleItemCount() const {
    156         return mHasPrincipleItemCount;
     157    bool hasPrincipalItemCount() const {
     158        return mHasPrincipalItemCount;
    157159    }
    158160
     
    184186    , mModule(nullptr)
    185187    , mKernelStateType(nullptr)
    186     , mHasPrincipleItemCount(false)
     188    , mHasPrincipalItemCount(false)
    187189    , mKernelName(kernelName)
    188190    , mStreamSetInputs(stream_inputs)
     
    191193    , mScalarOutputs(scalar_outputs)
    192194    , mInternalScalars(internal_scalars) {
    193         normalizeStreamProcessingRates();
     195
    194196    }
    195197   
    196 private:
    197 
    198     void normalizeStreamProcessingRates();
    199 
    200198protected:
    201199
     
    203201    llvm::Module *                          mModule;
    204202    llvm::StructType *                      mKernelStateType;
    205     bool                                    mHasPrincipleItemCount;
     203    bool                                    mHasPrincipalItemCount;
    206204    const std::string                       mKernelName;
    207205    std::vector<llvm::Value *>              mInitialArguments;
     
    211209    std::vector<Binding>                    mScalarOutputs;
    212210    std::vector<Binding>                    mInternalScalars;
    213 
    214211};
    215212
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5743 r5755  
    2222#include <sstream>
    2323#include <kernels/kernel_builder.h>
    24 #include <boost/math/common_factor_rt.hpp>
     24#include <boost/math/common_factor.hpp>
    2525#include <llvm/Support/Debug.h>
    2626
     
    5050        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
    5151    }
    52     if (LLVM_UNLIKELY(mKernelMap.count(name))) {
     52    if (LLVM_UNLIKELY(mKernelFieldMap.count(name))) {
    5353        report_fatal_error(getName() + " already contains scalar field " + name);
    5454    }
    5555    const auto index = mKernelFields.size();
    56     mKernelMap.emplace(name, index);
     56    mKernelFieldMap.emplace(name, index);
    5757    mKernelFields.push_back(type);
    5858    return index;
     
    189189        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
    190190        assert (mKernelStateType);
    191     }   
     191    }
    192192}
    193193
     
    206206    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
    207207        report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object");
    208     }   
    209 }
    210 
    211 /** ------------------------------------------------------------------------------------------------------------- *
    212  * @brief getItemsPerStride
    213  ** ------------------------------------------------------------------------------------------------------------- */
    214 std::pair<unsigned, unsigned> Kernel::getStreamRate(const Port p, const unsigned i) const {
    215     const ProcessingRate & rate = (p == Port::Input) ? mStreamSetInputs[i].getRate() : mStreamSetOutputs[i].getRate();
    216     unsigned min = 0, max = 0;
    217     if (rate.isFixed()) {
    218         min = max = rate.getRate();
    219     } else if (rate.isBounded()) {
    220         min = rate.getLowerBound();
    221         max = rate.getUpperBound();
    222     } else if (rate.isUnknown()) {
    223         min = rate.getLowerBound();
    224         max = 0;
    225     } else if (rate.isExactlyRelative()) {
    226         for (unsigned j = 0; j < mStreamSetInputs.size(); ++j) {
    227             if (mStreamSetInputs[j].getName() == rate.getReference()) {
    228                 std::tie(min, max) = getStreamRate(Port::Input, j);
    229                 min = (min * rate.getNumerator()) / rate.getDenominator();
    230                 assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
    231                 max = (max * rate.getNumerator()) / rate.getDenominator();
    232                 return std::make_pair(min, max);
    233             }
    234         }
    235         for (unsigned j = 0; j < mStreamSetOutputs.size(); ++j) {
    236             if (mStreamSetOutputs[j].getName() == rate.getReference()) {
    237                 assert (p == Port::Output);
    238                 std::tie(min, max) = getStreamRate(Port::Output, j);
    239                 min = (min * rate.getNumerator()) / rate.getDenominator();
    240                 assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0);
    241                 max = (max * rate.getNumerator()) / rate.getDenominator();
    242                 return std::make_pair(min, max);
    243             }
    244         }
    245         llvm_unreachable("Reference rate must be associated with an input or output!");
    246     }
    247     return std::make_pair(min, max);
     208    }
    248209}
    249210
     
    252213 ** ------------------------------------------------------------------------------------------------------------- */
    253214void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
    254    
     215
     216    if (mStreamMap.empty()) {
     217        prepareStreamSetNameMap();
     218    }
     219
     220    normalizeStreamProcessingRates();
     221
    255222    const unsigned inputSetCount = mStreamSetInputs.size();
    256223    const unsigned outputSetCount = mStreamSetOutputs.size();
    257    
     224
    258225    assert (inputSetCount == mStreamSetInputBuffers.size());
    259226    assert (outputSetCount == mStreamSetOutputBuffers.size());
     
    293260    for (const auto & binding : mScalarOutputs) {
    294261        addScalar(binding.getType(), binding.getName());
    295     }
    296     if (mStreamMap.empty()) {
    297         prepareStreamSetNameMap();
    298262    }
    299263    for (const auto & binding : mInternalScalars) {
     
    388352    setInstance(&*(args++));
    389353    mIsFinal = &*(args++);
    390     mAvailablePrincipleItemCount = nullptr;
    391 //    if (mHasPrincipleItemCount) {
    392 //        mAvailablePrincipleItemCount = &*(args++);
    393 //    }
     354    mAvailablePrincipalItemCount = nullptr;
    394355    const auto n = mStreamSetInputs.size();
    395356    mAvailableItemCount.resize(n, nullptr);
    396357    for (unsigned i = 0; i < n; i++) {
    397 //        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    398 //        Value * itemCount = nullptr;
    399 //        if (rate.isFixed()) {
    400 //            itemCount = mAvailablePrincipleItemCount;
    401 //            if (rate.getRate() != 1) {
    402 //                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getRate()));
    403 //            }
    404 //        } else if (rate.isBounded() || rate.isUnknown()) {
    405 //            itemCount = &*(args++);
    406 //        } else if (rate.isRelative()) {
    407 //            for (unsigned j = 0; j < i; ++j) {
    408 //                if (mStreamSetInputs[j].getName() == rate.getReference()) {
    409 //                    itemCount = mAvailableItemCount[j];
    410 //                    break;
    411 //                }
    412 //            }
    413 //            if (LLVM_UNLIKELY(itemCount == nullptr)) {
    414 //                report_fatal_error(mStreamSetInputs[i].getName() + " is declared before " + rate.getReference());
    415 //            }
    416 //            if (rate.getNumerator() != 1) {
    417 //                itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
    418 //            }
    419 //            if (rate.getDenominator() != 1) {
    420 //                itemCount = idb->CreateUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
    421 //            }
    422 //        }
    423 //        assert (itemCount);
    424 //        mAvailableItemCount[i] = itemCount;
    425 
    426358        assert (args != mCurrentMethod->arg_end());
    427359        mAvailableItemCount[i] = &*(args++);
    428360    }
    429361    assert (args == mCurrentMethod->arg_end());
    430 
    431362    generateKernelMethod(idb); // must be overridden by the Kernel subtype
    432363    mIsFinal = nullptr;
     
    466397 ** ------------------------------------------------------------------------------------------------------------- */
    467398unsigned Kernel::getScalarIndex(const std::string & name) const {
    468     const auto f = mKernelMap.find(name);
    469     if (LLVM_UNLIKELY(f == mKernelMap.end())) {
     399    const auto f = mKernelFieldMap.find(name);
     400    if (LLVM_UNLIKELY(f == mKernelFieldMap.end())) {
    470401        assert (false);
    471402        report_fatal_error(getName() + " does not contain scalar: " + name);
     
    574505
    575506/** ------------------------------------------------------------------------------------------------------------- *
     507 * @brief getStreamPort
     508 ** ------------------------------------------------------------------------------------------------------------- */
     509const Binding & Kernel::getBinding(const std::string & name) const {
     510    Port port; unsigned index;
     511    std::tie(port, index) = getStreamPort(name);
     512    return (port == Port::Input) ? getStreamInput(index) : getStreamOutput(index);
     513}
     514
     515/** ------------------------------------------------------------------------------------------------------------- *
     516 * @brief normalizeRelativeToFixedProcessingRate
     517 ** ------------------------------------------------------------------------------------------------------------- */
     518bool Kernel::normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate) {
     519    if (base.isFixed()) {
     520        return true;
     521    } else if (LLVM_UNLIKELY(base.isRelative())) {
     522        const auto & ref = getBinding(base.getReference()).getRate();
     523        if (normalizeRelativeToFixedProcessingRate(ref, toUpdate)) {
     524            toUpdate.getRate() *= ref.getRate();
     525            return true;
     526        }
     527    }
     528    return false;
     529}
     530
     531/** ------------------------------------------------------------------------------------------------------------- *
     532 * @brief normalizeStreamProcessingRates
     533 *
     534 * If we allow a stream to be transitively relative to a fixed rate stream, it complicates detection of fixed
     535 * rate streams later. Find any such occurance and transform them. This implies, however, that a fixed rate
     536 * stream could have a rational processing rate (which should not occur normally.)
     537 ** ------------------------------------------------------------------------------------------------------------- */
     538inline void Kernel::normalizeStreamProcessingRates() {
     539    for (Binding & input : mStreamSetInputs) {
     540        normalizeRelativeToFixedProcessingRate(input.getRate(), input.getRate());
     541    }
     542    for (Binding & output : mStreamSetOutputs) {
     543        normalizeRelativeToFixedProcessingRate(output.getRate(), output.getRate());
     544    }
     545    // TODO: we want to consume whole units. Once the pipeline is able to schedule kernels based on their stride
     546    // and input/output rates, modify them here.
     547}
     548
     549/** ------------------------------------------------------------------------------------------------------------- *
    576550 * @brief generateKernelMethod
    577551 ** ------------------------------------------------------------------------------------------------------------- */
    578552void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
    579 
    580     Constant * const log2BlockWidth = b->getSize(std::log2(b->getBitBlockWidth()));
    581 
    582553    const auto inputSetCount = mStreamSetInputs.size();
    583     mStreamSetInputBufferPtr.resize(inputSetCount);
     554    mStreamSetInputBaseAddress.resize(inputSetCount);
    584555    for (unsigned i = 0; i < inputSetCount; ++i) {
    585         const auto & name = mStreamSetInputs[i].getName();
    586         Value * ic = b->getProcessedItemCount(name);
    587         Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
    588         mStreamSetInputBufferPtr[i] = b->getInputStreamPtr(name, blockIndex);
    589     }
    590 
     556        mStreamSetInputBaseAddress[i] = nullptr;
     557    }
    591558    const auto outputSetCount = mStreamSetOutputs.size();
    592     mStreamSetOutputBufferPtr.resize(outputSetCount);
     559    mStreamSetOutputBaseAddress.resize(outputSetCount);
    593560    for (unsigned i = 0; i < outputSetCount; ++i) {
    594         const auto & name = mStreamSetOutputs[i].getName();
    595         Value * ic = b->getProducedItemCount(name);
    596         Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth);
    597         mStreamSetOutputBufferPtr[i] = b->getOutputStreamPtr(name, blockIndex);
    598     }
    599 
     561        mStreamSetOutputBaseAddress[i] = nullptr;
     562    }
    600563    generateDoSegmentMethod(b);
    601 
     564}
     565
     566/** ------------------------------------------------------------------------------------------------------------- *
     567 * @brief requiresBufferedFinalStride
     568 ** ------------------------------------------------------------------------------------------------------------- */
     569inline bool requiresBufferedFinalStride(const Binding & b) {
     570    if (LLVM_LIKELY(isa<ArrayType>(b.getType()))) {
     571        return b.getType()->getArrayNumElements() == 1;
     572    }
     573    return true;
     574}
     575
     576/** ------------------------------------------------------------------------------------------------------------- *
     577 * @brief getItemWidth
     578 ** ------------------------------------------------------------------------------------------------------------- */
     579inline unsigned getItemWidth(const Binding & b) {
     580    Type * ty = b.getType();
     581    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     582        ty = ty->getArrayElementType();
     583    }
     584    return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
     585}
     586
     587/** ------------------------------------------------------------------------------------------------------------- *
     588 * @brief getLowerBound
     589 ** ------------------------------------------------------------------------------------------------------------- */
     590ProcessingRate::RateValue MultiBlockKernel::getLowerBound(const ProcessingRate & rate) const {
     591    if (rate.isFixed() || rate.isBounded()) {
     592        return rate.getLowerBound();
     593    } else if (rate.isRelative()) {
     594        return rate.getRate() * getLowerBound(getBinding(rate.getReference()).getRate());
     595    } else { // if (rate.isUnknown())
     596        return 0;
     597    }
     598}
     599
     600/** ------------------------------------------------------------------------------------------------------------- *
     601 * @brief getUpperBound
     602 ** ------------------------------------------------------------------------------------------------------------- */
     603ProcessingRate::RateValue MultiBlockKernel::getUpperBound(const ProcessingRate &rate) const {
     604    if (rate.isFixed() || rate.isBounded()) {
     605        return rate.getUpperBound();
     606    } else if (rate.isRelative()) {
     607        return rate.getRate() * getUpperBound(getBinding(rate.getReference()).getRate());
     608    } else { // if (rate.isUnknown())
     609        return 0;
     610    }
     611}
     612
     613/** ------------------------------------------------------------------------------------------------------------- *
     614 * @brief getUpperBound
     615 ** ------------------------------------------------------------------------------------------------------------- */
     616bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
     617    if (rate.isUnknown()) {
     618        return true;
     619    } else if (rate.isDerived()) {
     620        return isTransitivelyUnknownRate(getBinding(rate.getReference()).getRate());
     621    }
     622    return false;
     623}
     624
     625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief roundUp
     627 ** ------------------------------------------------------------------------------------------------------------- */
     628unsigned roundUp(const ProcessingRate::RateValue & r) {
     629    if (LLVM_LIKELY(r.denominator() == 1)) {
     630        return r.numerator();
     631    } else {
     632        return (r.numerator() + r.denominator() - 1) / r.denominator();
     633    }
     634}
     635
     636/** ------------------------------------------------------------------------------------------------------------- *
     637 * @brief getItemAlignment
     638 ** ------------------------------------------------------------------------------------------------------------- */
     639inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
     640    const auto & rate = binding.getRate();
     641    if (rate.isFixed()) {
     642        const auto & r = rate.getRate();
     643        const auto n = (r.numerator() * mStride);
     644        if (LLVM_LIKELY(r.denominator() == 1)) {
     645            return n;
     646        } else if (LLVM_LIKELY((n % r.denominator()) == 0)) {
     647            return n / r.denominator();
     648        }
     649    }
     650    return 1; // ∀x GCD(x, x + 1) = 1
     651}
     652
     653/** ------------------------------------------------------------------------------------------------------------- *
     654 * @brief getStrideSize
     655 ** ------------------------------------------------------------------------------------------------------------- */
     656llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
     657    // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
     658    const auto r = getUpperBound(rate);
     659    if (r.numerator() == 0) {
     660        return nullptr;
     661    } else {
     662        assert ((r.numerator() * mStride) % r.denominator() == 0);
     663        return b->getSize((r.numerator() * mStride) / r.denominator());
     664    }
    602665}
    603666
     
    605668 * @brief generateKernelMethod
    606669 ** ------------------------------------------------------------------------------------------------------------- */
    607 void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) {
     670void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
     671
     672    if (LLVM_UNLIKELY((mStride % b->getBitBlockWidth()) != 0)) {
     673        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of MultiBlockKernel "
     674                           "must be a multiple of the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
     675    }
    608676
    609677    const auto inputSetCount = mStreamSetInputs.size();
    610678    const auto outputSetCount = mStreamSetOutputs.size();
    611     const auto totalSetCount = inputSetCount + outputSetCount;
    612 
    613     // Scan through and see if any of our input streams is marked as the principle
    614 
    615     bool hasPrinciple = false;
    616     unsigned principleInput = 0;
    617 
    618     for (unsigned i = 0; i < inputSetCount; i++) {
    619         for (const auto attr : mStreamSetInputs[i].getAttributes()) {
    620             if (attr.isPrinciple()) {
    621                 hasPrinciple = true;
    622                 principleInput = i;
    623                 break;
     679
     680    // Define and allocate the temporary buffer area in the prolog.
     681    const auto alignment = b->getBitBlockWidth() / 8;
     682    Value * temporaryInputBuffer[inputSetCount];
     683    for (unsigned i = 0; i < inputSetCount; ++i) {
     684
     685        // TODO: if this is a fixed rate input stream and the pipeline guarantees it will not call the kernel unless
     686        // there is sufficient input and all buffers will be sized sufficiently for the input, we ought to be able to
     687        // avoid the temporary buffer checks.
     688
     689        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     690        Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
     691        const auto ub = getUpperBound(rate);
     692        if (ub.numerator() == 0) {
     693            report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
     694        } else {           
     695            temporaryInputBuffer[i] = b->CreateAlignedAlloca(ty, alignment, b->getSize(roundUp(ub)));
     696            Type * const sty = temporaryInputBuffer[i]->getType()->getPointerElementType();
     697            b->CreateStore(Constant::getNullValue(sty), temporaryInputBuffer[i]);
     698        }       
     699    }
     700
     701    Value * temporaryOutputBuffer[outputSetCount];
     702    for (unsigned i = 0; i < outputSetCount; i++) {
     703        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     704        Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     705        if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
     706            temporaryOutputBuffer[i] = nullptr;
     707        } else {           
     708            auto ub = getUpperBound(rate);
     709            if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     710                ub += mStreamSetOutputBuffers[i]->overflowSize();
    624711            }
     712            temporaryOutputBuffer[i] = b->CreateAlignedAlloca(ty, alignment, b->getSize(roundUp(ub)));
     713            Type * const sty = temporaryOutputBuffer[i]->getType()->getPointerElementType();
     714            b->CreateStore(Constant::getNullValue(sty), temporaryOutputBuffer[i]);
    625715        }
    626716    }
     
    634724    // to process, in which case we abort unless IsFinal was set.
    635725
     726    Constant * const ZERO = b->getSize(0);
     727    Constant * const ONE = b->getSize(1);
     728    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
     729    Constant * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
     730
    636731    // Now proceed with creation of the doSegment method.
    637     BasicBlock * const doSegmentLoop = kb->CreateBasicBlock("DoSegmentLoop");
    638     kb->CreateBr(doSegmentLoop);
     732    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     733
     734    b->CreateBr(segmentLoop);
    639735
    640736    /// DO SEGMENT LOOP
    641737
    642     kb->SetInsertPoint(doSegmentLoop);
    643 
    644     // For each input buffer, determine the processedItemCount, the block pointer for the
    645     // buffer block containing the next item, and the number of linearly available items.
    646 
    647     Value * processedItemCount[inputSetCount];
    648     Value * baseInputBuffer[inputSetCount];
    649     Value * unprocessed[inputSetCount];
    650     Value * linearlyAvailable[inputSetCount];
    651     Value * readableStrides[inputSetCount];
    652 
    653     Constant * const log2BlockWidth = kb->getSize(std::log2(kb->getBitBlockWidth()));
    654 
     738    b->SetInsertPoint(segmentLoop);
     739
     740    // For each input buffer, get the initial processed item count, base input pointer, and the number of
     741    // linearly available strides.
    655742    Value * numOfStrides = nullptr;
    656 
     743    mInitialAvailableItemCount.resize(inputSetCount);
     744    mInitialProcessedItemCount.resize(inputSetCount);
     745    mStreamSetInputBaseAddress.resize(inputSetCount);
     746    Value * inputStrideSize[inputSetCount];
    657747    for (unsigned i = 0; i < inputSetCount; i++) {
    658         const auto name = mStreamSetInputs[i].getName();
    659         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    660 
    661         processedItemCount[i] = kb->getProcessedItemCount(name);
    662 
    663         assert (processedItemCount[i]->getType() == mAvailableItemCount[i]->getType());
    664 
    665         Value * const blockIndex = kb->CreateLShr(processedItemCount[i], log2BlockWidth);
    666         baseInputBuffer[i] = kb->getInputStreamPtr(name, blockIndex);
    667 
    668         if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
    669             kb->CreateAssert(kb->CreateICmpUGE(mAvailableItemCount[i], processedItemCount[i]),
    670                              "Processed item count cannot exceed the available item count");
    671         }
    672 
    673         unprocessed[i] = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    674 
    675         //kb->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed[i]);
    676 
    677         // INVESTIGATE: If the input rate of this stream is constant and known a priori, we could
    678         // avoid checking whether it is linearly accessible. Should we have an attribute for this?
    679 
    680         linearlyAvailable[i] = kb->getLinearlyAccessibleItems(name, processedItemCount[i], unprocessed[i]);
    681 
    682         //kb->CallPrintInt(getName() + "_" + name + "_linearlyAvailable", linearlyAvailable[i]);
    683 
    684         readableStrides[i] = nullptr;
    685 
    686         if (rate.isFixed() || rate.isBounded()) {
    687             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    688             readableStrides[i] = kb->CreateUDiv(linearlyAvailable[i], maxStrideSize);
    689             if (numOfStrides) {
    690                 numOfStrides = kb->CreateUMin(numOfStrides, readableStrides[i]);
    691             } else {
    692                 numOfStrides = readableStrides[i];
     748        const auto & input = mStreamSetInputs[i];
     749        const auto & name = input.getName();
     750        const ProcessingRate & rate = input.getRate();
     751        Value * const ic = b->getProcessedItemCount(name);
     752        mInitialProcessedItemCount[i] = ic;
     753        b->CreateAssert(b->CreateICmpUGE(mAvailableItemCount[i], ic), "processed item count cannot exceed the available item count");
     754        assert (ic->getType() == mAvailableItemCount[i]->getType());
     755        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], ic);
     756        mStreamSetInputBaseAddress[i]  = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     757        mInitialAvailableItemCount[i] = mAvailableItemCount[i];
     758        mAvailableItemCount[i] = b->getLinearlyAccessibleItems(name, ic, unprocessed);
     759        // Are our linearly accessible items sufficient for a stride?
     760        inputStrideSize[i] = getStrideSize(b, rate);
     761        Value * accessibleStrides = b->CreateUDiv(mAvailableItemCount[i], inputStrideSize[i]);
     762        if (!rate.isFixed() || requiresBufferedFinalStride(input)) {
     763
     764            // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
     765            // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer.
     766
     767            BasicBlock * const entry = b->GetInsertBlock();
     768            BasicBlock * const copyFromBack = b->CreateBasicBlock(name + "CopyFromBack");
     769            BasicBlock * const copyFromFront = b->CreateBasicBlock(name + "CopyFromFront");
     770            BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
     771
     772            b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume);
     773
     774            b->SetInsertPoint(copyFromBack);
     775            Value * const temporaryAvailable = b->CreateUMin(unprocessed, inputStrideSize[i]);
     776            b->CreateAssert(b->CreateICmpULE(mAvailableItemCount[i], temporaryAvailable), "linearly available cannot be greater than temporarily available");
     777            Value * const tempBufferPtr = temporaryInputBuffer[i];
     778            Value * const offset = b->CreateAnd(ic, BLOCK_WIDTH_MASK);
     779            const auto alignment = getItemAlignment(mStreamSetInputs[i]);
     780            b->CreateStreamCpy(name, tempBufferPtr, ZERO, mStreamSetInputBaseAddress[i] , offset, mAvailableItemCount[i], alignment);
     781            Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
     782            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
     783            b->CreateCondBr(b->CreateICmpNE(mAvailableItemCount[i], temporaryAvailable), copyFromFront, resume);
     784
     785            b->SetInsertPoint(copyFromFront);
     786            Value * const remaining = b->CreateSub(temporaryAvailable, mAvailableItemCount[i]);
     787            Value * const baseAddress = b->getBaseAddress(name);
     788            b->CreateStreamCpy(name, tempBufferPtr, mAvailableItemCount[i], baseAddress, ZERO, remaining, alignment);
     789            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
     790            b->CreateBr(resume);
     791
     792            b->SetInsertPoint(resume);
     793            PHINode * const bufferPtr = b->CreatePHI(mStreamSetInputBaseAddress[i] ->getType(), 3);
     794            bufferPtr->addIncoming(mStreamSetInputBaseAddress[i] , entry);
     795            bufferPtr->addIncoming(tempBufferPtr, copyToBackEnd);
     796            bufferPtr->addIncoming(tempBufferPtr, copyToFrontEnd);
     797            mStreamSetInputBaseAddress[i] = bufferPtr;
     798
     799            PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 3);
     800            phiAvailItemCount->addIncoming(mAvailableItemCount[i], entry);
     801            phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
     802            phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd);
     803            mAvailableItemCount[i] = phiAvailItemCount;
     804
     805            PHINode * const phiNumOfStrides = b->CreatePHI(b->getSizeTy(), 2);
     806            phiNumOfStrides->addIncoming(accessibleStrides, entry);
     807            phiNumOfStrides->addIncoming(temporaryStrides, copyToBackEnd);
     808            phiNumOfStrides->addIncoming(temporaryStrides, copyToFrontEnd);
     809            accessibleStrides = phiNumOfStrides;
     810        }
     811        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
     812    }
     813
     814    // Now determine the linearly writeable strides
     815    Value * linearlyWritable[outputSetCount];
     816    Value * baseOutputBuffer[outputSetCount];
     817    Value * outputStrideSize[outputSetCount];
     818    mInitialProducedItemCount.resize(outputSetCount);
     819    mStreamSetOutputBaseAddress.resize(outputSetCount);
     820    for (unsigned i = 0; i < outputSetCount; i++) {
     821        const auto & output = mStreamSetOutputs[i];
     822        const auto & name = output.getName();
     823        const ProcessingRate & rate = output.getRate();
     824        Value * const ic = b->getProducedItemCount(name);
     825        baseOutputBuffer[i] = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     826        assert (baseOutputBuffer[i]->getType()->isPointerTy());
     827        linearlyWritable[i] = b->getLinearlyWritableItems(name, ic);
     828        mInitialProducedItemCount[i] = ic;
     829        outputStrideSize[i] = nullptr;
     830        if (temporaryOutputBuffer[i]) {
     831            outputStrideSize[i] = getStrideSize(b, rate);
     832            // Is the number of linearly writable items sufficient for a stride?
     833            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
     834            if (!rate.isFixed() || requiresBufferedFinalStride(output)) {
     835                Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
     836                assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
     837                baseOutputBuffer[i] = b->CreateSelect(requiresCopy, temporaryOutputBuffer[i], baseOutputBuffer[i]);
     838                writableStrides = b->CreateSelect(requiresCopy, ONE, writableStrides);
    693839            }
    694         }
    695     }
    696 
    697     //kb->CallPrintInt(getName() + "_numOfStrides", numOfStrides);
    698 
    699     // Now determine the linearly writeable blocks, based on available blocks reduced
    700     // by limitations of output buffer space.
    701 
    702     Value * producedItemCount[outputSetCount];
    703     Value * baseOutputBuffer[outputSetCount];
    704     Value * writableStrides[outputSetCount];
    705     Value * linearlyWritable[outputSetCount];
    706 
    707     for (unsigned i = 0; i < outputSetCount; i++) {
    708         const auto & name = mStreamSetOutputs[i].getName();
    709         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    710         producedItemCount[i] = kb->getProducedItemCount(name);
    711 
    712         //kb->CallPrintInt(getName() + "_" + name + "_producedItemCount", producedItemCount[i]);
    713 
    714         Value * const blockIndex = kb->CreateLShr(producedItemCount[i], log2BlockWidth);
    715         baseOutputBuffer[i] = kb->getOutputStreamPtr(name, blockIndex);
    716         linearlyWritable[i] = nullptr;
    717         writableStrides[i] = nullptr;
    718         if (rate.isFixed() || rate.isBounded()) {
    719             linearlyWritable[i] = kb->getLinearlyWritableItems(name, producedItemCount[i]);
    720 
    721             //kb->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]);
    722 
    723             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    724             writableStrides[i] = kb->CreateUDiv(linearlyWritable[i], maxStrideSize);
    725             if (numOfStrides) {
    726                 numOfStrides = kb->CreateUMin(numOfStrides, writableStrides[i]);
    727             } else {
    728                 numOfStrides = writableStrides[i];
     840            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
     841            assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
     842        }
     843        mStreamSetOutputBaseAddress[i] = baseOutputBuffer[i];
     844    }
     845
     846    Value * const initiallyFinal = mIsFinal;
     847    if (LLVM_LIKELY(numOfStrides != nullptr)) {
     848        mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
     849        Value * const processStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
     850        b->CreateAssert(processStride, getName() + " does not have sufficient input data or output space for one stride");
     851        for (unsigned i = 0; i < inputSetCount; ++i) {
     852            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     853            if (rate.isFixed() && mStreamSetInputs[i].notDeferred()) {
     854                mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i]));
    729855            }
    730856        }
    731857    }
    732858
    733     //kb->CallPrintInt(getName() + "_numOfStrides'", numOfStrides);
    734 
    735     for (unsigned i = 0; i < inputSetCount; i++) {
    736         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    737         if (rate.isFixed()) {
    738             mAvailableItemCount[i] = kb->CreateMul(numOfStrides, kb->getSize(rate.getRate() * mStride));
    739         } else {
    740             mAvailableItemCount[i] = linearlyAvailable[i];
    741         }
    742 
    743         //kb->CallPrintInt(getName() + "_" + mStreamSetInputs[i].getName() + "_avail", mAvailableItemCount[i]);
    744     }
    745 
    746     // Define and allocate the temporary buffer area.
    747     Type * tempBuffers[totalSetCount];
    748     for (unsigned i = 0; i < inputSetCount; ++i) {
    749         Type * bufType = baseInputBuffer[i]->getType()->getPointerElementType();
    750         assert (baseInputBuffer[i]->getType()->getPointerAddressSpace() == 0);
    751         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    752         unsigned count = 0;
    753         if (rate.isFixed()) {
    754             count = rate.getRate();
    755         } else if (rate.isBounded()) {
    756             count = rate.getUpperBound() + 2;
    757         }
    758         tempBuffers[i] = ArrayType::get(bufType, count);
    759     }
    760     for (unsigned i = 0; i < outputSetCount; i++) {
    761         Type * const bufType = baseOutputBuffer[i]->getType()->getPointerElementType();
    762         assert (baseOutputBuffer[i]->getType()->getPointerAddressSpace() == 0);
    763         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    764         unsigned count = 0;
    765         if (rate.isFixed()) {
    766             count = rate.getRate();
    767         } else if (rate.isBounded()) {
    768             count = rate.getUpperBound() + 2;
    769         }
    770         tempBuffers[i + inputSetCount] = ArrayType::get(bufType, count);
    771     }
    772 
    773     Type * const tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount));
    774 
    775     Value * const tempBufferArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
    776 
    777     BasicBlock * const temporaryBufferCheck = kb->CreateBasicBlock("temporaryBufferCheck");
    778     BasicBlock * const doMultiBlock = kb->CreateBasicBlock("doMultiBlock");
    779     BasicBlock * const copyToTemporaryBuffers = kb->CreateBasicBlock("copyToTemporaryBuffers");
    780     BasicBlock * const segmentDone = kb->CreateBasicBlock("segmentDone");
    781 
    782     Value * const hasFullStride = numOfStrides ? kb->CreateICmpNE(numOfStrides, kb->getSize(0)) : kb->getTrue();
    783     kb->CreateCondBr(hasFullStride, doMultiBlock, temporaryBufferCheck);
    784 
    785     // We use temporary buffers in 3 different cases that preclude full stride processing.
    786 
    787     //  (a) One or more input buffers does not have a sufficient number of input items linearly available.
    788     //  (b) One or more output buffers does not have sufficient linearly available buffer space.
    789     //  (c) We have processed all the full strides of input and only the final block remains.
    790 
    791     kb->SetInsertPoint(temporaryBufferCheck);
    792 
    793     // Even if we copy the input data into a linear arrays, is there enough data to perform this stride?
    794     // If not, proceed only if this is our final block.
    795     Value * hasFullFragmentedStride = nullptr;
    796     for (unsigned i = 0; i < inputSetCount; i++) {
    797         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    798         if (r.isBounded() || (r.isUnknown() && r.getLowerBound() > 0)) {
    799             const auto l = r.isBounded() ? r.getUpperBound() : r.getLowerBound();
    800             Constant * const strideSize = kb->getSize(l * mStride);
    801             Value * enoughAvail = kb->CreateICmpUGE(unprocessed[i], strideSize);
    802             if (hasFullFragmentedStride) {
    803                 hasFullFragmentedStride = kb->CreateAnd(hasFullFragmentedStride, enoughAvail);
    804             } else {
    805                 hasFullFragmentedStride = enoughAvail;
    806             }
    807         }
    808     }
    809 
    810     Value * hasFragmentedOrFinalStride = nullptr;
    811     if (hasFullFragmentedStride) {
    812         hasFragmentedOrFinalStride = kb->CreateOr(hasFullFragmentedStride, mIsFinal);
    813         // Although this might be the final segment, we may have a full fragmented stride to process prior
    814         // to the actual final stride.
    815         mIsFinal = kb->CreateAnd(mIsFinal, kb->CreateNot(hasFullFragmentedStride));
    816     } else {
    817         hasFragmentedOrFinalStride = mIsFinal;
    818     }
    819     kb->CreateCondBr(hasFragmentedOrFinalStride, copyToTemporaryBuffers, segmentDone);
    820 
    821     /// COPY TO TEMPORARY BUFFERS
    822     kb->SetInsertPoint(copyToTemporaryBuffers);
    823 
    824     kb->CreateAlignedStore(Constant::getNullValue(tempParameterStructType), tempBufferArea, kb->getCacheAlignment());
    825 
    826     // For each input and output buffer, copy over necessary data starting from the last block boundary.
    827 
    828     Value * temporaryInputBuffer[inputSetCount];
    829     Value * temporaryAvailable[inputSetCount];
    830 
    831     for (unsigned i = 0; i < inputSetCount; i++) {
    832         temporaryInputBuffer[i] = baseInputBuffer[i];
    833         if (readableStrides[i]) {
    834             const auto name = mStreamSetInputs[i].getName();
    835             const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    836             assert (rate.getUpperBound() > 0);
    837             Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride);
    838             temporaryAvailable[i] = kb->CreateUMin(unprocessed[i], maxStrideSize);
    839 
    840             BasicBlock * entry = kb->GetInsertBlock();
    841             BasicBlock * copy = kb->CreateBasicBlock(name + "Copy");
    842             BasicBlock * resume = kb->CreateBasicBlock(name + "ResumeCopy");
    843             Value * const test = kb->CreateOr(kb->CreateICmpNE(readableStrides[i], kb->getSize(0)), mIsFinal);
    844             kb->CreateCondBr(test, resume, copy);
    845 
    846             kb->SetInsertPoint(copy);
    847             Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(i), kb->getInt32(0)});
    848             assert (tempBufferPtr->getType() == baseInputBuffer[i]->getType());
    849             Value * const neededItems = linearlyAvailable[i];
    850             Value * const bytesCopied = kb->copy(name, tempBufferPtr, baseInputBuffer[i], neededItems);
    851             Value * const nextInputPtr = kb->getRawInputPointer(name, kb->getSize(0));
    852             Value * const remaining = kb->CreateSub(temporaryAvailable[i], neededItems);
    853             Value * nextBufPtr = kb->CreatePointerCast(tempBufferPtr, kb->getInt8PtrTy());
    854             nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
    855             kb->copy(name, nextBufPtr, nextInputPtr, remaining);
    856 
    857             kb->CreateBr(resume);
    858 
    859             kb->SetInsertPoint(resume);
    860             PHINode * bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
    861             bufferPtr->addIncoming(baseInputBuffer[i], entry);
    862             bufferPtr->addIncoming(tempBufferPtr, copy);
    863             temporaryInputBuffer[i] = bufferPtr;
    864         }
    865     }
    866 
    867     Value * temporaryOutputBuffer[outputSetCount];
    868     for (unsigned i = 0; i < outputSetCount; i++) {
    869         temporaryOutputBuffer[i] = baseOutputBuffer[i];
    870         if (writableStrides[i]) {
    871             const auto name = mStreamSetOutputs[i].getName();
    872 
    873             BasicBlock * const entry = kb->GetInsertBlock();
    874             BasicBlock * const copy = kb->CreateBasicBlock(name + "Copy");
    875             BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopy");
    876 
    877             Value * const test = kb->CreateOr(kb->CreateICmpNE(writableStrides[i], kb->getSize(0)), mIsFinal);
    878             kb->CreateCondBr(test, resume, copy);
    879 
    880             kb->SetInsertPoint(copy);
    881             Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i), kb->getInt32(0)});
    882             assert (tempBufferPtr->getType() == baseOutputBuffer[i]->getType());
    883             Value * const itemsToCopy = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
    884             kb->copy(name, tempBufferPtr, baseOutputBuffer[i], itemsToCopy);
    885             kb->CreateBr(resume);
    886 
    887             kb->SetInsertPoint(resume);
    888             PHINode * bufferPtr = kb->CreatePHI(tempBufferPtr->getType(), 2);
    889             bufferPtr->addIncoming(baseOutputBuffer[i], entry);
    890             bufferPtr->addIncoming(tempBufferPtr, copy);
    891             temporaryOutputBuffer[i] = bufferPtr;
    892         }
    893     }
    894 
    895     kb->CreateBr(doMultiBlock);
    896     BasicBlock * const usingTemporaryBuffers = kb->GetInsertBlock();
    897     doMultiBlock->moveAfter(usingTemporaryBuffers);
    898 
    899     /// DO MULTI BLOCK
    900 
    901     //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
    902     //  Now prepare the doMultiBlock call.
    903     kb->SetInsertPoint(doMultiBlock);
    904 
    905     PHINode * const isFinal = kb->CreatePHI(mIsFinal->getType(), 2);
    906     isFinal->addIncoming(kb->getFalse(), doSegmentLoop);
    907     isFinal->addIncoming(mIsFinal, usingTemporaryBuffers);
    908     mIsFinal = isFinal;
    909 
    910     mStreamSetInputBufferPtr.resize(inputSetCount);
    911     for (unsigned i = 0; i < inputSetCount; ++i) {
    912         assert (baseInputBuffer[i] && temporaryInputBuffer[i]);
    913         if (baseInputBuffer[i] != temporaryInputBuffer[i]) {
    914             PHINode * const avail = kb->CreatePHI(kb->getSizeTy(), 2);
    915             avail->addIncoming(mAvailableItemCount[i], doSegmentLoop);
    916             avail->addIncoming(temporaryAvailable[i], usingTemporaryBuffers);
    917             mAvailableItemCount[i] = avail;
    918             PHINode * const bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2);
    919             bufferPtr->addIncoming(baseInputBuffer[i], doSegmentLoop);
    920             assert (baseInputBuffer[i]->getType() == temporaryInputBuffer[i]->getType());
    921             bufferPtr->addIncoming(temporaryInputBuffer[i], usingTemporaryBuffers);
    922             temporaryInputBuffer[i] = bufferPtr;
    923         }
    924         mStreamSetInputBufferPtr[i] = temporaryInputBuffer[i];
    925     }
    926 
    927     mStreamSetOutputBufferPtr.resize(outputSetCount);
    928     for (unsigned i = 0; i < outputSetCount; ++i) {
    929         assert (baseOutputBuffer[i] && temporaryOutputBuffer[i]);
    930         if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
    931             PHINode * const bufferPtr = kb->CreatePHI(baseOutputBuffer[i]->getType(), 2);
    932             bufferPtr->addIncoming(baseOutputBuffer[i], doSegmentLoop);
    933             assert (baseOutputBuffer[i]->getType() == temporaryOutputBuffer[i]->getType());
    934             bufferPtr->addIncoming(temporaryOutputBuffer[i], usingTemporaryBuffers);
    935             temporaryOutputBuffer[i] = bufferPtr;
    936         }
    937         mStreamSetOutputBufferPtr[i] = temporaryOutputBuffer[i];
    938     }
    939 
    940     // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
    941     // provide the required multi-block kernel logic.
    942     generateMultiBlockLogic(kb, numOfStrides);
    943 
    944     // If we have no fixed rate inputs, we won't know when we're done parsing until we test
    945     // whether any input data was processed.
    946     bool mayMakeNoProgress = true;
    947 
    948     // Update the processed item count of any Fixed input or output stream. While doing so, also
    949     // calculate the LCM of their rates. The LCM is used to calculate the final item counts.
    950 
    951     unsigned rateLCM = 1;
     859    //  We have one or more blocks of input data and output buffer space for all stream sets.
     860    generateMultiBlockLogic(b, numOfStrides);
    952861
    953862    for (unsigned i = 0; i < inputSetCount; ++i) {
    954863        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    955         if (rate.isFixed()) {
    956             mayMakeNoProgress = false;
    957             rateLCM = lcm(rateLCM, rate.getRate());
    958             Value * const processed = mAvailableItemCount[i]; // kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
    959             Value * const ic = kb->CreateAdd(processedItemCount[i], processed);
    960             kb->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
     864        if (rate.isFixed() && mStreamSetInputs[i].notDeferred()) {
     865            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
     866            b->setProcessedItemCount(mStreamSetInputs[i].getName(), ic);
    961867        }
    962868    }
     
    965871        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    966872        if (rate.isFixed()) {
    967             rateLCM = lcm(rateLCM, rate.getRate());
    968             Value * const produced = kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate()));
    969             Value * const ic = kb->CreateAdd(producedItemCount[i], produced);
    970             kb->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
    971         }
    972     }
    973 
    974     BasicBlock * const finalStrideCheck = kb->CreateBasicBlock("finalStrideCheck");
    975     BasicBlock * const finalStrideAdjustment = kb->CreateBasicBlock("finalStrideAdjustment");
    976     BasicBlock * const standardCopyBack = kb->CreateBasicBlock("standardCopyBack");
    977     BasicBlock * const temporaryBufferCopyBack = kb->CreateBasicBlock("temporaryBufferCopyBack");
    978 
    979     kb->CreateLikelyCondBr(hasFullStride, standardCopyBack, finalStrideCheck);
    980 
    981 
    982     /// FINAL STRIDE CHECK
    983     kb->SetInsertPoint(finalStrideCheck);
    984     kb->CreateUnlikelyCondBr(mIsFinal, finalStrideAdjustment, temporaryBufferCopyBack);
     873            assert (mStreamSetOutputs[i].notDeferred());
     874            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
     875            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
     876            b->setProducedItemCount(mStreamSetOutputs[i].getName(), ic);
     877        }
     878    }
     879
     880    BasicBlock * const handleFinalBlock = b->CreateBasicBlock("HandleFinalBlock");
     881    BasicBlock * const temporaryBufferCopyBack = b->CreateBasicBlock("TemporaryBufferCopyBack");
     882    BasicBlock * const strideDone = b->CreateBasicBlock("MultiBlockDone");
     883
     884    b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), temporaryBufferCopyBack, handleFinalBlock);
     885
    985886
    986887    /// FINAL STRIDE ADJUSTMENT
    987     kb->SetInsertPoint(finalStrideAdjustment);
     888    b->SetInsertPoint(handleFinalBlock);
    988889
    989890    // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
     
    991892    // to calculate them based on the actual input item counts.
    992893
    993     // NOTE: This appears overly complex to avoid an integer overflow without reducing the maximum
    994     // integer size. For each Fixed output stream, this calculates:
    995 
    996     //       CEILING(MIN(Total Available Item Count / Fixed Input Rate) * Fixed Output Rate)
    997 
    998     Value * basePreviouslyProcessedItemCount = nullptr;
    999     Value * scaledInverseOfStrideItemCount = nullptr;
    1000 
     894    reviseFinalProducedItemCounts(b);
     895
     896    b->CreateBr(temporaryBufferCopyBack);
     897
     898    /// TEMPORARY BUFFER COPY BACK
     899    b->SetInsertPoint(temporaryBufferCopyBack);
     900
     901    // Copy back data to the actual output buffers.
     902    for (unsigned i = 0; i < outputSetCount; i++) {
     903        Value * const tempBuffer = temporaryOutputBuffer[i];
     904        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
     905            continue;
     906        }
     907        Value * const baseBuffer = baseOutputBuffer[i];
     908        assert ("stack overflow" && (tempBuffer->getType() == baseBuffer->getType()));
     909        const auto & name = mStreamSetOutputs[i].getName();
     910        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
     911        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
     912        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
     913        // If we used a temporary buffer, copy it back to the original output buffer
     914        b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume);
     915
     916        b->SetInsertPoint(copyToBack);       
     917        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
     918        Value * const newProducedItemCount = b->getProducedItemCount(name);
     919        Value * const newlyProduced = b->CreateSub(newProducedItemCount, mInitialProducedItemCount[i]);
     920        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
     921        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     922        b->CreateStreamCpy(name, baseBuffer, offset, tempBuffer, ZERO, toWrite, alignment);
     923        // If we required a temporary output buffer, we will probably need to write to the beginning of the buffer as well.
     924        b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, resume);
     925
     926        b->SetInsertPoint(copyToFront);
     927        Value * const remaining = b->CreateSub(newlyProduced, toWrite);
     928        Value * const baseAddress = b->getBaseAddress(name);
     929        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
     930        b->CreateBr(resume);
     931
     932        b->SetInsertPoint(resume);
     933    }
     934
     935    strideDone->moveAfter(b->GetInsertBlock());
     936
     937    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
     938    //  We've dealt with the partial block processing and copied information back into the
     939    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
     940    if (hasNoTerminateAttribute()) {
     941        b->CreateCondBr(mIsFinal, segmentDone, strideDone);
     942    } else {
     943        BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
     944        b->CreateCondBr(mIsFinal, setTermination, strideDone);
     945
     946        b->SetInsertPoint(setTermination);
     947        b->setTerminationSignal();
     948        b->CreateBr(segmentDone);       
     949    }
     950
     951    /// STRIDE DONE
     952    b->SetInsertPoint(strideDone);
     953
     954    // do we have enough data for another stride?
     955    Value * pendingStrides = nullptr;
    1001956    for (unsigned i = 0; i < inputSetCount; ++i) {
    1002         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1003         if (r.isFixed()) {
    1004             assert (rateLCM % r.getRate() == 0);
    1005             Value * const a = kb->CreateMul(mAvailableItemCount[i], kb->getSize(rateLCM / r.getRate())); // unprocessed
    1006             Value * const p = kb->CreateUDiv(processedItemCount[i], kb->getSize(r.getRate()));
    1007             if (scaledInverseOfStrideItemCount) {
    1008                 scaledInverseOfStrideItemCount = kb->CreateUMin(scaledInverseOfStrideItemCount, a);
    1009                 basePreviouslyProcessedItemCount = kb->CreateUMin(basePreviouslyProcessedItemCount, p);
     957        Value * const processed = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     958        Value * const remaining = b->CreateSub(mInitialAvailableItemCount[i], processed);
     959        Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
     960        pendingStrides = b->CreateUMin(pendingStrides, remainingStrides);
     961    }
     962
     963    // do we have enough room for another stride?
     964    for (unsigned i = 0; i < outputSetCount; ++i) {
     965        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     966        const auto & name = mStreamSetOutputs[i].getName();
     967        Value * const newProduced = b->getProducedItemCount(name);
     968        // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
     969        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
     970            Value * const unconsumed = b->CreateSub(newProduced, b->getConsumedItemCount(name));
     971            Value * const remaining = b->CreateSub(b->getCapacity(name), unconsumed);
     972            Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
     973            pendingStrides = b->CreateUMin(pendingStrides, remainingStrides);
     974        }
     975        // Do copybacks if necessary.
     976        if (mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate)) {
     977            b->CreateCopyBack(name, mInitialProducedItemCount[i], newProduced);
     978        }
     979    }
     980
     981    Value * const hasMoreStrides = b->CreateOr(b->CreateICmpNE(pendingStrides, ZERO), initiallyFinal);
     982    b->CreateCondBr(hasMoreStrides, segmentLoop, segmentDone);
     983
     984    /// SEGMENT DONE
     985    segmentDone->moveAfter(b->GetInsertBlock());
     986    b->SetInsertPoint(segmentDone);
     987
     988}
     989
     990/** ------------------------------------------------------------------------------------------------------------- *
     991 * @brief requiresCopyBack
     992 ** ------------------------------------------------------------------------------------------------------------- */
     993bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
     994    if (rate.isBounded() || rate.isUnknown()) {
     995        return true;
     996    } else if (rate.isRelative()) {
     997        return requiresCopyBack(getBinding(rate.getReference()).getRate());
     998    }
     999    return false;
     1000}
     1001
     1002/** ------------------------------------------------------------------------------------------------------------- *
     1003 * @brief CreateUDivCeil
     1004 ** ------------------------------------------------------------------------------------------------------------- */
     1005inline Value * CreateUDivCeil(const std::unique_ptr<KernelBuilder> & b, Value * const number, const ProcessingRate::RateValue divisor, const Twine & Name = "") {
     1006    Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
     1007    if (LLVM_LIKELY(divisor.denominator() == 1)) {
     1008        return b->CreateUDivCeil(number, n, Name);
     1009    } else {
     1010        //   âŒŠ(num + ratio - 1) / ratio⌋
     1011        // = ⌊(num - 1) / (n/d)⌋ + (ratio/ratio)
     1012        // = ⌊(d * (num - 1)) / n⌋ + 1
     1013        Constant * const ONE = ConstantInt::get(number->getType(), 1);
     1014        Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
     1015        return b->CreateAdd(b->CreateUDiv(b->CreateMul(b->CreateSub(number, ONE), d), n), ONE, Name);
     1016    }
     1017}
     1018
     1019
     1020/** ------------------------------------------------------------------------------------------------------------- *
     1021 * @brief reviseFinalProducedItemCounts
     1022 ** ------------------------------------------------------------------------------------------------------------- */
     1023void MultiBlockKernel::reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b) {
     1024
     1025    if (LLVM_UNLIKELY(mStreamSetInputs.empty())) {
     1026        return;
     1027    }
     1028
     1029    const auto inputSetCount = mStreamSetInputs.size();
     1030
     1031    ProcessingRate::RateValue rateLCM(1);
     1032    unsigned first = 0;
     1033    unsigned last = inputSetCount;
     1034
     1035    for (unsigned i = 0; i < inputSetCount; ++i) {
     1036        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
     1037        if (pr.isFixed()) {
     1038            rateLCM = lcm(rateLCM, pr.getRate());
     1039            if (mStreamSetInputs[i].isPrincipal()) {
     1040                assert ("A kernel cannot have multiple principle input streams" && (first == 0 && last == inputSetCount));
     1041                first = i;
     1042                last = i + 1;
     1043            }
     1044        }       
     1045    }
     1046
     1047    bool noFixedRateOutput = true;
     1048
     1049    for (const Binding & output : mStreamSetOutputs) {
     1050        const ProcessingRate & pr = output.getRate();
     1051        if (pr.isFixed()) {
     1052            rateLCM = lcm(rateLCM, pr.getRate());
     1053            noFixedRateOutput = false;
     1054        }
     1055    }
     1056
     1057    if (noFixedRateOutput) {
     1058        return;
     1059    }
     1060
     1061    Value * baseInitialProcessedItemCount = nullptr;
     1062    Value * scaledInverseOfAvailItemCount = nullptr;
     1063
     1064    // For each Fixed output stream, this calculates:
     1065
     1066    //    CEILING(MIN(Available Item Count / Fixed Input Rate) * Fixed Output Rate)
     1067
     1068    // But avoids the possibility of overflow errors (assuming that each processed item count does not overflow)
     1069
     1070    for (unsigned i = first; i < last; ++i) {
     1071        const ProcessingRate & pr = mStreamSetInputs[i].getRate();
     1072        if (pr.isFixed()) {
     1073            Value * p = mInitialProcessedItemCount[i];
     1074            Value * a = b->CreateSub(mInitialAvailableItemCount[i], p);
     1075            const auto & rate = pr.getRate();
     1076            if (LLVM_UNLIKELY(rateLCM != rate)) {
     1077                const auto factor = rateLCM / rate;
     1078                if (LLVM_UNLIKELY(factor.numerator() > 1)) {
     1079                    a = b->CreateMul(a, b->getSize(factor.numerator()));
     1080                }
     1081                if (LLVM_UNLIKELY(factor.denominator() > 1)) {
     1082                    a = b->CreateUDiv(a, b->getSize(factor.denominator()));
     1083                }
     1084            }
     1085            if (LLVM_UNLIKELY(rate.denominator() > 1)) {
     1086                p = b->CreateMul(p, b->getSize(rate.denominator()));
     1087            }
     1088            if (LLVM_UNLIKELY(rate.numerator() > 1)) {
     1089                p = b->CreateUDiv(p, b->getSize(rate.numerator()));
     1090            }
     1091            if (scaledInverseOfAvailItemCount) {
     1092                scaledInverseOfAvailItemCount = b->CreateUMin(scaledInverseOfAvailItemCount, a);
     1093                baseInitialProcessedItemCount = b->CreateUMin(baseInitialProcessedItemCount, p);
    10101094            } else {
    1011                 scaledInverseOfStrideItemCount = a;
    1012                 basePreviouslyProcessedItemCount = p;
     1095                scaledInverseOfAvailItemCount = a;
     1096                baseInitialProcessedItemCount = p;
    10131097            }
    10141098        }
    1015 //        const auto name = mStreamSetInputs[i].getName();
    1016 //        Value * const processed = kb->CreateAdd(processedItemCount[i], unprocessed[i]);
    1017 //        kb->setProcessedItemCount(name, processed);
    1018     }
    1019 
    1020     for (unsigned i = 0; i < outputSetCount; ++i) {
    1021         const auto name = mStreamSetOutputs[i].getName();
    1022         const ProcessingRate & r = mStreamSetOutputs[i].getRate();
     1099    }
     1100
     1101    for (const Binding & output : mStreamSetOutputs) {
     1102        const auto name = output.getName();
     1103        const ProcessingRate & pr = output.getRate();
    10231104        Value * produced = nullptr;
    1024         if (r.isFixed()) {
    1025             assert (rateLCM % r.getRate() == 0);
    1026             assert (basePreviouslyProcessedItemCount && scaledInverseOfStrideItemCount);
    1027             Value * const p = kb->CreateMul(basePreviouslyProcessedItemCount, kb->getSize(r.getRate()));
    1028             Value * const ic = kb->CreateUDivCeil(scaledInverseOfStrideItemCount, kb->getSize(rateLCM / r.getRate()));
    1029             produced = kb->CreateAdd(p, ic);
     1105        if (pr.isFixed() && output.notDeferred()) {
     1106            assert (baseInitialProcessedItemCount && scaledInverseOfAvailItemCount);
     1107            const auto rate = pr.getRate();
     1108            Value * p = baseInitialProcessedItemCount;
     1109            if (LLVM_UNLIKELY(rate.numerator() != 1)) {
     1110                p = b->CreateMul(p, b->getSize(rate.numerator()));
     1111            }
     1112            if (LLVM_UNLIKELY(rate.denominator() != 1)) {
     1113                p = b->CreateUDiv(p, b->getSize(rate.denominator()));
     1114            }
     1115            Value * const ic = CreateUDivCeil(b, scaledInverseOfAvailItemCount, rateLCM / pr.getRate());
     1116            produced = b->CreateAdd(p, ic);
    10301117        } else { // check if we have an attribute; if so, get the current produced count and adjust it
    10311118            bool noAttributes = true;
    1032             for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1119            for (const Attribute & attr : output.getAttributes()) {
    10331120                if (attr.isAdd() || attr.isRoundUpTo()) {
    10341121                    noAttributes = false;
     
    10391126                continue;
    10401127            }
    1041             produced = kb->getProducedItemCount(name);
    1042         }
    1043         for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) {
     1128            produced = b->getProducedItemCount(name);
     1129        }
     1130        for (const Attribute & attr : output.getAttributes()) {
    10441131            if (attr.isAdd()) {
    1045                 produced = kb->CreateAdd(produced, kb->getSize(attr.getAmount()));
     1132                produced = b->CreateAdd(produced, b->getSize(attr.getAmount()));
    10461133            } else if (attr.isRoundUpTo()) {
    1047                 produced = kb->CreateRoundUp(produced, kb->getSize(attr.getAmount()));
     1134                produced = b->CreateRoundUp(produced, b->getSize(attr.getAmount()));
    10481135            }
    10491136        }
    1050         kb->setProducedItemCount(name, produced);
    1051     }
    1052 
    1053     kb->CreateBr(temporaryBufferCopyBack);
    1054 
    1055     /// TEMPORARY BUFFER COPY BACK
    1056     kb->SetInsertPoint(temporaryBufferCopyBack);
    1057 
    1058     // Copy back data to the actual output buffers.
    1059     for (unsigned i = 0; i < outputSetCount; i++) {
    1060 
    1061         if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) {
    1062 
    1063             const auto name = mStreamSetOutputs[i].getName();
    1064 
    1065             BasicBlock * const copy = kb->CreateBasicBlock(name + "CopyBack");
    1066             BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopyBack");
    1067             Value * const usedTemporary = kb->CreateICmpNE(temporaryOutputBuffer[i], baseOutputBuffer[i]);
    1068 
    1069             // If we used a temporary buffer ...
    1070             kb->CreateCondBr(usedTemporary, copy, resume);
    1071 
    1072             kb->SetInsertPoint(copy);
    1073             Value * bytesCopied = kb->copy(name, baseOutputBuffer[i], temporaryOutputBuffer[i], linearlyWritable[i]);
    1074             Value * nextOutputPtr = kb->getRawOutputPointer(name, kb->getSize(0));
    1075             Value * producedCount = kb->getProducedItemCount(name);
    1076 
    1077             Value * remaining = kb->CreateSub(producedCount, linearlyWritable[i]);
    1078             Value * nextBufPtr = kb->CreatePointerCast(temporaryOutputBuffer[i], kb->getInt8PtrTy());
    1079             nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied);
    1080 
    1081             kb->copy(name, nextOutputPtr, nextBufPtr, remaining);
    1082             kb->CreateBr(resume);
    1083 
    1084             kb->SetInsertPoint(resume);
    1085         }
    1086     }
    1087 
    1088     //  We've dealt with the partial block processing and copied information back into the
    1089     //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1090     BasicBlock * setTermination = nullptr;
    1091     if (hasNoTerminateAttribute()) {
    1092         kb->CreateCondBr(mIsFinal, segmentDone, standardCopyBack);
    1093     } else {
    1094         setTermination = kb->CreateBasicBlock("setTermination");
    1095         kb->CreateCondBr(mIsFinal, setTermination, standardCopyBack);
    1096     }
    1097 
    1098     /// STANDARD COPY BACK
    1099     kb->SetInsertPoint(standardCopyBack);
    1100 
    1101     // Do copybacks if necessary.
    1102     for (unsigned i = 0; i < outputSetCount; i++) {
    1103         if (mStreamSetOutputBuffers[i]->supportsCopyBack()) {
    1104             const auto name = mStreamSetOutputs[i].getName();
    1105             Value * newProduced = kb->getProducedItemCount(name);
    1106             kb->CreateCopyBack(name, producedItemCount[i], newProduced);
    1107         }
    1108     }
    1109 
    1110     // If it is possible to make no progress, verify we processed some of the input. If we haven't,
    1111     // we're finished this segment.
    1112     if (mayMakeNoProgress) {
    1113         Value * madeProgress = nullptr;
    1114         for (unsigned i = 0; i < inputSetCount; ++i) {
    1115             Value * const processed = kb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1116             Value * const progress = kb->CreateICmpNE(processed, processedItemCount[i]);
    1117             if (madeProgress) {
    1118                 madeProgress = kb->CreateOr(madeProgress, progress);
    1119             } else {
    1120                 madeProgress = progress;
    1121             }
    1122         }
    1123         assert (madeProgress);
    1124         kb->CreateCondBr(madeProgress, doSegmentLoop, segmentDone);
    1125     } else {
    1126         kb->CreateBr(doSegmentLoop);
    1127     }
    1128 
    1129     if (hasNoTerminateAttribute()) {
    1130         segmentDone->moveAfter(kb->GetInsertBlock());
    1131     } else {
    1132         /// SET TERMINATION
    1133         setTermination->moveAfter(kb->GetInsertBlock());
    1134         kb->SetInsertPoint(setTermination);
    1135         kb->setTerminationSignal();
    1136         kb->CreateBr(segmentDone);
    1137         segmentDone->moveAfter(setTermination);
    1138     }
    1139 
    1140     kb->SetInsertPoint(segmentDone);
    1141 
    1142 }
    1143 
    1144 //bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
    1145 //    if (rate.isBounded() || rate.isUnknown()) {
    1146 //        return true;
    1147 //    } else if (rate.isDirectlyRelative()) {
    1148 //        Port port; unsigned i;
    1149 //        std::tie(port, i) = getStreamPort(rate.getReference());
    1150 //        const auto & binding = (port == Port::Input) ? mStreamSetInputs[i] : mStreamSetOutputs[i];
    1151 //        return requiresCopyBack(binding.getRate());
    1152 //    }
    1153 //    return false;
    1154 //}
    1155 
    1156 //  The default doSegment method dispatches to the doBlock routine for
    1157 //  each block of the given number of blocksToDo, and then updates counts.
    1158 
    1159 void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) {
    1160 
    1161     BasicBlock * const entryBlock = idb->GetInsertBlock();
    1162     BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
    1163     mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody");
    1164     BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone");
    1165     BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock");
    1166     BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone");
    1167 
    1168     Value * baseTarget = nullptr;
    1169     if (idb->supportsIndirectBr()) {
    1170         baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
    1171     }
    1172 
    1173     Constant * const log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
    1174 
     1137        b->setProducedItemCount(name, produced);
     1138    }
     1139
     1140}
     1141
     1142/** ------------------------------------------------------------------------------------------------------------- *
     1143 * @brief generateMultiBlockLogic
     1144 ** ------------------------------------------------------------------------------------------------------------- */
     1145Value * BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     1146
     1147    if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
     1148        report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of BlockOrientedKernel "
     1149                           "equal to the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
     1150    }
     1151
     1152    Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
     1153
     1154    BasicBlock * const entryBlock = b->GetInsertBlock();
     1155    mStrideLoopBody = b->CreateBasicBlock(getName() + "_strideLoopBody");
     1156    BasicBlock * const stridesDone = b->CreateBasicBlock(getName() + "_stridesDone");
     1157    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
     1158    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
     1159    b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
     1160                    "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
    11751161    const auto inputSetCount = mStreamSetInputs.size();
    11761162    Value * baseProcessedIndex[inputSetCount];
    1177     for (unsigned i = 0; i < inputSetCount; ++i) {
     1163    Value * baseInputAddress[inputSetCount];
     1164    for (unsigned i = 0; i < inputSetCount; i++) {
    11781165        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    1179         if (rate.isFixed()) {
    1180             baseProcessedIndex[i] = nullptr;
    1181         } else {
    1182             Value * ic = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1183             ic = idb->CreateLShr(ic, log2BlockSize);
    1184             baseProcessedIndex[i] = ic;
    1185         }
     1166        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1167            Value * const ic = mInitialProcessedItemCount[i];
     1168            baseProcessedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
     1169        }
     1170        baseInputAddress[i] = mStreamSetInputBaseAddress[i];
    11861171    }
    11871172
    11881173    const auto outputSetCount = mStreamSetOutputs.size();
    11891174    Value * baseProducedIndex[outputSetCount];
     1175    Value * baseOutputAddress[inputSetCount];
     1176    for (unsigned i = 0; i < outputSetCount; i++) {
     1177        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
     1178        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1179            Value * const ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
     1180            baseProducedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
     1181        }
     1182        baseOutputAddress[i] = mStreamSetOutputBaseAddress[i];
     1183    }
     1184
     1185    b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, mStrideLoopBody);
     1186
     1187    /// BLOCK BODY
     1188
     1189    b->SetInsertPoint(mStrideLoopBody);
     1190
     1191    if (b->supportsIndirectBr()) {
     1192        Value * const baseTarget = BlockAddress::get(segmentDone);
     1193        mStrideLoopTarget = b->CreatePHI(baseTarget->getType(), 2, "strideTarget");
     1194        mStrideLoopTarget->addIncoming(baseTarget, entryBlock);
     1195    }
     1196
     1197    mStrideBlockIndex = b->CreatePHI(b->getSizeTy(), 2);
     1198    mStrideBlockIndex->addIncoming(b->getSize(0), entryBlock);
     1199
     1200    /// GENERATE DO BLOCK METHOD
     1201
     1202    for (unsigned i = 0; i < inputSetCount; ++i) {
     1203        Value * index = mStrideBlockIndex;
     1204        const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     1205        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1206            Value * ic = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     1207            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProcessedIndex[i]);
     1208        }
     1209        mStreamSetInputBaseAddress[i] = b->CreateGEP(mStreamSetInputBaseAddress[i], index);
     1210    }
     1211
    11901212    for (unsigned i = 0; i < outputSetCount; ++i) {
     1213        Value * index = mStrideBlockIndex;
    11911214        const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    1192         if (rate.isFixed()) {
    1193             baseProducedIndex[i] = nullptr;
    1194         } else {
    1195             Value * ic = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
    1196             ic = idb->CreateLShr(ic, log2BlockSize);
    1197             baseProducedIndex[i] = ic;
    1198         }
    1199     }
    1200 
    1201     Value * const numOfBlocksToProcess = idb->CreateMul(numOfStrides, idb->getSize(mStride / idb->getBitBlockWidth()));
    1202 
    1203     idb->CreateBr(strideLoopCond);
    1204 
    1205     /// BLOCK COND
    1206 
    1207     idb->SetInsertPoint(strideLoopCond);
    1208 
    1209     PHINode * branchTarget = nullptr;
    1210     if (baseTarget) {
    1211         branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
    1212         branchTarget->addIncoming(baseTarget, entryBlock);
    1213     }
    1214 
    1215     PHINode * const blockIndex = idb->CreatePHI(idb->getSizeTy(), 2, "index");
    1216     blockIndex->addIncoming(idb->getSize(0), entryBlock);
    1217 
    1218     for (unsigned i = 0; i < inputSetCount; ++i) {
    1219         Value * offset = blockIndex;
    1220         if (baseProcessedIndex[i]) {
    1221             offset = idb->getProcessedItemCount(mStreamSetInputs[i].getName());
    1222             offset = idb->CreateLShr(offset, log2BlockSize);
    1223             offset = idb->CreateSub(offset, baseProcessedIndex[i]);
    1224         }
    1225         mStreamSetInputBufferPtr[i] = idb->CreateGEP(mStreamSetInputBufferPtr[i], offset);
    1226     }
    1227 
    1228     for (unsigned i = 0; i < outputSetCount; ++i) {
    1229         Value * offset = blockIndex;
    1230         if (baseProducedIndex[i]) {
    1231             offset = idb->getProducedItemCount(mStreamSetOutputs[i].getName());
    1232             offset = idb->CreateLShr(offset, log2BlockSize);
    1233             offset = idb->CreateSub(offset, baseProducedIndex[i]);
    1234         }
    1235         mStreamSetOutputBufferPtr[i] = idb->CreateGEP(mStreamSetOutputBufferPtr[i], offset);
    1236     }
    1237 
    1238     Value * const notDone = idb->CreateICmpULT(blockIndex, numOfBlocksToProcess);
    1239     idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
    1240 
    1241     /// BLOCK BODY
    1242 
    1243     idb->SetInsertPoint(mStrideLoopBody);
    1244 
    1245     if (idb->supportsIndirectBr()) {
    1246         mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget");
    1247         mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
    1248     }
    1249 
    1250     /// GENERATE DO BLOCK METHOD
    1251 
    1252     writeDoBlockMethod(idb);
    1253 
    1254     BasicBlock * const bodyEnd = idb->GetInsertBlock();
    1255     blockIndex->addIncoming(idb->CreateAdd(blockIndex, idb->getSize(1)), bodyEnd);
    1256     if (branchTarget) {
    1257         branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
    1258     }
    1259     idb->CreateBr(strideLoopCond);
     1215        if (LLVM_UNLIKELY(!rate.isFixed())) {
     1216            Value * ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
     1217            index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProducedIndex[i]);
     1218        }
     1219        mStreamSetOutputBaseAddress[i] = b->CreateGEP(mStreamSetOutputBaseAddress[i], index);
     1220    }
     1221
     1222    writeDoBlockMethod(b);
     1223
     1224    BasicBlock * const bodyEnd = b->GetInsertBlock();
     1225    if (mStrideLoopTarget) {
     1226        mStrideLoopTarget->addIncoming(mStrideLoopTarget, bodyEnd);
     1227    }
     1228
     1229    Value * const nextIndex = b->CreateAdd(mStrideBlockIndex, b->getSize(1));
     1230    mStrideBlockIndex->addIncoming(nextIndex, bodyEnd);
     1231    Value * const notDone = b->CreateICmpULT(nextIndex, numOfBlocks);
     1232    b->CreateCondBr(notDone, mStrideLoopBody, stridesDone);
    12601233
    12611234    stridesDone->moveAfter(bodyEnd);
     
    12631236    /// STRIDE DONE
    12641237
    1265     idb->SetInsertPoint(stridesDone);
     1238    b->SetInsertPoint(stridesDone);
    12661239
    12671240    // Now conditionally perform the final block processing depending on the doFinal parameter.
    1268     if (branchTarget) {
    1269         mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
     1241    if (mStrideLoopTarget) {
     1242        mStrideLoopBranch = b->CreateIndirectBr(mStrideLoopTarget, 3);
    12701243        mStrideLoopBranch->addDestination(doFinalBlock);
    12711244        mStrideLoopBranch->addDestination(segmentDone);
    12721245    } else {
    1273         idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
     1246        b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
    12741247    }
    12751248
    12761249    doFinalBlock->moveAfter(stridesDone);
    12771250
    1278     idb->SetInsertPoint(doFinalBlock);
    1279 
    1280     Value * remainingItems = nullptr;
     1251    /// DO FINAL BLOCK
     1252
     1253    b->SetInsertPoint(doFinalBlock);
    12811254    for (unsigned i = 0; i < inputSetCount; ++i) {
    1282         const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1283         if (r.isFixed()) {
    1284             Value * ic = idb->CreateUDiv(mAvailableItemCount[i], idb->getSize(r.getRate()));
    1285             if (remainingItems) {
    1286                 remainingItems = idb->CreateUMax(remainingItems, ic);
    1287             } else {
    1288                 remainingItems = ic;
    1289             }
    1290         }
    1291     }
    1292 
    1293     writeFinalBlockMethod(idb, remainingItems);
    1294 
    1295     idb->CreateBr(segmentDone);
    1296 
    1297     segmentDone->moveAfter(idb->GetInsertBlock());
    1298 
    1299     idb->SetInsertPoint(segmentDone);
     1255        mStreamSetInputBaseAddress[i] = baseInputAddress[i];
     1256    }
     1257
     1258    for (unsigned i = 0; i < outputSetCount; ++i) {
     1259        mStreamSetOutputBaseAddress[i] = baseOutputAddress[i];
     1260    }
     1261
     1262    writeFinalBlockMethod(b, getRemainingItems(b));
     1263
     1264    b->CreateBr(segmentDone);
     1265
     1266    segmentDone->moveAfter(b->GetInsertBlock());
     1267
     1268    b->SetInsertPoint(segmentDone);
    13001269
    13011270    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
    1302     if (branchTarget) {
    1303         MDBuilder mdb(idb->getContext());
     1271    if (mStrideLoopTarget) {
     1272        MDBuilder mdb(b->getContext());
    13041273        const auto destinations = mStrideLoopBranch->getNumDestinations();
    13051274        uint32_t weights[destinations];
     
    13111280    }
    13121281
    1313 }
    1314 
    1315 inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) {
     1282    return numOfBlocks;
     1283}
     1284
     1285/** ------------------------------------------------------------------------------------------------------------- *
     1286 * @brief getRemainingItems
     1287 ** ------------------------------------------------------------------------------------------------------------- */
     1288Value * BlockOrientedKernel::getRemainingItems(const std::unique_ptr<KernelBuilder> & b) {
     1289    Value * remainingItems = nullptr;
     1290    const auto count = mStreamSetInputs.size();
     1291    if (count == 1) {
     1292        return mAvailableItemCount[0];
     1293    } else {
     1294        for (unsigned i = 0; i < count; i++) {
     1295            if (mStreamSetInputs[i].isPrincipal()) {
     1296                return mAvailableItemCount[i];
     1297            }
     1298        }
     1299        for (unsigned i = 0; i < count; ++i) {
     1300            const ProcessingRate & r = mStreamSetInputs[i].getRate();
     1301            if (r.isFixed()) {
     1302                Value * ic = CreateUDivCeil(b, mAvailableItemCount[i], r.getRate());
     1303                if (remainingItems) {
     1304                    remainingItems = b->CreateUMin(remainingItems, ic);
     1305                } else {
     1306                    remainingItems = ic;
     1307                }
     1308            }
     1309        }
     1310    }
     1311    return remainingItems;
     1312}
     1313
     1314/** ------------------------------------------------------------------------------------------------------------- *
     1315 * @brief writeDoBlockMethod
     1316 ** ------------------------------------------------------------------------------------------------------------- */
     1317inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    13161318
    13171319    Value * const self = getInstance();
    13181320    Function * const cp = mCurrentMethod;
    1319     auto ip = idb->saveIP();
     1321    auto ip = b->saveIP();
    13201322    std::vector<Value *> availableItemCount(0);
    13211323
    13221324    /// Check if the do block method is called and create the function if necessary
    1323     if (!idb->supportsIndirectBr()) {
     1325    if (!b->supportsIndirectBr()) {
    13241326
    13251327        std::vector<Type *> params;
     
    13301332        }
    13311333
    1332         FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false);
    1333         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule());
     1334        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
     1335        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, b->getModule());
    13341336        mCurrentMethod->setCallingConv(CallingConv::C);
    13351337        mCurrentMethod->setDoesNotThrow();
     
    13431345        assert (availableItemCount.size() == mAvailableItemCount.size());
    13441346        mAvailableItemCount.swap(availableItemCount);
    1345         idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
    1346     }
    1347 
    1348     generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
    1349 
    1350     if (!idb->supportsIndirectBr()) {
     1347        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
     1348    }
     1349
     1350    generateDoBlockMethod(b); // must be implemented by the BlockOrientedKernelBuilder subtype
     1351
     1352    if (!b->supportsIndirectBr()) {
    13511353        // Restore the DoSegment function state then call the DoBlock method
    1352         idb->CreateRetVoid();
     1354        b->CreateRetVoid();
    13531355        mDoBlockMethod = mCurrentMethod;
    1354         idb->restoreIP(ip);
     1356        b->restoreIP(ip);
    13551357        setInstance(self);
    13561358        mCurrentMethod = cp;
    13571359        mAvailableItemCount.swap(availableItemCount);
    1358         CreateDoBlockMethodCall(idb);
    1359     }
    1360 
    1361 }
    1362 
    1363 inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) {
     1360        CreateDoBlockMethodCall(b);
     1361    }
     1362
     1363}
     1364
     1365/** ------------------------------------------------------------------------------------------------------------- *
     1366 * @brief writeFinalBlockMethod
     1367 ** ------------------------------------------------------------------------------------------------------------- */
     1368inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingItems) {
    13641369
    13651370    Value * const self = getInstance();
    13661371    Function * const cp = mCurrentMethod;
    13671372    Value * const remainingItemCount = remainingItems;
    1368     auto ip = idb->saveIP();
     1373    auto ip = b->saveIP();
    13691374    std::vector<Value *> availableItemCount(0);
    13701375
    1371     if (!idb->supportsIndirectBr()) {
     1376    if (!b->supportsIndirectBr()) {
    13721377        std::vector<Type *> params;
    13731378        params.reserve(2 + mAvailableItemCount.size());
    13741379        params.push_back(self->getType());
    1375         params.push_back(idb->getSizeTy());
     1380        params.push_back(b->getSizeTy());
    13761381        for (Value * avail : mAvailableItemCount) {
    13771382            params.push_back(avail->getType());
    13781383        }
    1379         FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false);
    1380         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule());
     1384        FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
     1385        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, b->getModule());
    13811386        mCurrentMethod->setCallingConv(CallingConv::C);
    13821387        mCurrentMethod->setDoesNotThrow();
     
    13921397        assert (availableItemCount.size() == mAvailableItemCount.size());
    13931398        mAvailableItemCount.swap(availableItemCount);
    1394         idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
    1395     }
    1396 
    1397     generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
    1398 
    1399     if (!idb->supportsIndirectBr()) {
    1400         idb->CreateRetVoid();
    1401         idb->restoreIP(ip);
     1399        b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
     1400    }
     1401
     1402    generateFinalBlockMethod(b, remainingItems); // may be implemented by the BlockOrientedKernel subtype
     1403
     1404    if (!b->supportsIndirectBr()) {
     1405        b->CreateRetVoid();
     1406        b->restoreIP(ip);
    14021407        setInstance(self);
    14031408        mAvailableItemCount.swap(availableItemCount);
     
    14071412        args.push_back(self);
    14081413        args.push_back(remainingItemCount);
    1409         for (Value * avail : mAvailableItemCount) {
    1410             args.push_back(avail);
    1411         }
    1412         idb->CreateCall(mCurrentMethod, args);
     1414        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
     1415        b->CreateCall(mCurrentMethod, args);
    14131416        mCurrentMethod = cp;
    14141417    }
     
    14161419}
    14171420
    1418 //  The default finalBlock method simply dispatches to the doBlock routine.
    1419 void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) {
    1420     CreateDoBlockMethodCall(idb);
    1421 }
    1422 
    1423 void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) {
    1424     if (idb->supportsIndirectBr()) {
    1425         BasicBlock * bb = idb->CreateBasicBlock("resume");
     1421/** ------------------------------------------------------------------------------------------------------------- *
     1422 * @brief generateFinalBlockMethod
     1423 ** ------------------------------------------------------------------------------------------------------------- */
     1424void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * /* remainingItems */) {
     1425    //  The default finalBlock method simply dispatches to the doBlock routine.
     1426    CreateDoBlockMethodCall(b);
     1427}
     1428
     1429void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b) {
     1430    if (b->supportsIndirectBr()) {
     1431        BasicBlock * const bb = b->CreateBasicBlock("resume");
    14261432        mStrideLoopBranch->addDestination(bb);
    1427         mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock());
    1428         idb->CreateBr(mStrideLoopBody);
    1429         bb->moveAfter(idb->GetInsertBlock());
    1430         idb->SetInsertPoint(bb);
     1433        BasicBlock * const current = b->GetInsertBlock();
     1434        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), current);
     1435        mStrideBlockIndex->addIncoming(b->getSize(0), current);
     1436        b->CreateBr(mStrideLoopBody);
     1437        bb->moveAfter(current);
     1438        b->SetInsertPoint(bb);
    14311439    } else {
    14321440        std::vector<Value *> args;
    14331441        args.reserve(1 + mAvailableItemCount.size());
    14341442        args.push_back(getInstance());
    1435         for (Value * avail : mAvailableItemCount) {
    1436             args.push_back(avail);
    1437         }
    1438         idb->CreateCall(mDoBlockMethod, args);
     1443        args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
     1444        b->CreateCall(mDoBlockMethod, args);
    14391445    }
    14401446}
    14411447
    14421448static inline std::string annotateKernelNameWithDebugFlags(std::string && name) {
    1443     if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) {
     1449    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    14441450        name += "_EA";
    14451451    }
     
    14501456// CONSTRUCTOR
    14511457Kernel::Kernel(std::string && kernelName,
    1452                std::vector<Binding> && stream_inputs,
    1453                std::vector<Binding> && stream_outputs,
    1454                std::vector<Binding> && scalar_parameters,
    1455                std::vector<Binding> && scalar_outputs,
    1456                std::vector<Binding> && internal_scalars)
     1458               Bindings && stream_inputs,
     1459               Bindings && stream_outputs,
     1460               Bindings && scalar_parameters,
     1461               Bindings && scalar_outputs,
     1462               Bindings && internal_scalars)
    14571463: KernelInterface(annotateKernelNameWithDebugFlags(std::move(kernelName))
    14581464                  , std::move(stream_inputs), std::move(stream_outputs)
     
    14601466                  , std::move(internal_scalars))
    14611467, mCurrentMethod(nullptr)
    1462 , mAvailablePrincipleItemCount(nullptr)
     1468, mAvailablePrincipalItemCount(nullptr)
    14631469, mNoTerminateAttribute(false)
    14641470, mIsGenerated(false)
     
    14731479}
    14741480
     1481// MULTI-BLOCK KERNEL CONSTRUCTOR
     1482MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
     1483                                   Bindings && stream_inputs,
     1484                                   Bindings && stream_outputs,
     1485                                   Bindings && scalar_parameters,
     1486                                   Bindings && scalar_outputs,
     1487                                   Bindings && internal_scalars)
     1488: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1489
     1490}
     1491
    14751492// CONSTRUCTOR
    14761493BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
    1477                                          std::vector<Binding> && stream_inputs,
    1478                                          std::vector<Binding> && stream_outputs,
    1479                                          std::vector<Binding> && scalar_parameters,
    1480                                          std::vector<Binding> && scalar_outputs,
    1481                                          std::vector<Binding> && internal_scalars)
     1494                                         Bindings && stream_inputs,
     1495                                         Bindings && stream_outputs,
     1496                                         Bindings && scalar_parameters,
     1497                                         Bindings && scalar_outputs,
     1498                                         Bindings && internal_scalars)
    14821499: MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
    14831500, mDoBlockMethod(nullptr)
    14841501, mStrideLoopBody(nullptr)
    14851502, mStrideLoopBranch(nullptr)
    1486 , mStrideLoopTarget(nullptr) {
    1487 
    1488 }
    1489 
    1490 // MULTI-BLOCK KERNEL CONSTRUCTOR
    1491 MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
    1492                                    std::vector<Binding> && stream_inputs,
    1493                                    std::vector<Binding> && stream_outputs,
    1494                                    std::vector<Binding> && scalar_parameters,
    1495                                    std::vector<Binding> && scalar_outputs,
    1496                                    std::vector<Binding> && internal_scalars)
    1497 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1503, mStrideLoopTarget(nullptr)
     1504, mStrideBlockIndex(nullptr) {
    14981505
    14991506}
     
    15011508// CONSTRUCTOR
    15021509SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
    1503                                              std::vector<Binding> && stream_inputs,
    1504                                              std::vector<Binding> && stream_outputs,
    1505                                              std::vector<Binding> && scalar_parameters,
    1506                                              std::vector<Binding> && scalar_outputs,
    1507                                              std::vector<Binding> && internal_scalars)
     1510                                             Bindings && stream_inputs,
     1511                                             Bindings && stream_outputs,
     1512                                             Bindings && scalar_parameters,
     1513                                             Bindings && scalar_outputs,
     1514                                             Bindings && internal_scalars)
    15081515: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    15091516
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5706 r5755  
    1111
    1212namespace llvm { class BasicBlock; }
     13namespace llvm { class Constant; }
    1314namespace llvm { class Function; }
    1415namespace llvm { class IntegerType; }
     
    2627class Kernel : public KernelInterface {
    2728    friend class KernelBuilder;
    28 public:
    29     enum class Port { Input, Output };
    30 
    31     using StreamPort = std::pair<Port, unsigned>;
    32 
    33 protected:
    34 
    35     using KernelMap = boost::container::flat_map<std::string, unsigned>;
    36     using StreamMap = boost::container::flat_map<std::string, StreamPort>;
    37     using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
    38     using Kernels = std::vector<Kernel *>;
     29protected:
    3930
    4031    static const std::string DO_BLOCK_SUFFIX;
     
    5243public:
    5344   
     45    enum class Port { Input, Output };
     46    using StreamPort = std::pair<Port, unsigned>;
     47    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
     48    using KernelFieldMap = boost::container::flat_map<std::string, unsigned>;
     49    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
     50
    5451    // Kernel Signatures and Module IDs
    5552    //
    5653    // A kernel signature uniquely identifies a kernel and its full functionality.
    5754    // In the event that a particular kernel instance is to be generated and compiled
    58     // to produce object code, and we have a cached kernel object code instance with 
    59     // the same signature and targetting the same IDISA architecture, then the cached 
     55    // to produce object code, and we have a cached kernel object code instance with
     56    // the same signature and targetting the same IDISA architecture, then the cached
    6057    // object code may safely be used to avoid recompilation.
    6158    //
     
    6461    // Kernel developers should take responsibility for designing appropriate signature
    6562    // mechanisms that are short, inexpensive to compute and guarantee uniqueness
    66     // based on the semantics of the kernel. 
     63    // based on the semantics of the kernel.
    6764    //
    6865    // If no other mechanism is available, the default makeSignature() method uses the
     
    8784    // be added, the default method for preparing kernel state may be used.
    8885
    89        
    90     bool isCachable() const override { return false; }
    9186
    9287    std::string makeSignature(const std::unique_ptr<KernelBuilder> & idb) override;
     
    9590    virtual bool hasSignature() const { return true; }
    9691
    97     // Create a module stub for the kernel, populated only with its Module ID.     
    98     //
     92    bool isCachable() const override { return false; }
    9993
    10094    void bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs);
    101 
    102     StreamPort getStreamPort(const std::string & name) const;
    10395
    10496    llvm::Module * setModule(llvm::Module * const module);
     
    122114    }
    123115
     116    StreamPort getStreamPort(const std::string & name) const;
     117
     118    const Binding & getBinding(const std::string & name) const;
     119
    124120    const StreamSetBuffers & getStreamSetInputBuffers() const {
    125121        return mStreamSetInputBuffers;
     
    127123
    128124    const parabix::StreamSetBuffer * getStreamSetInputBuffer(const unsigned i) const {
     125        assert (i < mStreamSetInputBuffers.size());
     126        assert (mStreamSetInputBuffers[i]);
    129127        return mStreamSetInputBuffers[i];
     128    }
     129
     130    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
     131        const auto port = getStreamPort(name);
     132        assert (port.first == Port::Input);
     133        return getStreamSetInputBuffer(port.second);
    130134    }
    131135
     
    134138    }
    135139
     140    const Binding & getStreamInput(const unsigned i) const {
     141        return KernelInterface::getStreamInput(i);
     142    }
     143
     144    const Binding & getStreamInput(const std::string & name) const {
     145        const auto port = getStreamPort(name);
     146        assert (port.first == Port::Input);
     147        return KernelInterface::getStreamInput(port.second);
     148    }
     149
    136150    const parabix::StreamSetBuffer * getStreamSetOutputBuffer(const unsigned i) const {
     151        assert (i < mStreamSetOutputBuffers.size());
     152        assert (mStreamSetOutputBuffers[i]);
    137153        return mStreamSetOutputBuffers[i];
     154    }
     155
     156    const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
     157        const auto port = getStreamPort(name);
     158        assert (port.first == Port::Output);
     159        return getStreamSetOutputBuffer(port.second);
     160    }
     161
     162    const Binding & getStreamOutput(const unsigned i) const {
     163        return KernelInterface::getStreamOutput(i);
     164    }
     165
     166    const Binding & getStreamOutput(const std::string & name) const {
     167        const auto port = getStreamPort(name);
     168        assert (port.first == Port::Output);
     169        return KernelInterface::getStreamOutput(port.second);
    138170    }
    139171   
     
    144176    //
    145177   
    146     unsigned getKernelStride() const { return mStride; }
     178    unsigned getStride() const { return mStride; }
    147179   
    148180    virtual ~Kernel() = 0;
     
    156188protected:
    157189
    158     void setKernelStride(unsigned stride) { mStride = stride; }
    159 
    160190    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
    161191
     
    163193
    164194    // Constructor
    165     Kernel(std::string && kernelName,
    166                   std::vector<Binding> && stream_inputs,
    167                   std::vector<Binding> && stream_outputs,
    168                   std::vector<Binding> && scalar_parameters,
    169                   std::vector<Binding> && scalar_outputs,
    170                   std::vector<Binding> && internal_scalars);
     195    Kernel(std::string && kernelName, Bindings && stream_inputs,
     196          Bindings && stream_outputs,
     197          Bindings && scalar_parameters,
     198          Bindings && scalar_outputs,
     199          Bindings && internal_scalars);
    171200
    172201    void setNoTerminateAttribute(const bool noTerminate = true) {
     
    174203    }
    175204
    176     llvm::Value * getPrincipleItemCount() const {
    177         return mAvailablePrincipleItemCount;
     205    llvm::Value * getPrincipalItemCount() const {
     206        return mAvailablePrincipalItemCount;
    178207    }
    179208
     
    201230
    202231    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
    203 
    204 
    205     std::pair<unsigned, unsigned> getStreamRate(const Port p, const unsigned i) const;
    206 
    207     const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
    208         const auto port = getStreamPort(name);
    209         assert (port.first == Port::Input);
    210         assert (port.second < mStreamSetInputBuffers.size());
    211         assert (mStreamSetInputBuffers[port.second]);
    212         return mStreamSetInputBuffers[port.second];
    213     }
    214 
    215     const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
    216         const auto port = getStreamPort(name);
    217         assert (port.first == Port::Output);
    218         assert (port.second < mStreamSetOutputBuffers.size());
    219         assert (mStreamSetOutputBuffers[port.second]);
    220         return mStreamSetOutputBuffers[port.second];
    221     }
    222232
    223233    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
     
    235245    }
    236246
    237     llvm::Value * getStreamSetInputBufferPtr(const unsigned i) const {
    238         return mStreamSetInputBufferPtr[i];
    239     }
    240 
    241     llvm::Value * getStreamSetOutputBufferPtr(const unsigned i) const {
    242         return mStreamSetOutputBufferPtr[i];
    243     }
     247    void setStride(unsigned stride) { mStride = stride; }
    244248
    245249private:
    246250
    247251    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
     252
     253    llvm::Value * getStreamSetInputAddress(const std::string & name) const {
     254        const Kernel::StreamPort p = getStreamPort(name);
     255        assert (p.first == Port::Input);
     256        return mStreamSetInputBaseAddress[p.second];
     257    }
     258
     259    llvm::Value * getStreamSetOutputAddress(const std::string & name) const {
     260        const Kernel::StreamPort p = getStreamPort(name);
     261        assert (p.first == Port::Output);
     262        return mStreamSetOutputBaseAddress[p.second];
     263    }
    248264
    249265    llvm::Value * getAvailableItemCount(const unsigned i) const {
     
    251267    }
    252268
     269    void normalizeStreamProcessingRates();
     270
     271    bool normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate);
     272
    253273protected:
    254274
    255275    llvm::Function *                    mCurrentMethod;
    256     llvm::Value *                       mAvailablePrincipleItemCount;
     276    llvm::Value *                       mAvailablePrincipalItemCount;
    257277    bool                                mNoTerminateAttribute;
    258278    bool                                mIsGenerated;
     
    260280    llvm::Value *                       mIsFinal;
    261281    llvm::Value *                       mOutputScalarResult;
    262 
    263 
    264282    std::vector<llvm::Value *>          mAvailableItemCount;
    265283
     284    KernelFieldMap                      mKernelFieldMap;
    266285    std::vector<llvm::Type *>           mKernelFields;
    267     KernelMap                           mKernelMap;
     286
    268287    StreamMap                           mStreamMap;
     288
    269289    StreamSetBuffers                    mStreamSetInputBuffers;
    270     std::vector<llvm::Value *>          mStreamSetInputBufferPtr;
     290    std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
    271291    StreamSetBuffers                    mStreamSetOutputBuffers;
    272     std::vector<llvm::Value *>          mStreamSetOutputBufferPtr;
    273 
     292    std::vector<llvm::Value *>          mStreamSetOutputBaseAddress;
    274293};
    275294
     295using Kernels = std::vector<Kernel *>;
     296
    276297class SegmentOrientedKernel : public Kernel {
    277298protected:
    278299
    279300    SegmentOrientedKernel(std::string && kernelName,
    280                           std::vector<Binding> && stream_inputs,
    281                           std::vector<Binding> && stream_outputs,
    282                           std::vector<Binding> && scalar_parameters,
    283                           std::vector<Binding> && scalar_outputs,
    284                           std::vector<Binding> && internal_scalars);
     301                          Bindings && stream_inputs,
     302                          Bindings && stream_outputs,
     303                          Bindings && scalar_parameters,
     304                          Bindings && scalar_outputs,
     305                          Bindings && internal_scalars);
    285306protected:
    286307
    287308    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
    288309
    289     virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) = 0;
     310    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
    290311
    291312};
     
    386407
    387408    MultiBlockKernel(std::string && kernelName,
    388                      std::vector<Binding> && stream_inputs,
    389                      std::vector<Binding> && stream_outputs,
    390                      std::vector<Binding> && scalar_parameters,
    391                      std::vector<Binding> && scalar_outputs,
    392                      std::vector<Binding> && internal_scalars);
     409                     Bindings && stream_inputs,
     410                     Bindings && stream_outputs,
     411                     Bindings && scalar_parameters,
     412                     Bindings && scalar_outputs,
     413                     Bindings && internal_scalars);
    393414
    394415    // Each multi-block kernel subtype must provide its own logic for handling
     
    399420    // exit the RetVoid instruction will be added to complete the method.
    400421    //
    401     virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) = 0;
     422    virtual llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
    402423
    403424private:
     
    406427    // method of the multi-block kernel builder makes all the necessary arrangements
    407428    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
    408     void generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) final;
     429    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
     430
     431    unsigned getItemAlignment(const Binding & binding) const;
     432
     433    ProcessingRate::RateValue getLowerBound(const ProcessingRate &rate) const;
     434
     435    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
     436
     437    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
     438
     439    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
    409440
    410441    bool requiresCopyBack(const ProcessingRate & rate) const;
    411442
     443    void reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b);
     444
     445protected:
     446
     447    std::vector<llvm::Value *>      mInitialAvailableItemCount;
     448    std::vector<llvm::Value *>      mInitialProcessedItemCount;
     449    std::vector<llvm::Value *>      mInitialProducedItemCount;
     450
    412451};
    413452
     
    416455protected:
    417456
    418     void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb);
     457    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b);
    419458
    420459    // Each kernel builder subtype must provide its own logic for generating
    421460    // doBlock calls.
    422     virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) = 0;
     461    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
    423462
    424463    // Each kernel builder subtypre must also specify the logic for processing the
     
    429468    // not be overridden.
    430469
    431     virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
    432 
    433     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) final;
     470    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
    434471
    435472    BlockOrientedKernel(std::string && kernelName,
    436                         std::vector<Binding> && stream_inputs,
    437                         std::vector<Binding> && stream_outputs,
    438                         std::vector<Binding> && scalar_parameters,
    439                         std::vector<Binding> && scalar_outputs,
    440                         std::vector<Binding> && internal_scalars);
     473                        Bindings && stream_inputs,
     474                        Bindings && stream_outputs,
     475                        Bindings && scalar_parameters,
     476                        Bindings && scalar_outputs,
     477                        Bindings && internal_scalars);
    441478
    442479private:
    443480
    444     void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb);
    445 
    446     void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * remainingItems);
     481    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     482
     483    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
     484
     485    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
     486
     487    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
    447488
    448489private:
    449490
    450     llvm::Function *        mDoBlockMethod;
    451     llvm::BasicBlock *      mStrideLoopBody;
    452     llvm::IndirectBrInst *  mStrideLoopBranch;
    453     llvm::PHINode *         mStrideLoopTarget;
     491    llvm::Function *            mDoBlockMethod;
     492    llvm::BasicBlock *          mStrideLoopBody;
     493    llvm::IndirectBrInst *      mStrideLoopBranch;
     494    llvm::PHINode *             mStrideLoopTarget;
     495    llvm::PHINode *             mStrideBlockIndex;
    454496};
    455497
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5706 r5755  
    44#include <kernels/streamset.h>
    55#include <llvm/Support/raw_ostream.h>
     6#include <llvm/IR/Module.h>
    67
    78using namespace llvm;
    89using namespace parabix;
    910
    10 using Value = Value;
     11inline static bool is_power_2(const uint64_t n) {
     12    return ((n & (n - 1)) == 0) && n;
     13}
    1114
    1215namespace kernel {
     
    1417using Port = Kernel::Port;
    1518
    16 Value * KernelBuilder::getScalarFieldPtr(llvm::Value * instance, Value * const index) {
    17     assert (instance);
    18     CreateAssert(instance, "getScalarFieldPtr: instance cannot be null!");
     19Value * KernelBuilder::getScalarFieldPtr(llvm::Value * const instance, Value * const index) {
     20    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     21        CreateAssert(instance, "getScalarFieldPtr: instance cannot be null!");
     22    }
    1923    return CreateGEP(instance, {getInt32(0), index});
    2024}
    2125
    22 Value * KernelBuilder::getScalarFieldPtr(llvm::Value * instance, const std::string & fieldName) {
    23     return getScalarFieldPtr(instance, getInt32(mKernel->getScalarIndex(fieldName)));
    24 }
    25 
    26 llvm::Value * KernelBuilder::getScalarFieldPtr(llvm::Value * index) {
     26Value * KernelBuilder::getScalarFieldPtr(llvm::Value * const handle, const std::string & fieldName) {
     27    return getScalarFieldPtr(handle, getInt32(mKernel->getScalarIndex(fieldName)));
     28}
     29
     30llvm::Value * KernelBuilder::getScalarFieldPtr(llvm::Value * const index) {
    2731    return getScalarFieldPtr(mKernel->getInstance(), index);
    2832}
     
    4246Value * KernelBuilder::getStreamHandle(const std::string & name) {
    4347    Value * const ptr = getScalarField(name + Kernel::BUFFER_PTR_SUFFIX);
    44     CreateAssert(ptr, name + " cannot be null!");
     48    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     49        CreateAssert(ptr, name + " handle cannot be null!");
     50    }
    4551    return ptr;
    4652}
     
    5864}
    5965
    60 inline const Binding & getBinding(const Kernel * k, const std::string & name) {
    61     Port port; unsigned index;
    62     std::tie(port, index) = k->getStreamPort(name);
    63     if (port == Port::Input) {
    64         return k->getStreamInput(index);
    65     } else {
    66         return k->getStreamOutput(index);
    67     }
    68 }
    69 
    7066Value * KernelBuilder::getInternalItemCount(const std::string & name, const std::string & suffix) {
    71     const ProcessingRate & rate = getBinding(mKernel, name).getRate();
     67    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    7268    Value * itemCount = nullptr;
    73     if (rate.isExactlyRelative()) {
     69    if (LLVM_UNLIKELY(rate.isRelative())) {
    7470        Port port; unsigned index;
    7571        std::tie(port, index) = mKernel->getStreamPort(rate.getReference());
     
    7975            itemCount = getProducedItemCount(rate.getReference());
    8076        }
    81         if (rate.getNumerator() != 1) {
    82             itemCount = CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator()));
    83         }
    84         if (rate.getDenominator() != 1) {
    85             itemCount = CreateExactUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator()));
     77        const auto & r = rate.getRate();
     78        if (r.numerator() != 1) {
     79            itemCount = CreateMul(itemCount, ConstantInt::get(itemCount->getType(), r.numerator()));
     80        }
     81        if (r.denominator() != 1) {
     82            itemCount = CreateExactUDiv(itemCount, ConstantInt::get(itemCount->getType(), r.denominator()));
    8683        }
    8784    } else {
     
    9289
    9390void KernelBuilder::setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value) {
    94     const ProcessingRate & rate = getBinding(mKernel, name).getRate();
     91    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    9592    if (LLVM_UNLIKELY(rate.isDerived())) {
    9693        report_fatal_error("Cannot set item count: " + name + " is a Derived rate");
     
    139136}
    140137
    141 Value * KernelBuilder::copy(const std::string & name, Value * target, Value * source, Value * itemsToCopy, const unsigned alignment) {
     138//Value * KernelBuilder::getLinearlyCopyableItems(const std::string & name, Value * fromPosition, bool reverse) {
     139//    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     140//    return buf->getLinearlyCopyableItems(this, getStreamHandle(name), fromPosition, reverse);
     141//}
     142
     143/** ------------------------------------------------------------------------------------------------------------- *
     144 * @brief isConstantZero
     145 ** ------------------------------------------------------------------------------------------------------------- */
     146inline bool isConstantZero(Value * const v) {
     147    return isa<ConstantInt>(v) && cast<ConstantInt>(v)->isNullValue();
     148}
     149
     150/** ------------------------------------------------------------------------------------------------------------- *
     151 * @brief isConstantOne
     152 ** ------------------------------------------------------------------------------------------------------------- */
     153inline bool isConstantOne(Value * const v) {
     154    return isa<ConstantInt>(v) && cast<ConstantInt>(v)->isOne();
     155}
     156
     157/** ------------------------------------------------------------------------------------------------------------- *
     158 * @brief getItemWidth
     159 ** ------------------------------------------------------------------------------------------------------------- */
     160inline unsigned getItemWidth(const Type * ty) {
     161    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     162        ty = ty->getArrayElementType();
     163    }
     164    return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
     165}
     166
     167/** ------------------------------------------------------------------------------------------------------------- *
     168 * @brief getFieldWidth
     169 ** ------------------------------------------------------------------------------------------------------------- */
     170inline unsigned getFieldWidth(const unsigned bitWidth, const unsigned blockWidth) {
     171    for (unsigned k = 16; k < blockWidth; k *= 2) {
     172        if ((bitWidth & (k - 1)) != 0) {
     173            return k / 2;
     174        }
     175    }
     176    return blockWidth;
     177}
     178
     179/** ------------------------------------------------------------------------------------------------------------- *
     180 * @brief CreateStreamCpy
     181 ** ------------------------------------------------------------------------------------------------------------- */
     182void KernelBuilder::CreateStreamCpy(const std::string & name, Value * target, Value * targetOffset, Value * source, Value * sourceOffset, Value * itemsToCopy, const unsigned itemAlignment) {
     183
     184    assert (target && targetOffset);
     185    assert (source && sourceOffset);
     186    assert (target->getType() == source->getType());
     187    assert (target->getType()->isPointerTy());
     188
    142189    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    143     return buf->copy(this, getStreamHandle(name), target, source, itemsToCopy, alignment);
     190
     191    const auto itemWidth = getItemWidth(buf->getBaseType());
     192    assert ("invalid item width" && is_power_2(itemWidth));
     193    const auto blockWidth = getBitBlockWidth();
     194
     195    const auto fieldWidth = getFieldWidth(itemWidth * itemAlignment, blockWidth);
     196    assert ("overflow error" && is_power_2(fieldWidth) && (itemWidth <= fieldWidth));
     197
     198    assert (isConstantZero(targetOffset) || isConstantZero(sourceOffset));
     199
     200    IntegerType * const fieldWidthTy = getIntNTy(fieldWidth / 8);
     201
     202    const auto alignment = fieldWidth / 8;
     203
     204    if (LLVM_LIKELY(itemWidth < fieldWidth)) {
     205        Constant * const factor = getSize(fieldWidth / itemWidth);
     206        CreateAssertZero(CreateURem(targetOffset, factor), "target offset is not a multiple of its field width");
     207        targetOffset = CreateUDiv(targetOffset, factor);
     208        CreateAssertZero(CreateURem(sourceOffset, factor), "source offset is not a multiple of its field width");
     209        sourceOffset = CreateUDiv(sourceOffset, factor);
     210    }
     211
     212    /*
     213
     214       Streams are conceptually modelled as:
     215
     216                                            BLOCKS
     217
     218                                      A     B     C     D
     219           STREAM SET ELEMENT   1  |aaaaa|bbbbb|ccccc|dddd |
     220                                2  |eeeee|fffff|ggggg|hhhh |
     221                                3  |iiiii|jjjjj|kkkkk|llll |
     222
     223       But the memory layout is actually:
     224
     225           A_1   A_2   A_3   B_1   B_2   B_3   C_1   C_2   C_3   D_1   D_2   D_3
     226
     227         |aaaaa|eeeee|iiiii|bbbbb|fffff|jjjjj|ccccc|ggggg|kkkkk|dddd |hhhh |llll |
     228
     229
     230       So if we're copying the entire stream set block or our stream set has one element, we can use memcpy.
     231
     232    */
     233
     234    Value * const n = buf->getStreamSetCount(this, getStreamHandle(name));
     235    if (fieldWidth == blockWidth || isConstantOne(n) || (isConstantZero(targetOffset) && isConstantZero(sourceOffset))) {
     236        PointerType * const fieldWidthPtrTy = fieldWidthTy->getPointerTo();
     237        if (isConstantOne(n)) {
     238            if (LLVM_LIKELY(itemWidth < 8)) {
     239                itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
     240            } else if (LLVM_UNLIKELY(itemWidth > 8)) {
     241                itemsToCopy = CreateMul(itemsToCopy, getSize(itemWidth / 8));
     242            }
     243        } else {
     244            itemsToCopy = CreateMul(CreateUDivCeil(itemsToCopy, getSize(blockWidth / (8 * itemWidth))), n);
     245        }
     246        target = CreateGEP(CreatePointerCast(target, fieldWidthPtrTy), targetOffset);
     247        source = CreateGEP(CreatePointerCast(source, fieldWidthPtrTy), sourceOffset);
     248        CreateMemCpy(target, source, itemsToCopy, alignment);
     249
     250    } else { // either the target offset or source offset is non-zero but not both
     251
     252        VectorType * const blockTy = getBitBlockType();
     253        PointerType * const blockPtrTy = blockTy->getPointerTo();
     254
     255        target = CreatePointerCast(target, blockPtrTy);
     256        source = CreatePointerCast(source, blockPtrTy);
     257
     258        VectorType * const shiftTy = VectorType::get(fieldWidthTy, blockWidth / fieldWidth);
     259        Constant * const width = getSize(blockWidth / itemWidth);
     260        BasicBlock * const entry = GetInsertBlock();
     261
     262
     263        if (isConstantZero(targetOffset)) {
     264
     265            /*
     266                                                BLOCKS
     267
     268                                          A     B     C     D
     269               SOURCE STREAM        1  |aaa--|bbbBB|cccCC|  dDD|
     270                                    2  |eee--|fffFF|gggGG|  hHH|
     271                                    3  |iii--|jjjJJ|kkkKK|  lLL|
     272
     273
     274                                          A     B     C     D
     275               TARGET STREAM        1  |BBaaa|CCbbb|DDccc|    d|
     276                                    2  |FFeee|GGfff|HHggg|    h|
     277                                    3  |JJiii|KKjjj|LLkkk|    l|
     278             */
     279
     280            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, width), n);
     281            Value * const offset = CreateURem(sourceOffset, width);
     282            Value * const remaining = CreateSub(width, offset);
     283            Value * const trailing = CreateURem(CreateAdd(sourceOffset, itemsToCopy), width);
     284
     285            BasicBlock * const streamCopy = CreateBasicBlock(name + "StreamCopy");
     286            BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "StreamCopyRemaining");
     287            BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "StreamCopyEnd");
     288
     289            CreateCondBr(CreateICmpNE(blocksToCopy, getSize(0)), streamCopy, streamCopyRemaining);
     290
     291            SetInsertPoint(streamCopy);
     292            PHINode * const i = CreatePHI(getSizeTy(), 2);
     293            i->addIncoming(n, entry);
     294            Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
     295            prior = CreateLShr(CreateBitCast(prior, shiftTy), offset);
     296            Value * value = CreateAlignedLoad(CreateGEP(source, i), alignment);
     297            value = CreateShl(CreateBitCast(value, shiftTy), remaining);
     298            Value * const result = CreateBitCast(CreateOr(value, prior), blockTy);
     299            CreateAlignedStore(result, CreateGEP(target, i), alignment);
     300            Value * const next_i = CreateAdd(i, getSize(1));
     301            i->addIncoming(next_i, streamCopy);
     302            CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemaining);
     303
     304            SetInsertPoint(streamCopyRemaining);
     305            PHINode * const j = CreatePHI(getSizeTy(), 2);
     306            j->addIncoming(getSize(0), streamCopy);
     307            Value * k = CreateAdd(blocksToCopy, j);
     308            Value * final = CreateAlignedLoad(CreateGEP(source, k), alignment);
     309            final = CreateLShr(CreateBitCast(prior, shiftTy), trailing);
     310            CreateAlignedStore(final, CreateGEP(target, k), alignment);
     311            Value * const next_j = CreateAdd(i, getSize(1));
     312            i->addIncoming(next_j, streamCopyRemaining);
     313            CreateCondBr(CreateICmpNE(next_j, n), streamCopyRemaining, streamCopyEnd);
     314
     315            SetInsertPoint(streamCopyEnd);
     316
     317        } else if (isConstantZero(sourceOffset)) {
     318
     319            /*
     320                                                BLOCKS
     321
     322                                          A     B     C     D
     323               SOURCE STREAM        1  |AAAaa|BBBaa|CCCcc|    d|
     324                                    2  |EEEee|FFFff|GGGgg|    h|
     325                                    3  |IIIii|JJJjj|KKKkk|    l|
     326
     327
     328                                          A     B     C     D
     329               TARGET STREAM        1  |aa---|bbAAA|ccBBB| dCCC|
     330                                    2  |ee---|ffEEE|ggFFF| hGGG|
     331                                    3  |ii---|jjIII|kkJJJ| lKKK|
     332
     333            */
     334
     335            BasicBlock * const streamCopy = CreateBasicBlock(name + "StreamCopy");
     336            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock(name + "StreamCopyRemainingCond");
     337            BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "StreamCopyRemaining");
     338            BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "StreamCopyEnd");
     339
     340            Value * const offset = CreateURem(targetOffset, width);
     341            Value * const copied = CreateSub(width, offset);
     342            Value * const mask = CreateLShr(Constant::getAllOnesValue(shiftTy), copied);
     343
     344            SetInsertPoint(streamCopy);
     345            PHINode * const i = CreatePHI(getSizeTy(), 2);
     346            i->addIncoming(getSize(0), entry);
     347            Value * targetValue = CreateAlignedLoad(CreateGEP(target, i), alignment);
     348            targetValue = CreateAnd(CreateBitCast(targetValue, shiftTy), mask);
     349            Value * sourceValue = CreateAlignedLoad(CreateGEP(source, i), alignment);
     350            sourceValue = CreateShl(CreateBitCast(sourceValue, shiftTy), offset);
     351            CreateAlignedStore(CreateOr(sourceValue, targetValue), CreateGEP(source, i), alignment);
     352            Value * const next_i = CreateAdd(i, getSize(1));
     353            i->addIncoming(next_i, streamCopy);
     354            CreateCondBr(CreateICmpNE(next_i, n), streamCopy, streamCopyRemainingCond);
     355
     356            SetInsertPoint(streamCopyRemainingCond);
     357            Value * const blocksToCopy = CreateMul(CreateUDiv(CreateSub(itemsToCopy, copied), width), n);
     358            CreateCondBr(CreateICmpULT(copied, itemsToCopy), streamCopyRemaining, streamCopyEnd);
     359
     360            SetInsertPoint(streamCopyRemaining);
     361            PHINode * const j = CreatePHI(getSizeTy(), 2);
     362            j->addIncoming(n, entry);
     363            Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
     364            prior = CreateShl(CreateBitCast(prior, shiftTy), offset);
     365            Value * value = CreateAlignedLoad(CreateGEP(source, j), alignment);
     366            value = CreateLShr(CreateBitCast(value, shiftTy), copied);
     367            Value * const result = CreateBitCast(CreateOr(value, prior), blockTy);
     368            CreateAlignedStore(result, CreateGEP(target, j), alignment);
     369            Value * const next_j = CreateAdd(j, getSize(1));
     370            j->addIncoming(next_j, streamCopy);
     371            CreateCondBr(CreateICmpNE(next_j, blocksToCopy), streamCopyRemaining, streamCopyEnd);
     372
     373            SetInsertPoint(streamCopyEnd);
     374        }
     375
     376    }
    144377}
    145378
    146379void KernelBuilder::CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to) {
    147380    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    148     return buf->genCopyBackLogic(this, getStreamHandle(name), from, to, name);
     381    buf->genCopyBackLogic(this, getStreamHandle(name), from, to, name);
    149382}
    150383
     
    157390}
    158391
    159 inline Value * KernelBuilder::computeBlockIndex(Value * itemCount) {
    160     const auto divisor = getBitBlockWidth();
    161     if (LLVM_LIKELY((divisor & (divisor - 1)) == 0)) {
    162         return CreateLShr(itemCount, std::log2(divisor));
    163     } else {
    164         return CreateUDiv(itemCount, getSize(divisor));
    165     }
    166 }
    167 
    168 Value * KernelBuilder::getInputStreamPtr(const std::string & name, Value * const blockIndex) {
    169 //    Value * const blockIndex = computeBlockIndex(getProcessedItemCount(name));
    170     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    171     return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
    172 }
    173 
    174392Value * KernelBuilder::getInputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    175     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    176     if (LLVM_UNLIKELY(p.first == Port::Output)) {
    177         report_fatal_error(name + " is not an input stream set");
    178     }
    179     Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
    180     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    181     return buf->getStreamBlockPtr(this, getStreamHandle(name), addr, streamIndex, true);
     393    Value * const addr = mKernel->getStreamSetInputAddress(name);
     394    if (addr) {
     395        return CreateGEP(addr, {getInt32(0), streamIndex});
     396    } else {
     397        const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     398        Value * const blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     399        return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, true);
     400    }
    182401}
    183402
     
    187406
    188407Value * KernelBuilder::getInputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    189     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    190     if (LLVM_UNLIKELY(p.first == Port::Output)) {
    191         report_fatal_error(name + " is not an input stream set");
    192     }
    193     Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
    194     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    195     return buf->getStreamPackPtr(this, getStreamHandle(name), addr, streamIndex, packIndex, true);
     408    Value * const addr = mKernel->getStreamSetInputAddress(name);
     409    if (addr) {
     410        return CreateGEP(addr, {getInt32(0), streamIndex, packIndex});
     411    } else {
     412        const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     413        Value * const blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     414        return buf->getStreamPackPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, packIndex, true);
     415    }
    196416}
    197417
    198418Value * KernelBuilder::loadInputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex) {
     419
     420
     421
    199422    return CreateBlockAlignedLoad(getInputStreamPackPtr(name, streamIndex, packIndex));
    200423}
     
    206429
    207430Value * KernelBuilder::getAdjustedInputStreamBlockPtr(Value * blockAdjustment, const std::string & name, Value * streamIndex) {
    208     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    209     if (LLVM_UNLIKELY(p.first == Port::Output)) {
    210         report_fatal_error(name + " is not an input stream set");
    211     }
    212     Value * const addr = mKernel->getStreamSetInputBufferPtr(p.second);
    213     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    214     return buf->getStreamBlockPtr(this, getStreamHandle(name), CreateGEP(addr, blockAdjustment), streamIndex, true);
    215 }
    216 
    217 Value * KernelBuilder::getOutputStreamPtr(const std::string & name, Value * const blockIndex) {
    218 //    Value * const blockIndex = computeBlockIndex(getProducedItemCount(name));
    219     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    220     return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
     431    Value * const addr = mKernel->getStreamSetInputAddress(name);
     432    if (addr) {
     433        return CreateGEP(addr, {blockAdjustment, streamIndex});
     434    } else {
     435        const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     436        Value * blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     437        blockIndex = CreateAdd(blockIndex, blockAdjustment);
     438        return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, true);
     439    }
    221440}
    222441
    223442Value * KernelBuilder::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    224     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    225     if (LLVM_UNLIKELY(p.first == Port::Input)) {
    226         report_fatal_error(name + " is not an output stream set");
    227     }
    228     Value * addr = mKernel->getStreamSetOutputBufferPtr(p.second);
    229     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    230     return buf->getStreamBlockPtr(this, getStreamHandle(name), addr, streamIndex, true);
     443    Value * const addr = mKernel->getStreamSetOutputAddress(name);
     444    if (addr) {
     445        return CreateGEP(addr, {getInt32(0), streamIndex});
     446    } else {
     447        const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     448        Value * const blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
     449        return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, false);
     450    }
    231451}
    232452
     
    236456
    237457Value * KernelBuilder::getOutputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    238     const Kernel::StreamPort p = mKernel->getStreamPort(name);
    239     if (LLVM_UNLIKELY(p.first == Port::Input)) {
    240         report_fatal_error(name + " is not an output stream set");
    241     }
    242     Value * addr = mKernel->getStreamSetOutputBufferPtr(p.second);
    243     const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    244     return buf->getStreamPackPtr(this, getStreamHandle(name), addr, streamIndex, packIndex, false);
     458    Value * const addr = mKernel->getStreamSetOutputAddress(name);
     459    if (addr) {
     460        return CreateGEP(addr, {getInt32(0), streamIndex, packIndex});
     461    } else {
     462        const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     463        Value * const blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
     464        return buf->getStreamPackPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, packIndex, false);
     465    }
    245466}
    246467
     
    280501}
    281502
    282 
    283503Value * KernelBuilder::getCapacity(const std::string & name) {
    284504    return mKernel->getAnyStreamSetBuffer(name)->getCapacity(this, getStreamHandle(name));
     
    289509}
    290510
     511Value * KernelBuilder::getBlockAddress(const std::string & name, Value * blockIndex) {
     512    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
     513    return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
     514}
     515
     516void KernelBuilder::protectOutputStream(const std::string & name, const bool readOnly) {
     517    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     518    Value * const handle = getStreamHandle(name);
     519    Value * const base = buf->getBaseAddress(this, handle);
     520    Value * sz = ConstantExpr::getSizeOf(buf->getType());
     521    sz = CreateMul(sz, getInt64(buf->getBufferBlocks()));
     522    sz = CreateMul(sz, CreateZExt(buf->getStreamSetCount(this, handle), getInt64Ty()));
     523    CreateMProtect(base, sz, readOnly ? CBuilder::READ : (CBuilder::READ | CBuilder::WRITE));
     524}
    291525   
    292526CallInst * KernelBuilder::createDoSegmentCall(const std::vector<Value *> & args) {
    293 //    Function * const doSegment = mKernel->getDoSegmentFunction(getModule());
    294 //    assert (doSegment->getArgumentList().size() == args.size());
    295 //    return CreateCall(doSegment, args);
    296527    return mKernel->makeDoSegmentCall(*this, args);
    297528}
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5706 r5755  
    4444    }
    4545
    46     llvm::Value * getProcessedItemCount(const std::string & name) {
     46    llvm::Value * getProcessedItemCount(const std::string & name) {       
    4747        return getInternalItemCount(name, Kernel::PROCESSED_ITEM_COUNT_SUFFIX);
    4848    }
     
    7171    // use in implementing kernels.
    7272
    73     llvm::Value * getInputStreamPtr(const std::string & name, llvm::Value * const blockIndex);
    74 
    7573    llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
    7674
     
    8280
    8381    llvm::Value * getInputStreamSetCount(const std::string & name);
    84 
    85     llvm::Value * getOutputStreamPtr(const std::string & name, llvm::Value * const blockIndex);
    8682
    8783    llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
     
    10399    llvm::Value * getBaseAddress(const std::string & name);
    104100
     101    llvm::Value * getBlockAddress(const std::string & name, llvm::Value * const blockIndex);
     102
    105103    void CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to);
    106104
     
    121119    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    122120   
    123     llvm::Value * copy(const std::string & name, llvm::Value * target, llvm::Value * source, llvm::Value * itemsToCopy, const unsigned alignment = 0);
     121    void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopy, const unsigned itemAlignment);
    124122
    125123    llvm::BasicBlock * CreateConsumerWait();
     
    143141    }
    144142
     143    void protectOutputStream(const std::string & name, const bool readOnly);
     144
    145145protected:
    146146
     
    158158
    159159    void setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value);
    160 
    161 private:
    162 
    163     llvm::Value * computeBlockIndex(llvm::Value * itemCount);
    164160
    165161protected:
  • icGREP/icgrep-devel/icgrep/kernels/linebreak_kernel.cpp

    r5706 r5755  
    2121
    2222LineBreakKernelBuilder::LineBreakKernelBuilder(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned basisBitsCount)
    23 : PabloKernel(b, "lb", {Binding{b->getStreamSetTy(basisBitsCount), "basis"}}, {Binding{b->getStreamSetTy(1), "linebreak", FixedRate(), Add1()}}) {
     23: PabloKernel(b, "lb",
     24    {Binding{b->getStreamSetTy(basisBitsCount), "basis", FixedRate(), Principal()}},
     25    {Binding{b->getStreamSetTy(1), "linebreak", FixedRate(), Add1()}}) {
    2426
    2527}
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5706 r5755  
    1111using namespace kernel;
    1212
    13 Value * getInputPtr(const std::unique_ptr<KernelBuilder> & iBuilder, Value * blockStartPtr, Value * offset) {
    14     return iBuilder->CreateGEP(
    15             iBuilder->CreatePointerCast(blockStartPtr, iBuilder->getInt32Ty()->getPointerTo()),
    16             offset
    17             );
    18 }
     13Value * LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {
    1914
    20 Value * selectMin(const std::unique_ptr<KernelBuilder> & iBuilder, Value * a, Value * b) {
    21     return iBuilder->CreateSelect(iBuilder->CreateICmpULT(a, b), a, b);
    22 }
     15    BasicBlock * entry_block = b->GetInsertBlock();
     16    BasicBlock * loopBody = b->CreateBasicBlock("bytestream_block_loop_body");
     17    BasicBlock * loopExit = b->CreateBasicBlock("bytestream_block_loop_exit");
     18    Type * const i32PtrTy = b->getInt32Ty()->getPointerTo();
     19    Type * const sizeTy = b->getSizeTy();
     20    assert (mBufferSize > 0);
     21    Value * bufferSize = b->getSize(mBufferSize);
     22    Value * bufferSizeMask = b->getSize(mBufferSize - 1);
     23    Value * const iterations = b->getAvailableItemCount("literalIndexes");
     24    Value * const inputBufferBasePtr = b->getRawInputPointer("inputStream", b->getInt32(0));
     25    Value * const outputBufferBasePtr = b->getRawOutputPointer("outputStream", b->getInt32(0));
     26    Value * baseLiteralStartPtr = b->getInputStreamBlockPtr("literalIndexes", b->getSize(0));
     27    baseLiteralStartPtr = b->CreatePointerCast(baseLiteralStartPtr, i32PtrTy);
     28    Value * baseLiteralLengthPtr = b->getInputStreamBlockPtr("literalIndexes", b->getSize(1));
     29    baseLiteralLengthPtr = b->CreatePointerCast(baseLiteralLengthPtr, i32PtrTy);
     30    Value * baseMatchOffsetPtr = b->getInputStreamBlockPtr("matchIndexes", b->getSize(0));
     31    baseMatchOffsetPtr = b->CreatePointerCast(baseMatchOffsetPtr, i32PtrTy);
     32    Value * baseMatchLengthPtr = b->getInputStreamBlockPtr("matchIndexes", b->getSize(1));
     33    baseMatchLengthPtr = b->CreatePointerCast(baseMatchLengthPtr, i32PtrTy);
     34    b->CreateBr(loopBody);
    2335
    24 void LZ4ByteStreamDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    25     BasicBlock * entry_block = iBuilder->GetInsertBlock();
    26     BasicBlock * loopBody = iBuilder->CreateBasicBlock("bytestream_block_loop_body");
    27     BasicBlock * loopExit = iBuilder->CreateBasicBlock("bytestream_block_loop_exit");
    28 
    29     Value * bufferSize = iBuilder->getSize(mBufferSize);
    30     Value * bufferSizeMask = iBuilder->CreateSub(bufferSize, iBuilder->getSize(1));
    31     Value * iterations = selectMin(iBuilder,
    32             iBuilder->getSize(iBuilder->getBitBlockWidth()),
    33             iBuilder->CreateSub(iBuilder->getAvailableItemCount("literalIndexes"), iBuilder->getProcessedItemCount("literalIndexes")));
    34     Value * inputBufferBasePtr = iBuilder->getRawInputPointer("inputStream", iBuilder->getSize(0));
    35     Value * outputBufferBasePtr = iBuilder->getRawOutputPointer("outputStream", iBuilder->getSize(0));
    36     iBuilder->CreateBr(loopBody);
    37 
    38     iBuilder->SetInsertPoint(loopBody);
    39     PHINode * phiInputIndex = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "inputIndex");
    40     phiInputIndex->addIncoming(iBuilder->getSize(0), entry_block);
     36    b->SetInsertPoint(loopBody);
     37    PHINode * phiInputIndex = b->CreatePHI(sizeTy, 2, "inputIndex");
     38    phiInputIndex->addIncoming(b->getSize(0), entry_block);
    4139
    4240    // =================================================
    4341    // Indexes extraction.
    44     Value * literalStartPtr = getInputPtr(iBuilder,
    45             iBuilder->getInputStreamBlockPtr("literalIndexes", iBuilder->getSize(0)), phiInputIndex);
    46     Value * literalLengthPtr = getInputPtr(iBuilder,
    47             iBuilder->getInputStreamBlockPtr("literalIndexes", iBuilder->getSize(1)), phiInputIndex);
    48     Value * matchOffsetPtr = getInputPtr(iBuilder,
    49             iBuilder->getInputStreamBlockPtr("matchIndexes", iBuilder->getSize(0)), phiInputIndex);
    50     Value * matchLengthPtr = getInputPtr(iBuilder,
    51             iBuilder->getInputStreamBlockPtr("matchIndexes", iBuilder->getSize(1)), phiInputIndex);
    52     Value * literalStart = iBuilder->CreateZExt(iBuilder->CreateLoad(literalStartPtr), iBuilder->getSizeTy());
    53     Value * literalLength = iBuilder->CreateZExt(iBuilder->CreateLoad(literalLengthPtr), iBuilder->getSizeTy());
    54     Value * matchOffset = iBuilder->CreateZExt(iBuilder->CreateLoad(matchOffsetPtr), iBuilder->getSizeTy());
    55     Value * matchLength = iBuilder->CreateZExt(iBuilder->CreateLoad(matchLengthPtr), iBuilder->getSizeTy());
    5642
    57 //    iBuilder->CallPrintInt(" ----- literalStart", literalStart);
    58 //    iBuilder->CallPrintInt(" ----- literalLength", literalLength);
    59 //    iBuilder->CallPrintInt(" ----- matchOffset", matchOffset);
    60 //    iBuilder->CallPrintInt(" ----- matchLength", matchLength);
    6143
    62 //#if 0
    63 //    Value * processedItem = iBuilder->CreateAdd(iBuilder->getProcessedItemCount("literalIndexes"), phiInputIndex);
    64 //    iBuilder->CallPrintInt("ProccessedItem", processedItem);
    65 //    iBuilder->CallPrintInt("LiteralStart", literalStart);
    66 //    iBuilder->CallPrintInt("LiteralLength", literalLength);
    67 //    iBuilder->CallPrintInt("MatchOffset", matchOffset);
    68 //    iBuilder->CallPrintInt("MatchLength", matchLength);
    69 //#endif
     44    Value * literalStartPtr = b->CreateGEP(baseLiteralStartPtr, phiInputIndex);
     45    Value * literalLengthPtr = b->CreateGEP(baseLiteralLengthPtr, phiInputIndex);
     46    Value * matchOffsetPtr = b->CreateGEP(baseMatchOffsetPtr, phiInputIndex);
     47    Value * matchLengthPtr = b->CreateGEP(baseMatchLengthPtr, phiInputIndex);
     48
     49    Value * literalStart = b->CreateZExt(b->CreateLoad(literalStartPtr), sizeTy);
     50    Value * literalLength = b->CreateZExt(b->CreateLoad(literalLengthPtr), sizeTy);
     51    Value * matchOffset = b->CreateZExt(b->CreateLoad(matchOffsetPtr), sizeTy);
     52    Value * matchLength = b->CreateZExt(b->CreateLoad(matchLengthPtr), sizeTy);
    7053
    7154    // =================================================
    7255    // Literals.
    73     Value * outputItems = iBuilder->getProducedItemCount("outputStream");
    74     Value * bufferOffset = iBuilder->CreateAnd(outputItems, bufferSizeMask);
    75     Value * remainingBuffer = iBuilder->CreateSub(bufferSize, bufferOffset);
    76     Value * copyLength1 = selectMin(iBuilder, remainingBuffer, literalLength);
    77     iBuilder->CreateMemCpy(
    78             iBuilder->CreateGEP(outputBufferBasePtr, bufferOffset),
    79             iBuilder->CreateGEP(inputBufferBasePtr, literalStart),
     56    Value * outputItems = b->getProducedItemCount("outputStream");
     57    Value * bufferOffset = b->CreateAnd(outputItems, bufferSizeMask);
     58    Value * remainingBuffer = b->CreateSub(bufferSize, bufferOffset);
     59    Value * copyLength1 = b->CreateUMin(remainingBuffer, literalLength);
     60    b->CreateMemCpy(
     61            b->CreateGEP(outputBufferBasePtr, bufferOffset),
     62            b->CreateGEP(inputBufferBasePtr, literalStart),
    8063            copyLength1, 1);    // no alignment guaranteed
    8164    // Potential wrap around.
    82     iBuilder->CreateMemCpy(
     65    b->CreateMemCpy(
    8366            outputBufferBasePtr,
    84             iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(literalStart, copyLength1)),
    85             iBuilder->CreateSub(literalLength, copyLength1), 1); // Buffer start is aligned.
     67            b->CreateGEP(inputBufferBasePtr, b->CreateAdd(literalStart, copyLength1)),
     68            b->CreateSub(literalLength, copyLength1), 1); // Buffer start is aligned.
    8669    // NOTE: Test case reported non-8-byte alignment
    87     outputItems = iBuilder->CreateAdd(outputItems, literalLength);
     70    outputItems = b->CreateAdd(outputItems, literalLength);
    8871
    8972    // =================================================
     
    9275    // [cur, cur+matchLength] sequentially, with two ranges potentially overlapping.
    9376    // If matchOffset is larger than 4, we copy 4 bytes at a time; otherwise, one byte a time.
    94     Value * matchStart = iBuilder->CreateSub(outputItems, matchOffset);
    95     Value * baseSrcOffset = iBuilder->CreateAnd(matchStart, bufferSizeMask);
    96     Value * baseDstOffset = iBuilder->CreateAnd(outputItems, bufferSizeMask);
    97     Value * copyStep = iBuilder->CreateSelect(
    98             iBuilder->CreateICmpULT(matchOffset, iBuilder->getSize(4)),
    99             iBuilder->getSize(1),
    100             iBuilder->getSize(4)
    101             );
    102     BasicBlock * cpyLoopCond = iBuilder->CreateBasicBlock("matchcopy_loop_cond");
    103     BasicBlock * cpyLoopBody = iBuilder->CreateBasicBlock("matchcopy_loop_body");
    104     BasicBlock * cpyLoopExit = iBuilder->CreateBasicBlock("matchcopy_loop_exit");
    105     iBuilder->CreateBr(cpyLoopCond);
     77    Value * matchStart = b->CreateSub(outputItems, matchOffset);
     78    Value * baseSrcOffset = b->CreateAnd(matchStart, bufferSizeMask);
     79    Value * baseDstOffset = b->CreateAnd(outputItems, bufferSizeMask);
     80    Value * const copyStep = b->CreateSelect(
     81            b->CreateICmpULT(matchOffset, b->getSize(4)),
     82            b->getSize(1),
     83            b->getSize(4));
     84    BasicBlock * cpyLoopCond = b->CreateBasicBlock("matchcopy_loop_cond");
     85    BasicBlock * cpyLoopBody = b->CreateBasicBlock("matchcopy_loop_body");
     86    BasicBlock * cpyLoopExit = b->CreateBasicBlock("matchcopy_loop_exit");
     87    b->CreateBr(cpyLoopCond);
    10688
    107     iBuilder->SetInsertPoint(cpyLoopCond);
    108     PHINode * phiSrcOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "srcOffset");
    109     PHINode * phiDstOffset = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "dstOffset");
    110     PHINode * phiIter = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "iterator");
     89    b->SetInsertPoint(cpyLoopCond);
     90    PHINode * phiSrcOffset = b->CreatePHI(sizeTy, 3, "srcOffset");
     91    PHINode * phiDstOffset = b->CreatePHI(sizeTy, 3, "dstOffset");
     92    PHINode * phiIter = b->CreatePHI(sizeTy, 3, "iterator");
    11193    phiSrcOffset->addIncoming(baseSrcOffset, loopBody);
    11294    phiDstOffset->addIncoming(baseDstOffset, loopBody);
    113     phiIter->addIncoming(iBuilder->getSize(0), loopBody);
    114     iBuilder->CreateCondBr(
    115             iBuilder->CreateICmpUGE(phiIter, matchLength),
     95    phiIter->addIncoming(b->getSize(0), loopBody);
     96    b->CreateCondBr(
     97            b->CreateICmpUGE(phiIter, matchLength),
    11698            cpyLoopExit,
    11799            cpyLoopBody
    118100            );
    119101
    120     iBuilder->SetInsertPoint(cpyLoopBody);
     102    b->SetInsertPoint(cpyLoopBody);
    121103//#ifndef NDEBUG
    122104//    iBuilder->CallPrintIntToStderr("srcOffset", phiSrcOffset);
    123105//    iBuilder->CallPrintIntToStderr("dstOffset", phiDstOffset);
    124106//#endif
    125     BasicBlock * reachingBufferEnd_then = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_then");
    126     BasicBlock * reachingBufferEnd_else = iBuilder->CreateBasicBlock("matchcopy_reaching_buf_end_else");
    127     Value * distSrcEnd = iBuilder->CreateSub(bufferSize, phiSrcOffset);
    128     Value * distDstEnd = iBuilder->CreateSub(bufferSize, phiDstOffset);
    129     Value * minDist = selectMin(iBuilder, distSrcEnd, distDstEnd);
    130     iBuilder->CreateUnlikelyCondBr(
    131             iBuilder->CreateICmpULE(minDist, iBuilder->getSize(4)),
     107    BasicBlock * reachingBufferEnd_then = b->CreateBasicBlock("matchcopy_reaching_buf_end_then");
     108    BasicBlock * reachingBufferEnd_else = b->CreateBasicBlock("matchcopy_reaching_buf_end_else");
     109    Value * distSrcEnd = b->CreateSub(bufferSize, phiSrcOffset);
     110    Value * distDstEnd = b->CreateSub(bufferSize, phiDstOffset);
     111    Value * minDist = b->CreateUMin(distSrcEnd, distDstEnd);
     112    b->CreateUnlikelyCondBr(
     113            b->CreateICmpULE(minDist, b->getSize(4)),
    132114            reachingBufferEnd_then,
    133115            reachingBufferEnd_else
    134116            );
    135117
    136     iBuilder->SetInsertPoint(reachingBufferEnd_then);
    137     Value * src8 = iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset);
    138     Value * dst8 = iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset);
    139     iBuilder->CreateStore(iBuilder->CreateLoad(src8), dst8);
    140     Value * newSrcOffset = iBuilder->CreateAnd(
    141             iBuilder->CreateAdd(phiSrcOffset, iBuilder->getSize(1)),
     118    b->SetInsertPoint(reachingBufferEnd_then);
     119    Value * src8 = b->CreateGEP(outputBufferBasePtr, phiSrcOffset);
     120    Value * dst8 = b->CreateGEP(outputBufferBasePtr, phiDstOffset);
     121    b->CreateStore(b->CreateLoad(src8), dst8);
     122    Value * newSrcOffset = b->CreateAnd(
     123            b->CreateAdd(phiSrcOffset, b->getSize(1)),
    142124            bufferSizeMask
    143125            );
    144     Value * newDstOffset = iBuilder->CreateAnd(
    145             iBuilder->CreateAdd(phiDstOffset, iBuilder->getSize(1)),
     126    Value * newDstOffset = b->CreateAnd(
     127            b->CreateAdd(phiDstOffset, b->getSize(1)),
    146128            bufferSizeMask
    147129            );
    148130    phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_then);
    149131    phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_then);
    150     phiIter->addIncoming(iBuilder->CreateAdd(phiIter, iBuilder->getSize(1)), reachingBufferEnd_then);
    151     iBuilder->CreateBr(cpyLoopCond);
     132    phiIter->addIncoming(b->CreateAdd(phiIter, b->getSize(1)), reachingBufferEnd_then);
     133    b->CreateBr(cpyLoopCond);
    152134
    153     iBuilder->SetInsertPoint(reachingBufferEnd_else);
     135    b->SetInsertPoint(reachingBufferEnd_else);
    154136    // Copy 4 bytes at a time (regardless of step length).
    155     Value * src32 = iBuilder->CreatePointerCast(
    156             iBuilder->CreateGEP(outputBufferBasePtr, phiSrcOffset),
    157             iBuilder->getInt32Ty()->getPointerTo());
    158     Value * dst32 = iBuilder->CreatePointerCast(
    159             iBuilder->CreateGEP(outputBufferBasePtr, phiDstOffset),
    160             iBuilder->getInt32Ty()->getPointerTo());
     137    Value * src32 = b->CreatePointerCast(
     138            b->CreateGEP(outputBufferBasePtr, phiSrcOffset),
     139            b->getInt32Ty()->getPointerTo());
     140    Value * dst32 = b->CreatePointerCast(
     141            b->CreateGEP(outputBufferBasePtr, phiDstOffset),
     142            b->getInt32Ty()->getPointerTo());
    161143    // Force unaligned load/store of an int32.
    162     iBuilder->CreateAlignedStore(iBuilder->CreateAlignedLoad(src32, 1), dst32, 1);
    163     newSrcOffset = iBuilder->CreateAnd(
    164             iBuilder->CreateAdd(phiSrcOffset, copyStep),
     144    b->CreateAlignedStore(b->CreateAlignedLoad(src32, 1), dst32, 1);
     145    newSrcOffset = b->CreateAnd(
     146            b->CreateAdd(phiSrcOffset, copyStep),
    165147            bufferSizeMask
    166148            );
    167     newDstOffset = iBuilder->CreateAnd(
    168             iBuilder->CreateAdd(phiDstOffset, copyStep),
     149    newDstOffset = b->CreateAnd(
     150            b->CreateAdd(phiDstOffset, copyStep),
    169151            bufferSizeMask
    170152            );
    171153    phiSrcOffset->addIncoming(newSrcOffset, reachingBufferEnd_else);
    172154    phiDstOffset->addIncoming(newDstOffset, reachingBufferEnd_else);
    173     phiIter->addIncoming(iBuilder->CreateAdd(phiIter, copyStep), reachingBufferEnd_else);
    174     iBuilder->CreateBr(cpyLoopCond);
     155    phiIter->addIncoming(b->CreateAdd(phiIter, copyStep), reachingBufferEnd_else);
     156    b->CreateBr(cpyLoopCond);
    175157
    176     iBuilder->SetInsertPoint(cpyLoopExit);
    177     outputItems = iBuilder->CreateAdd(outputItems, matchLength);
    178     iBuilder->setProducedItemCount("outputStream", outputItems);
     158    b->SetInsertPoint(cpyLoopExit);
     159    outputItems = b->CreateAdd(outputItems, matchLength);
     160    b->setProducedItemCount("outputStream", outputItems);
    179161
    180     Value * newInputIndex = iBuilder->CreateAdd(phiInputIndex, iBuilder->getSize(1));
     162    Value * newInputIndex = b->CreateAdd(phiInputIndex, b->getSize(1));
    181163    phiInputIndex->addIncoming(newInputIndex, cpyLoopExit);
    182     iBuilder->CreateUnlikelyCondBr(
    183             iBuilder->CreateICmpEQ(newInputIndex, iterations),
     164    b->CreateUnlikelyCondBr(
     165            b->CreateICmpEQ(newInputIndex, iterations),
    184166            loopExit,
    185167            loopBody
    186168            );
    187169
    188     iBuilder->SetInsertPoint(loopExit);
    189 //#ifndef NDEBUG
    190 //    iBuilder->CallPrintInt("Decompressed bytes", iBuilder->getProducedItemCount("outputStream"));
    191 //#endif
     170    b->SetInsertPoint(loopExit);
     171    return numOfStrides;
    192172}
    193173
    194174
    195175LZ4ByteStreamDecoderKernel::LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize)
    196 : BlockOrientedKernel("lz4ByteStreamDecoder",
     176: MultiBlockKernel("lz4ByteStreamDecoder",
    197177    // Inputs
    198178    {Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes"},
    199179     Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes"},
    200      Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", UnknownRate(), LookBehind(65536)}},
     180     Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", FixedRate(), { Deferred(), LookBehind(65536) }}},
    201181    // Outputs
    202182    {Binding{iBuilder->getStreamSetTy(1, 8), "outputStream", UnknownRate()}},
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.h

    r5440 r5755  
    1414namespace kernel {
    1515
    16 class LZ4ByteStreamDecoderKernel : public BlockOrientedKernel {
     16class LZ4ByteStreamDecoderKernel final : public MultiBlockKernel {
    1717public:
    1818    LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize);
    1919protected:
    20     void generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     20    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;
    2121private:
    2222    size_t mBufferSize;
  • icGREP/icgrep-devel/icgrep/kernels/lz4_index_decoder.cpp

    r5706 r5755  
    1818
    1919#define printRTDebugMsg(MSG) \
    20     if (DEBUG_RT_PRINT) iBuilder->CallPrintMsgToStderr(MSG)
     20    if (DEBUG_RT_PRINT) b->CallPrintMsgToStderr(MSG)
    2121
    2222#define printRTDebugInt(NAME, X) \
    23     if (DEBUG_RT_PRINT) iBuilder->CallPrintIntToStderr(NAME, X)
     23    if (DEBUG_RT_PRINT) b->CallPrintIntToStderr(NAME, X)
    2424
    2525#define printGlobalPos() \
    26     printRTDebugInt("GlobalPos", iBuilder->CreateAdd(blockStartPos, iBuilder->CreateLoad(sOffset)))
     26    printRTDebugInt("GlobalPos", b->CreateAdd(blockStartPos, b->CreateLoad(sOffset)))
    2727
    2828namespace {
    2929
    30 Value * generateBitswap(const std::unique_ptr<KernelBuilder> & iBuilder, Value * v) {
    31     Value * bswapFunc = Intrinsic::getDeclaration(iBuilder->getModule(),
     30Value * generateBitswap(const std::unique_ptr<KernelBuilder> & b, Value * v) {
     31    Value * bswapFunc = Intrinsic::getDeclaration(b->getModule(),
    3232            Intrinsic::bswap, v->getType());
    33     return iBuilder->CreateCall(bswapFunc, {v});
    34 }
    35 
    36 Value * selectMin(const std::unique_ptr<KernelBuilder> & iBuilder, Value * a, Value * b) {
    37     return iBuilder->CreateSelect(iBuilder->CreateICmpULT(a, b), a, b);
    38 }
    39 
    40 Value * createStackVar(const std::unique_ptr<KernelBuilder> & iBuilder, Type * type, StringRef name, Value * initializer = nullptr) {
    41     Value * var = iBuilder->CreateAlloca(type, nullptr, name);
     33    return b->CreateCall(bswapFunc, {v});
     34}
     35
     36Value * createStackVar(const std::unique_ptr<KernelBuilder> & b, Type * type, StringRef name, Value * initializer = nullptr) {
     37    Value * var = b->CreateAlloca(type, nullptr, name);
    4238    if (initializer) {
    43         iBuilder->CreateStore(initializer, var);
     39        b->CreateStore(initializer, var);
    4440    } else {
    45         iBuilder->CreateStore(ConstantInt::get(type, 0), var);
     41        b->CreateStore(ConstantInt::get(type, 0), var);
    4642    }
    4743    return var;
    4844}
    4945
    50 void incStackVar(const std::unique_ptr<KernelBuilder> & iBuilder, Value * svar, Value * increment = nullptr) {
    51     Value * value = iBuilder->CreateLoad(svar);
     46void incStackVar(const std::unique_ptr<KernelBuilder> & b, Value * svar, Value * increment = nullptr) {
     47    Value * value = b->CreateLoad(svar);
    5248    if (increment) {
    53         value = iBuilder->CreateAdd(value, increment);
     49        value = b->CreateAdd(value, increment);
    5450    } else {
    55         value = iBuilder->CreateAdd(value, ConstantInt::get(value->getType(), 1));
     51        value = b->CreateAdd(value, ConstantInt::get(value->getType(), 1));
    5652    }
    57     iBuilder->CreateStore(value, svar);
    58 }
    59 
    60 Value * getOutputPtr(const std::unique_ptr<KernelBuilder> & iBuilder, Value * blockStartPtr, Value * offset) {
    61     return iBuilder->CreateGEP(
    62             iBuilder->CreatePointerCast(blockStartPtr, iBuilder->getInt32Ty()->getPointerTo()),
     53    b->CreateStore(value, svar);
     54}
     55
     56Value * getOutputPtr(const std::unique_ptr<KernelBuilder> & b, Value * blockStartPtr, Value * offset) {
     57    return b->CreateGEP(
     58            b->CreatePointerCast(blockStartPtr, b->getInt32Ty()->getPointerTo()),
    6359            offset
    6460            );
     
    7066 * Get the offset within the current word.
    7167 */
    72 Value * LZ4IndexDecoderKernel::getWordOffset(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
    73     Value * offset = iBuilder->CreateLoad(sOffset);
     68Value * LZ4IndexDecoderKernel::getWordOffset(const std::unique_ptr<kernel::KernelBuilder> & b) {
     69    Value * offset = b->CreateLoad(sOffset);
    7470    IntegerType * type = cast<IntegerType>(offset->getType());
    7571    Constant * mask = ConstantInt::get(type, wordWidth - 1);
    76     return iBuilder->CreateAnd(offset, mask);
     72    return b->CreateAnd(offset, mask);
    7773}
    7874
     
    8076 * Get the offset of the start of the current word.
    8177 */
    82 Value * LZ4IndexDecoderKernel::getWordStartOffset(const std::unique_ptr<KernelBuilder> & iBuilder) {
    83     Value * offset = iBuilder->CreateLoad(sOffset);
     78Value * LZ4IndexDecoderKernel::getWordStartOffset(const std::unique_ptr<KernelBuilder> & b) {
     79    Value * offset = b->CreateLoad(sOffset);
    8480    IntegerType * type = cast<IntegerType>(offset->getType());
    8581    Constant * mask = ConstantExpr::getNeg(ConstantInt::get(type, wordWidth));
    86     return iBuilder->CreateAnd(offset, mask);
     82    return b->CreateAnd(offset, mask);
    8783}
    8884
     
    9187 * If offset is not provided, load the current byte by default.
    9288 */
    93 Value * LZ4IndexDecoderKernel::loadRawByte(const std::unique_ptr<KernelBuilder> & iBuilder, Value * offset) {
    94     Value * blockStartPtr = iBuilder->CreatePointerCast(
    95             iBuilder->getInputStreamBlockPtr("byteStream", iBuilder->getInt32(0)),
    96             iBuilder->getInt8PtrTy()
     89Value * LZ4IndexDecoderKernel::loadRawByte(const std::unique_ptr<KernelBuilder> & b, Value * offset) {
     90    Value * blockStartPtr = b->CreatePointerCast(
     91            b->getInputStreamBlockPtr("byteStream", b->getInt32(0)),
     92            b->getInt8PtrTy()
    9793            );
    9894    if (offset == nullptr) {
    99         offset = iBuilder->CreateLoad(sOffset);
     95        offset = b->CreateLoad(sOffset);
    10096    }
    101     Value * ptr = iBuilder->CreateGEP(blockStartPtr, offset);
    102     return iBuilder->CreateLoad(ptr);
     97    Value * ptr = b->CreateGEP(blockStartPtr, offset);
     98    return b->CreateLoad(ptr);
    10399}
    104100
     
    110106 * cleared  = ....111
    111107 */
    112 void LZ4IndexDecoderKernel::setExtenderUntilOffset(const std::unique_ptr<KernelBuilder> & iBuilder) {
     108void LZ4IndexDecoderKernel::setExtenderUntilOffset(const std::unique_ptr<KernelBuilder> & b) {
    113109    // Little-endian, offset counts from LSB
    114110    // extender = extender ^ ~((1 << offset) -1)
    115     Value * extender = iBuilder->CreateLoad(sExtender);
    116     Value * wordOffset = iBuilder->CreateZExt(
    117             getWordOffset(iBuilder),
    118             iBuilder->getSizeTy()
    119             );
    120     Value * one = iBuilder->getSize(1);
    121     Value * mask = iBuilder->CreateSub(
    122             iBuilder->CreateShl(one, wordOffset),
     111    Value * extender = b->CreateLoad(sExtender);
     112    Value * wordOffset = b->CreateZExt(
     113            getWordOffset(b),
     114            b->getSizeTy()
     115            );
     116    Value * one = b->getSize(1);
     117    Value * mask = b->CreateSub(
     118            b->CreateShl(one, wordOffset),
    123119            one);
    124     extender = iBuilder->CreateOr(extender, mask);
    125     iBuilder->CreateStore(extender, sExtender);
     120    extender = b->CreateOr(extender, mask);
     121    b->CreateStore(extender, sExtender);
    126122}
    127123
     
    131127 * Called when we potentially reach a new word.  Usually followed by setExtenderUntilOffset.
    132128 */
    133 void LZ4IndexDecoderKernel::loadCurrentExtender(const std::unique_ptr<KernelBuilder> & iBuilder) {
    134     Value * offset = iBuilder->CreateLoad(sOffset);
     129void LZ4IndexDecoderKernel::loadCurrentExtender(const std::unique_ptr<KernelBuilder> & b) {
     130    Value * offset = b->CreateLoad(sOffset);
    135131    IntegerType * type = cast<IntegerType>(offset->getType());
    136132    ConstantInt * shift = ConstantInt::get(type, std::log2(wordWidth));
    137     Value * shiftedOffset = iBuilder->CreateLShr(offset, shift);
    138     Value * extender = iBuilder->CreateExtractElement(extenders, shiftedOffset);
    139     iBuilder->CreateStore(extender, sExtender);
    140 }
    141 
    142 
    143 void LZ4IndexDecoderKernel::generateProduceOutput(const std::unique_ptr<KernelBuilder> &iBuilder) {
    144     Value * producedItem = iBuilder->getProducedItemCount("literalIndexes");
     133    Value * shiftedOffset = b->CreateLShr(offset, shift);
     134    Value * extender = b->CreateExtractElement(extenders, shiftedOffset);
     135    b->CreateStore(extender, sExtender);
     136}
     137
     138
     139void LZ4IndexDecoderKernel::generateProduceOutput(const std::unique_ptr<KernelBuilder> &b) {
     140    Value * producedItem = b->getProducedItemCount("literalIndexes");
    145141
    146142//#ifndef NDEBUG
    147 //    iBuilder->CallPrintInt("ProducedItem", producedItem);
     143//    b->CallPrintInt("ProducedItem", producedItem);
    148144//    // LiteralStart is adjusted to be relative to the block start, so that
    149145//    // the output can be compared against that of the reference implementation.
    150 //    Value * literalStart = iBuilder->CreateSub(iBuilder->getScalarField("LiteralStart"), iBuilder->getScalarField("LZ4BlockStart"));
    151 //    iBuilder->CallPrintInt("LiteralStart", literalStart);
    152 //    iBuilder->CallPrintInt("LiteralLength", iBuilder->getScalarField("LiteralLength"));
    153 //    iBuilder->CallPrintInt("MatchOffset", iBuilder->getScalarField("MatchOffset"));
    154 //    iBuilder->CallPrintInt("MatchLength", iBuilder->getScalarField("MatchLength"));
     146//    Value * literalStart = b->CreateSub(b->getScalarField("LiteralStart"), b->getScalarField("LZ4BlockStart"));
     147//    b->CallPrintInt("LiteralStart", literalStart);
     148//    b->CallPrintInt("LiteralLength", b->getScalarField("LiteralLength"));
     149//    b->CallPrintInt("MatchOffset", b->getScalarField("MatchOffset"));
     150//    b->CallPrintInt("MatchLength", b->getScalarField("MatchLength"));
    155151//#endif
    156152    printRTDebugMsg("--------------");
    157153
    158     Value * outputOffset = iBuilder->CreateAnd(
    159             iBuilder->CreateTrunc(producedItem, iBuilder->getInt32Ty()),
    160             iBuilder->getInt32(iBuilder->getBitBlockWidth() - 1)
    161             );  // producedItem % blockWidth (as blockWidth is always a power of 2)
    162     Value * literalStartPtr = getOutputPtr(iBuilder,
    163             iBuilder->getOutputStreamBlockPtr("literalIndexes", iBuilder->getInt32(0)), outputOffset);
    164     Value * literalLengthPtr = getOutputPtr(iBuilder,
    165             iBuilder->getOutputStreamBlockPtr("literalIndexes", iBuilder->getInt32(1)), outputOffset);
    166     Value * matchOffsetPtr = getOutputPtr(iBuilder,
    167             iBuilder->getOutputStreamBlockPtr("matchIndexes", iBuilder->getInt32(0)), outputOffset);
    168     Value * matchLengthPtr = getOutputPtr(iBuilder,
    169             iBuilder->getOutputStreamBlockPtr("matchIndexes", iBuilder->getInt32(1)), outputOffset);
    170     iBuilder->CreateStore(iBuilder->getScalarField("LiteralStart"), literalStartPtr);
    171     iBuilder->CreateStore(iBuilder->getScalarField("LiteralLength"), literalLengthPtr);
    172     iBuilder->CreateStore(iBuilder->getScalarField("MatchOffset"), matchOffsetPtr);
    173     iBuilder->CreateStore(iBuilder->getScalarField("MatchLength"), matchLengthPtr);
    174     iBuilder->setProducedItemCount("literalIndexes", iBuilder->CreateAdd(producedItem, iBuilder->getSize(1)));
     154    Value * outputOffset = b->CreateAnd(b->CreateTrunc(producedItem, b->getInt32Ty()), b->getInt32(b->getBitBlockWidth() - 1));  // producedItem % blockWidth (as blockWidth is always a power of 2)
     155    Value * baseLiteralStartPtr = b->getOutputStreamBlockPtr("literalIndexes", b->getInt32(0));
     156
     157    Value * literalStartPtr = getOutputPtr(b, baseLiteralStartPtr, outputOffset);
     158    Value * literalLengthPtr = getOutputPtr(b,
     159            b->getOutputStreamBlockPtr("literalIndexes", b->getInt32(1)), outputOffset);
     160    Value * matchOffsetPtr = getOutputPtr(b,
     161            b->getOutputStreamBlockPtr("matchIndexes", b->getInt32(0)), outputOffset);
     162    Value * matchLengthPtr = getOutputPtr(b,
     163            b->getOutputStreamBlockPtr("matchIndexes", b->getInt32(1)), outputOffset);
     164
     165    b->CreateStore(b->getScalarField("LiteralStart"), literalStartPtr);
     166    b->CreateStore(b->getScalarField("LiteralLength"), literalLengthPtr);
     167    b->CreateStore(b->getScalarField("MatchOffset"), matchOffsetPtr);
     168    b->CreateStore(b->getScalarField("MatchLength"), matchLengthPtr);
     169    b->setProducedItemCount("literalIndexes", b->CreateAdd(producedItem, b->getSize(1)));
    175170    // matchIndexes has a fixed ratio of 1:1 w.r.t. literalIndexes.
    176171}
    177172
    178173
    179 void LZ4IndexDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    180     BasicBlock * entry_block = iBuilder->GetInsertBlock();
    181     BasicBlock * exit_block = iBuilder->CreateBasicBlock("exit");
     174void LZ4IndexDecoderKernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
     175    BasicBlock * entry_block = b->GetInsertBlock();
     176    BasicBlock * exit_block = b->CreateBasicBlock("exit");
    182177
    183178    // %entry
    184     iBuilder->SetInsertPoint(entry_block);
     179    b->SetInsertPoint(entry_block);
    185180    printRTDebugMsg("entry");
    186181    // Global positions in the byte stream.
    187     Value * blockNo = iBuilder->getScalarField("BlockNo");
    188     blockStartPos = iBuilder->CreateMul(blockNo, iBuilder->getInt32(iBuilder->getBitBlockWidth()), "blockStartPos");
    189     extenders = iBuilder->CreateBitCast(
    190             iBuilder->loadInputStreamBlock("extenders", iBuilder->getInt32(0)),
    191             VectorType::get(iBuilder->getSizeTy(), iBuilder->getBitBlockWidth() / iBuilder->getSizeTy()->getBitWidth()),
     182    Value * blockNo = b->getScalarField("BlockNo");
     183    blockStartPos = b->CreateMul(blockNo, b->getInt32(b->getBitBlockWidth()), "blockStartPos");
     184    extenders = b->CreateBitCast(
     185            b->loadInputStreamBlock("extenders", b->getInt32(0)),
     186            VectorType::get(b->getSizeTy(), b->getBitBlockWidth() / b->getSizeTy()->getBitWidth()),
    192187            "extenders");
    193188    // Create a series of stack variables which will be promoted by mem2reg.
    194     sOffset = createStackVar(iBuilder, iBuilder->getInt32Ty(), "offset");
     189    sOffset = createStackVar(b, b->getInt32Ty(), "offset");
    195190    // tempLength has different meanings in different states.
    196     sTempLength = createStackVar(iBuilder, iBuilder->getInt32Ty(), "tempLength", iBuilder->getScalarField("TempLength"));
    197     sTempCount = createStackVar(iBuilder, iBuilder->getInt32Ty(), "tempCount", iBuilder->getScalarField("TempCount"));
    198     sState = createStackVar(iBuilder, iBuilder->getInt8Ty(), "state", iBuilder->getScalarField("State"));
    199     sExtender = createStackVar(iBuilder, iBuilder->getSizeTy(), "extender",
    200             iBuilder->CreateExtractElement(extenders, iBuilder->getInt32(0)));
    201 
    202     BasicBlock * skippingBytes = iBuilder->CreateBasicBlock("skipping_bytes");
    203     BasicBlock * dispatch = iBuilder->CreateBasicBlock("dispatch");
    204 
    205     iBuilder->CreateCondBr(
    206             iBuilder->CreateICmpUGT(iBuilder->getScalarField("BytesToSkip"), iBuilder->getInt32(0)),
     191    sTempLength = createStackVar(b, b->getInt32Ty(), "tempLength", b->getScalarField("TempLength"));
     192    sTempCount = createStackVar(b, b->getInt32Ty(), "tempCount", b->getScalarField("TempCount"));
     193    sState = createStackVar(b, b->getInt8Ty(), "state", b->getScalarField("State"));
     194    sExtender = createStackVar(b, b->getSizeTy(), "extender",
     195            b->CreateExtractElement(extenders, b->getInt32(0)));
     196
     197    BasicBlock * skippingBytes = b->CreateBasicBlock("skipping_bytes");
     198    BasicBlock * dispatch = b->CreateBasicBlock("dispatch");
     199
     200    b->CreateCondBr(
     201            b->CreateICmpUGT(b->getScalarField("BytesToSkip"), b->getInt32(0)),
    207202            skippingBytes, dispatch
    208203            );
    209204
    210205    // %skipping_bytes
    211     generateSkippingBytes(iBuilder, skippingBytes, exit_block);
     206    generateSkippingBytes(b, skippingBytes, exit_block);
    212207    // Insert point is at the end of skippingBytes.
    213     iBuilder->CreateBr(dispatch);
     208    b->CreateBr(dispatch);
    214209
    215210    // %dispatch
     
    217212
    218213    // %at_block_checksum
    219     BasicBlock * atBlockChecksum = iBuilder->CreateBasicBlock("at_block_checksum");
    220     generateAtBlockChecksum(iBuilder, atBlockChecksum, skippingBytes);
     214    BasicBlock * atBlockChecksum = b->CreateBasicBlock("at_block_checksum");
     215    generateAtBlockChecksum(b, atBlockChecksum, skippingBytes);
    221216 
    222217    // %at_block_size
    223     BasicBlock * atBlockSize = iBuilder->CreateBasicBlock("at_block_size");
    224     generateAtBlockSize(iBuilder, atBlockSize, skippingBytes, exit_block);
     218    BasicBlock * atBlockSize = b->CreateBasicBlock("at_block_size");
     219    generateAtBlockSize(b, atBlockSize, skippingBytes, exit_block);
    225220
    226221    // %at_token
    227     BasicBlock * atToken = iBuilder->CreateBasicBlock("at_token");
    228     generateAtToken(iBuilder, atToken, exit_block);
     222    BasicBlock * atToken = b->CreateBasicBlock("at_token");
     223    generateAtToken(b, atToken, exit_block);
    229224
    230225    // %extending_literal_length
    231     BasicBlock * extendingLiteralLen = iBuilder->CreateBasicBlock("extending_literal_length");
    232     generateExtendingLiteralLen(iBuilder, extendingLiteralLen, exit_block);
     226    BasicBlock * extendingLiteralLen = b->CreateBasicBlock("extending_literal_length");
     227    generateExtendingLiteralLen(b, extendingLiteralLen, exit_block);
    233228
    234229    // %at_literals
    235     BasicBlock * atLiterals = iBuilder->CreateBasicBlock("at_literals");
    236     generateAtLiterals(iBuilder, atLiterals);
    237     iBuilder->CreateBr(skippingBytes);
     230    BasicBlock * atLiterals = b->CreateBasicBlock("at_literals");
     231    generateAtLiterals(b, atLiterals);
     232    b->CreateBr(skippingBytes);
    238233
    239234    // %at_first_offset
     
    241236    // If the whole LZ4 block is done, process the (optional) checksum.
    242237    // Otherwise, go around to process the next sequence.
    243     BasicBlock * atOffset1 = iBuilder->CreateBasicBlock("at_first_offset");
    244     iBuilder->SetInsertPoint(atOffset1);
    245     Value * nowGlobalPos = iBuilder->CreateAdd(blockStartPos, iBuilder->CreateLoad(sOffset));
    246     BasicBlock * blockEnd_else = iBuilder->CreateBasicBlock("block_end_else");
     238    BasicBlock * atOffset1 = b->CreateBasicBlock("at_first_offset");
     239    b->SetInsertPoint(atOffset1);
     240    Value * nowGlobalPos = b->CreateAdd(blockStartPos, b->CreateLoad(sOffset));
     241    BasicBlock * blockEnd_else = b->CreateBasicBlock("block_end_else");
    247242    // Conditional branch inserted at the end of the last block.
    248     iBuilder->CreateUnlikelyCondBr(
    249             iBuilder->CreateICmpEQ(nowGlobalPos, iBuilder->getScalarField("LZ4BlockEnd")),
     243    b->CreateUnlikelyCondBr(
     244            b->CreateICmpEQ(nowGlobalPos, b->getScalarField("LZ4BlockEnd")),
    250245            atBlockChecksum, blockEnd_else
    251246            );
    252     generateAtFirstOffset(iBuilder, blockEnd_else, exit_block);
     247    generateAtFirstOffset(b, blockEnd_else, exit_block);
    253248
    254249    // %at_second_offset
    255     BasicBlock * atOffset2 = iBuilder->CreateBasicBlock("at_second_offset");