Ignore:
Timestamp:
Apr 24, 2018, 2:57:34 PM (15 months ago)
Author:
nmedfort
Message:

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

Location:
icGREP/icgrep-devel/icgrep
Files:
2 added
44 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5955 r5985  
    6868endif()
    6969
    70 SET(KERNEL_SRC kernels/attributes.cpp kernels/processing_rate.cpp kernels/interface.cpp kernels/kernel.cpp kernels/streamset.cpp kernels/kernel_builder.cpp)
     70SET(KERNEL_SRC kernels/attributes.cpp kernels/processing_rate.cpp kernels/interface.cpp kernels/kernel.cpp kernels/multiblock_kernel.cpp kernels/block_kernel.cpp kernels/streamset.cpp kernels/kernel_builder.cpp)
    7171SET(KERNEL_SRC ${KERNEL_SRC} kernels/source_kernel.cpp kernels/s2p_kernel.cpp kernels/deletion.cpp kernels/swizzle.cpp kernels/p2s_kernel.cpp kernels/stdout_kernel.cpp)
    7272
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.cpp

    r5924 r5985  
    2222#include <unistd.h>
    2323#include <stdio.h>
     24#include <boost/format.hpp>
    2425
    2526#if defined(__i386__)
     
    9192}
    9293
     94static inline bool notConstantZeroArraySize(const AllocaInst * const Base) {
     95    if (const Constant * const as = dyn_cast_or_null<Constant>(Base->getArraySize())) {
     96        return !as->isNullValue();
     97    }
     98    return false;
     99}
     100
    93101static Value * checkStackAddress(CBuilder * const b, Value * const Ptr, Value * const Size, AllocaInst * const Base) {
    94102    DataLayout DL(b->getModule());
    95103    IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(Ptr->getType()));
    96104    Value * sz = ConstantExpr::getBitCast(ConstantExpr::getSizeOf(Base->getAllocatedType()), intPtrTy);
    97     if (dyn_cast_or_null<Constant>(Base->getArraySize()) && !cast<Constant>(Base->getArraySize())->isNullValue()) {
     105    if (notConstantZeroArraySize(Base)) {
    98106        sz = b->CreateMul(sz, b->CreateZExtOrTrunc(Base->getArraySize(), intPtrTy));
    99107    }
     
    141149}
    142150
    143 Value * CBuilder::CreateUDivCeil(Value * const number, Value * const divisor, const Twine & Name) {
     151Value * CBuilder::CreateCeilUDiv(Value * const number, Value * const divisor, const Twine & Name) {
    144152    assert (number->getType() == divisor->getType());
    145153    Type * const t = number->getType();
     
    155163        }
    156164    }
    157     CreateAssert(divisor, "CreateUDivCeil divisor cannot be 0!");
     165    CreateAssert(divisor, "CreateCeilUDiv divisor cannot be 0!");
    158166    return CreateUDiv(n, divisor, Name);
    159167}
    160168
    161169Value * CBuilder::CreateRoundUp(Value * const number, Value * const divisor, const Twine &Name) {
    162     return CreateMul(CreateUDivCeil(number, divisor), divisor, Name);
     170    return CreateMul(CreateCeilUDiv(number, divisor), divisor, Name);
    163171}
    164172
     
    835843
    836844void __report_failure(const char * msg, const uintptr_t * trace, const uint32_t n) {
     845    // TODO: look into boost stacktrace, available from version 1.65
    837846    raw_fd_ostream out(STDERR_FILENO, false);
    838847    if (trace) {
     
    842851            const auto pc = trace[i];
    843852            trace_string << format_hex(pc, 16) << "   ";
    844             const auto len = codegen::ProgramName.length() + 32;
    845             char cmd[len];
    846             snprintf(cmd, len,"addr2line -fpCe %s %p", codegen::ProgramName.data(), reinterpret_cast<void *>(pc));
    847             FILE * f = popen(cmd, "r");
     853            #ifdef __APPLE__
     854            const auto translator = "atos -o %s %p";
     855            #else
     856            const auto translator = "addr2line -fpCe %s %p";
     857            #endif
     858            const auto cmd = boost::format(translator) % codegen::ProgramName.data() % pc;
     859            FILE * const f = popen(cmd.str().data(), "r");
    848860            if (f) {
    849861                char buffer[1024] = {0};
     
    852864                }
    853865                pclose(f);
     866            } else { // TODO: internal default
     867
     868
    854869            }
    855870        }
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.h

    r5919 r5985  
    5555    // Division with rounding up to the ceiling
    5656    // Equivalent to CreateUDiv(CreateAdd(number, CreateSub(divisor, ConstantInt::get(divisor->getType(), 1))), divisor)
    57     llvm::Value * CreateUDivCeil(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "");
     57    llvm::Value * CreateCeilUDiv(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "");
     58
     59    llvm::Value * CreateUDivCeil(llvm::Value * number, llvm::Value * divisor, const llvm::Twine &Name = "") {
     60        return CreateCeilUDiv(number, divisor, Name);
     61    }
    5862   
    5963    // Round up to a multiple of divisor.
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.cpp

    r5979 r5985  
    171171    VectorType * const vecTy = cast<VectorType>(value->getType());
    172172    IntegerType * const intTy = getIntNTy(vecTy->getBitWidth());
     173    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     174        Type * const ty = shift->getType();
     175        Value * const scaled = CreateMul(shift, ConstantInt::get(ty, fw));
     176        Value * const inbounds = CreateICmpULE(scaled, ConstantInt::get(ty, vecTy->getBitWidth()));
     177        CreateAssert(inbounds, "shift exceeds vector width");
     178    }
    173179    value = CreateBitCast(value, intTy);
    174180    shift = CreateZExtOrTrunc(CreateMul(shift, ConstantInt::get(shift->getType(), fw)), intTy);
     
    179185    VectorType * const vecTy = cast<VectorType>(value->getType());
    180186    IntegerType * const intTy = getIntNTy(vecTy->getBitWidth());
     187    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     188        Type * const ty = shift->getType();
     189        Value * const scaled = CreateMul(shift, ConstantInt::get(ty, fw));
     190        Value * const inbounds = CreateICmpULE(scaled, ConstantInt::get(ty, vecTy->getBitWidth()));
     191        CreateAssert(inbounds, "shift exceeds vector width");
     192    }
    181193    value = CreateBitCast(value, intTy);
    182194    shift = CreateZExtOrTrunc(CreateMul(shift, ConstantInt::get(shift->getType(), fw)), intTy);
     
    614626}
    615627
     628Value * IDISA_Builder::bitblock_popcount(Value * const to_count) {
     629    const auto fieldWidth = getSizeTy()->getBitWidth();
     630    auto fields = (getBitBlockWidth() / fieldWidth);
     631    Value * fieldCounts = simd_popcount(fieldWidth, to_count);
     632    while (fields > 1) {
     633        fields /= 2;
     634        fieldCounts = CreateAdd(fieldCounts, mvmd_srli(fieldWidth, fieldCounts, fields));
     635    }
     636    return mvmd_extract(fieldWidth, fieldCounts, 0);
     637}
     638
    616639Value * IDISA_Builder::simd_and(Value * a, Value * b) {
    617640    return a->getType() == b->getType() ? CreateAnd(a, b) : CreateAnd(bitCast(a), bitCast(b));
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.h

    r5972 r5985  
    160160    virtual llvm::Value * bitblock_set_bit(llvm::Value * pos);
    161161
     162    // returns a scalar with the popcount of this block
     163    llvm::Value * bitblock_popcount(llvm::Value * const to_count);
     164
    162165    virtual void CreateBaseFunctions() {}
    163166   
     
    172175    }
    173176
    174     static llvm::VectorType * getStreamTy(llvm::LLVMContext & C, const unsigned FieldWidth = 1) {
     177    static llvm::VectorType * LLVM_READNONE getStreamTy(llvm::LLVMContext & C, const unsigned FieldWidth = 1) {
    175178        return llvm::VectorType::get(llvm::IntegerType::getIntNTy(C, FieldWidth), 0);
    176179    }
    177180
    178     static llvm::ArrayType * getStreamSetTy(llvm::LLVMContext & C, const unsigned NumElements = 1, const unsigned FieldWidth = 1) {
     181    static llvm::ArrayType * LLVM_READNONE getStreamSetTy(llvm::LLVMContext & C, const unsigned NumElements = 1, const unsigned FieldWidth = 1) {
    179182        return llvm::ArrayType::get(getStreamTy(C, FieldWidth), NumElements);
    180183    }
  • icGREP/icgrep-devel/icgrep/base64.cpp

    r5856 r5985  
    2121#include <boost/interprocess/mapped_region.hpp>
    2222#include <boost/interprocess/anonymous_shared_memory.hpp>
     23#include <boost/math/common_factor_rt.hpp>
    2324#include <sys/stat.h>
    2425#include <fcntl.h>
     
    6263
    6364    //Round up to a multiple of 3.
    64     const auto bufferSize = ((codegen::SegmentSize * codegen::BufferSegments * codegen::ThreadNum + 2) / 3) * 3;
     65    const auto bufferSize = (codegen::SegmentSize * codegen::BufferSegments);
     66    const auto expandedSize = boost::lcm(boost::lcm(bufferSize, 3U), 4U);
    6567
    6668    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
     
    6971    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
    7072   
    71     const auto outputBufferSize = ((bufferSize + 2) / 3) * 4;
    72 
    73     StreamSetBuffer * Expanded3_4Out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputBufferSize);
     73    StreamSetBuffer * Expanded3_4Out = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), expandedSize);
    7474    Kernel * expandK = pxDriver.addKernelInstance<expand3_4Kernel>(iBuilder);
    7575    pxDriver.makeKernelCall(expandK, {ByteStream}, {Expanded3_4Out});
    7676   
    77     StreamSetBuffer * Radix64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputBufferSize);
     77    StreamSetBuffer * Radix64out = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
    7878    Kernel * radix64K = pxDriver.addKernelInstance<radix64Kernel>(iBuilder);
    7979    pxDriver.makeKernelCall(radix64K, {Expanded3_4Out}, {Radix64out});
     
    8484        pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});
    8585    } else {
    86         StreamSetBuffer * Base64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputBufferSize);
     86        StreamSetBuffer * Base64out = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
    8787        Kernel * base64K = pxDriver.addKernelInstance<base64Kernel>(iBuilder);
    8888        pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});       
  • icGREP/icgrep-devel/icgrep/character_deletion.cpp

    r5939 r5985  
    6262    codegen::ParseCommandLineOptions(argc, argv, {&lz4dFlags, codegen::codegen_flags()});
    6363
    64 
    65     std::string fileName = inputFile;
     64    const std::string fileName = inputFile;
    6665
    6766    std::ifstream f(fileName, std::ios::binary | std::ios::ate);
     
    118117
    119118
    120 //    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks);
    121 //    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks);
    122     StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
    123     StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
    124     Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 8);
     119    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     120    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     121    Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 8);
    125122    pxDriver.makeKernelCall(delK, {CharacterMarkerBuffer, BasisBits}, {u16Swizzle0, u16Swizzle1});
    126123
     
    141138    pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
    142139
    143     /*
    144     Kernel * outK = pxDriver.addKernelInstance<FileSink>(iBuilder, 8);
    145     outK->setInitialArguments({iBuilder->GetString(outputFile)});
    146     pxDriver.makeKernelCall(outK, {DecompressedByteStream}, {});
    147     */
    148 
    149 
    150140    pxDriver.generatePipelineIR();
    151141
     
    159149    auto mainFunc = reinterpret_cast<MainFunctionType>(pxDriver.getMain());
    160150
    161 
    162151    mainFunc(fileBuffer, mFilesize);
    163152
  • icGREP/icgrep-devel/icgrep/character_deposit.cpp

    r5873 r5985  
    123123
    124124
    125 //    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
    126 //    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     125    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
     126    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1);
    127127
    128     StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1, 2);
    129     StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1, 2);
     128//    StreamSetBuffer * u16Swizzle0 = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1, 2);
     129//    StreamSetBuffer * u16Swizzle1 = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(4), inputBufferBlocks, 1, 2);
    130130    Kernel * delK = pxDriver.addKernelInstance<SwizzledDeleteByPEXTkernel>(iBuilder, 64, 8);
    131131    pxDriver.makeKernelCall(delK, {CharacterMarkerBuffer, BasisBits}, {u16Swizzle0, u16Swizzle1});
    132132
    133133    StreamSetBuffer * depositedSwizzle0 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), outputBufferBlocks, 1);
    134     StreamSetBuffer * depositedSwizzle1 = pxDriver.addBuffer<SwizzledCopybackBuffer>(iBuilder, iBuilder->getStreamSetTy(4), outputBufferBlocks, 1);
    135 
    136     /*
    137     Kernel * pdep0K = pxDriver.addKernelInstance<PDEPkernel>(iBuilder, 4, 4, 64, "pdep0");
     134    Kernel * pdep0K = pxDriver.addKernelInstance<PDEPkernel>(iBuilder, 4, "pdep0");
    138135    pxDriver.makeKernelCall(pdep0K, {CharacterMarkerBuffer, u16Swizzle0}, {depositedSwizzle0});
    139136
    140     Kernel * pdep1K = pxDriver.addKernelInstance<PDEPkernel>(iBuilder, 4, 4, 64, "pdep1");
     137    Kernel * pdep1K = pxDriver.addKernelInstance<PDEPkernel>(iBuilder, 4, "pdep1");
    141138    pxDriver.makeKernelCall(pdep1K, {CharacterMarkerBuffer, u16Swizzle1}, {u16Swizzle1});
    142139    */
  • icGREP/icgrep-devel/icgrep/editd/editd_cpu_kernel.cpp

    r5706 r5985  
    9191
    9292void editdCPUKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingBytes) {
    93     idb->setScalarField("EOFmask", idb->bitblock_mask_from(mAvailableItemCount[0]));
     93    idb->setScalarField("EOFmask", idb->bitblock_mask_from(remainingBytes));
    9494    CreateDoBlockMethodCall(idb);
    9595}
  • icGREP/icgrep-devel/icgrep/editd/editd_gpu_kernel.cpp

    r5706 r5985  
    9393
    9494void editdGPUKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingBytes) {
    95     idb->setScalarField("EOFmask", idb->bitblock_mask_from(mAvailableItemCount[0]));
     95    idb->setScalarField("EOFmask", idb->bitblock_mask_from(remainingBytes));
    9696    CreateDoBlockMethodCall(idb);
    9797}
  • icGREP/icgrep-devel/icgrep/kernels/attributes.h

    r5967 r5985  
    33
    44#include <vector>
     5#include <llvm/Support/Compiler.h>
     6#include <assert.h>
    57
    68namespace kernel {
     
    6466        // buffer size calculations.
    6567
     68        ZeroExtend, /// NOT DONE
     69
     70        // If the available item count of an input stream it less than some other input
     71        // stream(s), it will be zero-extended to the length of the larger stream. If
     72        // this option is not set and the kernel does not have a MustExplicitlyTerminate
     73        // attribute, it will end once any input has been exhausted.
     74
    6675        IndependentRegionBegin, IndependentRegionEnd, /// NOT DONE
    6776
     
    96105        // is enough data to execute a stride rather than the upper bound.)
    97106
    98         DisableSufficientChecking,
    99 
    100         // Workaround attribute, force disable sufficient data or sufficient space checking in pipelilne, always assume that
    101         // the data or space is sufficient
    102 
    103107        /** OUTPUT STREAM ATTRIBUTES **/
    104108
     
    120124        // perform any operations accordingly
    121125
    122         BlockSize, /// NOT DONE
    123 
    124         // A BlockSize(K) attribute, where K=2^k for some value of k>=4 declares
    125         // that the layout of stream data items within the corresponding input
    126         // or output buffer is arranged in blocks of K items each.   In each
    127         // block, the data buffer contains K items of the first stream in the
    128         // set, followed by K items of the next stream in the set and so on,
    129         // up to and including K items of the last stream in the set.
    130 
    131         // (Note: this replaces the concept of swizzling and anticipates that
    132         // the pipeline will take on the role of automatically inserting the
    133         // swizzling code necessary).
     126        BlockSize,
     127
     128        // Typically a kernel assumes that each stream of a stream set is a linear sequence
     129        // of items. The BlockSize(K) attribute informs the kernel is actually divided into
     130        // (BlockWidth / K) elements and each "stream" actually contains K items of the
     131        // first stream followed by K elements of the second stream and so on. The notion
     132        // of produced/processed item count changes to suite. I.e., when typical kernels
     133        // report that they've processed/produced up to the i-th position, it means:
     134
     135        //                                         v
     136        //                 ...|AAAAAAAA AAAAAAAA AA*              |...
     137        //                 ...|BBBBBBBB BBBBBBBB BB*              |...
     138        //                 ...|CCCCCCCC CCCCCCCC CC*              |...
     139        //                 ...|DDDDDDDD DDDDDDDD DD*              |...
     140
     141        // However, if (BlockWidth / K) is 4, the same i-th position above is actually:
     142
     143        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
     144        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
     145        //                 ...|AA*     |BB*     |CC*     |DD*     |...
     146        //                 ...|        |        |        |        |...
     147
     148        // (Note: this replaces the concept of swizzling and anticipates that the pipeline
     149        // will take on the role of automatically inserting the swizzling code necessary).
    134150
    135151        ReverseRegionBegin, ReverseRegionEnd, /// NOT DONE
     
    141157        // something ambigious after we've found its end position in some prior kernel.
    142158
    143 
    144         Swizzled,
    145 
    146         // Whether the input streamset is in swizzled form
    147159
    148160//        Here is a revised definition of SegmentedReverse:
     
    198210
    199211
     212        RequiresLinearAccess, PermitsNonLinearAccess,
     213
     214        // Indicates whether all unprocessed / consumed space is safely accessible by the
     215        // MultiBlockKernel code. By default, input streams and any output stream in which
     216        // we know a priori exactly how much data will be written into the overflow buffer
     217        // are opt-out and all others are opt-in. The reason is that writing non-linear
     218        // output at a non-Fixed rate be costly to manage. E.g.,
     219
     220        //                             BUFFER          v   OVERFLOW
     221        //                |?????############...........###|#####???|
     222        //                                 n           p  k    m
     223
     224        // Suppose from a given offset p, we write n items but only have space for k items
     225        // in the stream set buffer. Assuming we wrote more than one stride, we know that
     226        // there are (m - k) items in the overflow but may not know what our value of m is
     227        // unless we can derive the relationship between m and n a priori. The problem is
     228        // that the kernel will write the second stride's output at the (m - k)-th position
     229        // of the 0-th block and but final reported count will be n. We can safely mitigate
     230        // this in many ways:
     231
     232        // (1) when we detect that we could write into the overflow region of the buffer,
     233        // we can zero out the memory of both the overflow *and* the 0-th block of the
     234        // buffer then combine both by OR-ing the streams and writing them to the 0-th
     235        // block. The advantage is we require no extra memory but the disadvantage is that
     236        // the kernel is now relies on the pipeline to ensure that whenever we may write
     237        // into the overflow that the 0-th block is fully consumed.
     238
     239        // (2) the overflow region is equal to the size of the buffer (i.e., employ double
     240        // buffering.) The advantage of this is the kernel makes no assumptions about the
     241        // pipeline itself. The disadvantage is we could have to copy a lot of data if k
     242        // is very small and the amount we will copy is variable.
     243
     244        // (3) use stack allocated temporary buffers. This method has similar advantages /
     245        // disadvantages to 2 but trades heap space allocations for stack based ones.
     246
     247        // (4) force people writing kernels to record the number of items written each
     248        // stride. The advantage of this is it would be as cheap as (1) but requires the
     249        // kernel writer maintain the current stride index and that the kernel logic has
     250        // a natural breakpoint in the algorithm in which to record the number.
     251
    200252        /** KERNEL ATTRIBUTES **/
    201253
     
    226278        // in the pipeline have also terminated.
    227279
    228         MustConsumeAll,
     280        MustProcessAll,
    229281
    230282        //Workaround, the kernel will finish only when all of the inputs are consumed
     
    273325    friend struct Binding;
    274326    friend Attribute Add1();
     327    friend Attribute BlockSize(const unsigned k);
    275328    friend Attribute Principal();
    276329    friend Attribute AlwaysConsume();
    277     friend Attribute DisableSufficientChecking();
    278330    friend Attribute RoundUpTo(const unsigned);
    279331    friend Attribute LookAhead(const unsigned);
     
    283335    friend Attribute ConditionalRegionBegin();
    284336    friend Attribute ConditionalRegionEnd();
    285     friend Attribute Swizzled();
    286337    friend Attribute CanTerminateEarly();
    287338    friend Attribute MustExplicitlyTerminate();
    288     friend Attribute MustConsumeAll();
     339    friend Attribute RequiresLinearAccess();
     340    friend Attribute PermitsNonLinearAccess();
    289341
    290342    Attribute(const KindId kind, const unsigned k) : mKind(kind), mAmount(k) { }
     
    318370    Attribute & addAttribute(Attribute attribute);
    319371
    320     bool hasAttributes() const {
     372    bool LLVM_READNONE hasAttributes() const {
    321373        return !empty();
    322374    }
    323375
    324     bool hasAttribute(const AttributeId id) const {
     376    bool LLVM_READNONE hasAttribute(const AttributeId id) const {
    325377        return __findAttribute(id) != nullptr;
    326378    }
     
    337389
    338390};
    339 
    340391
    341392inline Attribute Add1() {
     
    351402}
    352403
    353 inline Attribute DisableSufficientChecking() {
    354     return Attribute(Attribute::KindId::DisableSufficientChecking, 0);
    355 }
    356 
    357404inline Attribute Principal() {
    358405    return Attribute(Attribute::KindId::Principal, 0);
     
    371418}
    372419
     420inline Attribute BlockSize(const unsigned k) {
     421    assert (k && ((k & (k - 1)) == 0));
     422    return Attribute(Attribute::KindId::BlockSize, k);
     423}
     424
     425inline Attribute Swizzled() {
     426    return BlockSize(64);
     427}
     428
     429inline Attribute RequiresLinearAccess() {
     430    return Attribute(Attribute::KindId::RequiresLinearAccess, 0);
     431}
     432
     433inline Attribute PermitsNonLinearAccess() {
     434    return Attribute(Attribute::KindId::PermitsNonLinearAccess, 0);
     435}
     436
    373437inline Attribute Misaligned() {
    374438    return Attribute(Attribute::KindId::Misaligned, 0);
     
    391455}
    392456
    393 inline Attribute MustConsumeAll() {
    394     return Attribute(Attribute::KindId::MustConsumeAll, 0);
    395 }
    396 
    397 inline Attribute Swizzled() {
    398     return Attribute(Attribute::KindId::Swizzled, 0);
    399 }
    400 
    401 }
    402 
     457}
    403458#endif // ATTRIBUTES_H
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5926 r5985  
    4848}
    4949
    50 SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width)
    51 : BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount),
    52                   {Binding{iBuilder->getStreamSetTy(), "delMaskSet"}, Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"}},
     50SwizzledDeleteByPEXTkernel::SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width)
     51: BlockOrientedKernel("PEXTdel" + std::to_string(PEXT_width) + "_" + std::to_string(streamCount),
     52                  {Binding{b->getStreamSetTy(), "delMaskSet"}, Binding{b->getStreamSetTy(streamCount), "inputStreamSet"}},
    5353                  {}, {}, {}, {})
    54 , mDelCountFieldWidth(fw)
    5554, mStreamCount(streamCount)
    56 , mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
     55, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
    5756// add mSwizzleFactor - 1 to mStreamCount before dividing by mSwizzleFactor
    5857// to prevent rounding errors.
     
    6059, mPEXTWidth(PEXT_width)
    6160{
    62     assert((mDelCountFieldWidth > 0) && ((mDelCountFieldWidth & (mDelCountFieldWidth - 1)) == 0)
     61    assert((mPEXTWidth > 0) && ((mPEXTWidth & (mPEXTWidth - 1)) == 0)
    6362        && "mDelCountFieldWidth must be a power of 2");
    6463    assert(mSwizzleFactor > 1 && "mDelCountFieldWidth must be less than the block width");
     
    6665
    6766    // why, if we have 1 input stream, are there n output swizzle streams rather 1 of n?
    68     mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1), "outputSwizzle0", BoundedRate(0, 1)});
    69     addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData0");
     67    Type * const outputTy = b->getStreamSetTy(mSwizzleFactor, 1);
     68
     69    mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle0", BoundedRate(0, 1), BlockSize(PEXT_width)}); // PopcountOfNot("delMaskSet")
     70    addScalar(b->getBitBlockType(), "pendingSwizzleData0");
    7071    for (unsigned i = 1; i < mSwizzleSetCount; i++) {
    71         mStreamSetOutputs.push_back(Binding{iBuilder->getStreamSetTy(mSwizzleFactor, 1),
    72             "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0")});
    73         addScalar(iBuilder->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
    74     }
    75     addScalar(iBuilder->getSizeTy(), "pendingOffset");
    76 }
    77 
    78 void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
     72        mStreamSetOutputs.push_back(Binding{outputTy, "outputSwizzle" + std::to_string(i), RateEqualTo("outputSwizzle0"), BlockSize(PEXT_width)});
     73        addScalar(b->getBitBlockType(), "pendingSwizzleData" + std::to_string(i));
     74    }
     75    addScalar(b->getSizeTy(), "pendingOffset");
     76}
     77
     78void SwizzledDeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    7979    // We use delMask to apply the same PEXT delete operation to each stream in the input stream set
    80     Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    81     const auto masks = get_PEXT_masks(iBuilder, delMask);
    82     generateProcessingLoop(iBuilder, masks, delMask);
    83 }
    84 
    85 void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> &iBuilder, Value * remainingBytes) {
    86     const auto originalProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
    87     IntegerType * vecTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
    88     Value * remaining = iBuilder->CreateZExt(remainingBytes, vecTy);
    89     Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    90     Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    91     const auto masks = get_PEXT_masks(iBuilder, delMask);
    92     generateProcessingLoop(iBuilder, masks, delMask);
    93 
    94     const auto newProducedItemCount = iBuilder->getProducedItemCount("outputSwizzle0");
    95     Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    96     Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
    97    
    98     Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    99     Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
    100     Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
    101 
    102     const auto deltaOutputIndex = iBuilder->CreateSub(
    103             iBuilder->CreateUDiv(newProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth())),
    104             iBuilder->CreateUDiv(originalProducedItemCount, iBuilder->getSize(iBuilder->getBitBlockWidth()))
    105     );
    106     outputIndex = iBuilder->CreateAdd(outputIndex, iBuilder->CreateMul(deltaOutputIndex, iBuilder->getSize(iBuilder->getBitBlockWidth() / mDelCountFieldWidth)));
    107 
    108     Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
    109 
    110     // Write the pending data.
    111     for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    112         Value * pendingData = iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i));
    113         Value * outputStreamPtr = iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), iBuilder->getInt32(0), outputIndex);
    114         iBuilder->CreateBlockAlignedStore(pendingData, outputStreamPtr);
    115     }
    116     iBuilder->setProducedItemCount("outputSwizzle0", iBuilder->CreateAdd(pendingOffset, outputProduced));
    117 }
    118 
    119 std::vector<Value *> SwizzledDeleteByPEXTkernel::get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
    120     // Del mask marks locations of bits we want to delete with 1 bits. Delete marked bits by extracting only the bits not marked in this way.
    121     // Apply the PEXT operation mPEXTWidth bits at a time (e.g. if block is 256 bits and mPEXTWidth is 64, apply 4 PEXT ops to full process block.
    122     Value * m = iBuilder->fwCast(mPEXTWidth, iBuilder->simd_not(del_mask));
    123     std::vector<Value *> masks;
    124     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
    125         masks.push_back(iBuilder->CreateExtractElement(m, i));
    126     }
    127     return masks;
    128 }
    129 
    130 void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
    131                                                 Value * delMask) {
    132     Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask)); // delMask marks the positions we want to extract
    133     std::vector<Value *> counts;
    134     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/ mPEXTWidth; i++) {
    135         // Store the deletion counts for each PEXT field
    136         counts.push_back(iBuilder->CreateExtractElement(delCount, i)); // Extract field i from SIMD register delCount
    137     }
    138 
    139     generatePEXTAndSwizzleLoop(iBuilder, masks, counts);
     80    Value * const delMask = b->loadInputStreamBlock("delMaskSet", b->getInt32(0));
     81    generateProcessingLoop(b, delMask, false);
     82}
     83
     84void SwizzledDeleteByPEXTkernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingBytes) {
     85    IntegerType * const vecTy = b->getIntNTy(b->getBitBlockWidth());
     86    Value * const remaining = b->CreateZExt(remainingBytes, vecTy);
     87    Value * const EOFMask = b->bitCast(b->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
     88    Value * const delMask = b->CreateOr(EOFMask, b->loadInputStreamBlock("delMaskSet", b->getInt32(0)));
     89    generateProcessingLoop(b, delMask, true);
    14090}
    14191
     
    14393What this function does in pseudo code:
    14494for (mSwizzleFactor)
    145         create a swizzle set containing mSwizzleFactor blocks
    146         apply PEXT to each block in the swizzle set
    147         store the swizzleSet in PEXTedSwizzleSets vector
    148        
     95    create a swizzle set containing mSwizzleFactor blocks
     96    apply PEXT to each block in the swizzle set
     97    store the swizzleSet in PEXTedSwizzleSets vector
     98
    14999for (each swizzle row i)
    150         for (each swizzle set j)
    151                 processes row i in swizzle set j
    152                 store output in pendingData[j]
     100    for (each swizzle set j)
     101        processes row i in swizzle set j
     102        store output in pendingData[j]
    153103*/
    154 void SwizzledDeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks,
    155                                                     std::vector<Value *> counts) {
    156     // For each of the k swizzle sets required to apply PEXT to all input streams
    157     std::vector<std::vector<Value *>> PEXTedSwizzleSets;
    158     for (unsigned j = 0; j < mSwizzleSetCount; ++j) {
    159     // Group input blocks together into input swizzle set. Input set should contain mSwizzleSetCount blocks (e.g. for U8U16 16/4=4).
    160     // Each block belongs to a different input stream.
    161         std::vector<Value *> input;
    162         unsigned streamSelectionIndex = j * mSwizzleFactor;
    163         for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
    164                 // Check if i > mStreamCount. If it is, add null streams until we get mSwizzleSetCount streams in the input vector
    165             if ( i >= mStreamCount) {
    166                                 input.push_back(iBuilder->allZeroes());
    167             } else {
    168                 input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
    169             }
    170         }
    171         // each partiallyCompressedSwizzleSet is obtained by applying PEXT to each of the blocks in input
    172         PEXTedSwizzleSets.push_back(apply_PEXT_deletion_with_swizzle(iBuilder, masks, input));
    173     }
    174         // Compress the PEXTedSwizzleSets
     104
     105void SwizzledDeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, Value * const delMask, const bool flush) {
     106
     107    // selectors marks the positions we want to keep
     108    Value * const selectors = b->CreateNot(delMask);
     109
     110    const auto swizzleSets = makeSwizzleSets(b, selectors);
     111
     112    // Compress the PEXTedSwizzleSets
    175113    // Output is written and committed to the output buffer one swizzle at a time.
    176     Constant * blockOffsetMask = iBuilder->getSize(iBuilder->getBitBlockWidth() - 1);
    177     Constant * outputIndexShift = iBuilder->getSize(std::log2(mDelCountFieldWidth));
    178    
    179     Value * outputProduced = iBuilder->getProducedItemCount("outputSwizzle0"); // All output groups have the same count.
    180     Value * producedOffset = iBuilder->CreateAnd(outputProduced, blockOffsetMask);
    181     Value * outputIndex = iBuilder->CreateLShr(producedOffset, outputIndexShift);
    182 
    183     // There may be pending data in the kernel state, for up to mDelCountFieldWidth-1 bits per stream.
    184     Value * pendingOffset = iBuilder->getScalarField("pendingOffset");
     114    ConstantInt * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
     115    ConstantInt * const PEXT_WIDTH = b->getSize(mPEXTWidth);
     116    ConstantInt * const LOG_2_PEXT_WIDTH = b->getSize(std::log2(mPEXTWidth));
     117    ConstantInt * const LOG_2_SWIZZLE_FACTOR = b->getSize(std::log2(mSwizzleFactor));
     118    ConstantInt * const PEXT_WIDTH_MASK = b->getSize(mPEXTWidth - 1);
     119
     120    // All output groups have the same count.
     121    Value * outputProduced = b->getProducedItemCount("outputSwizzle0");
     122    outputProduced = b->CreateAdd(outputProduced, b->getScalarField("pendingOffset"));
     123    Value * const producedOffset = b->CreateAnd(outputProduced, BLOCK_WIDTH_MASK);
     124    Value * outputIndex = b->CreateLShr(producedOffset, LOG_2_PEXT_WIDTH);
     125
    185126    // There is a separate vector of pending data for each swizzle group.
    186127    std::vector<Value *> pendingData;
    187 
    188128    for (unsigned i = 0; i < mSwizzleSetCount; i++) {
    189         pendingData.push_back(iBuilder->getScalarField("pendingSwizzleData" + std::to_string(i)));
    190     }
     129        pendingData.push_back(b->getScalarField("pendingSwizzleData" + std::to_string(i)));
     130    }
     131
     132    Value * const newItemCounts = b->simd_popcount(mPEXTWidth, selectors);
    191133
    192134    // For each row i
    193135    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     136
    194137        // Generate code for each of the mSwizzleFactor fields making up a block.
    195138        // We load the count for the field and process all swizzle groups accordingly.
    196         Value * newItemCount = counts[i];
    197         //iBuilder->CallPrintInt("NeW ITeM COUNT!", newItemCount); //TODO remove
    198         Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mDelCountFieldWidth), pendingOffset);
    199         Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
    200        
     139        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
     140        Value * const newItemCount = b->CreateExtractElement(newItemCounts, i);
     141        Value * const pendingSpace = b->CreateSub(PEXT_WIDTH, pendingOffset);
     142        Value * const pendingSpaceFilled = b->CreateICmpUGE(newItemCount, pendingSpace);
     143
     144        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
     145        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
     146
    201147        // Data from the ith swizzle pack of each group is processed
    202148        // according to the same newItemCount, pendingSpace, ...
    203149        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    204             Value * newItems = PEXTedSwizzleSets[j][i];
    205             //iBuilder->CallPrintRegister("NeW ITeMS!", newItems); //TODO remove
     150            Value * const newItems = swizzleSets[j][i];
    206151            // Combine as many of the new items as possible into the pending group.
    207             Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mDelCountFieldWidth,
    208                 pendingOffset)));
    209             //iBuilder->CallPrintRegister("ComBineDGROUP", combinedGroup);
     152            Value * const shiftVector = b->simd_fill(mPEXTWidth, pendingOffset);
     153            Value * const shiftedItems = b->CreateShl(newItems, shiftVector);
     154            Value * const combinedGroup = b->CreateOr(pendingData[j], shiftedItems);
    210155            // To avoid an unpredictable branch, always store the combined group, whether full or not.
    211             iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), outputIndex));
    212            
     156            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(j), swizzleIndex, blockOffset);
     157            b->CreateBlockAlignedStore(combinedGroup, outputPtr);
    213158            // Any items in excess of the space available in the current pending group overflow for the next group.
    214             Value * overFlowGroup = iBuilder->CreateLShr(newItems, iBuilder->simd_fill(mDelCountFieldWidth, pendingSpace));
     159            Value * overFlowGroup = b->CreateLShr(newItems, b->simd_fill(mPEXTWidth, pendingSpace));
    215160            // If we filled the space, then the overflow group becomes the new pending group and the index is updated.
    216             pendingData[j] = iBuilder->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
    217         }
    218         outputIndex = iBuilder->CreateSelect(pendingSpaceFilled, iBuilder->CreateAdd(outputIndex, iBuilder->getSize(1)), outputIndex);
    219         pendingOffset = iBuilder->CreateAnd(iBuilder->CreateAdd(newItemCount, pendingOffset), iBuilder->getSize(mDelCountFieldWidth-1));
    220     }
    221    
    222     iBuilder->setScalarField("pendingOffset", pendingOffset);
    223     //iBuilder->CallPrintInt("pendingOffset", pendingOffset);
    224    
    225     Value * newlyProduced = iBuilder->CreateSub(iBuilder->CreateShl(outputIndex, outputIndexShift), producedOffset);
    226     Value * produced = iBuilder->CreateAdd(outputProduced, newlyProduced);
    227     for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    228         iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
    229         //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
    230     }
    231     iBuilder->setProducedItemCount("outputSwizzle0", produced);
     161            pendingData[j] = b->CreateSelect(pendingSpaceFilled, overFlowGroup, combinedGroup);
     162        }
     163
     164        Value * const swizzleIncrement = b->CreateZExt(pendingSpaceFilled, b->getSizeTy());
     165        outputIndex = b->CreateAdd(outputIndex, swizzleIncrement);
     166
     167        outputProduced = b->CreateAdd(outputProduced, newItemCount);
     168    }
     169
     170    if (flush) { // incase we selected the overflow group on the final iteration
     171        Value * const swizzleIndex = b->CreateAnd(outputIndex, mSwizzleFactor - 1);
     172        Value * const blockOffset = b->CreateLShr(outputIndex, LOG_2_SWIZZLE_FACTOR);
     173        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     174            Value * const outputPtr = b->getOutputStreamBlockPtr("outputSwizzle" + std::to_string(i), swizzleIndex, blockOffset);
     175            b->CreateBlockAlignedStore(pendingData[i], outputPtr);
     176        }
     177    } else {
     178        for (unsigned i = 0; i < mSwizzleSetCount; i++) {
     179            b->setScalarField("pendingSwizzleData" + std::to_string(i), pendingData[i]);
     180        }
     181        Value * const pendingOffset = b->CreateAnd(outputProduced, PEXT_WIDTH_MASK);
     182        b->setScalarField("pendingOffset", pendingOffset);
     183        // unless this is our final stride, don't report partially written fields.
     184        outputProduced = b->CreateAnd(outputProduced, b->CreateNot(PEXT_WIDTH_MASK));
     185    }
     186
     187    b->setProducedItemCount("outputSwizzle0", outputProduced);
    232188}
    233189
     
    265221Swizzle 4:  lmnop000 23456000 LMNOP000 23456000
    266222
    267 Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the 
     223Now we can compress each 32-bit segment of swizzle 1 by 2, each 32 bit segment of swizzle 2 by 4, etc. Once we've completed the
    268224compression, we unswizzle to restore the 4 streams. The streams are now fully compressed!
    269225
    270226Args:
    271227    strms: the vector of blocks to apply PEXT operations to. strms[i] is the block associated with the ith input stream.
    272     masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces 
     228    masks: the PEXT deletion masks to apply to each block in strms (input mask is broken into PEXT width pieces, apply pieces
    273229        sequentially to PEXT a full block.)
    274230
     
    276232    output (vector of Value*): Swizzled, PEXTed version of strms. See example above.
    277233*/
    278 std::vector<Value *> SwizzledDeleteByPEXTkernel::apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder,
    279                                                              const std::vector<Value *> & masks, std::vector<Value *> strms) {
    280     Value * PEXT_func = nullptr;
     234
     235std::vector<std::vector<llvm::Value *>> SwizzledDeleteByPEXTkernel::makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const selectors) {
     236
     237    Constant * pext = nullptr;
    281238    if (mPEXTWidth == 64) {
    282         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     239        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
    283240    } else if (mPEXTWidth == 32) {
    284         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    285     }
    286    
    287     std::vector<Value *> output;     
    288     for (unsigned i = 0; i < strms.size(); i++) {
    289         Value * v = iBuilder->fwCast(mPEXTWidth, strms[i]);
    290         output.push_back(Constant::getNullValue(v->getType()));
    291     }
    292 
    293     // For each of the input streams
    294     for (unsigned j = 0; j < strms.size(); j++) {
    295         Value * v = iBuilder->fwCast(mPEXTWidth, strms[j]); // load stream j
    296         // Process the stream's block mPEXTWidth bits at a time (a PEXT operation can't do more than 64 bits at a time)
    297         for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
    298             Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (load mPEXTWidth bits)
    299             Value * PEXTed_field = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the segment we just loaded
    300             /*
    301              We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks.
    302              E.g.:
    303 
    304                *i*
    305             *j* a b c d strms[0]
    306                 e f g h
    307                 i j k l
    308                 m n o p
    309 
    310              Apply pext deletion at each position, then swizzle results:
    311                *j*
    312             *i* a` e` i` m` output[0]
    313                 b` f` j` n`
    314                 c` g` k` o` 
    315                 d` i` l` p`         
    316             */   
    317             output[i] = iBuilder->CreateInsertElement(output[i], PEXTed_field, j);
    318             /*
    319             numCompressedBits = 0
    320 
    321             for (each swizzleField position j)
    322                 for (each input swizzle i)
    323                     get PEXTed_field
    324                     Shift PEXTed_field left by "numCompressedBits" (in output[i])
    325                     OR PEXTed_field into output[i] (output[i] is output swizzle buffer for input swizzle i)
    326                 numCompressedBits += popcount(mask[i])
    327             */
    328         }
    329     }
    330    
    331     return output;
    332 }
    333 
    334 Value * SwizzledDeleteByPEXTkernel::apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
    335     Value * PEXT_func = nullptr;
    336     if (mPEXTWidth == 64) {
    337         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
    338     } else if (mPEXTWidth == 32) {
    339         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    340     }
    341        
    342     Value * v = iBuilder->fwCast(mPEXTWidth, strm);
    343     Value * output = Constant::getNullValue(v->getType());
    344     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/mPEXTWidth; i++) {
    345         Value * field = iBuilder->CreateExtractElement(v, i);
    346         Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
    347         output = iBuilder->CreateInsertElement(output, compressed, i);
    348     }
    349     return output;
     241        pext = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
     242    }
     243
     244    Value * const m = b->fwCast(mPEXTWidth, selectors);
     245
     246    std::vector<Value *> masks(mSwizzleFactor);
     247    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     248        masks[i] = b->CreateExtractElement(m, i);
     249
     250    }
     251
     252    std::vector<std::vector<Value *>> swizzleSets;
     253    swizzleSets.reserve(mSwizzleSetCount);
     254
     255    VectorType * const vecTy = b->fwVectorType(mPEXTWidth);
     256
     257    UndefValue * const outputInitializer = UndefValue::get(vecTy);
     258
     259    std::vector<Value *> input(mSwizzleFactor);
     260    // For each of the k swizzle sets required to apply PEXT to all input streams
     261    for (unsigned i = 0; i < mSwizzleSetCount; ++i) {
     262
     263        for (unsigned j = 0; j < mSwizzleFactor; ++j) {
     264            const unsigned k = (i * mSwizzleFactor) + j;
     265            if (k < mStreamCount) {
     266                input[j] = b->CreateBitCast(b->loadInputStreamBlock("inputStreamSet", b->getInt32(k)), vecTy);
     267            } else {
     268                input[j] = Constant::getNullValue(vecTy);
     269            }
     270        }
     271
     272        // TODO: if a SIMD pext instruction exists, we should first swizzle the lanes
     273        // then splat the pext mask and apply it to each output row
     274
     275        std::vector<Value *> output(mSwizzleFactor, outputInitializer);
     276        // For each of the input streams
     277        for (unsigned j = 0; j < mSwizzleFactor; j++) {
     278            for (unsigned k = 0; k < mSwizzleFactor; k++) {
     279                // Load block j,k
     280                Value * const field = b->CreateExtractElement(input[j], k);
     281                // Apply PEXT deletion
     282                Value * const selected = b->CreateCall(pext, {field, masks[k]});
     283                // Then store it as our k,j-th output
     284                output[k] = b->CreateInsertElement(output[k], selected, j);
     285            }
     286        }
     287
     288        swizzleSets.emplace_back(output);
     289    }
     290
     291    return swizzleSets;
    350292}
    351293
     
    392334}
    393335
    394 const unsigned PEXT_width = 64;
    395 
    396 inline std::vector<Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, Value * del_mask) {
    397     Value * m = iBuilder->fwCast(PEXT_width, iBuilder->simd_not(del_mask));
    398     std::vector<Value *> masks;
    399     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
    400         masks.push_back(iBuilder->CreateExtractElement(m, i));
    401     }
    402     return masks;
    403 }
    404 
    405 // Apply PEXT deletion to a collection of blocks and swizzle the result.
    406 // strms contains the blocks to process
    407 inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {
    408     Value * PEXT_func = nullptr;
    409     if (PEXT_width == 64) {
    410         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
    411     } else if (PEXT_width == 32) {
    412         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    413     }
    414    
    415     std::vector<Value *> output;     
    416     for (unsigned i = 0; i < strms.size(); i++) {
    417         Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
    418         output.push_back(Constant::getNullValue(v->getType()));
    419     }
    420 
    421     // For each of the input streams
    422     for (unsigned j = 0; j < strms.size(); j++) {
    423         Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
    424         // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
    425         for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
    426             Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
    427             Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
    428             /*
    429              We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
    430 
    431              a b c d
    432              e f g h
    433              i j k l
    434              m n o p
    435 
    436              Apply pext deletion at each position, then swizzle results:
    437 
    438              a` e` i` m`
    439              b` f` j` n`
    440              c` g` k` o` 
    441              d` i` l` p`         
    442             */     
    443             output[i] = iBuilder->CreateInsertElement(output[i], compressed, j);
    444         }
    445     }
    446    
    447     return output;
    448 }
    449 
    450 inline Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * strm) {
    451     Value * PEXT_func = nullptr;
    452     if (PEXT_width == 64) {
    453         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
    454     } else if (PEXT_width == 32) {
    455         PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    456     }
    457        
    458     Value * v = iBuilder->fwCast(PEXT_width, strm);
    459     Value * output = Constant::getNullValue(v->getType());
    460     for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
    461         Value * field = iBuilder->CreateExtractElement(v, i);
    462         Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]});
    463         output = iBuilder->CreateInsertElement(output, compressed, i);
    464     }
    465     return output;
    466 }
    467 
    468336// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
    469337// Kernel inputs: stream_count data streams plus one del_mask stream
     
    472340void DeleteByPEXTkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) {
    473341    Value * delMask = iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    474     const auto masks = get_PEXT_masks(iBuilder, delMask);
    475     generateProcessingLoop(iBuilder, masks, delMask);
     342    generateProcessingLoop(iBuilder, delMask);
    476343}
    477344
     
    481348    Value * EOF_del = iBuilder->bitCast(iBuilder->CreateShl(Constant::getAllOnesValue(vecTy), remaining));
    482349    Value * delMask = iBuilder->CreateOr(EOF_del, iBuilder->loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    483     const auto masks = get_PEXT_masks(iBuilder, delMask);
    484     generateProcessingLoop(iBuilder, masks, delMask);
    485 }
    486 
    487 void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks, Value * delMask) {
    488     if (mShouldSwizzle) {
    489         generatePEXTAndSwizzleLoop(iBuilder, masks);
    490     } else {
    491         generatePEXTLoop(iBuilder, masks);
    492     }
    493     //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     350    generateProcessingLoop(iBuilder, delMask);
     351}
     352
     353void DeleteByPEXTkernel::generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, Value * delMask) {
     354    Constant * PEXT_func = nullptr;
     355    if (mPEXTWidth == 64) {
     356        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     357    } else if (mPEXTWidth == 32) {
     358        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
     359    }
     360    std::vector<Value *> masks(mSwizzleFactor);
     361    Value * const m = iBuilder->fwCast(mSwizzleFactor, iBuilder->simd_not(delMask));
     362    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     363        masks.push_back(iBuilder->CreateExtractElement(m, i));
     364    }
     365
     366    for (unsigned i = 0; i < mStreamCount; ++i) {
     367        Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i));
     368        Value * value = iBuilder->fwCast(mPEXTWidth, input);
     369        Value * output = UndefValue::get(value->getType());
     370        for (unsigned j = 0; j < mSwizzleFactor; j++) {
     371            Value * field = iBuilder->CreateExtractElement(value, j);
     372            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[j]});
     373            output = iBuilder->CreateInsertElement(output, compressed, j);
     374        }
     375        iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(i), output);
     376    }
    494377    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    495378    iBuilder->storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    496379}
    497380
    498 void DeleteByPEXTkernel::generatePEXTLoop(const std::unique_ptr<KernelBuilder> &iBuilder, const std::vector<Value *> & masks) {
    499     for (unsigned j = 0; j < mStreamCount; ++j) {
    500         Value * input = iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    501         Value * output = apply_PEXT_deletion(iBuilder, masks, input);
    502         iBuilder->storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    503     }
    504 }
    505 
    506 void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Value *> & masks) {
    507     // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
    508     // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
    509     for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
    510         std::vector<Value *> input;
    511         unsigned streamSelectionIndex = j * mSwizzleFactor;
    512         for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
    513                 // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
    514             if ( i >= mStreamCount) {
    515                                 input.push_back(iBuilder->allZeroes());
    516             } else {
    517                 input.push_back(iBuilder->loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
    518             }
    519         }
    520         std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
    521         for (unsigned i = 0; i < mSwizzleFactor; i++) {
    522              iBuilder->storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
    523         }
    524     }
    525 }
    526 
    527 DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
    528 : BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + (shouldSwizzle ? "swiz" : "noswiz"),
    529                   {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    530                       Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    531                   {}, {}, {}, {})
     381DeleteByPEXTkernel::DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount, unsigned PEXT_width)
     382: BlockOrientedKernel("PEXTdel" + std::to_string(fw) + "_" + std::to_string(streamCount) + "_" + std::to_string(PEXT_width),
     383              {Binding{b->getStreamSetTy(streamCount), "inputStreamSet"},
     384                  Binding{b->getStreamSetTy(), "delMaskSet"}},
     385              {}, {}, {}, {})
    532386, mDelCountFieldWidth(fw)
    533387, mStreamCount(streamCount)
    534 , mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
    535 , mShouldSwizzle(shouldSwizzle)
    536 {
    537     if(mShouldSwizzle) {       
    538         for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) {
    539             mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
    540         }
    541     } else {
    542         // No swizzling. Output results as single stream set
    543         mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
    544     }
    545     mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
     388, mSwizzleFactor(b->getBitBlockWidth() / PEXT_width)
     389, mPEXTWidth(PEXT_width) {
     390    mStreamSetOutputs.emplace_back(b->getStreamSetTy(mStreamCount), "outputStreamSet", PopcountOfNot("delMaskSet"));
     391    mStreamSetOutputs.emplace_back(b->getStreamSetTy(), "deletionCounts");
    546392}
    547393
     
    611457    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    612458        Value * newItemCount = iBuilder->CreateLoad(iBuilder->CreateGEP(countStreamPtr, iBuilder->getInt32(i)));
    613     //iBuilder->CallPrintInt("newItemCount", newItemCount);
    614459        Value * pendingSpace = iBuilder->CreateSub(iBuilder->getSize(mFieldWidth), pendingOffset);
    615460        Value * pendingSpaceFilled = iBuilder->CreateICmpUGE(newItemCount, pendingSpace);
     
    619464        for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    620465            Value * newItems = iBuilder->loadInputStreamBlock("inputSwizzle" + std::to_string(j), iBuilder->getInt32(i));
    621         //iBuilder->CallPrintRegister("newItems", newItems);
    622466            // Combine as many of the new items as possible into the pending group.
    623467            Value * combinedGroup = iBuilder->CreateOr(pendingData[j], iBuilder->CreateShl(newItems, iBuilder->simd_fill(mFieldWidth, pendingOffset)));
    624             //iBuilder->CallPrintRegister("combinedGroup", combinedGroup);
    625             // To avoid an unpredictable branch, always store the combined group, whether full or not.
    626                
     468            // To avoid an unpredictable branch, always store the combined group, whether full or not.               
    627469            iBuilder->CreateBlockAlignedStore(combinedGroup, iBuilder->CreateGEP(outputStreamPtr[j], outputIndex));
    628470            // Any items in excess of the space available in the current pending group overflow for the next group.
     
    639481    for (unsigned j = 0; j < mSwizzleSetCount; j++) {
    640482        iBuilder->setScalarField("pendingSwizzleData" + std::to_string(j), pendingData[j]);
    641         //iBuilder->CallPrintRegister("pendingData[j]", pendingData[j]);
    642 
    643483    }
    644484    iBuilder->setProducedItemCount("outputSwizzle0", produced);
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r5540 r5985  
    2929class SwizzledDeleteByPEXTkernel final : public BlockOrientedKernel {
    3030public:
    31     SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width = 64);
     31    SwizzledDeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned streamCount, unsigned PEXT_width = 64);
    3232    bool isCachable() const override { return true; }
    3333    bool hasSignature() const override { return false; }
    3434protected:
    35     void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    36     void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * remainingBytes) override;
    37     std::vector<llvm::Value *> get_PEXT_masks(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * del_mask);
    38     void generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks,
    39                                 llvm::Value * delMask);
    40     void generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks, std::vector<llvm::Value *> counts);
    41     std::vector<llvm::Value *> apply_PEXT_deletion_with_swizzle(const std::unique_ptr<KernelBuilder> & iBuilder,
    42                                                                 const std::vector<llvm::Value *> & masks, std::vector<llvm::Value *> strms);
    43     llvm::Value * apply_PEXT_deletion(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks,
    44                                       llvm::Value * strm);
     35    void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) override;
     36    void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingBytes) override;
    4537private:
    46     const unsigned mDelCountFieldWidth;
     38    void generateProcessingLoop(const std::unique_ptr<KernelBuilder> & b, llvm::Value * delMask, const bool flush);
     39    std::vector<std::vector<llvm::Value *>> makeSwizzleSets(const std::unique_ptr<KernelBuilder> & b, llvm::Value * delMask);
     40private:
    4741    const unsigned mStreamCount;
    4842    const unsigned mSwizzleFactor;
    4943    const unsigned mSwizzleSetCount;
    5044    const unsigned mPEXTWidth;
    51     static constexpr const char* mOutputSwizzleNameBase = "outputStreamSet";
    5245};
    5346
     
    6760class DeleteByPEXTkernel final : public BlockOrientedKernel {
    6861public:
    69     DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle);
     62    DeleteByPEXTkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned fw, unsigned streamCount, unsigned PEXT_width = 64);
    7063    bool isCachable() const override { return true; }
    7164    bool hasSignature() const override { return false; }
     
    7366    void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    7467    void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * remainingBytes) override;
    75     void generatePEXTAndSwizzleLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks);
    76     void generatePEXTLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks);
    77     void generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<llvm::Value *> & masks, llvm::Value * delMask);
     68    void generateProcessingLoop(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * delMask);
    7869private:
    7970    const unsigned mDelCountFieldWidth;
    8071    const unsigned mStreamCount;
    8172    const unsigned mSwizzleFactor;
    82     const bool mShouldSwizzle;
    83     static constexpr const char* mOutputSwizzleNameBase = "outputStreamSet";
     73    const unsigned mPEXTWidth;
    8474};
    8575   
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5933 r5985  
    181181: PabloKernel(kb, "RequiredStreams_UTF8",
    182182// input
    183 {Binding{kb->getStreamSetTy(8), "basis"}, Binding{kb->getStreamSetTy(1), "lf", FixedRate(), LookAhead(1)}},
     183{Binding{kb->getStreamSetTy(8), "basis"},
     184 Binding{kb->getStreamSetTy(1), "lf", FixedRate(), LookAhead(1)}},
    184185// output
    185186{Binding{kb->getStreamSetTy(1), "nonFinal", FixedRate()},
  • icGREP/icgrep-devel/icgrep/kernels/interface.cpp

    r5755 r5985  
    1212#include <kernels/kernel_builder.h>
    1313
    14 static const auto INIT_SUFFIX = "_Init";
    15 
    16 static const auto DO_SEGMENT_SUFFIX = "_DoSegment";
    17 
    18 static const auto TERMINATE_SUFFIX = "_Terminate";
    1914
    2015using namespace llvm;
    2116
    2217namespace kernel {
     18
     19const static auto INIT_SUFFIX = "_Init";
     20const static auto DO_SEGMENT_SUFFIX = "_DoSegment";
     21const static auto TERMINATE_SUFFIX = "_Terminate";
    2322
    2423void KernelInterface::addKernelDeclarations(const std::unique_ptr<kernel::KernelBuilder> & idb) {
     
    6665    args = doSegment->arg_begin();
    6766    args->setName("self");
    68     (++args)->setName("doFinal");
    69 //    if (mHasPrincipalItemCount) {
    70 //        (++args)->setName("principleAvailableItemCount");
    71 //    }
     67    (++args)->setName("isFinal");
    7268    for (const Binding & input : mStreamSetInputs) {
    73         //const ProcessingRate & r = input.getRate();
    74         //if (!r.isDerived()) {
    75             (++args)->setName(input.getName() + "AvailableItems");
    76         //}
     69        (++args)->setName(input.getName() + "AvailableItems");
    7770    }
    7871
  • icGREP/icgrep-devel/icgrep/kernels/interface.h

    r5967 r5985  
    2626namespace kernel {
    2727
     28const static std::string LOGICAL_SEGMENT_NO_SCALAR = "segmentNo";
     29const static std::string PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
     30const static std::string PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
     31const static std::string CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
     32const static std::string NON_DEFERRED_ITEM_COUNT_SUFFIX = "_nonDeferredItemCount";
     33const static std::string TERMINATION_SIGNAL = "terminationSignal";
     34const static std::string BUFFER_SUFFIX = "_buffer";
     35const static std::string CONSUMER_SUFFIX = "_consumerLocks";
     36const static std::string CYCLECOUNT_SCALAR = "CPUcycles";
     37
    2838struct Binding : public AttributeSet {
    2939
     
    6676    }
    6777
    68     bool isMisaligned() const {
    69         return hasAttribute(AttributeId::Misaligned);
    70     }
    71 
    72     bool isSwizzled() const {
    73         return hasAttribute(AttributeId::Swizzled);
    74     }
    75 
    76     bool isDisableSufficientChecking() const {
    77         return hasAttribute(AttributeId::DisableSufficientChecking);
    78     }
    79 
    8078    unsigned const getLookahead() const {
    8179        return findAttribute(AttributeId::LookAhead).amount();
    8280    }
    8381
    84     bool nonDeferred() const {
    85         return !hasAttribute(AttributeId::Deferred);
     82    bool isDeferred() const {
     83        return hasAttribute(AttributeId::Deferred);
    8684    }
    8785
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5967 r5985  
    2222#include <sstream>
    2323#include <kernels/kernel_builder.h>
    24 #include <boost/math/common_factor.hpp>
    2524#include <llvm/Support/Debug.h>
    2625
    2726using namespace llvm;
    2827using namespace parabix;
    29 using namespace boost::math;
    3028
    3129namespace kernel {
    32 
    33 const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
    34 const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
    35 const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
    36 const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
    37 const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
    38 const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
    39 const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
    40 const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal";
    41 const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr";
    42 const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks";
    43 const std::string Kernel::CYCLECOUNT_SCALAR = "CPUcycles";
    4430
    4531/** ------------------------------------------------------------------------------------------------------------- *
     
    5945}
    6046
    61 
    6247/** ------------------------------------------------------------------------------------------------------------- *
    6348 * @brief addUnnamedScalar
     
    7257}
    7358
    74 
    75 /** ------------------------------------------------------------------------------------------------------------- *
    76  * @brief prepareStreamSetNameMap
    77  ** ------------------------------------------------------------------------------------------------------------- */
    78 void Kernel::prepareStreamSetNameMap() {
    79     for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    80         mStreamMap.emplace(mStreamSetInputs[i].getName(), std::make_pair(Port::Input, i));
    81     }
    82     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    83         mStreamMap.emplace(mStreamSetOutputs[i].getName(), std::make_pair(Port::Output, i));
    84     }
    85 }
    86 
    87 
    8859/** ------------------------------------------------------------------------------------------------------------- *
    8960 * @brief bindPorts
    9061 ** ------------------------------------------------------------------------------------------------------------- */
    9162void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
    92     assert (mModule == nullptr);
    93     assert (mStreamSetInputBuffers.empty());
    94     assert (mStreamSetOutputBuffers.empty());
    9563
    9664    if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) {
     
    10068    }
    10169
     70    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     71        mStreamMap.emplace(mStreamSetInputs[i].getName(), std::make_pair(Port::Input, i));
     72    }
     73
    10274    for (unsigned i = 0; i < inputs.size(); ++i) {
    10375        StreamSetBuffer * const buf = inputs[i];
    10476        if (LLVM_UNLIKELY(buf == nullptr)) {
    105             report_fatal_error(getName() + ": input stream set " + std::to_string(i)
    106                                + " cannot be null");
    107         }
     77            report_fatal_error(getName() + ": input stream " + std::to_string(i) + " cannot be null");
     78        }
     79       // const Binding & input = mStreamSetInputs[i];
     80       // verifyBufferSize(input, buf);
    10881        buf->addConsumer(this);
    10982    }
     
    11588    }
    11689
     90    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     91        mStreamMap.emplace(mStreamSetOutputs[i].getName(), std::make_pair(Port::Output, i));
     92    }
     93
    11794    for (unsigned i = 0; i < outputs.size(); ++i) {
    11895        StreamSetBuffer * const buf = outputs[i];
     
    12097            report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null");
    12198        }
     99        const Binding & output = mStreamSetOutputs[i];
     100       // verifyBufferSize(output, buf);
    122101        if (LLVM_LIKELY(buf->getProducer() == nullptr)) {
    123102            buf->setProducer(this);
    124103        } else {
    125             report_fatal_error(getName() + ": output stream set " + std::to_string(i)
     104            report_fatal_error(getName() + ": output stream set " + output.getName()
    126105                               + " is already produced by kernel " + buf->getProducer()->getName());
    127106        }
     
    132111}
    133112
    134 
    135113/** ------------------------------------------------------------------------------------------------------------- *
    136114 * @brief getCacheName
    137115 ** ------------------------------------------------------------------------------------------------------------- */
    138 std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & idb) const {
     116std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & b) const {
    139117    std::stringstream cacheName;
    140     cacheName << getName() << '_' << idb->getBuilderUniqueName();
     118    cacheName << getName() << '_' << b->getBuilderUniqueName();
    141119    for (const StreamSetBuffer * b: mStreamSetInputBuffers) {
    142120        cacheName <<  ':' <<  b->getUniqueID();
     
    159137}
    160138
    161 
    162139/** ------------------------------------------------------------------------------------------------------------- *
    163140 * @brief makeModule
     
    174151 * @brief prepareKernel
    175152 ** ------------------------------------------------------------------------------------------------------------- */
    176 void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
    177     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     153void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & b) {
     154    assert ("KernelBuilder does not have a valid IDISA Builder" && b);
    178155    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
    179156        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
    180157    }
    181     addBaseKernelProperties(idb);
    182     addInternalKernelProperties(idb);
     158    // verifyStreamSetDefinitions();
     159    addBaseKernelProperties(b);
     160    addInternalKernelProperties(b);
    183161    // NOTE: StructType::create always creates a new type even if an identical one exists.
    184162    if (LLVM_UNLIKELY(mModule == nullptr)) {
    185         makeModule(idb);
     163        makeModule(b);
    186164    }
    187165    mKernelStateType = mModule->getTypeByName(getName());
    188166    if (LLVM_LIKELY(mKernelStateType == nullptr)) {
    189         mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
     167        mKernelStateType = StructType::create(b->getContext(), mKernelFields, getName());
    190168        assert (mKernelStateType);
    191169    }
     
    196174 * @brief prepareCachedKernel
    197175 ** ------------------------------------------------------------------------------------------------------------- */
    198 void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
    199     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     176void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & b) {
     177    assert ("KernelBuilder does not have a valid IDISA Builder" && b);
    200178    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
    201179        report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized");
    202180    }
    203     assert (getModule());
    204     addBaseKernelProperties(idb);
     181    assert (getModule());   
     182    addBaseKernelProperties(b);
    205183    mKernelStateType = getModule()->getTypeByName(getName());
    206184    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
     
    210188
    211189/** ------------------------------------------------------------------------------------------------------------- *
     190 * @brief containsFixedRate
     191 ** ------------------------------------------------------------------------------------------------------------- */
     192bool containsFixedRate(const Bindings & bindings) {
     193    for (const Binding & binding : bindings) {
     194        const ProcessingRate & rate = binding.getRate();
     195        if (rate.isFixed()) {
     196            return true;
     197        }
     198    }
     199    return false;
     200}
     201
     202/** ------------------------------------------------------------------------------------------------------------- *
    212203 * @brief addBaseKernelProperties
    213204 ** ------------------------------------------------------------------------------------------------------------- */
    214 void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
    215 
    216     assert (mStreamMap.empty());
    217 
    218     prepareStreamSetNameMap();
    219 
    220     normalizeStreamProcessingRates();
     205void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & b) {
    221206
    222207    const unsigned inputSetCount = mStreamSetInputs.size();
     
    228213    if (mStride == 0) {
    229214        // Set the default kernel stride.
    230         mStride = idb->getBitBlockWidth();
    231     }
    232 
    233     IntegerType * const sizeTy = idb->getSizeTy();
    234 
     215        mStride = b->getBitBlockWidth();
     216    }
     217
     218    IntegerType * const sizeTy = b->getSizeTy();
     219
     220    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
     221    addScalar(sizeTy, TERMINATION_SIGNAL);
     222    // TODO: if we had a way of easily calculating the number of processed/produced items of the
     223    // final stride of a non-deferred fixed rate stream, we could avoid storing the item counts.
    235224    for (unsigned i = 0; i < inputSetCount; i++) {
    236         const Binding & b = mStreamSetInputs[i];
    237         //const ProcessingRate & rate = b.getRate();
    238         //if (rate.isBounded() || rate.isUnknown()) {
    239             addScalar(sizeTy, b.getName() + PROCESSED_ITEM_COUNT_SUFFIX);
    240         //}
    241     }
    242 
     225        const Binding & input = mStreamSetInputs[i];
     226        addScalar(sizeTy, input.getName() + PROCESSED_ITEM_COUNT_SUFFIX);
     227        if (LLVM_UNLIKELY(input.isDeferred())) {
     228            addScalar(sizeTy, input.getName() + NON_DEFERRED_ITEM_COUNT_SUFFIX);
     229        }
     230    }
    243231    for (unsigned i = 0; i < outputSetCount; i++) {
    244         const Binding & b = mStreamSetOutputs[i];
    245         //const ProcessingRate & rate = b.getRate();
    246         //if (rate.isBounded() || rate.isUnknown()) {
    247             addScalar(sizeTy, b.getName() + PRODUCED_ITEM_COUNT_SUFFIX);
    248         //}
    249     }
    250 
     232        const Binding & output = mStreamSetOutputs[i];
     233        addScalar(sizeTy, output.getName() + PRODUCED_ITEM_COUNT_SUFFIX);
     234        if (LLVM_UNLIKELY(output.isDeferred())) {
     235            addScalar(sizeTy, output.getName() + NON_DEFERRED_ITEM_COUNT_SUFFIX);
     236        }
     237    }
    251238    for (unsigned i = 0; i < inputSetCount; i++) {
    252         mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].getName() + BUFFER_PTR_SUFFIX);
     239        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].getName() + BUFFER_SUFFIX);
    253240    }
    254241    for (unsigned i = 0; i < outputSetCount; i++) {
    255         mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].getName() + BUFFER_PTR_SUFFIX);
     242        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].getName() + BUFFER_SUFFIX);
    256243    }
    257244    for (const auto & binding : mScalarInputs) {
     
    264251        addScalar(binding.getType(), binding.getName());
    265252    }
    266     Type * const consumerSetTy = StructType::get(idb->getContext(), {sizeTy, sizeTy->getPointerTo()->getPointerTo()})->getPointerTo();
     253    Type * const consumerSetTy = StructType::get(b->getContext(), {sizeTy, sizeTy->getPointerTo()->getPointerTo()})->getPointerTo();
    267254    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    268255        addScalar(consumerSetTy, mStreamSetOutputs[i].getName() + CONSUMER_SUFFIX);
    269256    }
    270     addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    271     addScalar(sizeTy, TERMINATION_SIGNAL);
    272257    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    273258        addScalar(sizeTy, mStreamSetOutputs[i].getName() + CONSUMED_ITEM_COUNT_SUFFIX);
     
    276261    // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
    277262    // will be able to add instrumentation to cached modules without recompilation.
    278     addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
    279 
     263    addScalar(b->getInt64Ty(), CYCLECOUNT_SCALAR);
    280264}
    281265
     
    318302 * @brief callGenerateInitializeMethod
    319303 ** ------------------------------------------------------------------------------------------------------------- */
    320 inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    321     mCurrentMethod = getInitFunction(idb->getModule());
    322     idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
     304inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & b) {
     305    mCurrentMethod = getInitFunction(b->getModule());
     306    b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
    323307    Function::arg_iterator args = mCurrentMethod->arg_begin();
    324308    setInstance(&*(args++));
    325     idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
     309    b->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
    326310    for (const auto & binding : mScalarInputs) {
    327         idb->setScalarField(binding.getName(), &*(args++));
     311        b->setScalarField(binding.getName(), &*(args++));
    328312    }
    329313    for (const auto & binding : mStreamSetOutputs) {
    330         idb->setConsumerLock(binding.getName(), &*(args++));
    331     }
    332     generateInitializeMethod(idb);
    333     idb->CreateRetVoid();
     314        b->setConsumerLock(binding.getName(), &*(args++));
     315    }
     316    generateInitializeMethod(b);
     317    b->CreateRetVoid();
    334318}
    335319
     
    337321 * @brief callGenerateDoSegmentMethod
    338322 ** ------------------------------------------------------------------------------------------------------------- */
    339 inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    340     mCurrentMethod = getDoSegmentFunction(idb->getModule());
    341     idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
     323inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & b) {
     324    mCurrentMethod = getDoSegmentFunction(b->getModule());
     325    b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
    342326    auto args = mCurrentMethod->arg_begin();
    343327    setInstance(&*(args++));
    344328    mIsFinal = &*(args++);
    345     mAvailablePrincipalItemCount = nullptr;
    346329    const auto n = mStreamSetInputs.size();
    347330    mAvailableItemCount.resize(n, nullptr);
     
    351334    }
    352335    assert (args == mCurrentMethod->arg_end());
    353     generateKernelMethod(idb); // must be overridden by the Kernel subtype
     336    generateKernelMethod(b); // must be overridden by the Kernel subtype
    354337    mIsFinal = nullptr;
    355338    mAvailableItemCount.clear();
    356     idb->CreateRetVoid();
     339    b->CreateRetVoid();
    357340}
    358341
     
    361344 * @brief callGenerateFinalizeMethod
    362345 ** ------------------------------------------------------------------------------------------------------------- */
    363 inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
    364     mCurrentMethod = getTerminateFunction(idb->getModule());
    365     idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
     346inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b) {
     347    mCurrentMethod = getTerminateFunction(b->getModule());
     348    b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
    366349    auto args = mCurrentMethod->arg_begin();
    367350    setInstance(&*(args++));
    368     generateFinalizeMethod(idb); // may be overridden by the Kernel subtype
     351    generateFinalizeMethod(b); // may be overridden by the Kernel subtype
    369352    const auto n = mScalarOutputs.size();
    370353    if (n == 0) {
    371         idb->CreateRetVoid();
     354        b->CreateRetVoid();
    372355    } else {
    373356        Value * outputs[n];
    374357        for (unsigned i = 0; i < n; ++i) {
    375             outputs[i] = idb->getScalarField(mScalarOutputs[i].getName());
     358            outputs[i] = b->getScalarField(mScalarOutputs[i].getName());
    376359        }
    377360        if (n == 1) {
    378             idb->CreateRet(outputs[0]);
     361            b->CreateRet(outputs[0]);
    379362        } else {
    380             idb->CreateAggregateRet(outputs, n);
     363            b->CreateAggregateRet(outputs, n);
    381364        }
    382365    }
     
    390373    const auto f = mKernelFieldMap.find(name);
    391374    if (LLVM_UNLIKELY(f == mKernelFieldMap.end())) {
    392         assert (false);
     375        assert ("kernel does not contain the requested scalar" && false);
    393376        report_fatal_error(getName() + " does not contain scalar: " + name);
    394377    }
     
    413396 * @brief initializeInstance
    414397 ** ------------------------------------------------------------------------------------------------------------- */
    415 void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
    416     assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     398void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & b) {
     399    assert ("KernelBuilder does not have a valid IDISA Builder" && b);
    417400    if (LLVM_UNLIKELY(getInstance() == nullptr)) {
    418401        report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()");
     
    449432    }
    450433    assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
    451     IntegerType * const sizeTy = idb->getSizeTy();
     434    IntegerType * const sizeTy = b->getSizeTy();
    452435    PointerType * const sizePtrTy = sizeTy->getPointerTo();
    453436    PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo();
    454     StructType * const consumerTy = StructType::get(idb->getContext(), {sizeTy, sizePtrPtrTy});
     437    StructType * const consumerTy = StructType::get(b->getContext(), {sizeTy, sizePtrPtrTy});
    455438    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
    456439        const auto output = mStreamSetOutputBuffers[i];
    457440        const auto & consumers = output->getConsumers();
    458441        const auto n = consumers.size();
    459         AllocaInst * const outputConsumers = idb->CreateAlloca(consumerTy);
    460         Value * const consumerSegNoArray = idb->CreateAlloca(ArrayType::get(sizePtrTy, n));
     442        AllocaInst * const outputConsumers = b->CreateAlloca(consumerTy);
     443        Value * const consumerSegNoArray = b->CreateAlloca(ArrayType::get(sizePtrTy, n));
    461444        for (unsigned i = 0; i < n; ++i) {
    462445            Kernel * const consumer = consumers[i];
    463446            assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance());
    464             idb->setKernel(consumer);
    465             Value * const segmentNoPtr = idb->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
    466             idb->CreateStore(segmentNoPtr, idb->CreateGEP(consumerSegNoArray, { idb->getInt32(0), idb->getInt32(i) }));
    467         }
    468         idb->setKernel(this);
    469         Value * const consumerCountPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(0)});
    470         idb->CreateStore(idb->getSize(n), consumerCountPtr);
    471         Value * const consumerSegNoArrayPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(1)});
    472         idb->CreateStore(idb->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
     447            b->setKernel(consumer);
     448            Value * const segmentNoPtr = b->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
     449            b->CreateStore(segmentNoPtr, b->CreateGEP(consumerSegNoArray, { b->getInt32(0), b->getInt32(i) }));
     450        }
     451        b->setKernel(this);
     452        Value * const consumerCountPtr = b->CreateGEP(outputConsumers, {b->getInt32(0), b->getInt32(0)});
     453        b->CreateStore(b->getSize(n), consumerCountPtr);
     454        Value * const consumerSegNoArrayPtr = b->CreateGEP(outputConsumers, {b->getInt32(0), b->getInt32(1)});
     455        b->CreateStore(b->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
    473456        args.push_back(outputConsumers);
    474457    }
    475     idb->CreateCall(getInitFunction(idb->getModule()), args);
     458    b->CreateCall(getInitFunction(b->getModule()), args);
    476459}
    477460
     
    490473    const auto f = mStreamMap.find(name);
    491474    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
     475        assert (!mStreamMap.empty());
    492476        report_fatal_error(getName() + " does not contain stream set " + name);
    493477    }
     
    510494    if (rate.isFixed() || rate.isBounded()) {
    511495        return rate.getLowerBound();
    512     } else if (rate.isRelative()) {
    513         return rate.getRate() * getLowerBound(getBinding(rate.getReference()).getRate());
     496    } else if (rate.hasReference()) {
     497        return rate.getLowerBound() * getLowerBound(getBinding(rate.getReference()).getRate());
    514498    } else { // if (rate.isUnknown())
    515499        return 0;
     
    521505 ** ------------------------------------------------------------------------------------------------------------- */
    522506ProcessingRate::RateValue Kernel::getUpperBound(const ProcessingRate &rate) const {
    523     if (rate.isFixed() || rate.isBounded()) {
     507    if (rate.isFixed() || rate.isBounded() || rate.isPopCount()) {
    524508        return rate.getUpperBound();
    525     } else if (rate.isRelative()) {
    526         return rate.getRate() * getUpperBound(getBinding(rate.getReference()).getRate());
     509    } else if (rate.hasReference()) {
     510        return rate.getUpperBound() * getUpperBound(getBinding(rate.getReference()).getRate());
    527511    } else { // if (rate.isUnknown())
    528512        return 0;
     
    531515
    532516/** ------------------------------------------------------------------------------------------------------------- *
    533  * @brief normalizeRelativeToFixedProcessingRate
    534  ** ------------------------------------------------------------------------------------------------------------- */
    535 bool Kernel::normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate) {
    536     if (base.isFixed()) {
     517 * @brief verifyStreamSetDefinitions
     518 ** ------------------------------------------------------------------------------------------------------------- */
     519void Kernel::verifyStreamSetDefinitions() const {
     520    unsigned numOfPrincipalStreams = 0;
     521    for (unsigned i = 0; i < mStreamSetInputs.size(); ++i) {
     522        const Binding & input = mStreamSetInputs[i];
     523        const ProcessingRate & rate = input.getRate();
     524        // If a stream can be relative to a relative or fixed rate stream, it complicates the pipeline and
     525        // multiblock kernel. For now, report an error.
     526        if (LLVM_UNLIKELY(rate.hasReference())) {
     527            Port port; unsigned index;
     528            std::tie(port, index) = getStreamPort(rate.getReference());
     529            if (LLVM_UNLIKELY(port == Port::Output)) {
     530                report_fatal_error(getName() + ": input stream \"" + input.getName() + "\" cannot refer to an output stream");
     531            }
     532            if (LLVM_UNLIKELY(index >= i)) {
     533                report_fatal_error(getName() + ": \"" + input.getName() + "\" must be ordered after its reference stream");
     534            }
     535            if (rate.isRelative()) {
     536                const ProcessingRate & refRate = getStreamInput(index).getRate();
     537                if (LLVM_UNLIKELY(refRate.isRelative() || refRate.isFixed())) {
     538                    report_fatal_error(getName() + ": \"" + input.getName() + "\" cannot be relative to a fixed or relative rate stream");
     539                }
     540            }
     541        } else if (LLVM_UNLIKELY(rate.isUnknown())) {
     542            report_fatal_error(getName() + ": \"" + input.getName() + "\" cannot be an unknown rate");
     543        }
     544        if (LLVM_UNLIKELY(input.isPrincipal())) {
     545            ++numOfPrincipalStreams;
     546        }
     547        bool hasFixedOnlyAttribute = false;
     548        for (const Attribute & attr : input.getAttributes()) {
     549            switch (attr.getKind()) {
     550                case Attribute::KindId::Add:
     551                case Attribute::KindId::RoundUpTo:
     552                case Attribute::KindId::Deferred:
     553                    hasFixedOnlyAttribute = false;
     554                    break;
     555                default: break;
     556            }
     557        }
     558        if (rate.isFixed()) {
     559
     560
     561
     562        } else if (LLVM_UNLIKELY(hasFixedOnlyAttribute)) {
     563            report_fatal_error(getName() + ": Add, RoundUpTo and Deferred cannot be applied to non-Fixed rate input stream \"" + input.getName() + "\"");
     564        }
     565    }
     566    if (LLVM_UNLIKELY(numOfPrincipalStreams > 1)) {
     567        report_fatal_error(getName() + ": may only have one principal stream set");
     568    }
     569    for (unsigned i = 0; i < mStreamSetOutputs.size(); ++i) {
     570        const Binding & output = mStreamSetOutputs[i];
     571        const ProcessingRate & rate = output.getRate();
     572        if (LLVM_UNLIKELY(rate.hasReference())) {
     573            Port port; unsigned index;
     574            std::tie(port, index) = getStreamPort(rate.getReference());
     575            if (LLVM_UNLIKELY(rate.isPopCount() && port == Port::Output)) {
     576                report_fatal_error(getName() + ": the popcount rate of \"" + output.getName() + "\" cannot refer to another output stream");
     577            }
     578            if (LLVM_UNLIKELY(port == Port::Output && index >= i)) {
     579                report_fatal_error(getName() + ": \"" + output.getName() + "\" must be ordered after its reference stream");
     580            }
     581            if (rate.isRelative()) {
     582                const Binding & ref = (port == Port::Input) ? getStreamInput(index) : getStreamOutput(index);
     583                const ProcessingRate & refRate = ref.getRate();
     584                if (LLVM_UNLIKELY(refRate.isRelative() || refRate.isFixed())) {
     585                    report_fatal_error(getName() + ": \"" + output.getName() + "\" cannot be relative to a fixed or relative rate stream");
     586                }
     587            }
     588        }
     589        if (LLVM_UNLIKELY(output.isPrincipal())) {
     590            report_fatal_error(getName() + ": output stream \"" + output.getName() + "\" cannot be a principal stream");
     591        }
     592
     593        bool hasAddOrRoundUpTo = false;
     594        bool hasDeferred = false;
     595        for (const Attribute & attr : output.getAttributes()) {
     596            switch (attr.getKind()) {
     597                case Attribute::KindId::Add:
     598                case Attribute::KindId::RoundUpTo:
     599                    hasAddOrRoundUpTo = true;
     600                    break;
     601                case Attribute::KindId::Deferred:
     602                    hasDeferred = false;
     603                    break;
     604                default: break;
     605            }
     606        }
     607
     608        if (LLVM_UNLIKELY((hasAddOrRoundUpTo || hasDeferred) && !(rate.isFixed() || rate.isPopCount()))) {
     609            report_fatal_error(getName() + ": " + output.getName() + " cannot have an Add, RoundUpTo or Deferred attribute");
     610        }       
     611        if (LLVM_UNLIKELY(hasDeferred && hasAddOrRoundUpTo)) {
     612            report_fatal_error(getName() + ": cannot apply Add or RoundUpTo attributes to the Deferred output stream " + output.getName());
     613        }
     614    }
     615}
     616
     617/** ------------------------------------------------------------------------------------------------------------- *
     618 * @brief verifyBufferSize
     619 ** ------------------------------------------------------------------------------------------------------------- */
     620bool Kernel::verifyBufferSize(const Binding & binding, const StreamSetBuffer * const buffer) const {
     621    if (LLVM_UNLIKELY(isa<SourceBuffer>(buffer) || isa<ExternalBuffer>(buffer))) {
    537622        return true;
    538     } else if (LLVM_UNLIKELY(base.isRelative())) {
    539         const auto & ref = getBinding(base.getReference()).getRate();
    540         if (normalizeRelativeToFixedProcessingRate(ref, toUpdate)) {
    541             toUpdate.getRate() *= ref.getRate();
    542             return true;
    543         }
    544     }
    545     return false;
    546 }
    547 
    548 /** ------------------------------------------------------------------------------------------------------------- *
    549  * @brief normalizeStreamProcessingRates
    550  *
    551  * If we allow a stream to be transitively relative to a fixed rate stream, it complicates detection of fixed
    552  * rate streams later. Find any such occurance and transform them. This implies, however, that a fixed rate
    553  * stream could have a rational processing rate (which should not occur normally.)
    554  ** ------------------------------------------------------------------------------------------------------------- */
    555 inline void Kernel::normalizeStreamProcessingRates() {
    556     for (Binding & input : mStreamSetInputs) {
    557         normalizeRelativeToFixedProcessingRate(input.getRate(), input.getRate());
    558     }
    559     for (Binding & output : mStreamSetOutputs) {
    560         normalizeRelativeToFixedProcessingRate(output.getRate(), output.getRate());
    561     }
    562     // TODO: we want to consume whole units. Once the pipeline is able to schedule kernels based on their stride
    563     // and input/output rates, modify them here.
    564 }
    565 
    566 /** ------------------------------------------------------------------------------------------------------------- *
    567  * @brief generateKernelMethod
    568  ** ------------------------------------------------------------------------------------------------------------- */
    569 void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
    570     const auto inputSetCount = mStreamSetInputs.size();
    571     mStreamSetInputBaseAddress.resize(inputSetCount);
    572     for (unsigned i = 0; i < inputSetCount; ++i) {
    573         mStreamSetInputBaseAddress[i] = nullptr;
    574     }
    575     const auto outputSetCount = mStreamSetOutputs.size();
    576     mStreamSetOutputBaseAddress.resize(outputSetCount);
    577     for (unsigned i = 0; i < outputSetCount; ++i) {
    578         mStreamSetOutputBaseAddress[i] = nullptr;
    579     }
    580     generateDoSegmentMethod(b);
    581 }
    582 
    583 /** ------------------------------------------------------------------------------------------------------------- *
    584  * @brief requiresBufferedFinalStride
    585  ** ------------------------------------------------------------------------------------------------------------- */
    586 inline bool LLVM_READNONE requiresBufferedFinalStride(const Binding & binding) {
    587     if (LLVM_LIKELY(isa<ArrayType>(binding.getType()))) {
    588         return binding.getType()->getArrayNumElements() == 1;
     623    }
     624    const ProcessingRate & rate = binding.getRate();
     625    if (requiresCopyBack(binding)) {
     626        const auto minOverflow = ceiling(rate.getUpperBound());
     627        if (LLVM_UNLIKELY(buffer->overflowSize() < minOverflow)) {
     628            report_fatal_error(getName() + ": " + binding.getName() + " requires " +
     629                               std::to_string(minOverflow) + " overflow blocks");
     630        }
     631    } else if (rate.isFixed() || binding.hasAttribute(Attribute::KindId::BlockSize)) {
     632        const auto r = rate.getUpperBound();
     633        if (LLVM_LIKELY(r.denominator() == 1)) {
     634            if (LLVM_UNLIKELY((buffer->getBufferBlocks() % r.numerator())) != 0) {
     635                report_fatal_error(getName() + ": " + binding.getName() + " requires a multiple of " +
     636                                   std::to_string(r.numerator()) + " buffer blocks");
     637                return false;
     638            }
     639        } else { // if (b % (n/d) != 0)
     640            const auto b = buffer->getBufferBlocks();
     641            const auto x = (b * r.denominator()) / r.numerator();
     642            if (LLVM_UNLIKELY((b * r.denominator()) != (r.numerator() * x))) {
     643                report_fatal_error(getName() + ": " + binding.getName() + " requires a multiple of " +
     644                                   std::to_string(r.numerator()) + "/" + std::to_string(r.denominator()) + " buffer blocks");
     645                return false;
     646            }
     647        }
    589648    }
    590649    return true;
    591650}
    592651
    593 /** ------------------------------------------------------------------------------------------------------------- *
    594  * @brief getItemWidth
    595  ** ------------------------------------------------------------------------------------------------------------- */
    596 inline unsigned LLVM_READNONE getItemWidth(const Binding & b) {
    597     Type * ty = b.getType();
    598     if (LLVM_LIKELY(isa<ArrayType>(ty))) {
    599         ty = ty->getArrayElementType();
    600     }
    601     return cast<IntegerType>(ty->getVectorElementType())->getBitWidth();
    602 }
    603 
    604 /** ------------------------------------------------------------------------------------------------------------- *
    605  * @brief isTransitivelyUnknownRate
    606  ** ------------------------------------------------------------------------------------------------------------- */
    607 bool LLVM_READNONE MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
    608     if (rate.isUnknown()) {
    609         return true;
    610     } else if (rate.isDerived()) {
    611         return isTransitivelyUnknownRate(getBinding(rate.getReference()).getRate());
    612     }
    613     return false;
    614 }
    615 
    616 /** ------------------------------------------------------------------------------------------------------------- *
    617  * @brief requiresTemporaryInputBuffer
    618  ** ------------------------------------------------------------------------------------------------------------- */
    619 inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const {
    620     if (requiresBufferedFinalStride(binding)) {
    621         return true;
    622     } else if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
    623         report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
    624     } else {
    625         return !rate.isFixed();
    626     }
    627 }
    628 
    629 /** ------------------------------------------------------------------------------------------------------------- *
    630  * @brief requiresTemporaryOutputBuffer
    631  ** ------------------------------------------------------------------------------------------------------------- */
    632 inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const {
    633     if (requiresBufferedFinalStride(binding)) {
     652
     653/** ------------------------------------------------------------------------------------------------------------- *
     654 * @brief requiresCopyBack
     655 ** ------------------------------------------------------------------------------------------------------------- */
     656bool Kernel::requiresCopyBack(const Binding & binding) const {
     657    const ProcessingRate & rate = binding.getRate();
     658    if (rate.isFixed() || binding.hasAttribute(Attribute::KindId::BlockSize)) {
     659        return false;
     660    } else if (rate.isRelative()) {
     661        return requiresCopyBack(getBinding(rate.getReference()));
     662    }
     663    return true;
     664}
     665
     666/** ------------------------------------------------------------------------------------------------------------- *
     667 * @brief requiresLinearAccess
     668 ** ------------------------------------------------------------------------------------------------------------- */
     669bool Kernel::requiresLinearAccess(const Binding & binding) const {
     670    return binding.hasAttribute(Attribute::KindId::RequiresLinearAccess);
     671}
     672
     673/** ------------------------------------------------------------------------------------------------------------- *
     674 * @brief strideOffsetIsTriviallyCalculable
     675 ** ------------------------------------------------------------------------------------------------------------- */
     676bool Kernel::strideOffsetIsTriviallyCalculable(const Binding & binding) const {
     677    if (requiresCopyBack(binding)) {
     678        const ProcessingRate & rate = binding.getRate();
     679        return rate.isPopCount() || rate.isNegatedPopCount();
     680    }
     681    return true;
     682}
     683
     684/** ------------------------------------------------------------------------------------------------------------- *
     685 * @brief permitsNonLinearAccess
     686 ** ------------------------------------------------------------------------------------------------------------- */
     687bool Kernel::permitsNonLinearAccess(const Binding & binding) const {
     688    if (LLVM_UNLIKELY(requiresLinearAccess(binding))) {
     689        return false;
     690    } else if (LLVM_UNLIKELY(binding.hasAttribute(Attribute::KindId::PermitsNonLinearAccess))) {
    634691        return true;
    635692    } else {
    636         return !(rate.isFixed() || isTransitivelyUnknownRate(rate));
    637     }
    638 }
    639 
    640 /** ------------------------------------------------------------------------------------------------------------- *
    641  * @brief getItemAlignment
    642  ** ------------------------------------------------------------------------------------------------------------- */
    643 inline unsigned LLVM_READNONE MultiBlockKernel::getItemAlignment(const Binding & binding) const {
    644     const auto & rate = binding.getRate();
    645     if (rate.isFixed() && binding.nonDeferred() && !binding.isMisaligned()) {
    646         const auto r = rate.getRate();
    647         auto n = (r.numerator() * mStride);
    648         if (LLVM_LIKELY(r.denominator() == 1)) {
    649             return n;
    650         } else if (LLVM_LIKELY((n % r.denominator()) == 0)) {
    651             return n / r.denominator();
    652         }
    653     }
    654     return 1; // ∀x GCD(x, x + 1) = 1
    655 }
    656 
    657 /** ------------------------------------------------------------------------------------------------------------- *
    658  * @brief getCopyAlignment
    659  ** ------------------------------------------------------------------------------------------------------------- */
    660 inline unsigned LLVM_READNONE MultiBlockKernel::getCopyAlignment(const Binding & binding) const {
    661     return ((getItemAlignment(binding) * getItemWidth(binding)) + 7) / 8;
    662 }
    663 
    664 /** ------------------------------------------------------------------------------------------------------------- *
    665  * @brief getStrideSize
    666  ** ------------------------------------------------------------------------------------------------------------- */
    667 llvm::Value * LLVM_READNONE MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
    668     // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
    669     const auto r = getUpperBound(rate);
    670     if (r.numerator() == 0) {
    671         return nullptr;
    672     } else {
    673         assert ((r.numerator() * mStride) % r.denominator() == 0);
    674         return b->getSize((r.numerator() * mStride) / r.denominator());
    675     }
    676 }
    677 
    678 // #define DEBUG_LOG
     693        return strideOffsetIsTriviallyCalculable(binding);
     694    }
     695}
     696
     697/** ------------------------------------------------------------------------------------------------------------- *
     698 * @brief mustClearOverflowPriorToCopyback
     699 ** ------------------------------------------------------------------------------------------------------------- */
     700bool Kernel::mustClearOverflowPriorToCopyback(const Binding & binding) const {
     701    return requiresCopyBack(binding) && permitsNonLinearAccess(binding) && !strideOffsetIsTriviallyCalculable(binding);
     702}
     703
     704/** ------------------------------------------------------------------------------------------------------------- *
     705 * @brief anyBindingRequiresLinearSpace
     706 ** ------------------------------------------------------------------------------------------------------------- */
     707bool Kernel::anyBindingRequiresLinearSpace() const {
     708    for (const Binding & input : mStreamSetInputs) {
     709        if (requiresLinearAccess(input)) {
     710            return true;
     711        }
     712    }
     713    for (const Binding & output : mStreamSetOutputs) {
     714        if (!permitsNonLinearAccess(output)) {
     715            return true;
     716        }
     717    }
     718    return false;
     719}
    679720
    680721/** ------------------------------------------------------------------------------------------------------------- *
    681722 * @brief generateKernelMethod
    682723 ** ------------------------------------------------------------------------------------------------------------- */
    683 void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
    684 
    685     if (LLVM_UNLIKELY((mStride % b->getBitBlockWidth()) != 0)) {
    686         report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of MultiBlockKernel "
    687                            "must be a multiple of the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
    688     }
    689 
    690     using RateValue = ProcessingRate::RateValue;
    691 
    692     const auto inputSetCount = mStreamSetInputs.size();
    693     const auto outputSetCount = mStreamSetOutputs.size();
    694 
    695     // Define and allocate the temporary buffer area in the prolog.   
    696     const auto blockAlignment = b->getBitBlockWidth() / 8;
    697     AllocaInst * temporaryInputBuffer[inputSetCount];
    698     for (unsigned i = 0; i < inputSetCount; ++i) {       
    699         const Binding & input = mStreamSetInputs[i];
    700         const ProcessingRate & rate = input.getRate();
    701         temporaryInputBuffer[i] = nullptr;
    702         if (requiresTemporaryInputBuffer(input, rate)) {
    703             Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    704             auto ub = getUpperBound(rate);
    705             assert (ub != 0);
    706             if (LLVM_UNLIKELY(input.hasLookahead())) {
    707                 ub += RateValue(input.getLookahead(), mStride);
    708             }
    709             Value * arraySize = b->getInt64(ceiling(ub));
    710             if (input.isSwizzled()) {
    711                 // TODO workaround to use larger temporary buffer size for swizzled buffer
    712                 arraySize = b->CreateMul(arraySize, b->getSize(codegen::BufferSegments * codegen::ThreadNum * codegen::SegmentSize));
    713             }
    714 
    715             AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
    716             assert (ptr->isStaticAlloca());
    717             temporaryInputBuffer[i] = ptr;
    718         }
    719     }
    720 
    721     AllocaInst * temporaryOutputBuffer[outputSetCount];
    722     for (unsigned i = 0; i < outputSetCount; i++) {
    723         const Binding & output = mStreamSetOutputs[i];
    724         const ProcessingRate & rate = output.getRate();
    725         temporaryOutputBuffer[i] = nullptr;
    726         if (requiresTemporaryOutputBuffer(output, rate)) {
    727             auto ub = getUpperBound(rate);
    728             if (ub > 0) {
    729                 if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
    730                     ub += mStreamSetOutputBuffers[i]->overflowSize();
    731                 }
    732                 Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
    733                 Constant * const arraySize = b->getInt64(ceiling(ub));
    734                 AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
    735                 assert (ptr->isStaticAlloca());
    736                 temporaryOutputBuffer[i] = ptr;
    737             }
    738         }
    739     }
    740 
    741     Constant * const ZERO = b->getSize(0);
    742     Constant * const ONE = b->getSize(1);
    743     Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
    744     Constant * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
    745 
    746     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    747         Value * terminatedTwice = b->CreateAnd(mIsFinal, b->getTerminationSignal());
    748         Value * unprocessedData = nullptr;
    749         for (unsigned i = 0; i < inputSetCount; i++) {
    750             Value * processed = b->getProcessedItemCount(mStreamSetInputs[i].getName());
    751             Value * const check = b->CreateICmpNE(processed, mAvailableItemCount[i]);
    752             unprocessedData = unprocessedData ? b->CreateOr(unprocessedData, check) : check;
    753         }
    754         b->CreateAssertZero(b->CreateAnd(terminatedTwice, unprocessedData),
    755                             getName() + " was called after its termination with additional input data");
    756         b->CreateAssertZero(terminatedTwice,
    757                             getName() + " was called after its termination");
    758     }
    759 
    760     mInitialAvailableItemCount.assign(mAvailableItemCount.begin(), mAvailableItemCount.end());
    761     mInitialProcessedItemCount.resize(inputSetCount);
    762     mStreamSetInputBaseAddress.resize(inputSetCount);
    763 
    764     Value * const initiallyFinal = mIsFinal;
    765     #ifdef DEBUG_LOG
    766     b->CallPrintInt(getName() + "_initiallyFinal", initiallyFinal);
    767     #endif
    768     // Now proceed with creation of the doSegment method.
    769     BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
    770 
    771     b->CreateBr(segmentLoop);
    772 
    773     /// DO SEGMENT LOOP
    774 
    775     b->SetInsertPoint(segmentLoop);
    776 
    777     Value * numOfStrides = nullptr;
    778 
    779     // TODO: we don't want the our available output space to limit how many conditional blocks we
    780     // can check. When we have a conditional region, split computation of input/output strides and
    781     // check as many input strides as possible but leave the kernel in a state that respects our
    782     // available output space. NOTE: we know coming into this block that the pipeline or kernel has
    783     // ensured there is at least one stride worth of space.
    784 
    785 
    786     // For each input buffer, get the initial processed item count, base input pointer, and the number of
    787     // linearly available strides.
    788     Value * inputStrideSize[inputSetCount];
    789     Value * linearlyAccessible[inputSetCount];
    790     for (unsigned i = 0; i < inputSetCount; i++) {
    791         const Binding & input = mStreamSetInputs[i];
    792         const auto & name = input.getName();
    793         Value * const processed = b->getProcessedItemCount(name);
    794         #ifdef DEBUG_LOG
    795         b->CallPrintInt(getName() + "_" + name + "_avail", mAvailableItemCount[i]);
    796         b->CallPrintInt(getName() + "_" + name + "_processed0", processed);
    797         #endif
    798         mInitialProcessedItemCount[i] = processed;
    799         mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
    800         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    801             b->CreateAssert(b->CreateICmpULE(processed, mAvailableItemCount[i]),
    802                             getName() + ": " + name + " processed item count exceeds its available item count");
    803         }
    804 
    805         Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);
    806         #ifdef DEBUG_LOG
    807         b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
    808         #endif
    809         Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
    810         #ifdef DEBUG_LOG
    811         b->CallPrintInt(getName() + "_" + name + "_accessible", accessible);
    812         #endif
    813         mAvailableItemCount[i] = unprocessed;
    814         linearlyAccessible[i] = accessible;
    815 
    816         const auto ub = getUpperBound(input.getRate());
    817         inputStrideSize[i] = b->getSize(ceiling(ub * mStride));
    818         Value * accessibleStrides = b->CreateUDiv(accessible, inputStrideSize[i]);
    819 
    820         if (LLVM_UNLIKELY(input.hasAttribute(Attribute::KindId::AlwaysConsume))) {
    821             const auto lb = getLowerBound(input.getRate());
    822             Value * const lowerbound = b->getSize(ceiling(lb * mStride));
    823             Value * const lowerboundStrides = b->CreateZExt(b->CreateICmpUGE(unprocessed, lowerbound), b->getSizeTy());
    824             Value * const tryLowerbound = b->CreateICmpULT(accessibleStrides, lowerboundStrides);
    825             inputStrideSize[i] = b->CreateSelect(tryLowerbound, lowerbound, inputStrideSize[i]);
    826             accessibleStrides = b->CreateSelect(tryLowerbound, lowerboundStrides, accessibleStrides);
    827         }
    828 
    829         numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
    830     }
    831 
    832     BasicBlock * const checkInputAvailability = b->CreateBasicBlock("CheckInputAvailability");
    833     BasicBlock * const selectOutputBuffers = b->CreateBasicBlock("SelectOutputBuffers");
    834     b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), selectOutputBuffers, checkInputAvailability);
    835 
    836     // Ensure that everything between S⌈P/S⌉ and S⌈n*(P + L)/S⌉ is linearly available, where S is the stride size,
    837     // P is the current processed position, L is the lookahead amount and n is our number of accessible strides ∈ â„€+.
    838     b->SetInsertPoint(checkInputAvailability);
    839     Value * linearlyCopyable[inputSetCount];
    840     PHINode * selectedInputBuffer[inputSetCount];
    841     for (unsigned i = 0; i < inputSetCount; i++) {
    842         AllocaInst * const tempBuffer = temporaryInputBuffer[i];
    843         selectedInputBuffer[i] = nullptr;
    844         if (tempBuffer) {
    845 
    846             const Binding & input = mStreamSetInputs[i];
    847             const auto & name = input.getName();
    848             Value * const processed = mInitialProcessedItemCount[i];
    849             Value * const unprocessed = mAvailableItemCount[i];
    850             Value * const accessible = linearlyAccessible[i];
    851 
    852             BasicBlock * const entry = b->GetInsertBlock();
    853 
    854             Value * strideSize = inputStrideSize[i];
    855             if (LLVM_UNLIKELY(input.hasLookahead())) {
    856                 Constant * const lookahead = b->getSize(input.getLookahead());
    857                 strideSize = b->CreateAdd(strideSize, lookahead);
    858             }
    859 
    860             Value * const requiresCopy = b->CreateICmpULT(accessible, strideSize);
    861 
    862             BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
    863 
    864             BasicBlock * copyToBackEnd = NULL;
    865             BasicBlock * copyToFrontEnd = NULL;
    866             Value * isPartialStride = NULL;
    867             Value * newAvailable = NULL;
    868 
    869             if (input.isSwizzled()) {
    870                 // Copy at least one whole block for Swizzled input stream
    871                 BasicBlock * const copyFromBack = b->CreateBasicBlock(name + "CopyFromBack");
    872                 BasicBlock * const copyFromFront = b->CreateBasicBlock(name + "CopyFromFront");
    873 
    874                 b->CreateUnlikelyCondBr(requiresCopy, copyFromBack, resume);
    875 
    876                 b->SetInsertPoint(copyFromBack);
    877 
    878 
    879                 Value * const arraySize = b->CreateZExt(tempBuffer->getArraySize(), b->getInt64Ty());
    880                 Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), accessible->getType());
    881 
    882                 Value * const processedOffset = b->CreateAnd(processed, BLOCK_WIDTH_MASK);
    883                 Value * const copyable = b->CreateUMin(b->CreateAdd(unprocessed, processedOffset), temporarySize); // <- we only really need strideSize items
    884                 newAvailable = b->CreateSub(copyable, processedOffset);
    885 //                b->CallPrintInt("newAvailable", newAvailable);
    886 
    887                 Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), arraySize);
    888                 b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
    889 
    890 //                b->CallPrintInt("temporarySize", temporarySize);
    891 //                b->CallPrintInt("processed", processed);
    892 //                b->CallPrintInt("unprocessed", unprocessed);
    893 //                b->CallPrintInt("processedOffset", processedOffset);
    894 //                b->CallPrintInt("copyable", copyable);
    895 
    896 //                b->CallPrintInt("streamCpy1", b->getSize(0));
    897                 Value* BIT_BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
    898 
    899                 Value* copyAmount1 = b->CreateAdd(accessible, processedOffset);
    900                 Value* roundCopyAmount = b->CreateMul(b->CreateUDivCeil(copyAmount1, BIT_BLOCK_WIDTH), BIT_BLOCK_WIDTH);
    901                 b->CreateStreamCpy(name, tempBuffer, ZERO, mStreamSetInputBaseAddress[i], ZERO, roundCopyAmount, getItemAlignment(input));
    902 
    903                 copyToBackEnd = b->GetInsertBlock();
    904 
    905                 b->CreateCondBr(b->CreateICmpNE(copyable, b->CreateAdd(accessible, processedOffset)), copyFromFront, resume);
    906 
    907                 b->SetInsertPoint(copyFromFront);
    908                 Value * const remaining = b->CreateSub(copyable, b->CreateAdd(accessible, processedOffset));
    909                 Value * const baseAddress = b->getBaseAddress(name);
    910 //                b->CallPrintInt("streamCpy2", b->getSize(0));
    911 
    912                 auto castedTempBuffer = b->CreatePointerCast(tempBuffer, b->getBitBlockType()->getPointerTo());
    913 
    914                 auto p = b->CreateGEP(
    915                         castedTempBuffer,
    916                         b->CreateMul(
    917                                 b->CreateUDiv(b->CreateAdd(accessible, processedOffset), BIT_BLOCK_WIDTH),
    918                                 b->getSize(this->getAnyStreamSetBuffer(name)->getNumOfStreams())
    919                         )
    920                 );
    921 //                b->CreateStreamCpy(name, tempBuffer, b->CreateAdd(accessible, processedOffset), baseAddress, ZERO, remaining, getItemAlignment(input));
    922 
    923 
    924                 b->CreateStreamCpy(name, p, ZERO, baseAddress, ZERO, b->CreateMul(b->CreateUDivCeil(remaining, BIT_BLOCK_WIDTH), BIT_BLOCK_WIDTH), getItemAlignment(input));
    925                 isPartialStride = b->CreateICmpUGE(copyable, strideSize);
    926                 copyToFrontEnd = b->GetInsertBlock();
    927 
    928 
    929 
    930                 b->CreateBr(resume);
    931             } else {
    932                 BasicBlock * const copyFromBack = b->CreateBasicBlock(name + "CopyFromBack");
    933                 BasicBlock * const copyFromFront = b->CreateBasicBlock(name + "CopyFromFront");
    934 
    935                 b->CreateUnlikelyCondBr(requiresCopy, copyFromBack, resume);
    936 
    937                 b->SetInsertPoint(copyFromBack);
    938                 Value * const arraySize = b->CreateZExt(tempBuffer->getArraySize(), b->getInt64Ty());
    939                 Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), accessible->getType());
    940                 Value * const copyable = b->CreateUMin(unprocessed, temporarySize); // <- we only really need strideSize items
    941                 newAvailable = copyable;
    942                 Value * const offset = b->CreateAnd(processed, BLOCK_WIDTH_MASK);
    943 
    944                 Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), arraySize);
    945                 b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
    946 
    947                 b->CreateStreamCpy(name, tempBuffer, ZERO, mStreamSetInputBaseAddress[i], offset, accessible, getItemAlignment(input));
    948 //            b->CallPrintInt("gep", b->CreateGEP(mStreamSetInputBaseAddress[i], b->CreateUDiv(offset, b->getSize(this->getAnyStreamSetBuffer(name)->getNumOfStreams()))));
    949 //            b->CallPrintRegister(name + "_tempBuffer", b->CreateLoad(tempBuffer));
    950                 copyToBackEnd = b->GetInsertBlock();
    951                 b->CreateCondBr(b->CreateICmpNE(copyable, accessible), copyFromFront, resume);
    952 
    953                 b->SetInsertPoint(copyFromFront);
    954                 Value * const remaining = b->CreateSub(copyable, accessible);
    955                 Value * const baseAddress = b->getBaseAddress(name);
    956                 b->CreateStreamCpy(name, tempBuffer, accessible, baseAddress, ZERO, remaining, getItemAlignment(input));
    957                 isPartialStride = b->CreateICmpUGE(copyable, strideSize);
    958                 copyToFrontEnd = b->GetInsertBlock();
    959                 b->CreateBr(resume);
    960             }
    961 
    962             b->SetInsertPoint(resume);
    963             PHINode * const address = b->CreatePHI(tempBuffer->getType(), 3);
    964             address->addIncoming(mStreamSetInputBaseAddress[i], entry);
    965             address->addIncoming(tempBuffer, copyToBackEnd);
    966             address->addIncoming(tempBuffer, copyToFrontEnd);
    967             selectedInputBuffer[i] = address;
    968             PHINode * const available = b->CreatePHI(accessible->getType(), 3);
    969             available->addIncoming(accessible, entry);
    970             available->addIncoming(newAvailable, copyToBackEnd);
    971             available->addIncoming(newAvailable, copyToFrontEnd);
    972             linearlyCopyable[i] = available;
    973             PHINode * const finalStride = b->CreatePHI(b->getInt1Ty(), 3);
    974             finalStride->addIncoming(mIsFinal, entry);
    975             finalStride->addIncoming(b->getTrue(), copyToBackEnd);
    976             finalStride->addIncoming(isPartialStride, copyToFrontEnd);
    977             mIsFinal = finalStride;
    978             if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    979                 Value * const hasStride = b->CreateOr(initiallyFinal, b->CreateNot(finalStride));
    980                 b->CreateAssert(hasStride, getName() + ": " + name + " has insufficient input data for one stride");
    981             }
    982         }
    983     }
    984 
    985     BasicBlock * const endCheckInputAvailability = b->GetInsertBlock();
    986     selectOutputBuffers->moveAfter(endCheckInputAvailability);
    987     b->CreateBr(selectOutputBuffers);
    988 
    989     b->SetInsertPoint(selectOutputBuffers);
    990     PHINode * const final = b->CreatePHI(mIsFinal->getType(), 2);
    991     final->addIncoming(b->getFalse(), segmentLoop);
    992     final->addIncoming(mIsFinal, endCheckInputAvailability);
    993     mIsFinal = final;
    994     for (unsigned i = 0; i < inputSetCount; i++) {
    995         if (selectedInputBuffer[i]) {
    996             PHINode * const address = b->CreatePHI(selectedInputBuffer[i]->getType(), 2);
    997             address->addIncoming(mStreamSetInputBaseAddress[i], segmentLoop);
    998             address->addIncoming(selectedInputBuffer[i], endCheckInputAvailability);
    999             mStreamSetInputBaseAddress[i] = address;
    1000             PHINode * const accessible = b->CreatePHI(linearlyAccessible[i]->getType(), 2);
    1001             accessible->addIncoming(linearlyAccessible[i], segmentLoop);
    1002             accessible->addIncoming(linearlyCopyable[i], endCheckInputAvailability);
    1003             linearlyAccessible[i] = accessible;
    1004         }
    1005     }
    1006     PHINode * const strides = b->CreatePHI(numOfStrides->getType(), 2);
    1007     strides->addIncoming(numOfStrides, segmentLoop);
    1008     strides->addIncoming(ONE, endCheckInputAvailability);
    1009     numOfStrides = strides;
    1010 
    1011     // Now determine the linearly writeable strides
    1012     Value * outputStrideSize[outputSetCount];
    1013     Value * linearlyWritable[outputSetCount];
    1014     mInitialProducedItemCount.resize(outputSetCount);
    1015     mStreamSetOutputBaseAddress.resize(outputSetCount);
    1016     for (unsigned i = 0; i < outputSetCount; i++) {
    1017         const auto & output = mStreamSetOutputs[i];
    1018         const auto & name = output.getName();
    1019         Value * const produced = b->getProducedItemCount(name);
    1020         #ifdef DEBUG_LOG
    1021         b->CallPrintInt(getName() + "_" + name + "_produced0", produced);
    1022         #endif
    1023         Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    1024         mInitialProducedItemCount[i] = produced;
    1025         mStreamSetOutputBaseAddress[i] = baseBuffer;
    1026         linearlyWritable[i] = nullptr;
    1027         // Is the number of linearly writable items sufficient for a stride?
    1028         outputStrideSize[i] = getStrideSize(b, output.getRate());
    1029         if (outputStrideSize[i]) {
    1030             linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);
    1031             #ifdef DEBUG_LOG
    1032             b->CallPrintInt(getName() + "_" + name + "_writable", linearlyWritable[i]);
    1033             #endif
    1034             Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
    1035             numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
    1036             // Do we require a temporary buffer to write to?
    1037             AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    1038             if (tempBuffer) {
    1039                 assert (tempBuffer->getType() == baseBuffer->getType());
    1040                 BasicBlock * const entry = b->GetInsertBlock();
    1041                 BasicBlock * const prepareTempBuffer = b->CreateBasicBlock(name + "PrepareTempBuffer");
    1042                 BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
    1043                 Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
    1044                 b->CreateUnlikelyCondBr(requiresCopy, prepareTempBuffer, resume);
    1045                 // Clear the output buffer prior to using it
    1046                 b->SetInsertPoint(prepareTempBuffer);
    1047                 Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), tempBuffer->getArraySize());
    1048                 b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);               
    1049                 b->CreateBr(resume);
    1050                 // Select the appropriate buffer / stride #
    1051                 b->SetInsertPoint(resume);
    1052                 PHINode * const phiBuffer = b->CreatePHI(baseBuffer->getType(), 3);
    1053                 phiBuffer->addIncoming(baseBuffer, entry);
    1054                 phiBuffer->addIncoming(tempBuffer, prepareTempBuffer);
    1055                 baseBuffer = phiBuffer;
    1056                 PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
    1057                 phiStrides->addIncoming(numOfStrides, entry);
    1058                 phiStrides->addIncoming(ONE, prepareTempBuffer);
    1059                 numOfStrides = phiStrides;
    1060             }
    1061             if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1062                 b->CreateAssert(numOfStrides, getName() + ": " + name + " has insufficient output space for one stride");
    1063             }
    1064         }
    1065     }
    1066 
    1067     // Update the locally available item count to reflect the current state
    1068     for (unsigned i = 0; i < inputSetCount; i++) {
    1069         const Binding & input = mStreamSetInputs[i];
    1070         if (input.getRate().isFixed() && input.nonDeferred()) {
    1071             Value * const processable = b->CreateMul(numOfStrides, inputStrideSize[i]);
    1072             linearlyAccessible[i] = b->CreateSelect(mIsFinal, linearlyAccessible[i], processable);
    1073         }
    1074         mAvailableItemCount[i] = linearlyAccessible[i];
    1075         #ifdef DEBUG_LOG
    1076         b->CallPrintInt(getName() + "_" + input.getName() + "_accessible", linearlyAccessible[i]);
    1077         #endif
    1078     }
    1079 
    1080     //  We have one or more strides of input data and output buffer space for all stream sets.
    1081     generateMultiBlockLogic(b, numOfStrides);
    1082 
    1083     for (unsigned i = 0; i < inputSetCount; ++i) {
    1084         const auto & input = mStreamSetInputs[i];
    1085         const ProcessingRate & rate = input.getRate();
    1086         if (rate.isFixed() && input.nonDeferred()) {
    1087             Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
    1088             b->setProcessedItemCount(input.getName(), ic);
    1089         }
    1090         #ifdef DEBUG_LOG
    1091         b->CallPrintInt(getName() + "_" + input.getName() + "_processed", b->getProcessedItemCount(input.getName()));
    1092         #endif
    1093     }
    1094 
    1095     for (unsigned i = 0; i < outputSetCount; ++i) {
    1096         const auto & output = mStreamSetOutputs[i];
    1097         const ProcessingRate & rate = output.getRate();
    1098         if (rate.isFixed()) {
    1099             Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
    1100             Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
    1101             b->setProducedItemCount(output.getName(), ic);
    1102         }
    1103         #ifdef DEBUG_LOG
    1104         b->CallPrintInt(getName() + "_" + output.getName() + "_produced", b->getProducedItemCount(output.getName()));
    1105         #endif
    1106     }
    1107 
    1108     BasicBlock * const handleFinalBlock = b->CreateBasicBlock("HandleFinalBlock");
    1109     BasicBlock * const temporaryBufferCopyBack = b->CreateBasicBlock("TemporaryBufferCopyBack");
    1110     BasicBlock * const strideDone = b->CreateBasicBlock("MultiBlockDone");
    1111 
    1112     b->CreateUnlikelyCondBr(mIsFinal, handleFinalBlock, temporaryBufferCopyBack);
    1113 
    1114 
    1115     /// FINAL STRIDE ADJUSTMENT
    1116     b->SetInsertPoint(handleFinalBlock);
    1117 
    1118     // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that
    1119     // the ITEM COUNT % FIXED RATE = 0 for all Fixed Input and Output streams. We correct that here
    1120     // to calculate them based on the actual input item counts.
    1121 
    1122     reviseFinalProducedItemCounts(b);
    1123 
    1124     b->CreateBr(temporaryBufferCopyBack);
    1125 
    1126     /// TEMPORARY BUFFER COPY BACK
    1127     b->SetInsertPoint(temporaryBufferCopyBack);
    1128 
    1129     // Copy back data to the actual output buffers.
    1130     for (unsigned i = 0; i < outputSetCount; i++) {
    1131         AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    1132         if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
    1133             continue;
    1134         }
    1135         const auto & name = mStreamSetOutputs[i].getName();
    1136         Value * const produced = b->getProducedItemCount(name);
    1137         Value * const baseBuffer = mStreamSetOutputBaseAddress[i];
    1138         assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType()));
    1139         //const auto & name = mStreamSetOutputs[i].getName();
    1140         BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
    1141         BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
    1142         BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
    1143         // If we used a temporary buffer, copy it back to the original output buffer
    1144         Value * const requiresCopy = b->CreateICmpEQ(tempBuffer, baseBuffer);
    1145         b->CreateCondBr(requiresCopy, copyToBack, resume);
    1146 
    1147         b->SetInsertPoint(copyToBack);       
    1148         Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
    1149         //Value * const newProducedItemCount = b->getProducedItemCount(name);
    1150         Value * const newlyProduced = b->CreateSub(produced, mInitialProducedItemCount[i]);
    1151 
    1152 
    1153         Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
    1154         const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
    1155         b->CreateStreamCpy(name, baseBuffer, offset, tempBuffer, ZERO, toWrite, alignment);
    1156         // If we required a temporary output buffer, we will probably need to write to the beginning of the buffer as well.
    1157         b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, resume);
    1158 
    1159         b->SetInsertPoint(copyToFront);
    1160         Value * const remaining = b->CreateSub(newlyProduced, toWrite);
    1161         Value * const baseAddress = b->getBaseAddress(name);
    1162         b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
    1163         b->CreateBr(resume);
    1164 
    1165         b->SetInsertPoint(resume);
    1166     }
    1167 
    1168     //  We've dealt with the partial block processing and copied information back into the
    1169     //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1170     BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
    1171 
    1172     if (hasAttribute(Attribute::KindId::MustExplicitlyTerminate) || hasAttribute(Attribute::KindId::CanTerminateEarly)) {
    1173         mIsFinal = b->CreateOr(mIsFinal, b->getTerminationSignal());
    1174     }
    1175 
    1176     b->CreateCondBr(mIsFinal, segmentDone, strideDone);
    1177 
    1178     /// STRIDE DONE
    1179     strideDone->moveAfter(b->GetInsertBlock());
    1180     b->SetInsertPoint(strideDone);
    1181 
    1182     // do we have enough data for another stride?
    1183     Value * hasMoreStrides = b->getTrue();
    1184     for (unsigned i = 0; i < inputSetCount; ++i) {
    1185         const Binding & input = mStreamSetInputs[i];
    1186         const auto & name = input.getName();
    1187         Value * const avail = mInitialAvailableItemCount[i];
    1188         Value * const processed = b->getProcessedItemCount(name);
    1189 //        b->CallPrintInt(getName() + "_" + name + "_processed'", processed);
    1190 
    1191         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1192             b->CreateAssert(b->CreateICmpULE(processed, avail), getName() + ": " + name + " processed data exceeds available data");
    1193         }
    1194         Value * const remaining = b->CreateSub(avail, processed);
    1195         Value * strideSize = inputStrideSize[i];
    1196         if (LLVM_UNLIKELY(input.hasLookahead())) {
    1197             strideSize = b->CreateAdd(strideSize, b->getSize(input.getLookahead()));
    1198         }
    1199         Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, strideSize);
    1200         hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    1201     }
    1202 
    1203     // even if we do not have enough input data for a full stride, if this is our final stride, allow it ...
    1204     hasMoreStrides = b->CreateOr(hasMoreStrides, initiallyFinal);
    1205 
    1206     // do we have enough room for another stride?
    1207     for (unsigned i = 0; i < outputSetCount; ++i) {
    1208         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    1209         const auto & name = mStreamSetOutputs[i].getName();
    1210         Value * const produced = b->getProducedItemCount(name);
    1211 
    1212         // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
    1213         if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
    1214             Value * const consumed = b->getConsumedItemCount(name);
    1215             if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1216                 b->CreateAssert(b->CreateICmpULE(consumed, produced),
    1217                                 getName() + ": " + name + " consumed data exceeds produced data");
    1218             }
    1219             Value * const unconsumed = b->CreateSub(produced, consumed);
    1220 
    1221 //            b->CallPrintInt(getName() + "_" + name + "_unconsumed", unconsumed);
    1222 
    1223             Value * const capacity = b->getBufferedSize(name);
    1224 
    1225 //            b->CallPrintInt(getName() + "_" + name + "_capacity", capacity);
    1226 
    1227             if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1228                 b->CreateAssert(b->CreateICmpULE(unconsumed, capacity),
    1229                                 getName() + ": " + name + " more data was written than its capacity allows");
    1230             }
    1231 
    1232 
    1233 
    1234             Value * const remaining = b->CreateSub(capacity, unconsumed);
    1235             Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, outputStrideSize[i]);
    1236             hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    1237         }
    1238         // Do copybacks if necessary.
    1239         if (mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate)) {
    1240             BasicBlock * const copyBack = b->CreateBasicBlock(name + "CopyBack");
    1241             BasicBlock * const done = b->CreateBasicBlock(name + "CopyBackDone");
    1242 
    1243             Value * const bufferSize = b->getBufferedSize(name);
    1244             Value * const prior = b->CreateURem(mInitialProducedItemCount[i], bufferSize);
    1245             Value * const current = b->CreateURem(produced, bufferSize);
    1246             b->CreateUnlikelyCondBr(b->CreateICmpUGT(prior, current), copyBack, done);
    1247 
    1248             b->SetInsertPoint(copyBack);
    1249             const auto copyAlignment = getItemAlignment(mStreamSetOutputs[i]);
    1250             Value * const startOfBuffer = b->getBaseAddress(name);
    1251             Value * const offset = b->CreateUDiv(bufferSize, b->getSize(b->getBitBlockWidth()));
    1252             Value * const endOfBuffer = b->CreateGEP(startOfBuffer, offset);
    1253             b->CreateStreamCpy(name, startOfBuffer, ZERO, endOfBuffer, ZERO, current, copyAlignment);
    1254             b->CreateBr(done);
    1255 
    1256             b->SetInsertPoint(done);
    1257         }
    1258     }
    1259 
    1260     b->CreateCondBr(hasMoreStrides, segmentLoop, segmentDone);
    1261 
    1262     /// SEGMENT DONE
    1263     segmentDone->moveAfter(b->GetInsertBlock());
    1264     b->SetInsertPoint(segmentDone);
    1265 
    1266 }
    1267 
    1268 /** ------------------------------------------------------------------------------------------------------------- *
    1269  * @brief requiresCopyBack
    1270  ** ------------------------------------------------------------------------------------------------------------- */
    1271 bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const {
    1272     if (rate.isBounded() || rate.isUnknown()) {
    1273         return true;
    1274     } else if (rate.isRelative()) {
    1275         return requiresCopyBack(getBinding(rate.getReference()).getRate());
    1276     }
    1277     return false;
    1278 }
    1279 
    1280 /** ------------------------------------------------------------------------------------------------------------- *
    1281  * @brief CreateUDivCeil
    1282  ** ------------------------------------------------------------------------------------------------------------- */
    1283 inline Value * CreateUDivCeil(const std::unique_ptr<KernelBuilder> & b, Value * const number, const ProcessingRate::RateValue divisor, const Twine & Name = "") {
    1284     Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
    1285     if (LLVM_LIKELY(divisor.denominator() == 1)) {
    1286         return b->CreateUDivCeil(number, n, Name);
    1287     } else {
    1288         //   âŒŠ(num + ratio - 1) / ratio⌋
    1289         // = ⌊(num - 1) / (n/d)⌋ + (ratio/ratio)
    1290         // = ⌊(d * (num - 1)) / n⌋ + 1
    1291         Constant * const ONE = ConstantInt::get(number->getType(), 1);
    1292         Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
    1293         return b->CreateAdd(b->CreateUDiv(b->CreateMul(b->CreateSub(number, ONE), d), n), ONE, Name);
    1294     }
    1295 }
    1296 
    1297 
    1298 /** ------------------------------------------------------------------------------------------------------------- *
    1299  * @brief reviseFinalProducedItemCounts
    1300  ** ------------------------------------------------------------------------------------------------------------- */
    1301 void MultiBlockKernel::reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b) {
    1302 
    1303     if (LLVM_UNLIKELY(mStreamSetInputs.empty())) {
    1304         return;
    1305     }
    1306 
    1307     const auto inputSetCount = mStreamSetInputs.size();
    1308 
    1309     ProcessingRate::RateValue rateLCM(1);
    1310     unsigned first = 0;
    1311     unsigned last = inputSetCount;
    1312 
    1313     bool hasFixedRateInput = false; // <- temporary workaround
    1314     for (unsigned i = 0; i < inputSetCount; ++i) {
    1315         const ProcessingRate & pr = mStreamSetInputs[i].getRate();
    1316         if (pr.isFixed()) {
    1317             rateLCM = lcm(rateLCM, pr.getRate());
    1318             hasFixedRateInput = true;
    1319             if (mStreamSetInputs[i].isPrincipal()) {
    1320                 assert ("A kernel cannot have multiple principle input streams" && (first == 0 && last == inputSetCount));
    1321                 first = i;
    1322                 last = i + 1;
    1323             }
    1324         }       
    1325     }
    1326 
    1327     bool noFixedRateOutput = true;
    1328 
    1329     for (const Binding & output : mStreamSetOutputs) {
    1330         const ProcessingRate & pr = output.getRate();
    1331         if (pr.isFixed()) {
    1332             rateLCM = lcm(rateLCM, pr.getRate());
    1333             noFixedRateOutput = false;
    1334         }
    1335     }
    1336 
    1337     if (noFixedRateOutput) {
    1338         return;
    1339     }
    1340 
    1341     Value * baseInitialProcessedItemCount = nullptr;
    1342     Value * scaledInverseOfAvailItemCount = nullptr;
    1343 
    1344     // For each Fixed output stream, this calculates:
    1345 
    1346     //    CEILING(MIN(Available Item Count / Fixed Input Rate) * Fixed Output Rate)
    1347 
    1348     // But avoids the possibility of overflow errors (assuming that each processed item count does not overflow)
    1349 
    1350     for (unsigned i = first; i < last; ++i) {
    1351         const ProcessingRate & pr = mStreamSetInputs[i].getRate();
    1352         if (pr.isFixed()) {
    1353             Value * p = mInitialProcessedItemCount[i];
    1354             Value * a = b->CreateSub(mInitialAvailableItemCount[i], p);
    1355             const auto & rate = pr.getRate();
    1356             if (LLVM_UNLIKELY(rateLCM != rate)) {
    1357                 const auto factor = rateLCM / rate;
    1358                 if (LLVM_UNLIKELY(factor.numerator() > 1)) {
    1359                     a = b->CreateMul(a, b->getSize(factor.numerator()));
    1360                 }
    1361                 if (LLVM_UNLIKELY(factor.denominator() > 1)) {
    1362                     a = b->CreateUDiv(a, b->getSize(factor.denominator()));
    1363                 }
    1364             }
    1365             if (LLVM_UNLIKELY(rate.denominator() > 1)) {
    1366                 p = b->CreateMul(p, b->getSize(rate.denominator()));
    1367             }
    1368             if (LLVM_UNLIKELY(rate.numerator() > 1)) {
    1369                 p = b->CreateUDiv(p, b->getSize(rate.numerator()));
    1370             }
    1371             if (scaledInverseOfAvailItemCount) {
    1372                 scaledInverseOfAvailItemCount = b->CreateUMin(scaledInverseOfAvailItemCount, a);
    1373                 baseInitialProcessedItemCount = b->CreateUMin(baseInitialProcessedItemCount, p);
    1374             } else {
    1375                 scaledInverseOfAvailItemCount = a;
    1376                 baseInitialProcessedItemCount = p;
    1377             }
    1378         }
    1379     }
    1380 
    1381     for (const Binding & output : mStreamSetOutputs) {
    1382         const auto name = output.getName();
    1383         const ProcessingRate & pr = output.getRate();
    1384         Value * produced = nullptr;
    1385         if (hasFixedRateInput && pr.isFixed() && output.nonDeferred()) {
    1386             assert (baseInitialProcessedItemCount && scaledInverseOfAvailItemCount);
    1387             const auto rate = pr.getRate();
    1388             Value * p = baseInitialProcessedItemCount;
    1389             if (LLVM_UNLIKELY(rate.numerator() != 1)) {
    1390                 p = b->CreateMul(p, b->getSize(rate.numerator()));
    1391             }
    1392             if (LLVM_UNLIKELY(rate.denominator() != 1)) {
    1393                 p = b->CreateUDiv(p, b->getSize(rate.denominator()));
    1394             }
    1395             Value * const ic = CreateUDivCeil(b, scaledInverseOfAvailItemCount, rateLCM / pr.getRate());
    1396             produced = b->CreateAdd(p, ic);
    1397             #ifdef DEBUG_LOG
    1398             b->CallPrintInt(getName() + "_" + name + "_produced'", produced);
    1399             #endif           
    1400         } else { // check if we have an attribute; if so, get the current produced count and adjust it
    1401             bool noAttributes = true;
    1402             for (const Attribute & attr : output.getAttributes()) {
    1403                 if (attr.isAdd() || attr.isRoundUpTo()) {
    1404                     noAttributes = false;
    1405                     break;
    1406                 }
    1407             }
    1408             if (noAttributes) {
    1409                 continue;
    1410             }
    1411             produced = b->getProducedItemCount(name);
    1412         }
    1413         for (const Attribute & attr : output.getAttributes()) {
    1414             if (attr.isAdd()) {
    1415                 produced = b->CreateAdd(produced, b->getSize(attr.amount()));
    1416             } else if (attr.isRoundUpTo()) {
    1417                 produced = b->CreateRoundUp(produced, b->getSize(attr.amount()));
    1418             }
    1419         }
    1420         #ifdef DEBUG_LOG
    1421         b->CallPrintInt(getName() + "_" + name + "_produced\"", produced);
    1422         #endif
    1423         b->setProducedItemCount(name, produced);
    1424     }
    1425 
    1426 }
    1427 
    1428 /** ------------------------------------------------------------------------------------------------------------- *
    1429  * @brief generateMultiBlockLogic
    1430  ** ------------------------------------------------------------------------------------------------------------- */
    1431 void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
    1432 
    1433     if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
    1434         report_fatal_error(getName() + ": the Stride (" + std::to_string(mStride) + ") of BlockOrientedKernel "
    1435                            "equal to the BitBlockWidth (" + std::to_string(b->getBitBlockWidth()) + ")");
    1436     }
    1437 
    1438     Constant * const LOG_2_BLOCK_WIDTH = b->getSize(std::log2(b->getBitBlockWidth()));
    1439 
    1440     BasicBlock * const entryBlock = b->GetInsertBlock();
    1441     mStrideLoopBody = b->CreateBasicBlock(getName() + "_strideLoopBody");
    1442     BasicBlock * const stridesDone = b->CreateBasicBlock(getName() + "_stridesDone");
    1443     BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
    1444     BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
    1445 
    1446     const auto inputSetCount = mStreamSetInputs.size();
    1447     Value * baseProcessedIndex[inputSetCount];
    1448     Value * baseInputAddress[inputSetCount];
    1449     for (unsigned i = 0; i < inputSetCount; i++) {
    1450         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    1451         if (LLVM_UNLIKELY(!rate.isFixed())) {
    1452             Value * const ic = mInitialProcessedItemCount[i];
    1453             baseProcessedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
    1454         }
    1455         baseInputAddress[i] = mStreamSetInputBaseAddress[i];
    1456     }
    1457 
    1458     const auto outputSetCount = mStreamSetOutputs.size();
    1459     Value * baseProducedIndex[outputSetCount];
    1460     Value * baseOutputAddress[inputSetCount];
    1461     for (unsigned i = 0; i < outputSetCount; i++) {
    1462         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    1463         if (LLVM_UNLIKELY(!rate.isFixed())) {
    1464             Value * const ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
    1465             baseProducedIndex[i] = b->CreateLShr(ic, LOG_2_BLOCK_WIDTH);
    1466         }
    1467         baseOutputAddress[i] = mStreamSetOutputBaseAddress[i];
    1468     }
    1469 
    1470     b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, mStrideLoopBody);
    1471 
    1472     /// BLOCK BODY
    1473 
    1474     b->SetInsertPoint(mStrideLoopBody);
    1475 
    1476     if (b->supportsIndirectBr()) {
    1477         Value * const baseTarget = BlockAddress::get(segmentDone);
    1478         mStrideLoopTarget = b->CreatePHI(baseTarget->getType(), 2, "strideTarget");
    1479         mStrideLoopTarget->addIncoming(baseTarget, entryBlock);
    1480     }
    1481 
    1482     mStrideBlockIndex = b->CreatePHI(b->getSizeTy(), 2);
    1483     mStrideBlockIndex->addIncoming(b->getSize(0), entryBlock);
    1484 
    1485     /// GENERATE DO BLOCK METHOD
    1486 
    1487     for (unsigned i = 0; i < inputSetCount; ++i) {
    1488         Value * index = mStrideBlockIndex;
    1489         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    1490         if (LLVM_UNLIKELY(!rate.isFixed())) {
    1491             Value * ic = b->getProcessedItemCount(mStreamSetInputs[i].getName());
    1492             index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProcessedIndex[i]);
    1493         }
    1494         mStreamSetInputBaseAddress[i] = b->CreateGEP(mStreamSetInputBaseAddress[i], index);
    1495     }
    1496 
    1497     for (unsigned i = 0; i < outputSetCount; ++i) {
    1498         Value * index = mStrideBlockIndex;
    1499         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    1500         if (LLVM_UNLIKELY(!rate.isFixed())) {
    1501             Value * ic = b->getProducedItemCount(mStreamSetOutputs[i].getName());
    1502             index = b->CreateSub(b->CreateLShr(ic, LOG_2_BLOCK_WIDTH), baseProducedIndex[i]);
    1503         }
    1504         mStreamSetOutputBaseAddress[i] = b->CreateGEP(mStreamSetOutputBaseAddress[i], index);
    1505     }
    1506 
    1507     writeDoBlockMethod(b);
    1508 
    1509     BasicBlock * const bodyEnd = b->GetInsertBlock();
    1510     if (mStrideLoopTarget) {
    1511         mStrideLoopTarget->addIncoming(mStrideLoopTarget, bodyEnd);
    1512     }
    1513 
    1514     Value * const nextIndex = b->CreateAdd(mStrideBlockIndex, b->getSize(1));
    1515     mStrideBlockIndex->addIncoming(nextIndex, bodyEnd);
    1516     Value * const notDone = b->CreateICmpULT(nextIndex, numOfBlocks);
    1517     b->CreateCondBr(notDone, mStrideLoopBody, stridesDone);
    1518 
    1519     stridesDone->moveAfter(bodyEnd);
    1520 
    1521     /// STRIDE DONE
    1522 
    1523     b->SetInsertPoint(stridesDone);
    1524 
    1525     // Now conditionally perform the final block processing depending on the doFinal parameter.
    1526     if (mStrideLoopTarget) {
    1527         mStrideLoopBranch = b->CreateIndirectBr(mStrideLoopTarget, 3);
    1528         mStrideLoopBranch->addDestination(doFinalBlock);
    1529         mStrideLoopBranch->addDestination(segmentDone);
    1530     } else {
    1531         b->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
    1532     }
    1533 
    1534     doFinalBlock->moveAfter(stridesDone);
    1535 
    1536     /// DO FINAL BLOCK
    1537 
    1538     b->SetInsertPoint(doFinalBlock);
    1539     for (unsigned i = 0; i < inputSetCount; ++i) {
    1540         mStreamSetInputBaseAddress[i] = baseInputAddress[i];
    1541     }
    1542 
    1543     for (unsigned i = 0; i < outputSetCount; ++i) {
    1544         mStreamSetOutputBaseAddress[i] = baseOutputAddress[i];
    1545     }
    1546 
    1547     writeFinalBlockMethod(b, getRemainingItems(b));
    1548 
    1549     b->CreateBr(segmentDone);
    1550 
    1551     segmentDone->moveAfter(b->GetInsertBlock());
    1552 
    1553     b->SetInsertPoint(segmentDone);
    1554 
    1555     // Update the branch prediction metadata to indicate that the likely target will be segmentDone
    1556     if (mStrideLoopTarget) {
    1557         MDBuilder mdb(b->getContext());
    1558         const auto destinations = mStrideLoopBranch->getNumDestinations();
    1559         uint32_t weights[destinations];
    1560         for (unsigned i = 0; i < destinations; ++i) {
    1561             weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1;
    1562         }
    1563         ArrayRef<uint32_t> bw(weights, destinations);
    1564         mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw));
    1565     }
    1566 
    1567 }
    1568 
    1569 /** ------------------------------------------------------------------------------------------------------------- *
    1570  * @brief getRemainingItems
    1571  ** ------------------------------------------------------------------------------------------------------------- */
    1572 Value * BlockOrientedKernel::getRemainingItems(const std::unique_ptr<KernelBuilder> & b) {
    1573     Value * remainingItems = nullptr;
    1574     const auto count = mStreamSetInputs.size();
    1575     if (count == 1) {
    1576         return mAvailableItemCount[0];
    1577     } else {
    1578         for (unsigned i = 0; i < count; i++) {
    1579             if (mStreamSetInputs[i].isPrincipal()) {
    1580                 return mAvailableItemCount[i];
    1581             }
    1582         }
    1583         for (unsigned i = 0; i < count; ++i) {
    1584             const ProcessingRate & r = mStreamSetInputs[i].getRate();
    1585             if (r.isFixed()) {
    1586                 Value * ic = CreateUDivCeil(b, mAvailableItemCount[i], r.getRate());
    1587                 if (remainingItems) {
    1588                     remainingItems = b->CreateUMin(remainingItems, ic);
    1589                 } else {
    1590                     remainingItems = ic;
    1591                 }
    1592             }
    1593         }
    1594     }
    1595     return remainingItems;
    1596 }
    1597 
    1598 /** ------------------------------------------------------------------------------------------------------------- *
    1599  * @brief writeDoBlockMethod
    1600  ** ------------------------------------------------------------------------------------------------------------- */
    1601 inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) {
    1602 
    1603     Value * const self = getInstance();
    1604     Function * const cp = mCurrentMethod;
    1605     auto ip = b->saveIP();
    1606     std::vector<Value *> availableItemCount(0);
    1607 
    1608     /// Check if the do block method is called and create the function if necessary
    1609     if (!b->supportsIndirectBr()) {
    1610 
    1611         std::vector<Type *> params;
    1612         params.reserve(1 + mAvailableItemCount.size());
    1613         params.push_back(self->getType());
    1614         for (Value * avail : mAvailableItemCount) {
    1615             params.push_back(avail->getType());
    1616         }
    1617 
    1618         FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
    1619         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, b->getModule());
    1620         mCurrentMethod->setCallingConv(CallingConv::C);
    1621         mCurrentMethod->setDoesNotThrow();
    1622         auto args = mCurrentMethod->arg_begin();
    1623         args->setName("self");
    1624         setInstance(&*args);
    1625         availableItemCount.reserve(mAvailableItemCount.size());
    1626         while (++args != mCurrentMethod->arg_end()) {
    1627             availableItemCount.push_back(&*args);
    1628         }
    1629         assert (availableItemCount.size() == mAvailableItemCount.size());
    1630         mAvailableItemCount.swap(availableItemCount);
    1631         b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
    1632     }
    1633 
    1634     generateDoBlockMethod(b); // must be implemented by the BlockOrientedKernelBuilder subtype
    1635 
    1636     if (!b->supportsIndirectBr()) {
    1637         // Restore the DoSegment function state then call the DoBlock method
    1638         b->CreateRetVoid();
    1639         mDoBlockMethod = mCurrentMethod;
    1640         b->restoreIP(ip);
    1641         setInstance(self);
    1642         mCurrentMethod = cp;
    1643         mAvailableItemCount.swap(availableItemCount);
    1644         CreateDoBlockMethodCall(b);
    1645     }
    1646 
    1647 }
    1648 
    1649 /** ------------------------------------------------------------------------------------------------------------- *
    1650  * @brief writeFinalBlockMethod
    1651  ** ------------------------------------------------------------------------------------------------------------- */
    1652 inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * remainingItems) {
    1653 
    1654     Value * const self = getInstance();
    1655     Function * const cp = mCurrentMethod;
    1656     Value * const remainingItemCount = remainingItems;
    1657     auto ip = b->saveIP();
    1658     std::vector<Value *> availableItemCount(0);
    1659 
    1660     if (!b->supportsIndirectBr()) {
    1661         std::vector<Type *> params;
    1662         params.reserve(2 + mAvailableItemCount.size());
    1663         params.push_back(self->getType());
    1664         params.push_back(b->getSizeTy());
    1665         for (Value * avail : mAvailableItemCount) {
    1666             params.push_back(avail->getType());
    1667         }
    1668         FunctionType * const type = FunctionType::get(b->getVoidTy(), params, false);
    1669         mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, b->getModule());
    1670         mCurrentMethod->setCallingConv(CallingConv::C);
    1671         mCurrentMethod->setDoesNotThrow();
    1672         auto args = mCurrentMethod->arg_begin();
    1673         args->setName("self");
    1674         setInstance(&*args);
    1675         remainingItems = &*(++args);
    1676         remainingItems->setName("remainingItems");
    1677         availableItemCount.reserve(mAvailableItemCount.size());
    1678         while (++args != mCurrentMethod->arg_end()) {
    1679             availableItemCount.push_back(&*args);
    1680         }
    1681         assert (availableItemCount.size() == mAvailableItemCount.size());
    1682         mAvailableItemCount.swap(availableItemCount);
    1683         b->SetInsertPoint(BasicBlock::Create(b->getContext(), "entry", mCurrentMethod));
    1684     }
    1685 
    1686     #ifdef DEBUG_LOG
    1687     b->CallPrintInt(getName() + "_remainingItems", remainingItems);
    1688     #endif
    1689     generateFinalBlockMethod(b, remainingItems); // may be implemented by the BlockOrientedKernel subtype
    1690 
    1691     if (!b->supportsIndirectBr()) {
    1692         b->CreateRetVoid();
    1693         b->restoreIP(ip);
    1694         setInstance(self);
    1695         mAvailableItemCount.swap(availableItemCount);
    1696         // Restore the DoSegment function state then call the DoFinal method
    1697         std::vector<Value *> args;
    1698         args.reserve(2 + mAvailableItemCount.size());
    1699         args.push_back(self);
    1700         args.push_back(remainingItemCount);
    1701         args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
    1702         b->CreateCall(mCurrentMethod, args);
    1703         mCurrentMethod = cp;
    1704     }
    1705 
    1706 }
    1707 
    1708 /** ------------------------------------------------------------------------------------------------------------- *
    1709  * @brief generateFinalBlockMethod
    1710  ** ------------------------------------------------------------------------------------------------------------- */
    1711 void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, Value * /* remainingItems */) {
    1712     //  The default finalBlock method simply dispatches to the doBlock routine.
    1713     CreateDoBlockMethodCall(b);
    1714 }
    1715 
    1716 void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b) {
    1717     if (b->supportsIndirectBr()) {
    1718         BasicBlock * const bb = b->CreateBasicBlock("resume");
    1719         mStrideLoopBranch->addDestination(bb);
    1720         BasicBlock * const current = b->GetInsertBlock();
    1721         mStrideLoopTarget->addIncoming(BlockAddress::get(bb), current);
    1722         mStrideBlockIndex->addIncoming(b->getSize(0), current);
    1723         b->CreateBr(mStrideLoopBody);
    1724         bb->moveAfter(current);
    1725         b->SetInsertPoint(bb);
    1726     } else {
    1727         std::vector<Value *> args;
    1728         args.reserve(1 + mAvailableItemCount.size());
    1729         args.push_back(getInstance());
    1730         args.insert(args.end(), mAvailableItemCount.begin(), mAvailableItemCount.end());
    1731         b->CreateCall(mDoBlockMethod, args);
    1732     }
     724void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) {
     725    mTreatUnsafeKernelOperationsAsErrors = false;
     726    generateDoSegmentMethod(b);
    1733727}
    1734728
     
    1753747                  , std::move(internal_scalars))
    1754748, mCurrentMethod(nullptr)
    1755 , mAvailablePrincipalItemCount(nullptr)
    1756749, mStride(0)
     750, mTreatUnsafeKernelOperationsAsErrors(false)
    1757751, mIsFinal(nullptr)
    1758752, mOutputScalarResult(nullptr)
     
    1762756
    1763757Kernel::~Kernel() {
    1764 
    1765 }
    1766 
    1767 // MULTI-BLOCK KERNEL CONSTRUCTOR
    1768 MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
    1769                                    Bindings && stream_inputs,
    1770                                    Bindings && stream_outputs,
    1771                                    Bindings && scalar_parameters,
    1772                                    Bindings && scalar_outputs,
    1773                                    Bindings && internal_scalars)
    1774 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    1775 
    1776 }
    1777 
    1778 // CONSTRUCTOR
    1779 BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
    1780                                          Bindings && stream_inputs,
    1781                                          Bindings && stream_outputs,
    1782                                          Bindings && scalar_parameters,
    1783                                          Bindings && scalar_outputs,
    1784                                          Bindings && internal_scalars)
    1785 : MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
    1786 , mDoBlockMethod(nullptr)
    1787 , mStrideLoopBody(nullptr)
    1788 , mStrideLoopBranch(nullptr)
    1789 , mStrideLoopTarget(nullptr)
    1790 , mStrideBlockIndex(nullptr) {
    1791758
    1792759}
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5883 r5985  
    1010#include <boost/container/flat_map.hpp>
    1111
     12namespace llvm { class AllocaInst; }
    1213namespace llvm { class BasicBlock; }
    1314namespace llvm { class Constant; }
     
    2728class Kernel : public KernelInterface {
    2829    friend class KernelBuilder;
    29 protected:
    30 
    31     static const std::string DO_BLOCK_SUFFIX;
    32     static const std::string FINAL_BLOCK_SUFFIX;
    33     static const std::string MULTI_BLOCK_SUFFIX;
    34     static const std::string LOGICAL_SEGMENT_NO_SCALAR;
    35     static const std::string PROCESSED_ITEM_COUNT_SUFFIX;
    36     static const std::string CONSUMED_ITEM_COUNT_SUFFIX;
    37     static const std::string PRODUCED_ITEM_COUNT_SUFFIX;
    38     static const std::string TERMINATION_SIGNAL;
    39     static const std::string BUFFER_PTR_SUFFIX;
    40     static const std::string CONSUMER_SUFFIX;
    41     static const std::string CYCLECOUNT_SCALAR;
    42 
    4330public:
    4431   
     
    117104
    118105    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
     106
     107    bool requiresCopyBack(const Binding & binding) const;
     108
     109    bool strideOffsetIsTriviallyCalculable(const Binding & binding) const;
     110
     111    bool permitsNonLinearAccess(const Binding & binding) const;
     112
     113    bool requiresLinearAccess(const Binding & binding) const;
     114
     115    bool anyBindingRequiresLinearSpace() const;
    119116
    120117    const StreamSetBuffers & getStreamSetInputBuffers() const {
     
    198195    virtual ~Kernel() = 0;
    199196
    200     void prepareKernel(const std::unique_ptr<KernelBuilder> & idb);
    201 
    202     void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb);
    203 
    204     std::string getCacheName(const std::unique_ptr<KernelBuilder> & idb) const;
    205 
    206 protected:
    207 
    208     virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
     197    void prepareKernel(const std::unique_ptr<KernelBuilder> & b);
     198
     199    void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & b);
     200
     201    std::string getCacheName(const std::unique_ptr<KernelBuilder> & b) const;
     202
     203    bool mayTerminateEarly() const {
     204        return hasAttribute(kernel::Attribute::KindId::CanTerminateEarly);
     205    }
     206
     207    bool mustExplicitlyTerminate() const {
     208        return hasAttribute(kernel::Attribute::KindId::MustExplicitlyTerminate);
     209    }
     210
     211protected:
     212
     213    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> &) { }
    209214
    210215    void getDoSegmentFunctionArguments(const std::vector<llvm::Value *> & availItems) const;
     
    217222          Bindings && internal_scalars);
    218223
    219     llvm::Value * getPrincipalItemCount() const {
    220         return mAvailablePrincipalItemCount;
    221     }
    222 
    223224    unsigned getScalarIndex(const std::string & name) const;
    224225
    225     void prepareStreamSetNameMap();
    226    
    227226    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
    228227
    229     virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { }
    230    
    231     virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> & iBuilder) = 0;
    232 
    233     virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) { }
     228    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> &) { }
     229   
     230    virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> &) = 0;
     231
     232    virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> &) { }
    234233
    235234    // Add an additional scalar field to the KernelState struct.
     
    239238    unsigned addUnnamedScalar(llvm::Type * type);
    240239
    241     void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & idb);
    242 
    243     void callGenerateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb);
    244 
    245     void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
     240    void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & b);
     241
     242    void callGenerateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b);
     243
     244    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b);
    246245
    247246    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
     
    249248        std::tie(port, index) = getStreamPort(name);
    250249        if (port == Port::Input) {
    251             assert (index < mStreamSetInputBuffers.size());
    252             assert (mStreamSetInputBuffers[index]);
    253             return mStreamSetInputBuffers[index];
     250            return getStreamSetInputBuffer(index);
    254251        } else {
    255             assert (index < mStreamSetOutputBuffers.size());
    256             assert (mStreamSetOutputBuffers[index]);
    257             return mStreamSetOutputBuffers[index];
     252            return getStreamSetOutputBuffer(index);
    258253        }
    259254    }
     
    261256    void setStride(unsigned stride) { mStride = stride; }
    262257
     258    bool treatUnsafeKernelOperationsAsErrors() const { return mTreatUnsafeKernelOperationsAsErrors; }
     259
     260    bool mustClearOverflowPriorToCopyback(const Binding & binding) const;
     261
    263262private:
    264263
    265     void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
    266 
    267     llvm::Value * getStreamSetInputAddress(const std::string & name) const {
    268         const Kernel::StreamPort p = getStreamPort(name);
    269         assert (p.first == Port::Input);
    270         return mStreamSetInputBaseAddress[p.second];
    271     }
    272 
    273     llvm::Value * getStreamSetOutputAddress(const std::string & name) const {
    274         const Kernel::StreamPort p = getStreamPort(name);
    275         assert (p.first == Port::Output);
    276         return mStreamSetOutputBaseAddress[p.second];
    277     }
     264    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & b);
    278265
    279266    llvm::Value * getAvailableItemCount(const unsigned i) const {
     267        assert (i < mAvailableItemCount.size());
    280268        return mAvailableItemCount[i];
    281269    }
    282270
    283     void normalizeStreamProcessingRates();
    284 
    285     bool normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate);
    286 
    287 protected:
    288 
    289     llvm::Function *                    mCurrentMethod;
    290     llvm::Value *                       mAvailablePrincipalItemCount;   
     271    bool verifyBufferSize(const Binding & binding, const parabix::StreamSetBuffer * const buffer) const;
     272
     273    void verifyStreamSetDefinitions() const;
     274
     275protected:
     276
     277    llvm::Function *                    mCurrentMethod;
    291278    unsigned                            mStride;
     279    bool                                mTreatUnsafeKernelOperationsAsErrors;
    292280    llvm::Value *                       mIsFinal;
    293281    llvm::Value *                       mOutputScalarResult;
     
    301289    StreamMap                           mStreamMap;
    302290
    303     // TODO: once the kernel no longer needs to be aware of what type of buffers its working with,
    304     // these should be removed from the Kernel class and put into the Pipeline
    305291    StreamSetBuffers                    mStreamSetInputBuffers;
    306     std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
    307292    StreamSetBuffers                    mStreamSetOutputBuffers;
    308     std::vector<llvm::Value *>          mStreamSetOutputBaseAddress;
    309293};
    310294
     
    420404
    421405class MultiBlockKernel : public Kernel {
     406    friend class BlockOrientedKernel;
    422407protected:
    423408
     
    445430    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
    446431
    447     unsigned getItemAlignment(const Binding & binding) const;
    448 
    449     unsigned getCopyAlignment(const Binding & binding) const;
    450 
    451     bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
    452 
    453     bool requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const;
    454 
    455     bool requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const;
    456 
    457     llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
    458 
    459     bool requiresCopyBack(const ProcessingRate & rate) const;
    460 
    461     void reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b);
    462 
    463 protected:
     432    void writeMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b);
     433
     434    void checkInputStream(const std::unique_ptr<KernelBuilder> & b, const unsigned index);
     435
     436    void checkOutputStream(const std::unique_ptr<KernelBuilder> & b, const unsigned index);
     437
     438    llvm::Value * computePopCountRate(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate, llvm::Value * const maxItems);
     439
     440    llvm::Value * getPopCountRateItems(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate, llvm::Value * const strideIndex);
     441
     442    void prepareOverflowBuffers(const std::unique_ptr<KernelBuilder> & b);
     443
     444    void writeCopyBackLogic(const std::unique_ptr<KernelBuilder> & b);
     445
     446    void calculateDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
     447
     448    void updateDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
     449
     450    llvm::Value * hasAnotherStride(const std::unique_ptr<KernelBuilder> & b);
     451
     452    void updateFinalDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
     453
     454    void checkTerminationSignal(const std::unique_ptr<KernelBuilder> & b);
     455
     456    static bool hasDerivedItemCount(const Binding & binding);
     457
     458protected:
     459
     460    llvm::Value *                   mInitiallyFinal;
     461    llvm::Value *                   mNumOfStrides;
     462    llvm::Value *                   mNumOfStridesInFinalSegment;
    464463
    465464    std::vector<llvm::Value *>      mInitialAvailableItemCount;
     465
    466466    std::vector<llvm::Value *>      mInitialProcessedItemCount;
     467    std::vector<llvm::Value *>      mAccessibleInputItems;
     468    std::vector<llvm::Value *>      mInputStrideLength;
     469    std::vector<llvm::Value *>      mPopCountRateArray;
     470
    467471    std::vector<llvm::Value *>      mInitialProducedItemCount;
    468 
     472    std::vector<llvm::Value *>      mWritableOutputItems;
     473    std::vector<llvm::Value *>      mOutputStrideLength;
    469474};
    470475
     
    495500                        Bindings && internal_scalars);
    496501
     502    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
     503
    497504private:
    498505
     
    503510    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
    504511
    505     llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
     512    llvm::Value * incrementDerivedItemCounts(const std::unique_ptr<KernelBuilder> & b);
    506513
    507514private:
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5967 r5985  
    4545
    4646Value * KernelBuilder::getStreamHandle(const std::string & name) {
    47     Value * const ptr = getScalarField(name + Kernel::BUFFER_PTR_SUFFIX);
     47    Value * const ptr = getScalarField(name + BUFFER_SUFFIX);
    4848    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    4949        CreateAssert(ptr, name + " handle cannot be null!");
     
    5353
    5454LoadInst * KernelBuilder::acquireLogicalSegmentNo() {
    55     return CreateAtomicLoadAcquire(getScalarFieldPtr(Kernel::LOGICAL_SEGMENT_NO_SCALAR));
    56 }
    57 
    58 void KernelBuilder::releaseLogicalSegmentNo(Value * nextSegNo) {
    59     CreateAtomicStoreRelease(nextSegNo, getScalarFieldPtr(Kernel::LOGICAL_SEGMENT_NO_SCALAR));
     55    return CreateAtomicLoadAcquire(getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR));
     56}
     57
     58void KernelBuilder::releaseLogicalSegmentNo(Value * const nextSegNo) {
     59    CreateAtomicStoreRelease(nextSegNo, getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR));
    6060}
    6161
    6262Value * KernelBuilder::getCycleCountPtr() {
    63     return getScalarFieldPtr(Kernel::CYCLECOUNT_SCALAR);
    64 }
    65 
    66 Value * KernelBuilder::getInternalItemCount(const std::string & name, const std::string & suffix) {
     63    return getScalarFieldPtr(CYCLECOUNT_SCALAR);
     64}
     65
     66Value * KernelBuilder::getNamedItemCount(const std::string & name, const std::string & suffix) {
    6767    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    6868    Value * itemCount = nullptr;
     
    7575            itemCount = getProducedItemCount(rate.getReference());
    7676        }
    77         const auto & r = rate.getRate();
    78         if (r.numerator() != 1) {
    79             itemCount = CreateMul(itemCount, ConstantInt::get(itemCount->getType(), r.numerator()));
    80         }
    81         if (r.denominator() != 1) {
    82             itemCount = CreateExactUDiv(itemCount, ConstantInt::get(itemCount->getType(), r.denominator()));
    83         }
     77        itemCount = CreateMul2(itemCount, rate.getRate());
    8478    } else {
    8579        itemCount = getScalarField(name + suffix);
     
    8882}
    8983
    90 void KernelBuilder::setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value) {
     84void KernelBuilder::setNamedItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value) {
    9185    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    92     if (LLVM_UNLIKELY(rate.isDerived())) {
    93         assert (false);
    94         report_fatal_error("Cannot set item count: " + name + " is a Derived rate");
     86    const auto safetyCheck = mKernel->treatUnsafeKernelOperationsAsErrors();
     87    if (LLVM_UNLIKELY(rate.isDerived() && safetyCheck)) {
     88        report_fatal_error("Cannot set item count: " + name + " is a derived rate stream");
    9589    }
    9690    if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
    9791        CallPrintIntToStderr(mKernel->getName() + ": " + name + suffix, value);
     92    }
     93    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts) && safetyCheck)) {
     94        Value * const current = getScalarField(name + suffix);
     95        CreateAssert(CreateICmpUGE(value, current), name + " " + suffix + " must be monotonically non-decreasing");
    9896    }
    9997    setScalarField(name + suffix, value);
     
    112110
    113111Value * KernelBuilder::getTerminationSignal() {
    114     return CreateICmpNE(getScalarField(Kernel::TERMINATION_SIGNAL), getSize(0));
     112    return CreateICmpNE(getScalarField(TERMINATION_SIGNAL), getSize(0));
    115113}
    116114
     
    120118        CallPrintIntToStderr(mKernel->getName() + ": setTerminationSignal", value);
    121119    }
    122     setScalarField(Kernel::TERMINATION_SIGNAL, CreateZExt(value, getSizeTy()));
     120    setScalarField(TERMINATION_SIGNAL, CreateZExt(value, getSizeTy()));
    123121}
    124122
     
    134132
    135133/** ------------------------------------------------------------------------------------------------------------- *
    136  * @brief isConstantZero
    137  ** ------------------------------------------------------------------------------------------------------------- */
    138 inline bool isConstantZero(Value * const v) {
    139     return isa<ConstantInt>(v) && cast<ConstantInt>(v)->isNullValue();
    140 }
    141 
    142 /** ------------------------------------------------------------------------------------------------------------- *
    143  * @brief isConstantOne
    144  ** ------------------------------------------------------------------------------------------------------------- */
    145 inline bool isConstantOne(Value * const v) {
    146     return isa<ConstantInt>(v) && cast<ConstantInt>(v)->isOne();
     134 * @brief CreatePrepareOverflow
     135 ** ------------------------------------------------------------------------------------------------------------- */
     136void KernelBuilder::CreatePrepareOverflow(const std::string & name) {
     137    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     138    assert (buf->supportsCopyBack());
     139    Constant * const overflowSize = ConstantExpr::getSizeOf(buf->getType());
     140    Value * const handle = getStreamHandle(name);
     141    // TODO: handle non constant stream set counts
     142    assert (isa<Constant>(buf->getStreamSetCount(this, handle)));
     143    Value * const base = buf->getBaseAddress(this, handle);
     144    Value * const overflow = buf->getOverflowAddress(this, handle);
     145    const auto blockSize = getBitBlockWidth() / 8;
     146    CreateMemZero(overflow, overflowSize, blockSize);
     147    CreateMemZero(base, overflowSize, blockSize);
    147148}
    148149
     
    150151 * @brief getItemWidth
    151152 ** ------------------------------------------------------------------------------------------------------------- */
    152 inline unsigned getItemWidth(const Type * ty) {
     153inline unsigned LLVM_READNONE getItemWidth(const Type * ty ) {
    153154    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
    154155        ty = ty->getArrayElementType();
     
    157158}
    158159
    159 inline static unsigned ceil_log2(const unsigned v) {
    160     assert ("log2(0) is undefined!" && v != 0);
    161     return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
    162 }
    163 
    164 /** ------------------------------------------------------------------------------------------------------------- *
    165  * @brief CreateStreamCpy
    166  ** ------------------------------------------------------------------------------------------------------------- */
    167 void KernelBuilder::CreateStreamCpy(const std::string & name, Value * target, Value * targetOffset, Value * source, Value * sourceOffset, Value * itemsToCopy, const unsigned itemAlignment) {
    168 
    169     assert (target && targetOffset);
    170     assert (source && sourceOffset);
    171     // assert (target->getType() == source->getType());
    172     assert (target->getType()->isPointerTy());
    173     assert (isConstantZero(targetOffset) || isConstantZero(sourceOffset));
    174     const StreamSetBuffer * const buffer = mKernel->getAnyStreamSetBuffer(name);
    175     const auto itemWidth = getItemWidth(buffer->getBaseType());
    176     assert ("invalid item width" && is_power_2(itemWidth));
    177     const auto blockWidth = getBitBlockWidth();
    178     // Although our item width may be n bits, if we know we're always processing m items per block, our field width
    179     // (w.r.t the stream copy) would be n*m. By taking this into account we can optimize and simplify the copy code.
    180     const auto fieldWidth = std::min(1U << ceil_log2(itemWidth * itemAlignment), blockWidth);
    181     assert ((blockWidth % fieldWidth) == 0);
    182 
    183     if (LLVM_LIKELY(itemWidth < fieldWidth)) {
    184         const auto factor = fieldWidth / itemWidth;
    185         Constant * const FACTOR = getSize(factor);
    186         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    187             const auto kernelName = mKernel->getName()+ ": " + name;
    188             if (fieldWidth > 8) {
    189                 const auto alignment = (fieldWidth + 7) / 8;
    190                 ConstantInt * const ALIGNMENT = getSize(alignment);
    191                 CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
    192                 CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
    193             }
    194             CreateAssertZero(CreateURem(targetOffset, FACTOR), kernelName + " target offset is misaligned (" + std::to_string(factor) + ")");
    195             CreateAssertZero(CreateURem(sourceOffset, FACTOR), kernelName + " source offset is misaligned (" + std::to_string(factor) + ")");
    196         }
    197         targetOffset = CreateUDiv(targetOffset, FACTOR);
    198         sourceOffset = CreateUDiv(sourceOffset, FACTOR);
    199     }
    200 
    201     /*
    202        Streams are conceptually modelled as:
    203 
    204                                             BLOCKS
    205 
    206                                       A     B     C     D
    207            STREAM SET ELEMENT   1  |aaaaa|bbbbb|ccccc|dddd |
    208                                 2  |eeeee|fffff|ggggg|hhhh |
    209                                 3  |iiiii|jjjjj|kkkkk|llll |
    210 
    211        But the memory layout is actually:
    212 
    213            A_1   A_2   A_3   B_1   B_2   B_3   C_1   C_2   C_3   D_1   D_2   D_3
    214 
    215          |aaaaa|eeeee|iiiii|bbbbb|fffff|jjjjj|ccccc|ggggg|kkkkk|dddd |hhhh |llll |
    216 
    217 
    218        So if we're copying the entire stream set block or our stream set has one element, we can use memcpy.
    219 
    220        One compilication here is when the BlockSize of a stream is not equal to the BitBlockWidth.
    221 
    222 
    223     */
    224 
    225     Value * const n = buffer->getStreamSetCount(this, getStreamHandle(name));
    226     if (((isConstantOne(n) && fieldWidth >= 8) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset)))) {
    227         if (LLVM_LIKELY(itemWidth < 8)) {
    228             itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
    229         } else if (LLVM_UNLIKELY(itemWidth > 8)) {
    230             itemsToCopy = CreateMul(itemsToCopy, getSize(itemWidth / 8));
    231         }
    232         if (!isConstantOne(n)) {
    233             itemsToCopy = CreateMul(itemsToCopy, n);
    234         }
    235         PointerType * const ptrTy = getIntNTy(fieldWidth)->getPointerTo();
    236         target = CreateGEP(CreatePointerCast(target, ptrTy), targetOffset);
    237         source = CreateGEP(CreatePointerCast(source, ptrTy), sourceOffset);
    238         const auto alignment = (fieldWidth + 7) / 8;
    239         CreateMemCpy(target, source, itemsToCopy, alignment);
    240 
    241     } else { // either the target offset or source offset is non-zero but not both
    242         VectorType * const blockTy = getBitBlockType();
    243         PointerType * const blockPtrTy = blockTy->getPointerTo();
    244         Constant * const BLOCK_WIDTH = getSize(blockWidth);
    245         target = CreatePointerCast(target, blockPtrTy);
    246         target = CreateGEP(target, CreateUDiv(targetOffset, BLOCK_WIDTH));
    247         source = CreatePointerCast(source, blockPtrTy);
    248         source = CreateGEP(source, CreateUDiv(sourceOffset, BLOCK_WIDTH));
    249         const auto alignment = blockWidth / 8;
    250         Constant * const ZERO = getSize(0);
    251         Constant * const ONE = getSize(1);
    252 
    253         BasicBlock * const entry = GetInsertBlock();
    254 
    255         // TODO: this code isn't correct. I was hoping to shift by fieldwidth units to give LLVM
    256         // the ability to better select
    257 
    258         if (isConstantZero(targetOffset)) {
    259 
    260             /*
    261                                                 BLOCKS
    262 
    263                                           A     B     C     D
    264                SOURCE STREAM        1  |aaa--|bbbBB|cccCC|  dDD|
    265                                     2  |eee--|fffFF|gggGG|  hHH|
    266                                     3  |iii--|jjjJJ|kkkKK|  lLL|
    267 
    268 
    269                                           A     B     C     D
    270                TARGET STREAM        1  |BBaaa|CCbbb|DDccc|    d|
    271                                     2  |FFeee|GGfff|HHggg|    h|
    272                                     3  |JJiii|KKjjj|LLkkk|    l|
    273             */
    274 
    275             sourceOffset = CreateURem(sourceOffset, BLOCK_WIDTH);
    276 
    277             Value * const borrowOffset = CreateSub(BLOCK_WIDTH, sourceOffset);
    278             BasicBlock * const streamCopy = CreateBasicBlock();
    279             BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
    280             BasicBlock * const streamCopyRemaining = CreateBasicBlock();
    281             BasicBlock * const streamCopyEnd = CreateBasicBlock();
    282 
    283             Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
    284             CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemainingCond);
    285 
    286             SetInsertPoint(streamCopy);
    287             PHINode * const i = CreatePHI(getSizeTy(), 2);
    288             i->addIncoming(n, entry);
    289             Value * Ai = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
    290             Ai = mvmd_srl(fieldWidth, Ai, borrowOffset);
    291             Value * Bi = CreateAlignedLoad(CreateGEP(source, i), alignment);
    292             Bi = mvmd_sll(fieldWidth, Bi, sourceOffset);
    293             CreateAlignedStore(CreateOr(Bi, Ai), CreateGEP(target, i), alignment);
    294             Value * const next_i = CreateAdd(i, ONE);
    295             i->addIncoming(next_i, streamCopy);
    296             CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemainingCond);
    297 
    298             SetInsertPoint(streamCopyRemainingCond);
    299             Value * const partialBlocksToCopy = CreateAdd(blocksToCopy, n);
    300             Value * const remainingItemsToCopy = CreateURem(itemsToCopy, BLOCK_WIDTH);
    301             CreateLikelyCondBr(CreateIsNotNull(remainingItemsToCopy), streamCopyRemaining, streamCopyEnd);
    302 
    303             SetInsertPoint(streamCopyRemaining);
    304             PHINode * const j = CreatePHI(getSizeTy(), 2);
    305             j->addIncoming(blocksToCopy, streamCopyRemainingCond);
    306             Value * Aj = CreateAlignedLoad(CreateGEP(source, j), alignment);
    307             Aj = mvmd_srl(fieldWidth, Aj, borrowOffset);
    308             CreateAlignedStore(Aj, CreateGEP(target, j), alignment);
    309             Value * const next_j = CreateAdd(j, ONE);
    310             j->addIncoming(next_j, streamCopyRemaining);
    311             CreateCondBr(CreateICmpNE(next_j, partialBlocksToCopy), streamCopyRemaining, streamCopyEnd);
    312 
    313             SetInsertPoint(streamCopyEnd);
    314 
    315         } else if (isConstantZero(sourceOffset)) {
    316 
    317             /*
    318                                                 BLOCKS
    319 
    320                                           A     B     C     D
    321                SOURCE STREAM        1  |AAAaa|BBBaa|CCCcc|    d|
    322                                     2  |EEEee|FFFff|GGGgg|    h|
    323                                     3  |IIIii|JJJjj|KKKkk|    l|
    324 
    325 
    326                                           A     B     C     D
    327                TARGET STREAM        1  |--XXX|-----|-----|-----|
    328                                     2  |--YYY|-----|-----|-----|
    329                                     3  |--ZZZ|-----|-----|-----|
    330 
    331                                           A     B     C     D
    332                OUTPUT STREAM        1  |aaXXX|bbAAA|ccBBB| dCCC|
    333                                     2  |eeYYY|ffEEE|ggFFF| hGGG|
    334                                     3  |iiZZZ|jjIII|kkJJJ| lKKK|
    335 
    336             */
    337 
    338             BasicBlock * const streamCopy = CreateBasicBlock();
    339             BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
    340             BasicBlock * const streamCopyRemaining = CreateBasicBlock();
    341             BasicBlock * const streamCopyEnd = CreateBasicBlock();
    342 
    343             targetOffset = CreateURem(targetOffset, BLOCK_WIDTH);
    344 
    345             Value * const carryOffset = CreateSub(BLOCK_WIDTH, targetOffset);
    346             Value * const mask = mvmd_srl(fieldWidth, Constant::getAllOnesValue(blockTy), carryOffset);
    347             CreateBr(streamCopy);
    348 
    349             SetInsertPoint(streamCopy);
    350             PHINode * const i = CreatePHI(getSizeTy(), 2);
    351             i->addIncoming(ZERO, entry);
    352             Value * A0 = CreateAlignedLoad(CreateGEP(target, i), alignment);
    353             A0 = CreateAnd(A0, mask);
    354             Value * Ai = CreateAlignedLoad(CreateGEP(source, i), alignment);
    355             Ai = mvmd_sll(fieldWidth, Ai, targetOffset);
    356             CreateAlignedStore(CreateOr(Ai, A0), CreateGEP(target, i), alignment);
    357             Value * const next_i = CreateAdd(i, ONE);
    358             i->addIncoming(next_i, streamCopy);
    359             CreateCondBr(CreateICmpNE(next_i, n), streamCopy, streamCopyRemainingCond);
    360 
    361             SetInsertPoint(streamCopyRemainingCond);
    362             Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
    363             CreateCondBr(CreateICmpUGT(blocksToCopy, n), streamCopyRemaining, streamCopyEnd);
    364 
    365             SetInsertPoint(streamCopyRemaining);
    366             PHINode * const j = CreatePHI(getSizeTy(), 2);
    367             j->addIncoming(n, streamCopyRemainingCond);
    368             Value * Aj = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
    369             Aj = mvmd_srl(fieldWidth, Aj, carryOffset);
    370             Value * Bj = CreateAlignedLoad(CreateGEP(source, j), alignment);
    371             Bj = mvmd_sll(fieldWidth, Bj, targetOffset);
    372             CreateAlignedStore(CreateOr(Bj, Aj), CreateGEP(target, j), alignment);
    373             Value * const next_j = CreateAdd(j, ONE);
    374             j->addIncoming(next_j, streamCopyRemaining);
    375             CreateCondBr(CreateICmpNE(next_j, blocksToCopy), streamCopyRemaining, streamCopyEnd);
    376 
    377             SetInsertPoint(streamCopyEnd);
    378         }
    379     }
     160/** ------------------------------------------------------------------------------------------------------------- *
     161 * @brief CreateNonLinearCopyFromOverflow
     162 ** ------------------------------------------------------------------------------------------------------------- */
     163void KernelBuilder::CreateNonLinearCopyFromOverflow(const Binding & output, llvm::Value * const itemsToCopy, Value * overflowOffset) {
     164
     165    Value * const handle = getStreamHandle(output.getName());
     166    Type * const bitBlockPtrTy = getBitBlockType()->getPointerTo();
     167    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(output.getName());
     168    assert (buf->supportsCopyBack());
     169    Value * const target = CreatePointerCast(buf->getBaseAddress(this, handle), bitBlockPtrTy);
     170    Value * const source = CreatePointerCast(buf->getOverflowAddress(this, handle), bitBlockPtrTy);
     171    const auto blockSize = getBitBlockWidth() / 8;
     172    Constant * const BLOCK_WIDTH = getSize(getBitBlockWidth());
     173    Constant * const ITEM_WIDTH = getSize(getItemWidth(buf->getBaseType()));
     174    Value * const streamCount = buf->getStreamSetCount(this, handle);
     175
     176    // If we have a computed overflow position, the base and overflow regions were not speculatively zeroed out prior
     177    // to the kernel writing over them. To handle them, we compute a mask of valid items and exclude any bit not in
     178    // them before OR-ing together the streams.
     179    if (overflowOffset) {
     180
     181        overflowOffset = CreateMul(overflowOffset, ITEM_WIDTH);
     182        Value * targetMask = bitblock_mask_from(CreateURem(overflowOffset, BLOCK_WIDTH));
     183        Value * sourceMask = CreateNot(targetMask);
     184        Value * const overflowBlockCount = CreateUDiv(overflowOffset, BLOCK_WIDTH);
     185        Value * const blockOffset = CreateMul(overflowBlockCount, streamCount);
     186        Value * const fullCopyLength = CreateMul(blockOffset, getSize(blockSize));
     187        CreateMemCpy(target, source, fullCopyLength, blockSize);
     188
     189        BasicBlock * const partialCopyEntry = GetInsertBlock();
     190        BasicBlock * const partialCopyLoop = CreateBasicBlock();
     191        BasicBlock * const partialCopyExit = CreateBasicBlock();
     192
     193        Value * const partialBlockCount = CreateAdd(blockOffset, streamCount);
     194        CreateBr(partialCopyLoop);
     195
     196        SetInsertPoint(partialCopyLoop);
     197        PHINode * const blockIndex = CreatePHI(getSizeTy(), 2);
     198        blockIndex->addIncoming(blockOffset, partialCopyEntry);
     199        Value * const sourcePtr = CreateGEP(source, blockIndex);
     200        Value * sourceValue = CreateBlockAlignedLoad(sourcePtr);
     201        sourceValue = CreateAnd(sourceValue, sourceMask);
     202        Value * const targetPtr = CreateGEP(target, blockIndex);
     203        Value * targetValue = CreateBlockAlignedLoad(targetPtr);
     204        targetValue = CreateAnd(targetValue, targetMask);
     205        targetValue = CreateOr(targetValue, sourceValue);
     206        CreateBlockAlignedStore(targetValue, targetPtr);
     207        Value * const nextBlockIndex = CreateAdd(blockIndex, getSize(1));
     208        blockIndex->addIncoming(nextBlockIndex, partialCopyLoop);
     209        CreateCondBr(CreateICmpNE(nextBlockIndex, partialBlockCount), partialCopyLoop, partialCopyExit);
     210
     211        SetInsertPoint(partialCopyExit);
     212
     213    } else {
     214
     215        BasicBlock * const mergeCopyEntry = GetInsertBlock();
     216        BasicBlock * const mergeCopyLoop = CreateBasicBlock();
     217        BasicBlock * const mergeCopyExit = CreateBasicBlock();
     218
     219        Value * blocksToCopy = CreateCeilUDiv(itemsToCopy, BLOCK_WIDTH);
     220        blocksToCopy = CreateMul(blocksToCopy, ITEM_WIDTH);
     221        blocksToCopy = CreateMul(blocksToCopy, streamCount);
     222
     223        CreateBr(mergeCopyLoop);
     224
     225        SetInsertPoint(mergeCopyLoop);
     226        PHINode * const blockIndex = CreatePHI(getSizeTy(), 2);
     227        blockIndex->addIncoming(getSize(0), mergeCopyEntry);
     228        Value * const sourcePtr = CreateGEP(source, blockIndex);
     229        Value * const sourceValue = CreateBlockAlignedLoad(sourcePtr);
     230        Value * const targetPtr = CreateGEP(target, blockIndex);
     231        Value * targetValue = CreateBlockAlignedLoad(targetPtr);
     232        targetValue = CreateOr(targetValue, sourceValue);
     233        CreateBlockAlignedStore(targetValue, targetPtr);
     234        Value * const nextBlockIndex = CreateAdd(blockIndex, getSize(1));
     235        blockIndex->addIncoming(nextBlockIndex, mergeCopyLoop);
     236        CreateCondBr(CreateICmpNE(nextBlockIndex, blocksToCopy), mergeCopyLoop, mergeCopyExit);
     237
     238        SetInsertPoint(mergeCopyExit);
     239    }
     240
     241
     242
     243}
     244
     245/** ------------------------------------------------------------------------------------------------------------- *
     246 * @brief CreateCopyFromOverflow
     247 ** ------------------------------------------------------------------------------------------------------------- */
     248void KernelBuilder::CreateCopyFromOverflow(const Binding & output, llvm::Value * const itemsToCopy) {
     249
     250    Value * const handle = getStreamHandle(output.getName());
     251    Type * const bitBlockPtrTy = getBitBlockType()->getPointerTo();
     252    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(output.getName());
     253    assert (buf->supportsCopyBack());
     254    Value * const target = CreatePointerCast(buf->getBaseAddress(this, handle), bitBlockPtrTy);
     255    Value * const source = CreatePointerCast(buf->getOverflowAddress(this, handle), bitBlockPtrTy);
     256    Constant * const BLOCK_WIDTH = getSize(getBitBlockWidth());
     257    Constant * const ITEM_WIDTH = getSize(getItemWidth(buf->getBaseType()));
     258    Value * const streamCount = buf->getStreamSetCount(this, handle);
     259
     260    BasicBlock * const mergeCopyEntry = GetInsertBlock();
     261    BasicBlock * const mergeCopyLoop = CreateBasicBlock();
     262    BasicBlock * const mergeCopyExit = CreateBasicBlock();
     263
     264    Value * blocksToCopy = CreateCeilUDiv(itemsToCopy, BLOCK_WIDTH);
     265    blocksToCopy = CreateMul(blocksToCopy, ITEM_WIDTH);
     266    blocksToCopy = CreateMul(blocksToCopy, streamCount);
     267
     268    CreateBr(mergeCopyLoop);
     269
     270    SetInsertPoint(mergeCopyLoop);
     271    PHINode * const blockIndex = CreatePHI(getSizeTy(), 2);
     272    blockIndex->addIncoming(getSize(0), mergeCopyEntry);
     273    Value * const sourcePtr = CreateGEP(source, blockIndex);
     274    Value * const sourceValue = CreateBlockAlignedLoad(sourcePtr);
     275    Value * const targetPtr = CreateGEP(target, blockIndex);
     276    CreateBlockAlignedStore(sourceValue, targetPtr);
     277    Value * const nextBlockIndex = CreateAdd(blockIndex, getSize(1));
     278    blockIndex->addIncoming(nextBlockIndex, mergeCopyLoop);
     279    CreateCondBr(CreateICmpNE(nextBlockIndex, blocksToCopy), mergeCopyLoop, mergeCopyExit);
     280
     281    SetInsertPoint(mergeCopyExit);
     282}
     283
     284
     285/** ------------------------------------------------------------------------------------------------------------- *
     286 * @brief CreateCopyToOverflow
     287 ** ------------------------------------------------------------------------------------------------------------- */
     288void KernelBuilder::CreateCopyToOverflow(const std::string & name) {
     289    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     290    assert (buf->supportsCopyBack());
     291    Value * const handle = getStreamHandle(name);
     292    // TODO: handle non constant stream set counts
     293    assert (isa<Constant>(buf->getStreamSetCount(this, handle)));
     294    Value * const target = buf->getBaseAddress(this, handle);
     295    Value * const source = buf->getOverflowAddress(this, handle);
     296    Constant * const overflowSize = ConstantExpr::getSizeOf(buf->getType());
     297    CreateMemCpy(target, source, overflowSize, getBitBlockWidth() / 8);
    380298}
    381299
    382300Value * KernelBuilder::getConsumerLock(const std::string & name) {
    383     return getScalarField(name + Kernel::CONSUMER_SUFFIX);
    384 }
    385 
    386 void KernelBuilder::setConsumerLock(const std::string & name, Value * value) {
    387     setScalarField(name + Kernel::CONSUMER_SUFFIX, value);
    388 }
    389 
    390 Value * KernelBuilder::loadInputStreamBlock(const std::string & name, Value * streamIndex) {
    391     return CreateBlockAlignedLoad(getInputStreamBlockPtr(name, streamIndex));
    392 }
    393 
    394 Value * KernelBuilder::getInputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    395     Value * const addr = mKernel->getStreamSetInputAddress(name);
    396     if (addr) {
    397         return CreateGEP(addr, {getInt32(0), streamIndex, packIndex});
    398     } else {
    399         const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    400         Value * const blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
    401         return buf->getStreamPackPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, packIndex, true);
    402     }
    403 }
    404 
    405 Value * KernelBuilder::loadInputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex) {
    406     return CreateBlockAlignedLoad(getInputStreamPackPtr(name, streamIndex, packIndex));
     301    return getScalarField(name + CONSUMER_SUFFIX);
     302}
     303
     304void KernelBuilder::setConsumerLock(const std::string & name, Value * const value) {
     305    setScalarField(name + CONSUMER_SUFFIX, value);
     306}
     307
     308Value * KernelBuilder::getInputStreamBlockPtr(const std::string & name, Value * const streamIndex, Value * const blockOffset) {
     309    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     310    Value * blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     311    if (blockOffset) {
     312        assert (blockOffset->getType() == blockIndex->getType());
     313        blockIndex = CreateAdd(blockIndex, blockOffset);
     314    }
     315    return buf->getStreamBlockPtr(this, getStreamHandle(name), streamIndex, blockIndex, true);
     316}
     317
     318Value * KernelBuilder::getInputStreamPackPtr(const std::string & name, Value * const streamIndex, Value * const packIndex, Value * const blockOffset) {
     319    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     320    Value * blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
     321    if (blockOffset) {
     322        assert (blockOffset->getType() == blockIndex->getType());
     323        blockIndex = CreateAdd(blockIndex, blockOffset);
     324    }
     325    return buf->getStreamPackPtr(this, getStreamHandle(name), streamIndex, blockIndex, packIndex, true);
     326}
     327
     328Value * KernelBuilder::loadInputStreamBlock(const std::string & name, Value * const streamIndex, Value * const blockOffset) {
     329    return CreateBlockAlignedLoad(getInputStreamBlockPtr(name, streamIndex, blockOffset));
     330}
     331
     332Value * KernelBuilder::loadInputStreamPack(const std::string & name, Value * const streamIndex, Value * const packIndex, Value * const blockOffset) {
     333    return CreateBlockAlignedLoad(getInputStreamPackPtr(name, streamIndex, packIndex, blockOffset));
    407334}
    408335
     
    412339}
    413340
    414 Value * KernelBuilder::getInputStreamBlockPtr(const std::string & name, Value * const streamIndex, Value * const blockOffset) {
    415     Value * const addr = mKernel->getStreamSetInputAddress(name);
    416     if (addr) {
    417         return CreateGEP(addr, {blockOffset ? blockOffset : getInt32(0), streamIndex});
    418     } else {
    419         const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    420         Value * blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
    421         if (blockOffset) {
    422             assert (blockIndex->getType() == blockOffset->getType());
    423             blockIndex = CreateAdd(blockIndex, blockOffset);
    424         }
    425         return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, true);
    426     }
    427 }
    428 
    429341Value * KernelBuilder::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex, Value * const blockOffset) {
    430     Value * const addr = mKernel->getStreamSetOutputAddress(name);
    431     if (addr) {
    432         return CreateGEP(addr, {blockOffset ? blockOffset : getInt32(0), streamIndex});
    433     } else {
    434         const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    435         Value * blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
    436         if (blockOffset) {
    437             assert (blockIndex->getType() == blockOffset->getType());
    438             blockIndex = CreateAdd(blockIndex, blockOffset);
    439         }
    440         return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, false);
    441     }
    442 }
    443 
    444 StoreInst * KernelBuilder::storeOutputStreamBlock(const std::string & name, Value * streamIndex, Value * toStore) {
    445     Value * const ptr = getOutputStreamBlockPtr(name, streamIndex);
     342    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     343    Value * blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
     344    if (blockOffset) {
     345        assert (blockOffset->getType() == blockIndex->getType());
     346        blockIndex = CreateAdd(blockIndex, blockOffset);
     347    }
     348    return buf->getStreamBlockPtr(this, getStreamHandle(name), streamIndex, blockIndex, false);
     349}
     350
     351Value * KernelBuilder::getOutputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex, llvm::Value * blockOffset) {
     352    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     353    Value * blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
     354    if (blockOffset) {
     355        assert (blockOffset->getType() == blockIndex->getType());
     356        blockIndex = CreateAdd(blockIndex, blockOffset);
     357    }
     358    return buf->getStreamPackPtr(this, getStreamHandle(name), streamIndex, blockIndex, packIndex, false);
     359}
     360
     361
     362StoreInst * KernelBuilder::storeOutputStreamBlock(const std::string & name, Value * streamIndex, llvm::Value * blockOffset, Value * toStore) {
     363    Value * const ptr = getOutputStreamBlockPtr(name, streamIndex, blockOffset);
    446364    Type * const storeTy = toStore->getType();
    447365    Type * const ptrElemTy = ptr->getType()->getPointerElementType();
     
    461379}
    462380
    463 Value * KernelBuilder::getOutputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) {
    464     Value * const addr = mKernel->getStreamSetOutputAddress(name);
    465     if (addr) {
    466         return CreateGEP(addr, {getInt32(0), streamIndex, packIndex});
    467     } else {
    468         const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    469         Value * const blockIndex = CreateLShr(getProducedItemCount(name), std::log2(getBitBlockWidth()));
    470         return buf->getStreamPackPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, packIndex, false);
    471     }
    472 }
    473 
    474 StoreInst * KernelBuilder::storeOutputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex, Value * toStore) {
    475     Value * const ptr = getOutputStreamPackPtr(name, streamIndex, packIndex);
     381StoreInst * KernelBuilder::storeOutputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex, llvm::Value * blockOffset, Value * toStore) {
     382    Value * const ptr = getOutputStreamPackPtr(name, streamIndex, packIndex, blockOffset);
    476383    Type * const storeTy = toStore->getType();
    477384    Type * const ptrElemTy = ptr->getType()->getPointerElementType();
     
    530437}
    531438
    532 Value * KernelBuilder::getBlockAddress(const std::string & name, Value * blockIndex) {
    533     const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    534     return buf->getBlockAddress(this, getStreamHandle(name), blockIndex);
    535 }
    536 
    537439void KernelBuilder::protectOutputStream(const std::string & name, const bool readOnly) {
    538440    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
     
    630532}
    631533
    632 }
     534/** ------------------------------------------------------------------------------------------------------------- *
     535 * @brief CreateUDiv2
     536 ** ------------------------------------------------------------------------------------------------------------- */
     537Value * KernelBuilder::CreateUDiv2(Value * const number, const ProcessingRate::RateValue & divisor, const Twine & Name) {
     538    if (divisor.numerator() == 1 && divisor.denominator() == 1) {
     539        return number;
     540    }
     541    Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
     542    if (LLVM_LIKELY(divisor.denominator() == 1)) {
     543        return CreateUDiv(number, n, Name);
     544    } else {
     545        Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
     546        return CreateUDiv(CreateMul(number, d), n);
     547    }
     548}
     549
     550/** ------------------------------------------------------------------------------------------------------------- *
     551 * @brief CreateCeilUDiv2
     552 ** ------------------------------------------------------------------------------------------------------------- */
     553Value * KernelBuilder::CreateCeilUDiv2(Value * const number, const ProcessingRate::RateValue & divisor, const Twine & Name) {
     554    if (divisor.numerator() == 1 && divisor.denominator() == 1) {
     555        return number;
     556    }
     557    Constant * const n = ConstantInt::get(number->getType(), divisor.numerator());
     558    if (LLVM_LIKELY(divisor.denominator() == 1)) {
     559        return CreateCeilUDiv(number, n, Name);
     560    } else {
     561        //   âŒŠ(num + ratio - 1) / ratio⌋
     562        // = ⌊(num - 1) / (n/d)⌋ + (ratio/ratio)
     563        // = ⌊(d * (num - 1)) / n⌋ + 1
     564        Constant * const ONE = ConstantInt::get(number->getType(), 1);
     565        Constant * const d = ConstantInt::get(number->getType(), divisor.denominator());
     566        return CreateAdd(CreateUDiv(CreateMul(CreateSub(number, ONE), d), n), ONE, Name);
     567    }
     568}
     569
     570/** ------------------------------------------------------------------------------------------------------------- *
     571 * @brief CreateMul2
     572 ** ------------------------------------------------------------------------------------------------------------- */
     573Value * KernelBuilder::CreateMul2(Value * const number, const ProcessingRate::RateValue & factor, const Twine & Name) {
     574    if (factor.numerator() == 1 && factor.denominator() == 1) {
     575        return number;
     576    }
     577    Constant * const n = ConstantInt::get(number->getType(), factor.numerator());
     578    if (LLVM_LIKELY(factor.denominator() == 1)) {
     579        return CreateMul(number, n, Name);
     580    } else {
     581        Constant * const d = ConstantInt::get(number->getType(), factor.denominator());
     582        return CreateUDiv(CreateMul(number, n), d, Name);
     583    }
     584}
     585
     586/** ------------------------------------------------------------------------------------------------------------- *
     587 * @brief CreateMulCeil2
     588 ** ------------------------------------------------------------------------------------------------------------- */
     589Value * KernelBuilder::CreateCeilUMul2(Value * const number, const ProcessingRate::RateValue & factor, const Twine & Name) {
     590    if (factor.denominator() == 1) {
     591        return CreateMul2(number, factor, Name);
     592    }
     593    Constant * const n = ConstantInt::get(number->getType(), factor.numerator());
     594    Constant * const d = ConstantInt::get(number->getType(), factor.denominator());
     595    return CreateCeilUDiv(CreateMul(number, n), d, Name);
     596}
     597
     598}
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5967 r5985  
    3434    llvm::LoadInst * acquireLogicalSegmentNo();
    3535
    36     void releaseLogicalSegmentNo(llvm::Value * nextSegNo);
     36    void releaseLogicalSegmentNo(llvm::Value * const nextSegNo);
    3737
    3838    llvm::Value * getProducedItemCount(const std::string & name) {
    39         return getInternalItemCount(name, Kernel::PRODUCED_ITEM_COUNT_SUFFIX);
     39        return getNamedItemCount(name, PRODUCED_ITEM_COUNT_SUFFIX);
    4040    }
    4141
    4242    void setProducedItemCount(const std::string & name, llvm::Value * value) {
    43         setInternalItemCount(name, Kernel::PRODUCED_ITEM_COUNT_SUFFIX, value);
     43        setNamedItemCount(name, PRODUCED_ITEM_COUNT_SUFFIX, value);
    4444    }
    4545
    4646    llvm::Value * getProcessedItemCount(const std::string & name) {       
    47         return getInternalItemCount(name, Kernel::PROCESSED_ITEM_COUNT_SUFFIX);
     47        return getNamedItemCount(name, PROCESSED_ITEM_COUNT_SUFFIX);
    4848    }
    4949
    5050    void setProcessedItemCount(const std::string & name, llvm::Value * value) {
    51         setInternalItemCount(name, Kernel::PROCESSED_ITEM_COUNT_SUFFIX, value);
     51        setNamedItemCount(name, PROCESSED_ITEM_COUNT_SUFFIX, value);
    5252    }
    5353
    5454    llvm::Value * getConsumedItemCount(const std::string & name) {
    55         return getInternalItemCount(name, Kernel::CONSUMED_ITEM_COUNT_SUFFIX);
     55        return getNamedItemCount(name, CONSUMED_ITEM_COUNT_SUFFIX);
    5656    }
    5757
    5858    void setConsumedItemCount(const std::string & name, llvm::Value * value) {
    59         setInternalItemCount(name, Kernel::CONSUMED_ITEM_COUNT_SUFFIX, value);
     59        setNamedItemCount(name, CONSUMED_ITEM_COUNT_SUFFIX, value);
     60    }
     61
     62    llvm::Value * getNonDeferredProcessedItemCount(const Binding & input) {
     63        return getNamedItemCount(input.getName(), input.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PROCESSED_ITEM_COUNT_SUFFIX);
     64    }
     65
     66    void setNonDeferredProcessedItemCount(const Binding & input, llvm::Value * value) {
     67        setNamedItemCount(input.getName(), input.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PROCESSED_ITEM_COUNT_SUFFIX, value);
     68    }
     69
     70    llvm::Value * getNonDeferredProducedItemCount(const Binding & output) {
     71        return getNamedItemCount(output.getName(), output.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PRODUCED_ITEM_COUNT_SUFFIX);
     72    }
     73
     74    void setNonDeferredProducedItemCount(const Binding & output, llvm::Value * value) {
     75        setNamedItemCount(output.getName(), output.isDeferred() ? NON_DEFERRED_ITEM_COUNT_SUFFIX : PRODUCED_ITEM_COUNT_SUFFIX, value);
    6076    }
    6177
     
    7793    llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset);
    7894
    79     llvm::Value * loadInputStreamBlock(const std::string & name, llvm::Value * streamIndex);
    80 
    81     llvm::Value * getInputStreamPackPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex);
    82 
    83     llvm::Value * loadInputStreamPack(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex);
     95    llvm::Value * loadInputStreamBlock(const std::string & name, llvm::Value * streamIndex) {
     96        return loadInputStreamBlock(name, streamIndex, nullptr);
     97    }
     98
     99    llvm::Value * loadInputStreamBlock(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset);
     100
     101    llvm::Value * getInputStreamPackPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex) {
     102        return getInputStreamPackPtr(name, streamIndex, packIndex, nullptr);
     103    }
     104
     105    llvm::Value * getInputStreamPackPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex, llvm::Value * blockOffset);
     106
     107    llvm::Value * loadInputStreamPack(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex) {
     108        return loadInputStreamPack(name, streamIndex, packIndex, nullptr);
     109    }
     110
     111    llvm::Value * loadInputStreamPack(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex, llvm::Value * blockOffset);
    84112
    85113    llvm::Value * getInputStreamSetCount(const std::string & name);
     
    91119    llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset);
    92120
    93     llvm::StoreInst * storeOutputStreamBlock(const std::string & name, llvm::Value * streamIndex, llvm::Value * toStore);
    94 
    95     llvm::Value * getOutputStreamPackPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex);
    96 
    97     llvm::StoreInst * storeOutputStreamPack(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex, llvm::Value * toStore);
     121    llvm::StoreInst * storeOutputStreamBlock(const std::string & name, llvm::Value * streamIndex, llvm::Value * toStore) {
     122        return storeOutputStreamBlock(name, streamIndex, nullptr, toStore);
     123    }
     124
     125    llvm::StoreInst * storeOutputStreamBlock(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset, llvm::Value * toStore);
     126
     127    llvm::Value * getOutputStreamPackPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex) {
     128        return getOutputStreamPackPtr(name, streamIndex, packIndex, nullptr);
     129    }
     130
     131    llvm::Value * getOutputStreamPackPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex, llvm::Value * blockOffset);
     132
     133    llvm::StoreInst * storeOutputStreamPack(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex, llvm::Value * toStore) {
     134        return storeOutputStreamPack(name, streamIndex, packIndex, nullptr, toStore);
     135    }
     136
     137    llvm::StoreInst * storeOutputStreamPack(const std::string & name, llvm::Value * streamIndex, llvm::Value * packIndex, llvm::Value * blockOffset, llvm::Value * toStore);
    98138
    99139    llvm::Value * getOutputStreamSetCount(const std::string & name);
     
    105145    llvm::Value * getBaseAddress(const std::string & name);
    106146
    107     llvm::Value * getBlockAddress(const std::string & name, llvm::Value * const blockIndex);
     147    void CreatePrepareOverflow(const std::string & name);
     148
     149    void CreateNonLinearCopyFromOverflow(const Binding & output, llvm::Value * itemsToCopy, llvm::Value * overflowOffset);
     150
     151    void CreateCopyFromOverflow(const Binding & output, llvm::Value * itemsToCopy);
     152
     153    void CreateCopyToOverflow(const std::string & name);
    108154
    109155    void setBaseAddress(const std::string & name, llvm::Value * addr);
     
    120166
    121167    llvm::Value * getLinearlyAccessibleItems(const std::string & name, llvm::Value * fromPos, llvm::Value * avail, bool reverse = false);
    122    
     168
    123169    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    124    
    125     void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopyFromOffset, const unsigned itemAlignment);
    126170
    127171    llvm::BasicBlock * CreateConsumerWait();
     
    148192
    149193    void doubleCapacity(const std::string & name);
     194
     195    // overloading wrongly subsitutes this for CBuilder function. renamed for now until I can investigate why.
     196    llvm::Value * CreateUDiv2(llvm::Value * const number, const ProcessingRate::RateValue & divisor, const llvm::Twine & Name = "");
     197
     198    llvm::Value * CreateCeilUDiv2(llvm::Value * const number, const ProcessingRate::RateValue & divisor, const llvm::Twine & Name = "");
     199
     200    llvm::Value * CreateMul2(llvm::Value * const number, const ProcessingRate::RateValue & factor, const llvm::Twine & Name = "");
     201
     202    llvm::Value * CreateCeilUMul2(llvm::Value * const number, const ProcessingRate::RateValue & factor, const llvm::Twine & Name = "");
    150203
    151204protected:
     
    161214    llvm::Value * getScalarFieldPtr(llvm::Value * instance, const std::string & fieldName);
    162215
    163     llvm::Value * getInternalItemCount(const std::string & name, const std::string & suffix);
    164 
    165     void setInternalItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value);
     216    llvm::Value * getNamedItemCount(const std::string & name, const std::string & suffix);
     217
     218    void setNamedItemCount(const std::string & name, const std::string & suffix, llvm::Value * const value);
    166219
    167220protected:
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_index_builder.cpp

    r5983 r5985  
    2626
    2727           // block data
    28            Binding{iBuilder->getStreamSetTy(1, 1), "isCompressed", BoundedRate(0, 1),
    29                    AlwaysConsume()},
    30            Binding{iBuilder->getStreamSetTy(1, 64), "blockStart", BoundedRate(0, 1),
    31                    AlwaysConsume()},
    32            Binding{iBuilder->getStreamSetTy(1, 64), "blockEnd", BoundedRate(0, 1),
    33                    AlwaysConsume()}
     28           Binding{iBuilder->getStreamSetTy(1, 1), "isCompressed", BoundedRate(0, 1), AlwaysConsume()},
     29           Binding{iBuilder->getStreamSetTy(1, 64), "blockStart", RateEqualTo("isCompressed"), AlwaysConsume()},
     30           Binding{iBuilder->getStreamSetTy(1, 64), "blockEnd", RateEqualTo("isCompressed"), AlwaysConsume()}
    3431
    3532    },
     
    9491        Value * newBlockDataIndex = iBuilder->CreateAdd(blockDataIndex, iBuilder->getInt64(1));
    9592        iBuilder->setScalarField("blockDataIndex", newBlockDataIndex);
    96         iBuilder->setProcessedItemCount("blockEnd", newBlockDataIndex);
    97         iBuilder->setProcessedItemCount("blockStart", newBlockDataIndex);
    9893        iBuilder->setProcessedItemCount("isCompressed", newBlockDataIndex);
     94//        iBuilder->setProcessedItemCount("blockEnd", newBlockDataIndex);
     95//        iBuilder->setProcessedItemCount("blockStart", newBlockDataIndex);
    9996
    10097        iBuilder->setProcessedItemCount("byteStream", blockEnd);
     
    132129
    133130        iBuilder->SetInsertPoint(extendLiteralLengthExit);
    134 //        PHINode* newCursorPos = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    135 //        newCursorPos->addIncoming(a, extendLiteralLengthCon);
    136 //        newCursorPos->addIncoming(newCursorPos2, advanceFinishBlock);
    137 
    138131        PHINode* phiCursorPosAfterLiteral = iBuilder->CreatePHI(iBuilder->getInt64Ty(), 3);
    139132        phiCursorPosAfterLiteral->addIncoming(nextTokenPos, extendLiteralLengthCon);
     
    241234        );
    242235
    243         Value* matchOffset = iBuilder->CreateAdd(
    244                 iBuilder->CreateZExt(this->generateLoadSourceInputByte(iBuilder, offsetPos), iBuilder->getSizeTy()),
    245                 iBuilder->CreateShl(iBuilder->CreateZExt(this->generateLoadSourceInputByte(iBuilder, iBuilder->CreateAdd(offsetPos, iBuilder->getSize(1))), iBuilder->getSizeTy()), iBuilder->getSize(8))
    246         );
     236
    247237        iBuilder->setProducedItemCount("M0CountMarker", iBuilder->CreateAdd(iBuilder->getProducedItemCount("M0CountMarker"), iBuilder->getSize(1)));
    248238        this->markCircularOutputBitstream(iBuilder, "MatchOffsetMarker", offsetPos);
    249 //        iBuilder->CallPrintInt("offsetPos", offsetPos);
    250 //        iBuilder->CallPrintInt("matchOffset", matchOffset);
    251 
    252 
    253239        this->increaseScalarField(iBuilder, "m0OutputPos", matchLength);
    254240        this->setCircularOutputBitstream(iBuilder, "M0Marker", outputPos, outputEndPos);
     
    593579        targetValue = iBuilder->CreateOr(targetValue, iBuilder->CreateShl(INT8_1, byteOffset));
    594580        iBuilder->CreateStore(targetValue, outputTargetPtr);
    595 
    596         Value* a = iBuilder->CreateURem(iBuilder->CreateUDiv(pos, iBuilder->getSize(iBuilder->getBitBlockWidth())), iBuilder->getSize(this->getOutputStreamSetBuffer(bitstreamName)->getBufferBlocks()));
    597         Value* p = iBuilder->CreatePointerCast(iBuilder->getRawOutputPointer(bitstreamName, SIZE_0), iBuilder->getBitBlockType()->getPointerTo());
    598 //        iBuilder->CallPrintInt("--pos", pos);
    599 //        iBuilder->CallPrintRegister("aa", iBuilder->CreateLoad(iBuilder->CreateGEP(p, a)));
    600 
    601581    }
    602582
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_match_copy_kernel.cpp

    r5905 r5985  
    2424
    2525    Value *outputBasePtr = iBuilder->getOutputStreamBlockPtr(OUTPUT_STREAM_NAME, SIZE_ZERO);
    26     Value *itemsToDo = mAvailableItemCount[0];
     26    Value *itemsToDo = mAccessibleInputItems[0];
    2727    Value *copySize = iBuilder->CreateMul(outputBlocks, SIZE_BIT_BLOCK_WIDTH);
    2828    Value* actualCopySize = iBuilder->CreateUMin(itemsToDo, copySize);
     
    4646    Value *SIZE_ONE = iBuilder->getSize(1);
    4747    Value *m0EndInitOffset = iBuilder->CreateURem(iBuilder->getProcessedItemCount("m0End"), SIZE_BIT_BLOCK_WIDTH);
    48     Value *m0EndItemsToDo = mAvailableItemCount[2];
     48    Value *m0EndItemsToDo = mAccessibleInputItems[2];
    4949    Value *m0EndBasePtr = iBuilder->getInputStreamBlockPtr("m0End", SIZE_ZERO);
    5050    m0EndBasePtr = iBuilder->CreatePointerCast(m0EndBasePtr, iBuilder->getInt64Ty()->getPointerTo());
     
    8383    Constant *SIZE_BIT_BLOCK_WIDTH = iBuilder->getSize(iBuilder->getBitBlockWidth());
    8484
    85     Value *itemsToDo = mAvailableItemCount[0];
     85    Value *itemsToDo = mAccessibleInputItems[0];
    8686
    8787
     
    143143
    144144    Value *initM0StartProcessIndex = iBuilder->getProcessedItemCount("m0Start");
    145     Value *totalM0StartItemsCount = iBuilder->CreateAdd(initM0StartProcessIndex, mAvailableItemCount[1]);
     145    Value *totalM0StartItemsCount = iBuilder->CreateAdd(initM0StartProcessIndex, mAccessibleInputItems[1]);
    146146
    147147    Value *initMatchOffset = iBuilder->getScalarField("pendingMatchOffset");
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp

    r5957 r5985  
    55#include "lz4_multiple_pdep_kernel.h"
    66#include <kernels/kernel_builder.h>
    7 #include <llvm/Support/raw_ostream.h>
    8 #include <iostream>
    9 #include <vector>
    10 
    117
    128using namespace llvm;
    139
    14 
    1510namespace kernel {
    1611
    17     LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
    18             : MultiBlockKernel(name + "",
    19                                {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)}},
    20                                {},
    21                                {}, {}, {})
    22             , mSwizzleFactor(swizzleFactor)
    23             , mPDEPWidth(PDEP_width)
    24             , mStreamSize(streamSize)
    25     {
    26         assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
    27         assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
     12LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
     13: MultiBlockKernel(name + "",
     14                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", FixedRate(), Principal()}},
     15                   {},
     16                   {}, {}, {})
     17, mSwizzleFactor(swizzleFactor)
     18, mPDEPWidth(PDEP_width)
     19, mStreamSize(streamSize)
     20{
     21    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
     22    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
    2823
    29         mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), Swizzled()});
    30         mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0", RateEqualTo("PDEPmarkerStream")});
     24    mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", PopcountOf("PDEPmarkerStream"), Swizzled()});
     25    mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0"});
    3126
    32         for (unsigned i = 1; i < streamSize; i++) {
    33             mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), BoundedRate(0, 1), Swizzled()});
    34             mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0")});
     27    for (unsigned i = 1; i < streamSize; i++) {
     28        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), Swizzled()});
     29        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i)});
     30    }
     31}
     32
     33void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const blocksToDo) {
     34    BasicBlock * entry = kb->GetInsertBlock();
     35
     36    BasicBlock * loopBody = kb->CreateBasicBlock("loopBody");
     37    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
     38    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
     39    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
     40    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
     41
     42    Value * pdepWidth = kb->getSize(mPDEPWidth);
     43    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
     44    Value * PDEP_func = nullptr;
     45    if (mPDEPWidth == 64) {
     46        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
     47    } else if (mPDEPWidth == 32) {
     48        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
     49    }
     50    kb->CreateBr(loopBody);
     51
     52    kb->SetInsertPoint(loopBody);
     53    // The following PHINodes' values can come from entry or processBlock
     54    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
     55    PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     56    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
     57    updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
     58    Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
     59    Value * PDEP_ms_blk = kb->loadInputStreamBlock("PDEPmarkerStream", kb->getInt32(0), blockOffsetPhi);
     60
     61    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
     62    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
     63
     64    // For each mask extracted from the PDEP marker block
     65    for (unsigned i = 0; i < mSwizzleFactor; i++) {
     66        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
     67
     68        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
     69        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
     70        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
     71
     72
     73        Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
     74
     75        for (unsigned iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
     76            Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
     77            Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
     78
     79            // Load current and next BitBlocks/swizzles
     80            Value* current_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), current_swizzle_idx, current_blk_idx);
     81
     82            Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
     83
     84
     85            Value* next_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), next_swizzle_idx, next_blk_idx);
     86            Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
     87
     88            // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
     89            Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
     90
     91            Value * borrowed_bits = kb->CreateShl(next_swizzle,
     92                                                  kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
     93            Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
     94
     95            Value * segments = kb->fwCast(mPDEPWidth, combined);
     96
     97            Value * result_swizzle = Constant::getNullValue(segments->getType());
     98            // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
     99
     100            Value * PDEP_mask = PDEP_masks[i];
     101            for (unsigned j = 0; j < mSwizzleFactor; j++) {
     102                Value * source_field = kb->CreateExtractElement(segments, j);
     103                Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
     104                result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
     105
     106            }
     107
     108            // Store the result
     109            Value* outputPos = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(iStreamIndex), kb->getSize(i), blockOffsetPhi);
     110
     111            kb->CreateBlockAlignedStore(result_swizzle, outputPos);
    35112        }
     113
     114        updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
    36115    }
    37116
    38     void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
    39         BasicBlock * entry = kb->GetInsertBlock();
    40 //        kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
    41         BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
    42         BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
    43         BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
    44         BasicBlock * terminate = kb->CreateBasicBlock("terminate");
     117    updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, loopBody);
     118    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), loopBody);
     119    Value * haveRemBlocks = kb->CreateICmpNE(blockOffsetPhi, blocksToDo);
     120    kb->CreateCondBr(haveRemBlocks, loopBody, terminate);
    45121
    46         Value * itemsToDo = mAvailableItemCount[0];
    47         Value * sourceItemsAvail = mAvailableItemCount[1]; //TODO need to be calculated from numOfStrides
     122    kb->SetInsertPoint(terminate);
     123    kb->setProcessedItemCount("sourceStreamSet0", updatedProcessedSourceBitsPhi);
    48124
    49         Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
    50         Value * blocksToDo = kb->CreateSelect(mIsFinal, kb->CreateUDivCeil(itemsToDo, blockWidth), kb->CreateUDiv(itemsToDo, blockWidth));
    51         Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
    52         Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
     125}
    53126
    54         Value * pdepWidth = kb->getSize(mPDEPWidth);
    55         Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
    56         Value * PDEP_func = nullptr;
    57         if (mPDEPWidth == 64) {
    58             PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
    59         } else if (mPDEPWidth == 32) {
    60             PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
    61         }
    62         kb->CreateBr(checkLoopCond);
     127std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
     128    Value * pop_counts = kb->simd_popcount(field_width, blk);
     129    std::vector<Value *> counts;
     130    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
     131        // Store the pop counts for each blk_width field in blk
     132        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
     133    }
     134    return counts;
     135}
    63136
    64         kb->SetInsertPoint(checkLoopCond);
    65         // The following PHINodes' values can come from entry or processBlock
    66         PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    67         PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
    68         PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    69         PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
    70         blocksToDoPhi->addIncoming(blocksToDo, entry);
    71         blockOffsetPhi->addIncoming(kb->getSize(0), entry);
    72         updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
    73         sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
     137std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
     138    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
     139    // Split the PDEP marker stream block into mPDEPWidth segments.
     140    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
     141    std::vector<Value *> PDEP_masks;
     142    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
     143        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
     144    }
     145    return PDEP_masks;
     146}
    74147
    75         Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
    76         kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
    77 
    78         kb->SetInsertPoint(checkSourceCount);
    79         // Extract the values we will use in the main processing loop
    80         Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
    81         Value * updatedSourceItems = sourceItemsRemaining;
    82         Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0), blockOffsetPhi));
    83 
    84         const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
    85         const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
    86 
    87         Value * total_count = mask_popcounts[0];
    88         for (unsigned j = 1; j < mask_popcounts.size(); j++) {
    89             total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
    90         }
    91 //    kb->CallPrintInt("total_count", total_count);
    92 //    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
    93         // Do not check popcount in final block, since there may be some useless pdep marker in the end
    94         kb->CreateCondBr(kb->CreateOr(kb->CreateICmpULE(total_count, sourceItemsRemaining), mIsFinal), processBlock, terminate);
    95         kb->SetInsertPoint(processBlock);
    96 
    97         // For each mask extracted from the PDEP marker block
    98         for (unsigned i = 0; i < mSwizzleFactor; i++) {
    99             // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
    100 
    101             Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
    102             Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
    103             Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
    104 
    105 
    106             Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
    107 
    108             for (unsigned iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
    109                 Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
    110                 Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
    111 
    112                 // Load current and next BitBlocks/swizzles
    113                 Value* current_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), current_swizzle_idx, current_blk_idx);
    114 
    115                 Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    116 
    117 
    118                 Value* next_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), next_swizzle_idx, next_blk_idx);
    119                 Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    120 
    121                 // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
    122                 Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
    123 
    124                 Value * borrowed_bits = kb->CreateShl(next_swizzle,
    125                                                       kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
    126                 Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
    127 
    128                 Value * segments = kb->fwCast(mPDEPWidth, combined);
    129 
    130 //                kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
    131 //                kb->CallPrintInt("next_swizzle_idx", next_swizzle_idx);
    132 //                if (iStreamIndex == 1) {
    133 //                    kb->CallPrintInt("current_swizzle_ptr"  + std::to_string(iStreamIndex) , current_swizzle_ptr);
    134 //                    kb->CallPrintRegister("current_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), current_swizzle);
    135 //
    136 //                    kb->CallPrintRegister("next_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), next_swizzle);
    137 //                    kb->CallPrintRegister("segments_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), segments);
    138 //                }
    139 
    140                 Value * result_swizzle = Constant::getNullValue(segments->getType());
    141                 // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
    142 
    143                 Value * PDEP_mask = PDEP_masks[i];
    144                 for (unsigned j = 0; j < mSwizzleFactor; j++) {
    145                     Value * source_field = kb->CreateExtractElement(segments, j);
    146                     Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
    147                     result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
    148 
    149                 }
    150 
    151                 // Store the result
    152                 Value* outputPos = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(iStreamIndex), kb->getSize(i), blockOffsetPhi);
    153 
    154                 kb->CreateBlockAlignedStore(result_swizzle, outputPos);
    155             }
    156 
    157             updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
    158             updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
    159         }
    160 
    161         updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
    162         blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
    163         blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
    164         sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
    165         kb->CreateBr(checkLoopCond);
    166 
    167         kb->SetInsertPoint(terminate);
    168         for (unsigned i = 0; i < mStreamSize; i++) {
    169             kb->setProcessedItemCount("sourceStreamSet" + std::to_string(i), updatedProcessedSourceBitsPhi);
    170         }
    171 
    172         Value* processedBlock = kb->CreateSub(blocksToDo, blocksToDoPhi);
    173 //        kb->CallPrintInt("blocksToDoPhi", blocksToDoPhi);
    174 
    175         kb->setProcessedItemCount("PDEPmarkerStream",
    176                                   kb->CreateSelect(mIsFinal,
    177                                                    kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"), itemsToDo),
    178                                                    kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"),kb->CreateMul(processedBlock, blockWidth))
    179                                   )
    180         );
    181     }
    182 
    183     std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
    184         Value * pop_counts = kb->simd_popcount(field_width, blk);
    185         std::vector<Value *> counts;
    186         for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
    187             // Store the pop counts for each blk_width field in blk
    188             counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
    189         }
    190         return counts;
    191     }
    192 
    193     std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
    194         // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
    195         // Split the PDEP marker stream block into mPDEPWidth segments.
    196         Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
    197         std::vector<Value *> PDEP_masks;
    198         for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
    199             PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
    200         }
    201         return PDEP_masks;
    202     }
    203148}
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_swizzled_match_copy_kernel.cpp

    r5982 r5985  
    248248// Inputs
    249249{
    250                                    Binding{iBuilder->getStreamSetTy(1, 1), "MatchOffsetMarker", BoundedRate(0, 1), {DisableSufficientChecking()}},
    251                                    Binding{iBuilder->getStreamSetTy(1, 1), "M0Marker", BoundedRate(0, 1), {DisableSufficientChecking()}},
    252                                    Binding{iBuilder->getStreamSetTy(1, 1), "M0CountMarker", BoundedRate(0, 1), {DisableSufficientChecking()}},
     250                                   Binding{iBuilder->getStreamSetTy(1, 1), "MatchOffsetMarker", BoundedRate(0, 1)},
     251                                   Binding{iBuilder->getStreamSetTy(1, 1), "M0Marker", BoundedRate(0, 1)},
     252                                   Binding{iBuilder->getStreamSetTy(1, 1), "M0CountMarker", BoundedRate(0, 1)},
    253253                                   Binding{iBuilder->getStreamSetTy(1, 8), "byteStream", BoundedRate(0, 1)}
    254254},
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5870 r5985  
    1616// input stream sets
    1717{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
    18 Binding{b->getStreamSetTy(swizzleFactor), "source", FixedRate(), Deferred()}},
     18Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
    1919// output stream set
    20 {Binding{b->getStreamSetTy(swizzleFactor), "output"}},
    21 {}, {},
    22 // internal scalars
    23 {Binding{b->getBitBlockType(), "buffer"},
    24 Binding{b->getSizeTy(), "buffered"},
    25 Binding{b->getSizeTy(), "sourceOffset"}})
     20{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
     21{}, {}, {})
    2622, mSwizzleFactor(swizzleFactor) {
    2723
     
    4541    }
    4642
    47     // We store an internal source offset here because this kernel processes items in an unusual way.
    48     // The pipeline and multiblock assume that if we report we're on the i-th bit of a stream we have
    49     // fully processed all of the bits up to the i-th position.
     43    Constant * const ZERO = b->getSize(0);
     44    Value * const sourceItemCount = b->getProcessedItemCount("source");
    5045
    51     //                                             v
    52     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    53     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    54     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    55     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    56 
    57     // However, this kernel divides the stream into K elements and fully consumes a single stream of
    58     // the stream set before consuming the next one. So the same i-th position above is actually:
    59 
    60     //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
    61     //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
    62     //                |XX          |XX          |XX          |XX          |
    63     //                |            |            |            |            |
    64 
    65     // In the future, we may want the pipeline and multiblock to understand this style of processing
    66     // but for now, we hide it by delaying writing the actual processed offset until we've fully
    67     // processed the entire block.
    68 
    69     Value * const initialBuffer = b->getScalarField("buffer");
    70     Value * const initialBufferSize = b->getScalarField("buffered");
    71     Value * const initialSourceOffset = b->getScalarField("sourceOffset");
     46    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
    7247    b->CreateBr(processBlock);
    7348
    7449    b->SetInsertPoint(processBlock);
    7550    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
    76     strideIndex->addIncoming(b->getSize(0), entry);
    77     PHINode * const bufferPhi = b->CreatePHI(initialBuffer->getType(), 2);
    78     bufferPhi->addIncoming(initialBuffer, entry);
     51    strideIndex->addIncoming(ZERO, entry);
     52    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
     53    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
    7954    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
    8055    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
    8156    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
    82     bufferSizePhi->addIncoming(initialBufferSize, entry);
     57    bufferSizePhi->addIncoming(ZERO, entry);
    8358
    8459    // Extract the values we will use in the main processing loop
    85     Value * const markerStream = b->getInputStreamBlockPtr("marker", b->getInt32(0), strideIndex);
    86     Value * const selectors = b->fwCast(pdepWidth, b->CreateBlockAlignedLoad(markerStream));
     60    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
     61    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
     62    Value * const selectors = b->fwCast(pdepWidth, markerValue);
    8763    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
    88 
    89     // If we run out of source items here, it is a failure of the MultiBlockKernel and/or PipelineGenerator
    90     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    91         Value * requiredSourceItems = b->CreateExtractElement(numOfSelectors, b->getInt64(0));
    92         for (unsigned i = 1; i < mSwizzleFactor; i++) {
    93             requiredSourceItems = b->CreateAdd(requiredSourceItems, b->CreateExtractElement(numOfSelectors, b->getInt64(i)));
    94         }
    95         Value * const availableSourceItems = b->getAvailableItemCount("source");
    96         Value * const unreadSourceItems = b->CreateSub(availableSourceItems, sourceOffsetPhi);
    97         Value * const hasSufficientSourceItems = b->CreateICmpULE(requiredSourceItems, unreadSourceItems);
    98         b->CreateAssert(hasSufficientSourceItems, getName() + " has insufficient source items for the given marker stream");
    99     }
    10064
    10165    // For each element of the marker block
     
    10670
    10771        // How many bits will we deposit?
    108         Value * const required = b->CreateExtractElement(numOfSelectors, b->getInt32(i));
     72        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
    10973
    11074        // Aggressively enqueue any additional bits
     
    11478
    11579        b->SetInsertPoint(enqueueBits);
    116         PHINode * const bufferSize2 = b->CreatePHI(bufferSize->getType(), 2);
    117         bufferSize2->addIncoming(bufferSize, entry);
    118         PHINode * const sourceOffset2 = b->CreatePHI(sourceOffset->getType(), 2);
    119         sourceOffset2->addIncoming(sourceOffset, entry);
    120         PHINode * const buffer2 = b->CreatePHI(buffer->getType(), 2);
    121         buffer2->addIncoming(buffer, entry);
     80        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
     81        updatedBufferSize->addIncoming(bufferSize, entry);
     82        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
     83        updatedSourceOffset->addIncoming(sourceOffset, entry);
     84        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
     85        updatedBuffer->addIncoming(buffer, entry);
    12286
    123         // Calculate the block and swizzle index of the "current" swizzle
    124         Value * const block_index = b->CreateUDiv(sourceOffset2, BLOCK_WIDTH);
    125         Value * const stream_index = b->CreateUDiv(b->CreateURem(sourceOffset2, BLOCK_WIDTH), PDEP_WIDTH);
    126         Value * const ptr = b->getInputStreamBlockPtr("source", stream_index, block_index);
    127         Value * const swizzle = b->CreateBlockAlignedLoad(ptr);
    128         Value * const swizzleOffset = b->CreateURem(sourceOffset2, PDEP_WIDTH);
     87        // Calculate the block and swizzle index of the current swizzle row
     88        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
     89        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
     90        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
     91        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
    12992
    13093        // Shift the swizzle to the right to clear off any used bits ...
     
    13396
    13497        // ... then to the left to align the bits with the buffer and combine them.
    135         Value * const bufferShift = b->simd_fill(pdepWidth, bufferSize2);
     98        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
    13699        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
    137         buffer = b->CreateOr(buffer, pendingBits);
    138         buffer2->addIncoming(buffer, enqueueBits);
    139100
    140         // Update the buffer size by the number of bits we have actually enqueued
    141         Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), bufferSize2);
     101        buffer = b->CreateOr(updatedBuffer, pendingBits);
     102        updatedBuffer->addIncoming(buffer, enqueueBits);
     103
     104        // Update the buffer size with the number of bits we have actually enqueued
     105        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
    142106        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
     107        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
     108
    143109        // ... and increment the source offset by the number we actually inserted
    144         sourceOffset = b->CreateAdd(sourceOffset2, b->CreateSub(bufferSize, bufferSize2));
    145         bufferSize2->addIncoming(bufferSize, enqueueBits);
    146         sourceOffset2->addIncoming(sourceOffset, enqueueBits);
     110        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
     111        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
     112        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
     113
     114        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
     115        // branch misprediction is better or worse than always filling from two swizzles to
     116        // ensure that we have enough bits to deposit.
    147117        BasicBlock * const depositBits = b->CreateBasicBlock();
    148         b->CreateCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
     118        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
    149119
    150120        b->SetInsertPoint(depositBits);
     121
    151122        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
    152123        Value * result = UndefValue::get(buffer->getType());
     
    159130
    160131        // Store the result
    161         Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getInt32(i), strideIndex);
     132        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
    162133        b->CreateBlockAlignedStore(result, outputStreamPtr);
    163134
     
    177148
    178149    b->SetInsertPoint(finishedStrides);
    179     Value * const sourceItemsProcessed = b->CreateMul(b->CreateUDiv(sourceOffset, BLOCK_WIDTH), BLOCK_WIDTH);
    180     b->setProcessedItemCount("source", b->CreateAdd(b->getProcessedItemCount("source"), sourceItemsProcessed));
    181     b->setScalarField("buffer", buffer);
    182     b->setScalarField("buffered", bufferSize);
    183     b->setScalarField("sourceOffset", b->CreateURem(sourceOffset, BLOCK_WIDTH));
    184150}
    185151
  • icGREP/icgrep-devel/icgrep/kernels/processing_rate.h

    r5782 r5985  
    2828
    2929    enum class KindId {
    30         Fixed, Bounded, Unknown, Relative, PopCount
     30        Fixed, Bounded, Unknown, Relative, PopCount, NegatedPopCount
    3131    };
    3232
     
    3636
    3737    RateValue getRate() const {
    38         assert (isFixed() || isRelative());
    3938        return mLowerBound;
    4039    }
    4140
    4241    RateValue getLowerBound() const {
    43         assert (isFixed() || isBounded() || isUnknown());
    4442        return mLowerBound;
    4543    }
    4644
    4745    RateValue getUpperBound() const {
    48         assert (isFixed() || isBounded());
    49         assert (isFixed() ? mUpperBound == mLowerBound : mUpperBound > mLowerBound);
    5046        return mUpperBound;
    5147    }
    5248
    5349    const std::string & getReference() const {
    54         assert (isRelative());
     50        assert (hasReference());
    5551        return mReference;
    5652    }
     
    7268    }
    7369
     70    bool isNegatedPopCount() const {
     71        return mKind == KindId::NegatedPopCount;
     72    }
     73
    7474    bool isUnknown() const {
    7575        return mKind == KindId::Unknown;
    7676    }
    7777
     78    bool hasReference() const {
     79        return isRelative() || isPopCount() || isNegatedPopCount();
     80    }
     81
    7882    bool isDerived() const {
    79         return isRelative(); // isFixed() ||
     83        return isRelative();
    8084    }
    8185
     
    9296    friend ProcessingRate UnknownRate(const unsigned);
    9397    friend ProcessingRate RateEqualTo(std::string);
    94     friend ProcessingRate PopcountOf(std::string, const ProcessingRate::RateValue);
     98    friend ProcessingRate PopcountOf(std::string);
     99    friend ProcessingRate PopcountOfNot(std::string);
    95100
    96101    ProcessingRate(ProcessingRate &&) = default;
     
    99104
    100105protected:   
    101     ProcessingRate(const KindId k, const unsigned n, const unsigned m, const std::string && ref = "") : mKind(k), mLowerBound(n), mUpperBound(m), mReference(ref) {}
    102     ProcessingRate(const KindId k, const RateValue n, const RateValue m, const std::string && ref = "") : mKind(k), mLowerBound(n), mUpperBound(m), mReference(ref) {}
     106    ProcessingRate(const KindId k, const RateValue lb, const RateValue ub, const std::string && ref = "")
     107    : mKind(k)
     108    , mLowerBound(lb)
     109    , mUpperBound(ub)
     110    , mReference(ref) {
     111        assert (isFixed() ? mUpperBound == mLowerBound : (isBounded() ? mUpperBound > mLowerBound :  mUpperBound >= mLowerBound));
     112    }
    103113private:
    104     KindId mKind;
    105     RateValue mLowerBound;
    106     RateValue mUpperBound;
    107     std::string mReference;
     114    const KindId mKind;
     115    const RateValue mLowerBound;
     116    const RateValue mUpperBound;
     117    const std::string mReference;
    108118};
    109119
     
    131141
    132142inline ProcessingRate RateEqualTo(std::string ref) {
    133     return ProcessingRate(ProcessingRate::KindId::Relative, 1, 0, std::move(ref));
     143    return ProcessingRate(ProcessingRate::KindId::Relative, 1, 1, std::move(ref));
    134144}
    135145
    136 inline ProcessingRate PopcountOf(std::string ref, const ProcessingRate::RateValue ratio = ProcessingRate::RateValue{1}) {
    137     return ProcessingRate(ProcessingRate::KindId::PopCount, ratio, ProcessingRate::RateValue{0}, std::move(ref));
     146inline ProcessingRate PopcountOf(std::string ref) {
     147    return ProcessingRate(ProcessingRate::KindId::PopCount, 0, 1, std::move(ref));
     148}
     149
     150inline ProcessingRate PopcountOfNot(std::string ref) {
     151    return ProcessingRate(ProcessingRate::KindId::NegatedPopCount, 0, 1, std::move(ref));
    138152}
    139153
  • icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

    r5831 r5985  
    3939// of bytes to the actual output stream.
    4040
    41 void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    42 
    43     BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
    44     BasicBlock * expand_3_4_loop = iBuilder->CreateBasicBlock("expand_3_4_loop");
    45     BasicBlock * expand3_4_exit = iBuilder->CreateBasicBlock("expand3_4_exit");
    46    
     41//void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &b, Value * const numOfStrides) {
     42
     43//    BasicBlock * expand2_3entry = b->GetInsertBlock();
     44//    BasicBlock * expand_3_4_loop = b->CreateBasicBlock("expand_3_4_loop");
     45//    BasicBlock * expand3_4_exit = b->CreateBasicBlock("expand3_4_exit");
     46
     47//    // Determine the require shufflevector constants.
     48//    const unsigned PACK_SIZE = b->getBitBlockWidth()/8;
     49
     50//    ConstantInt * const ZERO = b->getSize(0);
     51//    ConstantInt * const ONE = b->getSize(1);
     52//    ConstantInt * const THREE = b->getSize(3);
     53//    ConstantInt * const FOUR = b->getSize(4);
     54//    ConstantInt * const SEVEN = b->getSize(7);
     55
     56//    // Construct a list of indexes in  the form
     57//    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
     58//    unsigned sourceByteIndex = 0;
     59//    unsigned expand3_4_index[PACK_SIZE];
     60//    for (unsigned i = 0; i < PACK_SIZE; i++) {
     61//        expand3_4_index[i] = sourceByteIndex;
     62//        if (i % 4 != 2) sourceByteIndex++;
     63//    }
     64//    unsigned const expand3_4_offset[4] = {PACK_SIZE, 3*PACK_SIZE/4, PACK_SIZE/2, PACK_SIZE/4};
     65//    Value * expand_3_4_shuffle[4];
     66//    for (unsigned j = 0; j < 4; j++) {
     67//        std::vector<Constant *> Idxs;
     68//        for (unsigned i = 0; i < PACK_SIZE; i++) {
     69//            Idxs.push_back(ConstantInt::get(b->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
     70//        }
     71//        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
     72//    }
     73
     74
     75
     76//    Constant * triplePackSize = b->getSize(3 * PACK_SIZE); // 3 packs per loop.
     77//    UndefValue * undefPack = UndefValue::get(b->fwVectorType(8));
     78
     79//    Value * const numOfBlocks = b->CreateMul(numOfStrides, b->getSize(8));
     80
     81//    Value * itemsToDo = mAvailableItemCount[0];
     82
     83//    // The main loop processes 3 packs of data at a time.
     84//    b->CreateBr(expand_3_4_loop);
     85
     86//    b->SetInsertPoint(expand_3_4_loop);
     87//    PHINode * loopItemsRemain = b->CreatePHI(b->getSizeTy(), 2);
     88//    PHINode * strideOffset = b->CreatePHI(b->getSizeTy(), 2);
     89//    loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
     90//    strideOffset->addIncoming(ZERO, expand2_3entry);
     91
     92//    Value * const baseInputOffset = b->CreateMul(strideOffset, THREE);
     93//    Value * const baseOutputOffset = b->CreateMul(strideOffset, FOUR);
     94//    Value * carryOver = undefPack;
     95//    for (unsigned i = 0; i < 3; ++i) {
     96//        ConstantInt * const index = b->getSize(i);
     97//        Value * const inputOffset = b->CreateAdd(baseInputOffset, index);
     98//        Value * const inputPackIndex = b->CreateAnd(inputOffset, SEVEN);
     99//        Value * const inputBlockOffset = b->CreateLShr(inputOffset, THREE);
     100//        Value * const input = b->fwCast(8, b->loadInputStreamPack("sourceStream", ZERO, inputPackIndex, inputBlockOffset));
     101//        Value * const expanded = b->CreateShuffleVector(carryOver, input, expand_3_4_shuffle[i]);
     102//        Value * const outputOffset = b->CreateAdd(baseOutputOffset, index);
     103//        Value * const outputPackIndex = b->CreateAnd(outputOffset, SEVEN);
     104//        Value * const outputBlockOffset = b->CreateLShr(outputOffset, THREE);
     105//        b->storeOutputStreamPack("expand34Stream", ZERO, outputPackIndex, outputBlockOffset, b->bitCast(expanded));
     106//        carryOver = input;
     107//    }
     108//    Value * expanded = b->CreateShuffleVector(carryOver, undefPack, expand_3_4_shuffle[3]);
     109//    Value * outputOffset = b->CreateAdd(baseOutputOffset, THREE);
     110//    Value * const outputPackIndex = b->CreateAnd(outputOffset, SEVEN);
     111//    Value * const outputBlockOffset = b->CreateLShr(outputOffset, THREE);
     112//    b->storeOutputStreamPack("expand34Stream", ZERO, outputPackIndex, outputBlockOffset, b->bitCast(expanded));
     113
     114//    Value * remainingItems = b->CreateSub(loopItemsRemain, triplePackSize);
     115
     116//    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
     117//    Value * const nextStrideOffset = b->CreateAdd(strideOffset, ONE);
     118//    strideOffset->addIncoming(nextStrideOffset, expand_3_4_loop);
     119
     120//    //Value * continueLoop = b->CreateICmpSGT(remainingItems, ZERO);
     121//    Value * continueLoop = b->CreateICmpULT(nextStrideOffset, numOfBlocks);
     122//    b->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_exit);
     123
     124//    b->SetInsertPoint(expand3_4_exit);
     125
     126//}
     127
     128void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &b, Value * const numOfStrides) {
     129
     130    BasicBlock * expand2_3entry = b->GetInsertBlock();
     131    BasicBlock * expand_3_4_loop = b->CreateBasicBlock("expand_3_4_loop");
     132    BasicBlock * expand3_4_exit = b->CreateBasicBlock("expand3_4_exit");
     133
    47134    // Determine the require shufflevector constants.
    48     const unsigned PACK_SIZE = iBuilder->getBitBlockWidth()/8;
    49    
     135    const unsigned PACK_SIZE = b->getBitBlockWidth()/8;
     136
     137    ConstantInt * const ZERO = b->getSize(0);
     138    ConstantInt * const ONE = b->getSize(1);
     139    ConstantInt * const THREE = b->getSize(3);
     140    ConstantInt * const FOUR = b->getSize(4);
     141    ConstantInt * const SEVEN = b->getSize(7);
     142
    50143    // Construct a list of indexes in  the form
    51144    // 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, ...
     
    61154        std::vector<Constant *> Idxs;
    62155        for (unsigned i = 0; i < PACK_SIZE; i++) {
    63             Idxs.push_back(ConstantInt::get(iBuilder->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
     156            Idxs.push_back(ConstantInt::get(b->getInt32Ty(), expand3_4_offset[j] + expand3_4_index[i]));
    64157        }
    65158        expand_3_4_shuffle[j] = ConstantVector::get(Idxs);
    66159    }
    67160
    68     Constant * triplePackSize = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
    69     UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
    70    
    71     const unsigned packAlign = iBuilder->getBitBlockWidth()/8;
    72 
    73     Value * itemsToDo = mAvailableItemCount[0];
    74 
    75     Value * sourceStream = iBuilder->getInputStreamBlockPtr("sourceStream", iBuilder->getInt32(0));
    76     Value * expandedStream = iBuilder->getOutputStreamBlockPtr("expand34Stream", iBuilder->getInt32(0));
    77 
     161    UndefValue * undefPack = UndefValue::get(b->fwVectorType(8));
     162    Value * const numOfBlocks = b->CreateMul(numOfStrides, b->getSize(8));
    78163    // The main loop processes 3 packs of data at a time.
    79     // The initial pack offsets may be nonzero.
    80     sourceStream = iBuilder->CreatePointerCast(sourceStream, iBuilder->getInt8PtrTy());
    81     expandedStream = iBuilder->CreatePointerCast(expandedStream, iBuilder->getInt8PtrTy());
    82     Value * offset = iBuilder->CreateURem(iBuilder->getProcessedItemCount("sourceStream"), iBuilder->getSize(iBuilder->getBitBlockWidth()));
    83     Value * sourcePackPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(sourceStream, offset), iBuilder->getBitBlockType()->getPointerTo());
    84     offset = iBuilder->CreateURem(iBuilder->getProducedItemCount("expand34Stream"), iBuilder->getSize(iBuilder->getBitBlockWidth()));
    85     Value * outputPackPtr = iBuilder->CreatePointerCast(iBuilder->CreateGEP(expandedStream, offset), iBuilder->getBitBlockType()->getPointerTo());
    86     iBuilder->CreateCondBr(iBuilder->CreateICmpSGT(itemsToDo, iBuilder->getSize(0)), expand_3_4_loop, expand3_4_exit);
    87    
    88     iBuilder->SetInsertPoint(expand_3_4_loop);
    89     PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
    90     PHINode * loopOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
    91     PHINode * loopItemsRemain = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
    92 
    93     loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
    94     loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
    95     loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
    96 
    97 
    98     // Step 1 of the main loop.
    99     Value * pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopInput_ptr, packAlign));
    100     Value * expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
    101     iBuilder->CreateBlockAlignedStore(expand0, loopOutput_ptr);
    102     // Step 2 of the main loop.
    103     Value * inPack1_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(1));
    104     Value * outPack1_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(1));
    105     Value * pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
    106     Value * expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1, expand_3_4_shuffle[1]));
    107     iBuilder->CreateBlockAlignedStore(expand1, outPack1_ptr);
    108     // Step 3 of the main loop.
    109     Value * inPack2_ptr = iBuilder->CreateGEP(loopInput_ptr, iBuilder->getInt32(2));
    110     Value * outPack2_ptr = iBuilder->CreateGEP(loopOutput_ptr, iBuilder->getInt32(2));
    111     Value * pack2 = iBuilder->fwCast(8, iBuilder