Changeset 5856 for icGREP/icgrep-devel


Ignore:
Timestamp:
Feb 2, 2018, 2:49:08 PM (14 months ago)
Author:
nmedfort
Message:

Revised pipeline structure to better control I/O rates

Location:
icGREP/icgrep-devel/icgrep
Files:
4 added
22 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/base64.cpp

    r5771 r5856  
    6262
    6363    //Round up to a multiple of 3.
    64     const unsigned initSegSize = ((codegen::SegmentSize + 2)/3) * 3;
    65     const unsigned bufferSize = (4 * initSegSize * codegen::BufferSegments) / 3;
     64    const auto bufferSize = ((codegen::SegmentSize * codegen::BufferSegments * codegen::ThreadNum + 2) / 3) * 3;
    6665
    6766    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
    68 
    69     Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, initSegSize);
     67    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
    7068    mmapK->setInitialArguments({fileDescriptor});
    7169    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
    7270   
    73     StreamSetBuffer * Expanded3_4Out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
     71    const auto outputBufferSize = ((bufferSize + 2) / 3) * 4;
     72
     73    StreamSetBuffer * Expanded3_4Out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputBufferSize);
    7474    Kernel * expandK = pxDriver.addKernelInstance<expand3_4Kernel>(iBuilder);
    7575    pxDriver.makeKernelCall(expandK, {ByteStream}, {Expanded3_4Out});
    7676   
    77     StreamSetBuffer * Radix64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
     77    StreamSetBuffer * Radix64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputBufferSize);
    7878    Kernel * radix64K = pxDriver.addKernelInstance<radix64Kernel>(iBuilder);
    7979    pxDriver.makeKernelCall(radix64K, {Expanded3_4Out}, {Radix64out});
     
    8383        Kernel * base64K = pxDriver.addKernelInstance<base64Kernel>(iBuilder);
    8484        pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});
    85     }
    86     else {
    87         StreamSetBuffer * Base64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), bufferSize);
     85    } else {
     86        StreamSetBuffer * Base64out = pxDriver.addBuffer<DynamicBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), outputBufferSize);
    8887        Kernel * base64K = pxDriver.addKernelInstance<base64Kernel>(iBuilder);
    89         pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});
    90        
     88        pxDriver.makeKernelCall(base64K, {Radix64out}, {Base64out});       
    9189        Kernel * outK = pxDriver.addKernelInstance<StdOutKernel>(iBuilder, 8);
    9290        pxDriver.makeKernelCall(outK, {Base64out}, {});
  • icGREP/icgrep-devel/icgrep/character_deletion.cpp

    r5853 r5856  
    103103    iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", main, 0));
    104104
    105 
    106105    // GeneratePipeline
    107106    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
    108107    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), inputBufferBlocks);
    109108
    110     kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy(), codegen::SegmentSize);
     109    kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
    111110    sourceK->setInitialArguments({inputStream, fileSize});
    112111    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
     112
    113113    Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder, /*aligned = */ true);
    114114    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    115 
    116115
    117116    StreamSetBuffer * const CharacterMarkerBuffer = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), inputBufferBlocks);
     
    152151    pxDriver.generatePipelineIR();
    153152
     153    pxDriver.deallocateBuffers();
     154
    154155    iBuilder->CreateRetVoid();
    155156
  • icGREP/icgrep-devel/icgrep/editd/editd.cpp

    r5847 r5856  
    258258
    259259    auto ChStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(4));
    260     auto mmapK = pxDriver.addKernelInstance<MemorySourceKernel>(idb, inputType, segmentSize);
     260    auto mmapK = pxDriver.addKernelInstance<MemorySourceKernel>(idb, inputType);
    261261    mmapK->setInitialArguments({inputStream, fileSize});
    262262    pxDriver.makeKernelCall(mmapK, {}, {ChStream});
     
    329329    auto ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
    330330
    331     auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, segmentSize);
     331    auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
    332332    mmapK->setInitialArguments({fileDescriptor});
    333333    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
     
    371371    auto ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
    372372
    373     auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(idb, segmentSize);
     373    auto mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(idb);
    374374    mmapK->setInitialArguments({fileDescriptor});
    375375    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
     
    437437
    438438    auto ChStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(4));
    439     auto mmapK = pxDriver.addKernelInstance<MemorySourceKernel>(idb, inputType, segmentSize);
     439    auto mmapK = pxDriver.addKernelInstance<MemorySourceKernel>(idb, inputType);
    440440    mmapK->setInitialArguments({inputStream, fileSize});
    441441    pxDriver.makeKernelCall(mmapK, {}, {ChStream});
     
    666666    Module * M = iBuilder->getModule();
    667667
    668     const unsigned segmentSize = codegen::SegmentSize;
    669     const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    670 
    671668    Type * mBitBlockType = iBuilder->getBitBlockType();
    672669    Type * const size_ty = iBuilder->getSizeTy();
     
    684681
    685682    StreamSetBuffer * MatchResults = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(editDistance+1));
    686     kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(iBuilder, inputType, segmentSize * bufferSegments);
     683    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(iBuilder, inputType);
    687684    sourceK->setInitialArguments({inputStream, fileSize});
    688685    pxDriver.makeKernelCall(sourceK, {}, {MatchResults});
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5841 r5856  
    151151        REs[i] = regular_expression_passes(REs[i]);
    152152        if (CC_Multiplexing) {
    153             const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
    154 
     153            const auto UnicodeSets = re::collectUnicodeSets(REs[i]);
    155154            StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    156155            if (UnicodeSets.size() <= 1) {
     
    166165                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
    167166                mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
     167//                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
     168//                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
    168169                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()});
    169170                mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams, CharClasses}, {MatchResults});
     
    219220    Module * M = idb->getModule();
    220221
    221     const auto segmentSize = codegen::SegmentSize;
    222     const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    223 
    224222    const unsigned encodingBits = 8;
    225223
     
    233231
    234232    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
    235     kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
     233    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
    236234    sourceK->setInitialArguments({fileDescriptor});
    237235    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
     
    315313    Module * M = idb->getModule();
    316314
    317     const auto segmentSize = codegen::SegmentSize;
    318     const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    319315    const unsigned encodingBits = 8;
    320316
     
    330326
    331327    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
    332     kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
     328    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
    333329    sourceK->setInitialArguments({fileDescriptor});
    334330    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
  • icGREP/icgrep-devel/icgrep/kernels/attributes.h

    r5793 r5856  
    203203        // a MultiBlock kernel will select the *maximum* input item count as it's
    204204        // principle item length and zero-extend the streams accordingly.
     205
     206        CanTerminateEarly,
     207
     208        // Indicates that this kernel can call setTerminationSignal() to terminate the
     209        // kernel prior to processing all of its input streams.
    205210
    206211    };
     
    255260    friend Attribute ConditionalRegionBegin();
    256261    friend Attribute ConditionalRegionEnd();
     262    friend Attribute CanTerminateEarly();
    257263
    258264    Attribute(const KindId kind, const unsigned k) : mKind(kind), mAmount(k) { }
     
    343349}
    344350
     351inline Attribute CanTerminateEarly() {
     352    return Attribute(Attribute::KindId::CanTerminateEarly, 0);
     353}
     354
    345355}
    346356
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5852 r5856  
    761761    mStreamSetInputBaseAddress.resize(inputSetCount);
    762762
     763    Value * const initiallyFinal = mIsFinal;
     764
     765//    b->CallPrintInt(getName() + "_initiallyFinal", initiallyFinal);
     766
    763767    // Now proceed with creation of the doSegment method.
    764768    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     
    787791        const auto & name = input.getName();
    788792        Value * const processed = b->getProcessedItemCount(name);
     793
     794//        b->CallPrintInt(getName() + "_" + name + "_avail", mAvailableItemCount[i]);
     795//        b->CallPrintInt(getName() + "_" + name + "_processed", processed);
     796
    789797        mInitialProcessedItemCount[i] = processed;
    790798        mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
     
    793801                            getName() + ": " + name + " processed item count exceeds its available item count");
    794802        }
    795         Value * const unprocessed = b->CreateNUWSub(mAvailableItemCount[i], processed);
     803
     804        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);       
     805//        b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
     806
     807        Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
     808//        b->CallPrintInt(getName() + "_" + name + "_accessible", accessible);
     809
    796810        mAvailableItemCount[i] = unprocessed;
    797         Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
     811
    798812        linearlyAccessible[i] = accessible;
    799813        inputStrideSize[i] = getStrideSize(b, input.getRate());
     
    809823    // P is the current processed position, L is the lookahead amount and n is our number of accessible strides ∈ â„€+.
    810824    b->SetInsertPoint(checkInputAvailability);
    811     Value * const initiallyFinal = mIsFinal;
    812825    Value * linearlyCopyable[inputSetCount];
    813826    PHINode * selectedInputBuffer[inputSetCount];
     
    831844            if (LLVM_UNLIKELY(input.hasLookahead())) {
    832845                Constant * const lookahead = b->getSize(input.getLookahead());
    833                 strideSize = b->CreateNUWAdd(strideSize, lookahead);
     846                strideSize = b->CreateAdd(strideSize, lookahead);
    834847            }
    835848            Value * const requiresCopy = b->CreateICmpULT(accessible, strideSize);
     
    913926        const auto & name = output.getName();
    914927        Value * const produced = b->getProducedItemCount(name);
     928//        b->CallPrintInt(getName() + "_" + name + "_produced", produced);
     929
    915930        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    916931        mInitialProducedItemCount[i] = produced;
     
    935950                b->SetInsertPoint(prepareTempBuffer);
    936951                Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), tempBuffer->getArraySize());
    937                 b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
     952                b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);               
    938953                b->CreateBr(resume);
    939954                // Select the appropriate buffer / stride #
     
    971986        const ProcessingRate & rate = input.getRate();
    972987        if (rate.isFixed() && input.nonDeferred()) {
     988//            b->CallPrintInt(getName() + "_" + input.getName() + "_processed (+)", mAvailableItemCount[i]);
    973989            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
    974990            b->setProcessedItemCount(input.getName(), ic);
     
    981997        if (rate.isFixed()) {
    982998            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
    983             Value * const ic = b->CreateNUWAdd(mInitialProducedItemCount[i], produced);
     999            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
     1000//            b->CallPrintInt(getName() + "_" + output.getName() + "_produced (+)", produced);
    9841001            b->setProducedItemCount(output.getName(), ic);
    9851002        }
     
    10301047        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
    10311048        //Value * const newProducedItemCount = b->getProducedItemCount(name);
    1032         Value * const newlyProduced = b->CreateNUWSub(produced, mInitialProducedItemCount[i]);
     1049        Value * const newlyProduced = b->CreateSub(produced, mInitialProducedItemCount[i]);
    10331050        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
    10341051        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     
    10381055
    10391056        b->SetInsertPoint(copyToFront);
    1040         Value * const remaining = b->CreateNUWSub(newlyProduced, toWrite);
     1057        Value * const remaining = b->CreateSub(newlyProduced, toWrite);
    10411058        Value * const baseAddress = b->getBaseAddress(name);
    10421059        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
     
    10481065    //  We've dealt with the partial block processing and copied information back into the
    10491066    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    1050     BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
    1051     b->CreateCondBr(mIsFinal, setTermination, strideDone);
    1052     b->SetInsertPoint(setTermination);
    1053     b->setTerminationSignal();
    10541067    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
    1055     b->CreateBr(segmentDone);
     1068    if (canTerminateEarly()) {
     1069        mIsFinal = b->CreateOr(mIsFinal, b->getTerminationSignal());
     1070    }
     1071    b->CreateCondBr(mIsFinal, segmentDone, strideDone);
    10561072
    10571073    /// STRIDE DONE
     
    10661082        Value * const avail = mInitialAvailableItemCount[i];
    10671083        Value * const processed = b->getProcessedItemCount(name);
     1084//        b->CallPrintInt(getName() + "_" + name + "_processed'", processed);
     1085
    10681086        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    10691087            b->CreateAssert(b->CreateICmpULE(processed, avail), getName() + ": " + name + " processed data exceeds available data");
    10701088        }
    1071         Value * remaining = b->CreateSub(avail, processed);
     1089        Value * const remaining = b->CreateSub(avail, processed);
    10721090        Value * strideSize = inputStrideSize[i];
    10731091        if (LLVM_UNLIKELY(input.hasLookahead())) {
    1074             strideSize = b->CreateNUWAdd(strideSize, b->getSize(input.getLookahead()));
     1092            strideSize = b->CreateAdd(strideSize, b->getSize(input.getLookahead()));
    10751093        }
    10761094        Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, strideSize);
    10771095        hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    10781096    }
     1097
    10791098    // even if we do not have enough input data for a full stride, if this is our final stride, allow it ...
    10801099    hasMoreStrides = b->CreateOr(hasMoreStrides, initiallyFinal);
     
    10931112            }
    10941113            Value * const unconsumed = b->CreateSub(produced, consumed);
    1095             Value * const capacity = b->getCapacity(name);
     1114            Value * const capacity = b->getBufferedSize(name);
    10961115            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    10971116                b->CreateAssert(b->CreateICmpULE(unconsumed, capacity),
     
    11131132
    11141133            b->SetInsertPoint(copyBack);
    1115             Value * const baseAddress = b->getBaseAddress(name);
    1116             const StreamSetBuffer * const buf = this->getAnyStreamSetBuffer(name);
    1117             const auto numOfStreams = buf->getNumOfStreams();
    1118             const auto itemWidth = getItemWidth(this->getBinding(name));
    1119 
    1120             const auto sizeByBit = b->CreateMul(b->CreateMul(b->getSize(itemWidth), bufferSize), b->getSize(numOfStreams));
    1121             const auto sizeByByte = b->CreateUDiv(sizeByBit, b->getSize(8));
    1122             const auto sourcePtr = b->CreateGEP(b->CreatePointerCast(baseAddress, b->getInt8PtrTy()), sizeByByte);
    1123             const auto targetPtr = b->CreatePointerCast(baseAddress, b->getInt8PtrTy());
    1124 
    1125             const auto itemsToBeCopyByBit = b->CreateMul(b->CreateMul(b->getSize(itemWidth), current), b->getSize(numOfStreams));
    1126             const auto itemsToBeCopyByByte = b->CreateUDiv(itemsToBeCopyByBit, b->getSize(8));
    1127             b->CreateMemCpy(targetPtr, sourcePtr, itemsToBeCopyByByte, 8);
    1128 
     1134            const auto copyAlignment = getItemAlignment(mStreamSetOutputs[i]);
     1135            Value * const startOfBuffer = b->getBaseAddress(name);
     1136            Value * const offset = b->CreateUDiv(bufferSize, b->getSize(b->getBitBlockWidth()));
     1137            Value * const endOfBuffer = b->CreateGEP(startOfBuffer, offset);
     1138            b->CreateStreamCpy(name, startOfBuffer, ZERO, endOfBuffer, ZERO, current, copyAlignment);
    11291139            b->CreateBr(done);
    11301140
     
    16181628, mCurrentMethod(nullptr)
    16191629, mAvailablePrincipalItemCount(nullptr)
    1620 , mIsGenerated(false)
    16211630, mStride(0)
    16221631, mIsFinal(nullptr)
    1623 , mOutputScalarResult(nullptr) {
     1632, mOutputScalarResult(nullptr)
     1633, mIsGenerated(false) {
    16241634
    16251635}
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5831 r5856  
    203203
    204204    std::string getCacheName(const std::unique_ptr<KernelBuilder> & idb) const;
     205
     206    bool canTerminateEarly() const { return hasAttribute(Attribute::KindId::CanTerminateEarly); }
    205207
    206208protected:
     
    288290
    289291    llvm::Function *                    mCurrentMethod;
    290     llvm::Value *                       mAvailablePrincipalItemCount;
    291     bool                                mIsGenerated;
     292    llvm::Value *                       mAvailablePrincipalItemCount;   
    292293    unsigned                            mStride;
    293294    llvm::Value *                       mIsFinal;
    294295    llvm::Value *                       mOutputScalarResult;
     296    mutable bool                        mIsGenerated;
     297
    295298    std::vector<llvm::Value *>          mAvailableItemCount;
    296299
     
    300303    StreamMap                           mStreamMap;
    301304
     305    // TODO: once the kernel no longer needs to be aware of what type of buffers its working with,
     306    // these should be removed from the Kernel class and put into the Pipeline
    302307    StreamSetBuffers                    mStreamSetInputBuffers;
    303308    std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5852 r5856  
    9898    const ProcessingRate & rate = mKernel->getBinding(name).getRate();
    9999    if (LLVM_UNLIKELY(rate.isDerived())) {
     100        assert (false);
    100101        report_fatal_error("Cannot set item count: " + name + " is a Derived rate");
    101102    }
     
    237238    Type * const fieldWidthTy = getIntNTy(fieldWidth);
    238239
    239     Value * const n = buf->getStreamSetCount(this, getStreamHandle(name));
     240    Value * n = buf->getStreamSetCount(this, getStreamHandle(name));
    240241
    241242    if (isConstantOne(n) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset))) {
    242         if (isConstantOne(n)) {
    243             if (LLVM_LIKELY(itemWidth < 8)) {
    244                 itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
    245             } else if (LLVM_UNLIKELY(itemWidth > 8)) {
    246                 itemsToCopy = CreateMul(itemsToCopy, getSize(itemWidth / 8));
    247             }
    248         } else {
    249             if (LLVM_LIKELY(blockWidth > (itemWidth * 8))) {
    250                 itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(blockWidth / (8 * itemWidth)));
    251             } else if (LLVM_LIKELY(blockWidth < (itemWidth * 8))) {
    252                 itemsToCopy = CreateUDivCeil(CreateMul(itemsToCopy, getSize(8)), getSize(blockWidth / itemWidth));
    253             }
     243        if (LLVM_LIKELY(itemWidth < 8)) {
     244            itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
     245        } else if (LLVM_UNLIKELY(itemWidth > 8)) {
     246            itemsToCopy = CreateMul(itemsToCopy, getSize(itemWidth / 8));
     247        }
     248        if (!isConstantOne(n)) {
    254249            itemsToCopy = CreateMul(itemsToCopy, n);
    255250        }
     
    260255
    261256    } else { // either the target offset or source offset is non-zero but not both
     257
    262258        VectorType * const blockTy = getBitBlockType();
    263259        PointerType * const blockPtrTy = blockTy->getPointerTo();
     
    448444
    449445StoreInst * KernelBuilder::storeOutputStreamBlock(const std::string & name, Value * streamIndex, Value * toStore) {
    450     return CreateBlockAlignedStore(toStore, getOutputStreamBlockPtr(name, streamIndex));
     446    Value * const ptr = getOutputStreamBlockPtr(name, streamIndex);
     447    Type * const storeTy = toStore->getType();
     448    Type * const ptrElemTy = ptr->getType()->getPointerElementType();
     449    if (LLVM_UNLIKELY(storeTy != ptrElemTy)) {
     450        if (LLVM_LIKELY(storeTy->canLosslesslyBitCastTo(ptrElemTy))) {
     451            toStore = CreateBitCast(toStore, ptrElemTy);
     452        } else {
     453            std::string tmp;
     454            raw_string_ostream out(tmp);
     455            out << "invalid type conversion when calling storeOutputStreamBlock on " <<  name << ": ";
     456            ptrElemTy->print(out);
     457            out << " vs. ";
     458            storeTy->print(out);
     459        }
     460    }
     461    return CreateBlockAlignedStore(toStore, ptr);
    451462}
    452463
     
    463474
    464475StoreInst * KernelBuilder::storeOutputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex, Value * toStore) {
    465     return CreateBlockAlignedStore(toStore, getOutputStreamPackPtr(name, streamIndex, packIndex));
     476    Value * const ptr = getOutputStreamPackPtr(name, streamIndex, packIndex);
     477    Type * const storeTy = toStore->getType();
     478    Type * const ptrElemTy = ptr->getType()->getPointerElementType();
     479    if (LLVM_UNLIKELY(storeTy != ptrElemTy)) {
     480        if (LLVM_LIKELY(storeTy->canLosslesslyBitCastTo(ptrElemTy))) {
     481            toStore = CreateBitCast(toStore, ptrElemTy);
     482        } else {
     483            std::string tmp;
     484            raw_string_ostream out(tmp);
     485            out << "invalid type conversion when calling storeOutputStreamPack on " <<  name << ": ";
     486            ptrElemTy->print(out);
     487            out << " vs. ";
     488            storeTy->print(out);
     489        }
     490    }
     491    return CreateBlockAlignedStore(toStore, ptr);
    466492}
    467493
     
    548574}
    549575
     576void KernelBuilder::doubleCapacity(const std::string & name) {
     577    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
     578    return buf->doubleCapacity(this, getStreamHandle(name));
     579}
     580
    550581BasicBlock * KernelBuilder::CreateConsumerWait() {
    551582    const auto consumers = mKernel->getStreamOutputs();
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5830 r5856  
    123123    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    124124   
    125     void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopy, const unsigned itemAlignment);
     125    void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopy, const unsigned itemAlignment);   
    126126
    127127    llvm::BasicBlock * CreateConsumerWait();
     
    146146
    147147    void protectOutputStream(const std::string & name, const bool readOnly);
     148
     149    void doubleCapacity(const std::string & name);
    148150
    149151protected:
  • icGREP/icgrep-devel/icgrep/kernels/linebreak_kernel.cpp

    r5847 r5856  
    7777    PabloAST * removedCRLF = crb.createAnd(LineBreak, crb.createNot(CRLF));
    7878    crb.createAssign(LineBreak, removedCRLF);
     79
     80
    7981    // Record the CR marker of any CR+LF
    8082    pb.createAssign(pb.createExtract(getOutput(1), ZERO), CRLF);
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.cpp

    r5771 r5856  
    99#include <sys/stat.h>
    1010#include <fcntl.h>
     11#include <toolchain/toolchain.h>
    1112
    1213using namespace llvm;
     
    2829}
    2930
    30 void MMapSourceKernel::generateInitializeMethod(Function * const fileSizeMethod, const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<KernelBuilder> & kb) {
    31     BasicBlock * const emptyFile = kb->CreateBasicBlock("EmptyFile");
     31void MMapSourceKernel::generateInitializeMethod(Function * const fileSizeMethod, const unsigned codeUnitWidth, const std::unique_ptr<KernelBuilder> & kb) {
     32
     33    BasicBlock * const emptyFile = kb->CreateBasicBlock("emptyFile");
    3234    BasicBlock * const nonEmptyFile = kb->CreateBasicBlock("NonEmptyFile");
    3335    BasicBlock * const exit = kb->CreateBasicBlock("Exit");
     
    3537    Value * const fd = kb->getScalarField("fileDescriptor");
    3638    assert (fileSizeMethod);
    37     Value * fileSize = kb->CreateCall(fileSizeMethod, fd);
    38     fileSize = kb->CreateZExtOrTrunc(fileSize, sizeTy);
     39    Value * fileSize = kb->CreateZExtOrTrunc(kb->CreateCall(fileSizeMethod, fd), sizeTy);
     40    kb->CreateLikelyCondBr(kb->CreateIsNotNull(fileSize), nonEmptyFile, emptyFile);
     41
     42    kb->SetInsertPoint(nonEmptyFile);
     43    PointerType * const codeUnitPtrTy = kb->getIntNTy(codeUnitWidth)->getPointerTo();
     44    Value * const fileBuffer = kb->CreatePointerCast(kb->CreateFileSourceMMap(fd, fileSize), codeUnitPtrTy);
     45    kb->setScalarField("buffer", fileBuffer);   
     46    kb->setBaseAddress("sourceBuffer", fileBuffer);   
     47    kb->CreateMAdvise(fileBuffer, fileSize, CBuilder::ADVICE_WILLNEED);
    3948    if (codeUnitWidth > 8) {
    4049        fileSize = kb->CreateUDiv(fileSize, kb->getSize(codeUnitWidth / 8));
    4150    }
    42     Value * const isEmpty = kb->CreateICmpEQ(fileSize, ConstantInt::getNullValue(fileSize->getType()));
    43     kb->CreateUnlikelyCondBr(isEmpty, emptyFile, nonEmptyFile);
    44     // we cannot mmap a 0 length file; just create a 1-page sized fake file buffer for simplicity
     51    kb->setBufferedSize("sourceBuffer", fileSize);
     52    kb->setScalarField("fileSize", fileSize);
     53    kb->setProducedItemCount("sourceBuffer", fileSize);
     54    kb->setCapacity("sourceBuffer", fileSize);
     55    kb->CreateBr(exit);
     56
    4557    kb->SetInsertPoint(emptyFile);
    46     Constant * const readSize = kb->getSize(segmentSize);
    47     Value * const fakeFileBuffer = kb->CreateAnonymousMMap(readSize);
     58    kb->setTerminationSignal();
    4859    kb->CreateBr(exit);
    4960
    50     kb->SetInsertPoint(nonEmptyFile);
    51     Value * fileBackedBuffer = kb->CreateFileSourceMMap(fd, fileSize);
    52     kb->CreateBr(exit);
    53 
    5461    kb->SetInsertPoint(exit);
    55     PHINode * const buffer = kb->CreatePHI(fileBackedBuffer->getType(), 2);
    56     buffer->addIncoming(fakeFileBuffer, emptyFile);
    57     buffer->addIncoming(fileBackedBuffer, nonEmptyFile);
    58     PHINode * const bufferSize = kb->CreatePHI(sizeTy, 2);
    59     bufferSize->addIncoming(readSize, emptyFile);
    60     bufferSize->addIncoming(fileSize, nonEmptyFile);
    61 
    62     PointerType * const codeUnitPtrTy = kb->getIntNTy(codeUnitWidth)->getPointerTo();
    63     Value * bufferPtr = kb->CreatePointerCast(buffer, codeUnitPtrTy);
    64     kb->setScalarField("buffer", bufferPtr);
    65     kb->setScalarField("fileSize", fileSize);
    66 
    67     kb->setBaseAddress("sourceBuffer", bufferPtr);
    68     kb->setBufferedSize("sourceBuffer", bufferSize);
    69     kb->setCapacity("sourceBuffer", fileSize);
    70     kb->CreateMAdvise(buffer, fileSize, CBuilder::ADVICE_WILLNEED);
    71 
    72 }
    73 
    74 void MMapSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<KernelBuilder> & kb) {
     62}
     63
     64void MMapSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const std::unique_ptr<KernelBuilder> & kb) {
    7565
    7666    BasicBlock * dropPages = kb->CreateBasicBlock("dropPages");
    77     BasicBlock * processSegment = kb->CreateBasicBlock("produceData");
     67    BasicBlock * checkRemaining = kb->CreateBasicBlock("checkRemaining");
    7868    BasicBlock * setTermination = kb->CreateBasicBlock("setTermination");
    79     BasicBlock * mmapSourceExit = kb->CreateBasicBlock("mmapSourceExit");
    80 
    81     Constant * const readSize = kb->getSize(segmentSize);
     69    BasicBlock * exit = kb->CreateBasicBlock("mmapSourceExit");
     70
     71    Value * const fileSize = kb->getScalarField("fileSize");
    8272    Constant * const pageSize = kb->getSize(getpagesize());
    8373
     
    8575    consumed = kb->CreateMul(consumed, kb->getSize(codeUnitWidth / 8));
    8676    consumed = kb->CreateAnd(consumed, ConstantExpr::getNeg(pageSize));
    87 
    8877    Value * const consumedBuffer = kb->getRawOutputPointer("sourceBuffer", consumed);
    8978    Value * const readableBuffer = kb->getScalarField("buffer");
     
    9180
    9281    // avoid calling madvise unless an actual page table change could occur
    93     kb->CreateLikelyCondBr(kb->CreateIsNotNull(unnecessaryBytes), processSegment, dropPages);
     82    kb->CreateLikelyCondBr(kb->CreateIsNotNull(unnecessaryBytes), dropPages, checkRemaining);
    9483
    9584    kb->SetInsertPoint(dropPages);
     
    9786    kb->CreateMAdvise(readableBuffer, unnecessaryBytes, CBuilder::ADVICE_DONTNEED);
    9887    kb->setScalarField("buffer", kb->CreateGEP(readableBuffer, unnecessaryBytes));
    99     kb->CreateBr(processSegment);
     88    kb->CreateBr(checkRemaining);
    10089
    10190    // determine whether or not we've exhausted the file buffer
    102     kb->SetInsertPoint(processSegment);
    103     Value * const fileSize = kb->getScalarField("fileSize");
    104     Value * const produced = kb->CreateAdd(kb->getProducedItemCount("sourceBuffer"), readSize);
    105     Value * const lessThanFullSegment = kb->CreateICmpULT(fileSize, produced);
    106     kb->CreateUnlikelyCondBr(lessThanFullSegment, setTermination, mmapSourceExit);
     91    kb->SetInsertPoint(checkRemaining);
     92    Value * const remaining = kb->CreateSub(fileSize, consumed);
     93    Value * const lastPage = kb->CreateICmpULE(remaining, pageSize);
     94    kb->CreateUnlikelyCondBr(lastPage, setTermination, exit);
    10795
    10896    kb->SetInsertPoint(setTermination);
    10997    kb->setTerminationSignal();
    110     kb->CreateBr(mmapSourceExit);
     98    kb->CreateBr(exit);
    11199
    112100    // finally, set the "produced" count to reflect current position in the file
    113     kb->SetInsertPoint(mmapSourceExit);
    114     PHINode * itemsRead = kb->CreatePHI(produced->getType(), 2);
    115     itemsRead->addIncoming(produced, processSegment);
    116     itemsRead->addIncoming(fileSize, setTermination);
    117     kb->setProducedItemCount("sourceBuffer", itemsRead);
    118 
     101    kb->SetInsertPoint(exit);
    119102}
    120103
     
    123106}
    124107
    125 MMapSourceKernel::MMapSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned stridesPerSegment, const unsigned codeUnitWidth)
    126 : SegmentOrientedKernel("mmap_source" + std::to_string(stridesPerSegment) + "@" + std::to_string(codeUnitWidth),
    127 {},
    128 {Binding{kb->getStreamSetTy(1, codeUnitWidth), "sourceBuffer", FixedRate(), Deferred()}},
    129 {Binding{kb->getInt32Ty(), "fileDescriptor"}},
    130 {Binding{kb->getSizeTy(), "fileSize"}}, {Binding{kb->getIntNTy(codeUnitWidth)->getPointerTo(), "buffer"}})
    131 , mStridesPerSegment(stridesPerSegment)
    132 , mCodeUnitWidth(codeUnitWidth)
    133 , mFileSizeFunction(nullptr) {
    134 
    135 }
    136 
    137108/// READ SOURCE KERNEL
    138109
    139 void ReadSourceKernel::generateInitializeMethod(const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<KernelBuilder> & b) {
     110void ReadSourceKernel::generateInitializeMethod(const unsigned codeUnitWidth, const unsigned stride, const std::unique_ptr<KernelBuilder> & b) {
    140111    const unsigned pageSize = getpagesize();
    141     const auto bufferSize = std::max(pageSize * 8, segmentSize * 4);
     112    const auto bufferSize = std::max(pageSize * 8, codegen::SegmentSize * stride * 4);
    142113    ConstantInt * const bufferItems = b->getSize(bufferSize);
    143114    const auto codeUnitSize = codeUnitWidth / 8;
     
    150121}
    151122
    152 void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<KernelBuilder> & b) {
     123void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned stride, const std::unique_ptr<KernelBuilder> & b) {
    153124
    154125    const unsigned pageSize = getpagesize();
    155     ConstantInt * const itemsToRead = b->getSize(std::max(pageSize, segmentSize * 2));
     126    ConstantInt * const itemsToRead = b->getSize(std::max(pageSize, codegen::SegmentSize * stride * 2));
    156127    ConstantInt * const codeUnitBytes = b->getSize(codeUnitWidth / 8);
    157     ConstantInt * const itemsPerSegment = b->getSize(segmentSize);
     128    ConstantInt * const itemsPerSegment = b->getSize(codegen::SegmentSize * stride);
    158129
    159130    BasicBlock * const entry = b->GetInsertBlock();
     
    213184    b->CreateMemCpy(expandedBuffer, unreadData, remainingBytes, blockSize);
    214185    b->setScalarField("buffer", expandedBuffer);
    215     b->setCapacity("sourceBuffer", expandedCapacity); 
     186    b->setCapacity("sourceBuffer", expandedCapacity);
    216187    b->CreateFree(baseBuffer);
    217188    b->CreateBr(prepareBuffer);
     
    236207    b->CreateUnlikelyCondBr(b->CreateICmpULT(itemsBuffered, itemsPending), setTermination, readExit);
    237208
    238     // ... set the termination signal.   
     209    // ... set the termination signal.
    239210    b->SetInsertPoint(setTermination);
    240211    Value * const bytesToZero = b->CreateMul(b->CreateSub(itemsPending, itemsBuffered), codeUnitBytes);
     
    254225void ReadSourceKernel::freeBuffer(const std::unique_ptr<KernelBuilder> & kb) {
    255226    kb->CreateFree(kb->getScalarField("buffer"));
    256 }
    257 
    258 ReadSourceKernel::ReadSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned stridesPerSegment, const unsigned codeUnitWidth)
    259 : SegmentOrientedKernel("read_source"  + std::to_string(stridesPerSegment) + "@" + std::to_string(codeUnitWidth)
    260 , {}
    261 , {Binding{b->getStreamSetTy(1, codeUnitWidth), "sourceBuffer", FixedRate(), Deferred()}}
    262 , {Binding{b->getInt32Ty(), "fileDescriptor"}}
    263 , {}
    264 , {Binding{b->getIntNTy(codeUnitWidth)->getPointerTo(), "buffer"}})
    265 , mStridesPerSegment(stridesPerSegment)
    266 , mCodeUnitWidth(codeUnitWidth) {
    267 
    268227}
    269228
     
    296255    kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), initializeRead, initializeMMap);
    297256    kb->SetInsertPoint(initializeRead);
    298     ReadSourceKernel::generateInitializeMethod(mCodeUnitWidth, mStride * mStridesPerSegment, kb);
     257    ReadSourceKernel::generateInitializeMethod(mCodeUnitWidth, getStride(), kb);
    299258    kb->CreateBr(initializeDone);
    300259    kb->SetInsertPoint(initializeMMap);
    301     MMapSourceKernel::generateInitializeMethod(mFileSizeFunction, mCodeUnitWidth, mStride * mStridesPerSegment, kb);
     260    MMapSourceKernel::generateInitializeMethod(mFileSizeFunction, mCodeUnitWidth, kb);
    302261    kb->CreateBr(initializeDone);
    303262    kb->SetInsertPoint(initializeDone);
     
    311270    kb->CreateCondBr(kb->CreateICmpEQ(kb->getScalarField("fileDescriptor"), kb->getInt32(STDIN_FILENO)), DoSegmentRead, DoSegmentMMap);
    312271    kb->SetInsertPoint(DoSegmentRead);
    313     ReadSourceKernel::generateDoSegmentMethod(mCodeUnitWidth, mStride * mStridesPerSegment, kb);
     272    ReadSourceKernel::generateDoSegmentMethod(mCodeUnitWidth, getStride(), kb);
    314273    kb->CreateBr(DoSegmentDone);
    315274    kb->SetInsertPoint(DoSegmentMMap);
    316     MMapSourceKernel::generateDoSegmentMethod(mCodeUnitWidth, mStride * mStridesPerSegment, kb);
     275    MMapSourceKernel::generateDoSegmentMethod(mCodeUnitWidth, kb);
    317276    kb->CreateBr(DoSegmentDone);
    318277    kb->SetInsertPoint(DoSegmentDone);
    319278}
    320279
    321 FDSourceKernel::FDSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned stridesPerSegment, const unsigned codeUnitWidth)
    322 : SegmentOrientedKernel("FD_source" + std::to_string(stridesPerSegment) + "@" + std::to_string(codeUnitWidth)
    323 , {}
    324 , {Binding{kb->getStreamSetTy(1, codeUnitWidth), "sourceBuffer", FixedRate(), Deferred()}}
    325 , {Binding{kb->getInt32Ty(), "fileDescriptor"}}
    326 , {}
    327 , {Binding{kb->getIntNTy(codeUnitWidth)->getPointerTo(), "buffer"}, Binding{kb->getSizeTy(), "fileSize"}})
    328 , mStridesPerSegment(stridesPerSegment)
    329 , mCodeUnitWidth(codeUnitWidth)
    330 , mFileSizeFunction(nullptr) {
    331 
    332 }
    333280
    334281/// MEMORY SOURCE KERNEL
     
    341288    kb->setBufferedSize("sourceBuffer", fileItems);
    342289    kb->setCapacity("sourceBuffer", fileItems);
     290    kb->setProducedItemCount("sourceBuffer", fileItems);
     291    kb->setTerminationSignal();
    343292}
    344293
    345294void MemorySourceKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
    346295
    347     BasicBlock * entryBlock = kb->GetInsertBlock();
    348     BasicBlock * setTermination = kb->CreateBasicBlock("setTermination");
    349     BasicBlock * mmapSourceExit = kb->CreateBasicBlock("sourceExit");
    350 
    351     ConstantInt * const segmentItems = kb->getSize(mStride * mStridesPerSegment);
    352     Value * const fileItems = kb->getBufferedSize("sourceBuffer");
    353     Value * const produced = kb->CreateAdd(kb->getProducedItemCount("sourceBuffer"), segmentItems);
    354 
    355     kb->CreateUnlikelyCondBr(kb->CreateICmpULT(fileItems, produced), setTermination, mmapSourceExit);
    356 
    357     kb->SetInsertPoint(setTermination);
    358     kb->setTerminationSignal();
    359     kb->CreateBr(mmapSourceExit);
    360 
    361     kb->SetInsertPoint(mmapSourceExit);
    362     PHINode * itemsRead = kb->CreatePHI(produced->getType(), 2);
    363     itemsRead->addIncoming(produced, entryBlock);
    364     itemsRead->addIncoming(fileItems, setTermination);
    365     kb->setProducedItemCount("sourceBuffer", itemsRead);
    366 }
    367 
    368 MemorySourceKernel::MemorySourceKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, Type * const type, const unsigned stridesPerSegment, const unsigned codeUnitWidth)
     296}
     297
     298MMapSourceKernel::MMapSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned codeUnitWidth)
     299: SegmentOrientedKernel("mmap_source@" + std::to_string(codeUnitWidth),
     300{},
     301{Binding{b->getStreamSetTy(1, codeUnitWidth), "sourceBuffer", FixedRate(), Deferred()}},
     302{Binding{b->getInt32Ty(), "fileDescriptor"}},
     303{Binding{b->getSizeTy(), "fileSize"}}, {Binding{b->getIntNTy(codeUnitWidth)->getPointerTo(), "buffer"}})
     304, mCodeUnitWidth(codeUnitWidth)
     305, mFileSizeFunction(nullptr) {
     306    addAttribute(CanTerminateEarly());
     307}
     308
     309
     310ReadSourceKernel::ReadSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned codeUnitWidth)
     311: SegmentOrientedKernel("read_source" + std::to_string(codegen::SegmentSize) + "@" + std::to_string(codeUnitWidth)
     312, {}
     313, {Binding{b->getStreamSetTy(1, codeUnitWidth), "sourceBuffer", FixedRate(), Deferred()}}
     314, {Binding{b->getInt32Ty(), "fileDescriptor"}}
     315, {}
     316, {Binding{b->getIntNTy(codeUnitWidth)->getPointerTo(), "buffer"}})
     317, mCodeUnitWidth(codeUnitWidth) {
     318    addAttribute(CanTerminateEarly());
     319}
     320
     321
     322FDSourceKernel::FDSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned codeUnitWidth)
     323: SegmentOrientedKernel("FD_source@" + std::to_string(codeUnitWidth)
     324, {}
     325, {Binding{kb->getStreamSetTy(1, codeUnitWidth), "sourceBuffer", FixedRate(), Deferred()}}
     326, {Binding{kb->getInt32Ty(), "fileDescriptor"}}
     327, {}
     328, {Binding{kb->getIntNTy(codeUnitWidth)->getPointerTo(), "buffer"}, Binding{kb->getSizeTy(), "fileSize"}})
     329, mCodeUnitWidth(codeUnitWidth)
     330, mFileSizeFunction(nullptr) {
     331    addAttribute(CanTerminateEarly());
     332}
     333
     334MemorySourceKernel::MemorySourceKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, Type * const type, const unsigned codeUnitWidth)
    369335: SegmentOrientedKernel("memory_source",
    370336    {},
    371337    {Binding{kb->getStreamSetTy(1, codeUnitWidth), "sourceBuffer"}},
    372338    {Binding{cast<PointerType>(type), "fileSource"}, Binding{kb->getSizeTy(), "fileSize"}}, {}, {})
    373 , mStridesPerSegment(stridesPerSegment)
    374339, mCodeUnitWidth(codeUnitWidth) {
    375 
    376 }
    377 
    378 }
     340    addAttribute(CanTerminateEarly());
     341}
     342
     343}
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.h

    r5761 r5856  
    1414    friend class FDSourceKernel;
    1515public:
    16     MMapSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned stridesPerSegment = 1, const unsigned codeUnitWidth = 8);
     16    MMapSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned codeUnitWidth = 8);
    1717    bool isCachable() const override { return true; }
    1818    bool hasSignature() const override { return false; }
     
    2121    }
    2222    void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
    23         generateInitializeMethod(mFileSizeFunction, mCodeUnitWidth, mStride * mStridesPerSegment, iBuilder);
     23        generateInitializeMethod(mFileSizeFunction, mCodeUnitWidth, iBuilder);
    2424    }
    2525    void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
    26         generateDoSegmentMethod(mCodeUnitWidth, mStride * mStridesPerSegment, iBuilder);
     26        generateDoSegmentMethod(mCodeUnitWidth, iBuilder);
    2727    }
    2828    void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     
    3131protected:
    3232    static llvm::Function * linkFileSizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    33     static void generateInitializeMethod(llvm::Function * fileSize, const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    34     static void generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     33    static void generateInitializeMethod(llvm::Function * fileSize, const unsigned codeUnitWidth, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     34    static void generateDoSegmentMethod(const unsigned codeUnitWidth, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    3535    static void unmapSourceBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    3636protected:
    37     const unsigned mStridesPerSegment;
    3837    const unsigned mCodeUnitWidth;
    3938    llvm::Function * mFileSizeFunction;
     
    4342    friend class FDSourceKernel;
    4443public:
    45     ReadSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned stridesPerSegment = 1, const unsigned codeUnitWidth = 8);
     44    ReadSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned codeUnitWidth = 8);
    4645    bool isCachable() const override { return true; }
    4746    bool hasSignature() const override { return false; }
    4847    void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
    49         generateInitializeMethod(mCodeUnitWidth, mStride * mStridesPerSegment, iBuilder);
     48        generateInitializeMethod(mCodeUnitWidth, getStride(), iBuilder);
    5049    }
    5150    void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
    52         generateDoSegmentMethod(mCodeUnitWidth, mStride * mStridesPerSegment, iBuilder);
     51        generateDoSegmentMethod(mCodeUnitWidth, getStride(), iBuilder);
    5352    }
    5453    void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override {
     
    5655    }
    5756protected:
    58     static void generateInitializeMethod(const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    59     static void generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned segmentSize, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     57    static void generateInitializeMethod(const unsigned codeUnitWidth, const unsigned stride, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
     58    static void generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned stride, const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    6059    static void freeBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    6160private:
    62     const unsigned mStridesPerSegment;
    6361    const unsigned mCodeUnitWidth;
    6462};
     
    6664class FDSourceKernel final : public SegmentOrientedKernel {
    6765public:
    68     FDSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned stridesPerSegment = 1, const unsigned codeUnitWidth = 8);
     66    FDSourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const unsigned codeUnitWidth = 8);
    6967    bool isCachable() const override { return true; }
    7068    bool hasSignature() const override { return false; }
     
    7472    void generateFinalizeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    7573protected:
    76     const unsigned mStridesPerSegment;
    7774    const unsigned mCodeUnitWidth;
    7875    llvm::Function * mFileSizeFunction;
     
    8178class MemorySourceKernel final : public SegmentOrientedKernel {
    8279public:
    83     MemorySourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Type * const type, const unsigned stridesPerSegment = 1, const unsigned codeUnitWidth = 8);
     80    MemorySourceKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Type * const type, const unsigned codeUnitWidth = 8);
    8481    bool hasSignature() const override { return false; }
    8582protected:
     
    8784    void generateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    8885private:
    89     const unsigned mStridesPerSegment;
    9086    const unsigned mCodeUnitWidth;
    9187};
  • icGREP/icgrep-devel/icgrep/kernels/streamset.cpp

    r5836 r5856  
    116116    }
    117117    return b->getSize(count);
     118}
     119
     120void StreamSetBuffer::doubleCapacity(IDISA::IDISA_Builder * const /* b */, Value */* handle */) const {
     121    report_fatal_error("doubleCapacity is not supported by this buffer type");
    118122}
    119123
     
    702706//  consumer and producer positions.
    703707//
    704 void DynamicBuffer::doubleCapacity(IDISA::IDISA_Builder * const b, Value * const handle) {
     708void DynamicBuffer::doubleCapacity(IDISA::IDISA_Builder * const b, Value * const handle) const {
    705709    size_t numStreams = 1;
    706710    if (isa<ArrayType>(mBaseType)) {
  • icGREP/icgrep-devel/icgrep/kernels/streamset.h

    r5843 r5856  
    112112    virtual llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * fromPosition, llvm::Value * consumed, bool reverse = false) const;
    113113   
     114    virtual void doubleCapacity(IDISA::IDISA_Builder * const b, llvm::Value * handle) const;
     115
    114116    bool supportsCopyBack() const {
    115117        return mOverflowBlocks != 0;
     
    312314   
    313315// Dynamically allocated circular buffers: TODO: add copyback, swizzle support, dynamic allocation, producer, consumer, length
    314 class DynamicBuffer: public StreamSetBuffer {
     316class DynamicBuffer final : public StreamSetBuffer {
    315317public:
    316318    static inline bool classof(const StreamSetBuffer * b) {return b->getBufferKind() == BufferKind::DynamicBuffer;}
     
    324326    void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    325327
    326     void releaseBuffer(const std::unique_ptr<kernel::KernelBuilder> & kb) const override;
     328    void releaseBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) const override;
    327329
    328330    llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * absolutePosition) const override;
     
    330332    llvm::Value * getBufferedSize(IDISA::IDISA_Builder * const b, llvm::Value * handle) const override;
    331333   
    332     void doubleCapacity(IDISA::IDISA_Builder * const b, llvm::Value * handle);
     334    void doubleCapacity(IDISA::IDISA_Builder * const b, llvm::Value * handle)  const final;
    333335
    334336protected:
  • icGREP/icgrep-devel/icgrep/kernels/until_n.cpp

    r5832 r5856  
    4444    Constant * const PACK_SIZE = b->getSize(packSize);
    4545    Constant * const PACKS_PER_BLOCK = b->getSize(packsPerBlock);
    46     VectorType * const vTy = VectorType::get(sizeTy, packsPerBlock);
    47     Value * const ZEROES = Constant::getNullValue(vTy);
     46    const auto blocksPerStride = getStride() / b->getBitBlockWidth();
     47    Constant * const BLOCKS_PER_STRIDE = b->getSize(blocksPerStride);
     48    const auto maximumBlocksPerIteration = packSize / packsPerBlock;
     49    Constant * const MAXIMUM_BLOCKS_PER_ITERATION = b->getSize(maximumBlocksPerIteration);
     50    VectorType * const packVectorTy = VectorType::get(sizeTy, packsPerBlock);
     51    Value * const ZEROES = Constant::getNullValue(packVectorTy);
    4852
    4953    BasicBlock * const entry = b->GetInsertBlock();
     54    Value * const numOfBlocks = b->CreateMul(numOfStrides, BLOCKS_PER_STRIDE);
    5055    BasicBlock * const strideLoop = b->CreateBasicBlock("strideLoop");
     56    b->CreateBr(strideLoop);
    5157
    52     Value * const allAvailableItems = b->getAvailableItemCount("bits");
     58    b->SetInsertPoint(strideLoop);
     59    PHINode * const baseBlockIndex = b->CreatePHI(b->getSizeTy(), 2);
     60    baseBlockIndex->addIncoming(ZERO, entry);
     61    PHINode * const blocksRemaining = b->CreatePHI(b->getSizeTy(), 2);
     62    blocksRemaining->addIncoming(numOfBlocks, entry);
     63    Value * const blocksToDo = b->CreateUMin(blocksRemaining, MAXIMUM_BLOCKS_PER_ITERATION);
     64    BasicBlock * const iteratorLoop = b->CreateBasicBlock("iteratorLoop");
     65    BasicBlock * const checkForMatches = b->CreateBasicBlock("checkForMatches");
     66    b->CreateBr(iteratorLoop);
    5367
    54     b->CreateBr(strideLoop);
    55     b->SetInsertPoint(strideLoop);
    56     PHINode * const strideIndex = b->CreatePHI(sizeTy, 2);
    57     strideIndex->addIncoming(ZERO, entry);
    5868
    59     const auto n = (packSize * packSize) / b->getBitBlockWidth();
    60     Value * groupMask = nullptr;
    61     Value * const baseOffset = b->CreateMul(strideIndex, b->getSize(n));
    62     for (unsigned i = 0; i < n; ++i) {
    63         Value * offset = b->CreateNUWAdd(baseOffset, b->getSize(i));
    64         Value * inputPtr = b->getInputStreamBlockPtr("bits", ZERO, offset);
    65         Value * inputValue = b->CreateBlockAlignedLoad(inputPtr);
    66         Value * outputPtr = b->getOutputStreamBlockPtr("uptoN", ZERO, offset);
    67         b->CreateBlockAlignedStore(inputValue, outputPtr);
    68         Value * markers = b->CreateNot(b->simd_eq(packSize, inputValue, ZEROES));
    69         Value * blockMask = b->CreateZExtOrTrunc(b->hsimd_signmask(packSize, markers), sizeTy);
    70         if (i) {
    71             blockMask = b->CreateShl(blockMask, i * packsPerBlock);
    72             groupMask = b->CreateOr(groupMask, blockMask);
    73         } else {
    74             groupMask = blockMask;
    75         }
    76     }
     69    // Construct the outer iterator mask indicating whether any markers are in the stream.
     70    b->SetInsertPoint(iteratorLoop);
     71    PHINode * const groupMaskPhi = b->CreatePHI(b->getSizeTy(), 2);
     72    groupMaskPhi->addIncoming(ZERO, strideLoop);
     73    PHINode * const localIndex = b->CreatePHI(b->getSizeTy(), 2);
     74    localIndex->addIncoming(ZERO, strideLoop);
     75    Value * const blockIndex = b->CreateAdd(baseBlockIndex, localIndex);
     76    Value * inputPtr = b->getInputStreamBlockPtr("bits", ZERO, blockIndex);
     77    Value * inputValue = b->CreateBlockAlignedLoad(inputPtr);
     78    Value * outputPtr = b->getOutputStreamBlockPtr("uptoN", ZERO, blockIndex);
     79    b->CreateBlockAlignedStore(inputValue, outputPtr);
     80    Value * const inputPackValue = b->CreateNot(b->simd_eq(packSize, inputValue, ZEROES));
     81    Value * iteratorMask = b->CreateZExtOrTrunc(b->hsimd_signmask(packSize, inputPackValue), sizeTy);
     82    iteratorMask = b->CreateShl(iteratorMask, b->CreateMul(localIndex, PACKS_PER_BLOCK));
     83    iteratorMask = b->CreateOr(groupMaskPhi, iteratorMask);
     84    groupMaskPhi->addIncoming(iteratorMask, iteratorLoop);
     85    Value * const nextLocalIndex = b->CreateAdd(localIndex, ONE);
     86    localIndex->addIncoming(nextLocalIndex, iteratorLoop);
     87    b->CreateCondBr(b->CreateICmpNE(nextLocalIndex, blocksToDo), iteratorLoop, checkForMatches);
     88
     89    // Now check whether we have any matches
     90    b->SetInsertPoint(checkForMatches);
    7791
    7892    BasicBlock * const processGroups = b->CreateBasicBlock("processGroups");
    7993    BasicBlock * const nextStride = b->CreateBasicBlock("nextStride");
    80 
    81     b->CreateLikelyCondBr(b->CreateIsNull(groupMask), nextStride, processGroups);
     94    b->CreateLikelyCondBr(b->CreateIsNull(iteratorMask), nextStride, processGroups);
    8295
    8396    b->SetInsertPoint(processGroups);
     
    90103    PHINode * const observed = b->CreatePHI(initiallyObserved->getType(), 2);
    91104    observed->addIncoming(initiallyObserved, processGroups);
    92     PHINode * const groupMarkers = b->CreatePHI(groupMask->getType(), 2);
    93     groupMarkers->addIncoming(groupMask, processGroups);
     105    PHINode * const groupMarkers = b->CreatePHI(iteratorMask->getType(), 2);
     106    groupMarkers->addIncoming(iteratorMask, processGroups);
    94107
    95108    Value * const groupIndex = b->CreateZExtOrTrunc(b->CreateCountForwardZeroes(groupMarkers), sizeTy);
    96     Value * const blockIndex = b->CreateNUWAdd(baseOffset, b->CreateUDiv(groupIndex, PACKS_PER_BLOCK));
     109    Value * const blockIndex2 = b->CreateAdd(baseBlockIndex, b->CreateUDiv(groupIndex, PACKS_PER_BLOCK));
    97110    Value * const packOffset = b->CreateURem(groupIndex, PACKS_PER_BLOCK);
    98     Value * const groupPtr = b->getInputStreamBlockPtr("bits", ZERO, blockIndex);
    99     Value * const groupValue = b->CreateBlockAlignedLoad(groupPtr);
    100     Value * const packBits = b->CreateExtractElement(b->CreateBitCast(groupValue, vTy), packOffset);
    101     //Type * packPtrTy = packTy->getPointerTo();
    102     //Value * const packPtr = b->CreateGEP(b->CreatePointerCast(groupPtr, packPtrTy), packOffset);
    103     //Value * const packBits = b->CreateLoad(packPtr);
     111    Value * const groupPtr2 = b->getInputStreamBlockPtr("bits", ZERO, blockIndex2);
     112    Value * const groupValue = b->CreateBlockAlignedLoad(groupPtr2);
     113    Value * const packBits = b->CreateExtractElement(b->CreateBitCast(groupValue, packVectorTy), packOffset);
    104114    Value * const packCount = b->CreateZExtOrTrunc(b->CreatePopcount(packBits), sizeTy);
    105     Value * const observedUpTo = b->CreateNUWAdd(observed, packCount);
    106 
     115    Value * const observedUpTo = b->CreateAdd(observed, packCount);
    107116    BasicBlock * const haveNotSeenEnough = b->CreateBasicBlock("haveNotSeenEnough");
    108117    BasicBlock * const seenNOrMore = b->CreateBasicBlock("seenNOrMore");
     
    122131        b->CreateAssert(b->CreateICmpUGT(N, observed), "N must be greater than observed count!");
    123132    }
    124     Value * const bitsToFind = b->CreateNUWSub(N, observed);
     133    Value * const bitsToFind = b->CreateSub(N, observed);
    125134    BasicBlock * const findNthBit = b->CreateBasicBlock("findNthBit");
    126135    BasicBlock * const foundNthBit = b->CreateBasicBlock("foundNthBit");
     
    134143    Value * const nextRemainingBits = b->CreateResetLowestBit(remainingBits);
    135144    remainingBits->addIncoming(nextRemainingBits, findNthBit);
    136     Value * const nextRemainingBitsToFind = b->CreateNUWSub(remainingBitsToFind, ONE);
     145    Value * const nextRemainingBitsToFind = b->CreateSub(remainingBitsToFind, ONE);
    137146    remainingBitsToFind->addIncoming(nextRemainingBitsToFind, findNthBit);
    138147    b->CreateLikelyCondBr(b->CreateIsNull(nextRemainingBitsToFind), foundNthBit, findNthBit);
     
    140149    // If we've found the n-th bit, end the segment after clearing the markers
    141150    b->SetInsertPoint(foundNthBit);
    142     Value * const inputPtr = b->getInputStreamBlockPtr("bits", ZERO, blockIndex);
    143     Value * const inputValue = b->CreateBlockAlignedLoad(inputPtr);
     151
     152    Value * const inputPtr2 = b->getInputStreamBlockPtr("bits", ZERO, blockIndex2);
     153    Value * const inputValue2 = b->CreateBlockAlignedLoad(inputPtr2);
    144154    Value * const packPosition = b->CreateZExtOrTrunc(b->CreateCountForwardZeroes(remainingBits), sizeTy);
    145     Value * const basePosition = b->CreateNUWMul(packOffset, PACK_SIZE);
    146     Value * const blockOffset = b->CreateNUWAdd(b->CreateOr(basePosition, packPosition), ONE);
     155    Value * const basePosition = b->CreateMul(packOffset, PACK_SIZE);
     156    Value * const blockOffset = b->CreateAdd(b->CreateOr(basePosition, packPosition), ONE);
    147157    Value * const mask = b->CreateNot(b->bitblock_mask_from(blockOffset));
    148     Value * const maskedInputValue = b->CreateAnd(inputValue, mask);
    149     Value * const outputPtr = b->getOutputStreamBlockPtr("uptoN", ZERO, blockIndex);
    150     b->CreateBlockAlignedStore(maskedInputValue, outputPtr);
    151     Value * const positionOfNthItem = b->CreateNUWAdd(b->CreateMul(blockIndex, b->getSize(b->getBitBlockWidth())), blockOffset);
     158    Value * const maskedInputValue = b->CreateAnd(inputValue2, mask);
     159    Value * const outputPtr2 = b->getOutputStreamBlockPtr("uptoN", ZERO, blockIndex2);
     160    b->CreateBlockAlignedStore(maskedInputValue, outputPtr2);
     161    Value * const positionOfNthItem = b->CreateAdd(b->CreateMul(blockIndex2, b->getSize(b->getBitBlockWidth())), blockOffset);
    152162    b->setTerminationSignal();
    153163    BasicBlock * const segmentDone = b->CreateBasicBlock("segmentDone");
     
    157167
    158168    b->SetInsertPoint(nextStride);
    159     Value * const nextStrideIndex = b->CreateNUWAdd(strideIndex, ONE);
    160     strideIndex->addIncoming(nextStrideIndex, nextStride);
    161     b->CreateLikelyCondBr(b->CreateICmpEQ(nextStrideIndex, numOfStrides), segmentDone, strideLoop);
     169    blocksRemaining->addIncoming(b->CreateSub(blocksRemaining, MAXIMUM_BLOCKS_PER_ITERATION), nextStride);
     170    baseBlockIndex->addIncoming(b->CreateAdd(baseBlockIndex, MAXIMUM_BLOCKS_PER_ITERATION), nextStride);
     171    b->CreateLikelyCondBr(b->CreateICmpULE(blocksRemaining, MAXIMUM_BLOCKS_PER_ITERATION), segmentDone, strideLoop);
    162172
    163173    b->SetInsertPoint(segmentDone);
    164174    PHINode * const produced = b->CreatePHI(sizeTy, 2);
    165175    produced->addIncoming(positionOfNthItem, foundNthBit);
    166     produced->addIncoming(allAvailableItems, nextStride);
     176    produced->addIncoming(b->getAvailableItemCount("bits"), nextStride);
    167177    Value * producedCount = b->getProducedItemCount("uptoN");
    168     producedCount = b->CreateNUWAdd(producedCount, produced);
     178    producedCount = b->CreateAdd(producedCount, produced);
    169179    b->setProducedItemCount("uptoN", producedCount);
    170180
    171181}
    172182
    173 unsigned LLVM_READNONE calculateRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
    174     const unsigned packSize = b->getSizeTy()->getBitWidth();
    175     return (packSize * packSize) / b->getBitBlockWidth();
    176 }
    177 
    178183UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    179 : MultiBlockKernel("UntilN_" + std::to_string(calculateRate(b)),
     184: MultiBlockKernel("UntilN",
    180185// inputs
    181 {Binding{b->getStreamSetTy(), "bits", FixedRate(calculateRate(b))}},
     186{Binding{b->getStreamSetTy(), "bits"}},
    182187// outputs
    183 {Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, calculateRate(b))}},
     188{Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, 1)}},
    184189// input scalar
    185190{Binding{b->getSizeTy(), "N"}}, {},
    186191// internal state
    187192{Binding{b->getSizeTy(), "observed"}}) {
    188 
     193    addAttribute(CanTerminateEarly());
    189194}
    190195
  • icGREP/icgrep-devel/icgrep/lz4d.cpp

    r5755 r5856  
    8282
    8383   
    84     kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy(), segmentSize);
     84    kernel::Kernel * sourceK = pxDriver.addKernelInstance<MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
    8585    sourceK->setInitialArguments({inputStream, fileSize});
    8686    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
  • icGREP/icgrep-devel/icgrep/preprocess.cpp

    r5755 r5856  
    5757
    5858    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
    59     kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy(), segmentSize);
     59    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(iBuilder, iBuilder->getInt8PtrTy());
    6060    sourceK->setInitialArguments({inputStream, fileSize});
    6161    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
  • icGREP/icgrep-devel/icgrep/toolchain/cpudriver.cpp

    r5841 r5856  
    9191
    9292void ParabixDriver::makeKernelCall(Kernel * kernel, const std::vector<StreamSetBuffer *> & inputs, const std::vector<StreamSetBuffer *> & outputs) {
    93     assert ("addKernelCall or makeKernelCall was already run on this kernel." && (kernel->getModule() == nullptr));
     93    assert ("makeKernelCall was already run on this kernel." && (kernel->getModule() == nullptr));
    9494    mPipeline.emplace_back(kernel);
    9595    kernel->bindPorts(inputs, outputs);
  • icGREP/icgrep-devel/icgrep/toolchain/driver.h

    r5755 r5856  
    4141    }
    4242
    43     void addKernelCall(kernel::Kernel & kb, const std::vector<parabix::StreamSetBuffer *> & inputs, const std::vector<parabix::StreamSetBuffer *> & outputs) {
    44         return makeKernelCall(&kb, inputs, outputs);
    45     }
    46 
    4743    virtual void makeKernelCall(kernel::Kernel * kb, const std::vector<parabix::StreamSetBuffer *> & inputs, const std::vector<parabix::StreamSetBuffer *> & outputs) = 0;
    4844
  • icGREP/icgrep-devel/icgrep/toolchain/grep_pipeline.cpp

    r5836 r5856  
    5858   
    5959    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
    60     kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy(), segmentSize);
     60    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
    6161    sourceK->setInitialArguments({buffer, length});
    6262    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
  • icGREP/icgrep-devel/icgrep/toolchain/pipeline.cpp

    r5845 r5856  
    1111#include <boost/container/flat_set.hpp>
    1212#include <boost/container/flat_map.hpp>
     13#include <boost/graph/adjacency_list.hpp>
    1314#include <kernels/kernel_builder.h>
     15
     16#include <llvm/Support/raw_ostream.h>
    1417
    1518using namespace kernel;
    1619using namespace parabix;
    1720using namespace llvm;
     21using namespace boost;
     22using namespace boost::container;
    1823
    1924using Port = Kernel::Port;
    20 
    21 template <typename Value>
    22 using StreamSetBufferMap = boost::container::flat_map<const StreamSetBuffer *, Value>;
    23 
    24 template <typename Value>
    25 using FlatSet = boost::container::flat_set<Value>;
    2625
    2726Function * makeThreadFunction(const std::unique_ptr<kernel::KernelBuilder> & b, const std::string & name) {
     
    3231}
    3332
    34 void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel);
    35 
    36 void handleInsufficientData(const std::unique_ptr<KernelBuilder> & b, Value * const produced, Value * const final, BasicBlock * const entry, const Kernel * const consumer,  const Binding & input, const StreamSetBuffer * const buffer);
    37 
    38 bool requiresCopyBack(const Kernel * k, const ProcessingRate & rate);
     33struct PipelineGenerator {
     34
     35    template <typename Value>
     36    using StreamSetBufferMap = flat_map<const StreamSetBuffer *, Value>;
     37
     38    using RateValue = ProcessingRate::RateValue;
     39
     40    struct Channel {
     41        Channel() = default;
     42        Channel(const RateValue & rate, const StreamSetBuffer * const buffer)
     43        : rate(rate), buffer(buffer) { }
     44
     45        RateValue               rate;
     46        const StreamSetBuffer * buffer;
     47    };
     48
     49    using Graph = adjacency_list<vecS, vecS, bidirectionalS, const Kernel *, Channel, vecS>;
     50
     51    using Map = flat_map<const Kernel *, Graph::vertex_descriptor>;
     52
     53    void initialize(const std::vector<Kernel *> & kernels);
     54
     55    Value * executeKernel(const std::unique_ptr<KernelBuilder> & b, const Kernel * const kernel, PHINode * const segNo, Value * const finished);
     56
     57    void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel);
     58
     59    void updateProducedAndConsumedCounts(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel);
     60
     61private:
     62
     63    Graph   G;
     64    Map     M;
     65
     66    StreamSetBufferMap<Value *>         producedItemCount;
     67    StreamSetBufferMap<Value *>         consumedItemCount;
     68    StreamSetBufferMap<const Kernel *>  lastConsumer;
     69};
    3970
    4071/** ------------------------------------------------------------------------------------------------------------- *
     
    87118    Value * const segOffset = b->CreateLoad(b->CreateGEP(threadStruct, {b->getInt32(0), b->getInt32(1)}));
    88119
    89     BasicBlock * const segmentLoop = BasicBlock::Create(b->getContext(), "segmentLoop", threadFunc);
     120    PipelineGenerator G;
     121
     122    BasicBlock * const segmentLoop = b->CreateBasicBlock("segmentLoop");
    90123    b->CreateBr(segmentLoop);
    91124
    92125    b->SetInsertPoint(segmentLoop);
     126    G.initialize(kernels);
    93127    PHINode * const segNo = b->CreatePHI(b->getSizeTy(), 2, "segNo");
    94128    segNo->addIncoming(segOffset, entryBlock);
    95 
    96     BasicBlock * const exitThreadBlock = BasicBlock::Create(b->getContext(), "exitThread", threadFunc);
    97 
    98     StreamSetBufferMap<Value *> producedItemCount;
    99     StreamSetBufferMap<Value *> consumedItemCount;
    100     StreamSetBufferMap<Kernel *> lastUsedKernel;
     129    Value * finished = nullptr;
    101130
    102131    Value * cycleCountStart = nullptr;
     
    106135    }
    107136
    108     Value * terminated = nullptr;
    109 
    110137    const bool serialize = codegen::DebugOptionIsSet(codegen::SerializeThreads);
    111138
    112     for (Kernel * const kernel : kernels) {
    113         const auto & inputs = kernel->getStreamInputs();
    114         for (unsigned i = 0; i < inputs.size(); ++i) {
    115             const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    116             auto f = lastUsedKernel.find(buffer);
    117             if (f == lastUsedKernel.end()) {
    118                 lastUsedKernel.emplace(buffer, kernel);
    119             } else {
    120                 f->second = kernel;
    121             }
    122         }
    123     }
    124 
    125139    for (unsigned k = 0; k < n; ++k) {
    126140
    127         const auto & kernel = kernels[k];
    128 
    129         BasicBlock * const kernelWait = BasicBlock::Create(b->getContext(), kernel->getName() + "Wait", threadFunc);
    130 
     141        const Kernel * const kernel = kernels[k];
     142
     143        BasicBlock * const kernelWait = b->CreateBasicBlock(kernel->getName() + "Wait");
    131144        b->CreateBr(kernelWait);
    132145
    133         BasicBlock * const kernelCheck = BasicBlock::Create(b->getContext(), kernel->getName() + "Check", threadFunc);
    134 
    135         BasicBlock * const kernelBody = BasicBlock::Create(b->getContext(), kernel->getName() + "Do", threadFunc);
    136 
    137         BasicBlock * const kernelEnd = BasicBlock::Create(b->getContext(), kernel->getName() + "End", threadFunc);
    138 
    139146        b->SetInsertPoint(kernelWait);
    140 
    141147        b->setKernel(kernels[serialize ? (n - 1) : k]);
    142148        Value * const processedSegmentCount = b->acquireLogicalSegmentNo();
    143149        b->setKernel(kernel);
    144 
    145150        assert (processedSegmentCount->getType() == segNo->getType());
    146         Value * const ready = b->CreateICmpEQ(segNo, processedSegmentCount);       
     151        Value * const ready = b->CreateICmpEQ(segNo, processedSegmentCount);
     152
     153        BasicBlock * const kernelCheck = b->CreateBasicBlock(kernel->getName() + "Check");
    147154        b->CreateCondBr(ready, kernelCheck, kernelWait);
    148155
    149156        b->SetInsertPoint(kernelCheck);
    150         b->CreateUnlikelyCondBr(b->getTerminationSignal(), kernelEnd, kernelBody);
    151 
    152         // Execute the kernel segment
    153         b->SetInsertPoint(kernelBody);
    154         const auto & inputs = kernel->getStreamInputs();
    155         Value * const isFinal = b->CreateOr(terminated ? terminated : b->getFalse(), b->getTerminationSignal());
    156         std::vector<Value *> args = {kernel->getInstance(), isFinal};
    157         for (unsigned i = 0; i < inputs.size(); ++i) {
    158             const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    159             const auto f = producedItemCount.find(buffer);
    160             assert (f != producedItemCount.end());
    161             Value * const produced = f->second;
    162             args.push_back(produced);
    163             handleInsufficientData(b, produced, isFinal, kernelEnd, kernel, inputs[i], buffer);
    164         }
    165 
    166         b->setKernel(kernel);
    167         b->createDoSegmentCall(args);
    168         b->CreateBr(kernelEnd);
    169 
    170         b->SetInsertPoint(kernelEnd);
    171 
    172         Value * const finished = b->getTerminationSignal();
    173         if (terminated) { // all kernels must terminate
    174             terminated = b->CreateAnd(terminated, finished);
    175         } else {
    176             terminated = finished;
    177         }
    178 
    179         const auto & outputs = kernel->getStreamOutputs();
    180         for (unsigned i = 0; i < outputs.size(); ++i) {           
    181             Value * const produced = b->getProducedItemCount(outputs[i].getName());
    182             const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
    183             assert (producedItemCount.count(buf) == 0);
    184             producedItemCount.emplace(buf, produced);
    185         }
    186         for (unsigned i = 0; i < inputs.size(); ++i) {
    187             Value * const processedItemCount = b->getProcessedItemCount(inputs[i].getName());
    188             const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(i);           
    189             auto f = consumedItemCount.find(buf);
    190             if (f == consumedItemCount.end()) {
    191                 consumedItemCount.emplace(buf, processedItemCount);
    192             } else {
    193                 assert (f->second);
    194                 f->second = b->CreateUMin(processedItemCount, f->second);
    195             }
    196         }
    197 
    198         for (auto i = lastUsedKernel.begin(); i != lastUsedKernel.end(); i++) {
    199             if (i->second == kernel) {
    200                 const StreamSetBuffer * const buffer = i->first;
    201                 Kernel * const producerKernel = buffer->getProducer();
    202                 const auto & binding = producerKernel->getStreamOutput(buffer);
    203                 if (LLVM_UNLIKELY(binding.getRate().isDerived())) {
    204                     continue;
    205                 }
    206                 auto f = consumedItemCount.find(buffer);
    207                 if (f != consumedItemCount.end()) {
    208                     const Kernel* tempKernel = b->getKernel();
    209                     b->setKernel(producerKernel);
    210                     b->setConsumedItemCount(binding.getName(), f->second);
    211                     b->setKernel(tempKernel);
    212                 }
    213             }
    214         }
    215 
     157
     158        finished = G.executeKernel(b, kernel, segNo, finished);
    216159
    217160        if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
     
    225168    }
    226169
    227     exitThreadBlock->moveAfter(b->GetInsertBlock());
    228 
    229170    segNo->addIncoming(b->CreateAdd(segNo, b->getSize(codegen::ThreadNum)), b->GetInsertBlock());
    230171
    231     b->CreateUnlikelyCondBr(terminated, exitThreadBlock, segmentLoop);
    232 
    233     b->SetInsertPoint(exitThreadBlock);
     172    BasicBlock * const segmentExit = b->CreateBasicBlock("segmentExit");
     173    b->CreateUnlikelyCondBr(finished, segmentExit, segmentLoop);
     174
     175    b->SetInsertPoint(segmentExit);
    234176
    235177    // only call pthread_exit() within spawned threads; otherwise it'll be equivalent to calling exit() within the process
    236     BasicBlock * const exitThread = BasicBlock::Create(b->getContext(), "ExitThread", threadFunc);
    237     BasicBlock * const exitFunction = BasicBlock::Create(b->getContext(), "ExitProcessFunction", threadFunc);
    238 
    239     Value * const exitCond = b->CreateICmpEQ(segOffset, ConstantInt::getNullValue(segOffset->getType()));
    240     b->CreateCondBr(exitCond, exitFunction, exitThread);
     178    BasicBlock * const exitThread = b->CreateBasicBlock("ExitThread");
     179    BasicBlock * const exitFunction = b->CreateBasicBlock("ExitProcessFunction");
     180
     181    b->CreateCondBr(b->CreateIsNull(segOffset), exitFunction, exitThread);
    241182    b->SetInsertPoint(exitThread);
    242183    b->CreatePThreadExitCall(nullVoidPtrVal);
     
    317258}
    318259
     260
    319261/** ------------------------------------------------------------------------------------------------------------- *
    320262 * @brief generatePipelineLoop
     
    322264void generatePipelineLoop(const std::unique_ptr<KernelBuilder> & b, const std::vector<Kernel *> & kernels) {
    323265
    324     BasicBlock * entryBlock = b->GetInsertBlock();
    325     Function * main = entryBlock->getParent();
    326 
    327266    // Create the basic blocks for the loop.
    328     BasicBlock * const pipelineLoop = BasicBlock::Create(b->getContext(), "pipelineLoop", main);
    329     BasicBlock * const pipelineExit = BasicBlock::Create(b->getContext(), "pipelineExit", main);
    330 
    331     StreamSetBufferMap<Value *> producedItemCount;
    332     StreamSetBufferMap<Value *> consumedItemCount;
    333     StreamSetBufferMap<Kernel *> lastUsedKernel;
     267    BasicBlock * const entryBlock = b->GetInsertBlock();
     268    BasicBlock * const pipelineLoop = b->CreateBasicBlock("pipelineLoop");
     269    BasicBlock * const pipelineExit = b->CreateBasicBlock("pipelineExit");
     270
     271    PipelineGenerator G;
    334272
    335273    b->CreateBr(pipelineLoop);
     274
    336275    b->SetInsertPoint(pipelineLoop);
    337    
     276    G.initialize(kernels);
     277    PHINode * const segNo = b->CreatePHI(b->getSizeTy(), 2, "segNo");
     278    segNo->addIncoming(b->getSize(0), entryBlock);
     279    Value * finished = nullptr;
     280
    338281    Value * cycleCountStart = nullptr;
    339282    Value * cycleCountEnd = nullptr;
     
    341284        cycleCountStart = b->CreateReadCycleCounter();
    342285    }
    343     Value * terminated = nullptr;
    344286
    345287    for (Kernel * const kernel : kernels) {
    346         const auto & inputs = kernel->getStreamInputs();
    347         for (unsigned i = 0; i < inputs.size(); ++i) {
    348             const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    349             auto f = lastUsedKernel.find(buffer);
    350             if (f == lastUsedKernel.end()) {
    351                 lastUsedKernel.emplace(buffer, kernel);
    352             } else {
    353                 f->second = kernel;
    354             }
    355         }
    356     }
    357 
    358     for (Kernel * const kernel : kernels) {
    359288
    360289        b->setKernel(kernel);
    361290
    362         BasicBlock * const entry = b->GetInsertBlock();
    363         BasicBlock * const kernelCode = BasicBlock::Create(b->getContext(), kernel->getName(), main);
    364         BasicBlock * const kernelExit = BasicBlock::Create(b->getContext(), kernel->getName() + "_exit", main);
    365 
    366         b->CreateUnlikelyCondBr(b->getTerminationSignal(), kernelExit, kernelCode);
    367 
    368         b->SetInsertPoint(kernelCode);
    369         const auto & inputs = kernel->getStreamInputs();
    370         const auto & outputs = kernel->getStreamOutputs();
    371 
    372         Value * const isFinal = terminated ? terminated : b->getFalse();
    373 
    374         std::vector<Value *> args = {kernel->getInstance(), isFinal};
    375 
    376         const auto name = kernel->getName();
    377         for (unsigned i = 0; i < inputs.size(); ++i) {
    378             const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    379             const auto f = producedItemCount.find(buffer);
    380             if (LLVM_UNLIKELY(f == producedItemCount.end())) {
    381                 report_fatal_error(kernel->getName() + " uses stream set " + inputs[i].getName() + " prior to its definition");
    382             }
    383             Value * const produced = f->second;
    384             args.push_back(produced);
    385             handleInsufficientData(b, produced, isFinal, pipelineLoop, kernel, inputs[i], buffer);
    386         }
    387 
    388         applyOutputBufferExpansions(b, kernel);
    389 
    390         b->createDoSegmentCall(args);
    391 
    392         BasicBlock * const kernelFinished = b->GetInsertBlock();
    393         Value * const finished = b->getTerminationSignal();
    394         b->CreateBr(kernelExit);
    395 
    396         b->SetInsertPoint(kernelExit);
    397         PHINode * const finishedPhi = b->CreatePHI(b->getInt1Ty(), 2);
    398         finishedPhi->addIncoming(b->getTrue(), entry);
    399         finishedPhi->addIncoming(finished, kernelFinished);
    400         if (terminated) { // All kernels must agree that we've terminated.
    401             terminated = b->CreateAnd(terminated, finishedPhi);
    402         } else {
    403             terminated = finishedPhi;
    404         }
    405 
    406         for (unsigned i = 0; i < outputs.size(); ++i) {
    407             Value * const produced = b->getProducedItemCount(outputs[i].getName());
    408             const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
    409             assert (producedItemCount.count(buf) == 0);
    410             producedItemCount.emplace(buf, produced);
    411         }
    412 
    413         for (unsigned i = 0; i < inputs.size(); ++i) {
    414             Value * const processed = b->getProcessedItemCount(inputs[i].getName());
    415             const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(i);
    416             auto f = consumedItemCount.find(buf);
    417             if (f == consumedItemCount.end()) {
    418                 consumedItemCount.emplace(buf, processed);
    419             } else {
    420                 f->second = b->CreateUMin(processed, f->second);
    421             }
    422         }
    423 
    424         for (auto i = lastUsedKernel.begin(); i != lastUsedKernel.end(); i++) {
    425             if (i->second == kernel) {
    426                 const StreamSetBuffer * const buffer = i->first;
    427                 Kernel * const producerKernel = buffer->getProducer();
    428                 const auto & binding = producerKernel->getStreamOutput(buffer);
    429                 if (LLVM_UNLIKELY(binding.getRate().isDerived())) {
    430                     continue;
    431                 }
    432                 auto f = consumedItemCount.find(buffer);
    433                 if (f != consumedItemCount.end()) {
    434                     const Kernel* tempKernel = b->getKernel();
    435                     b->setKernel(producerKernel);
    436                     b->setConsumedItemCount(binding.getName(), f->second);
    437                     b->setKernel(tempKernel);
    438                 }
    439             }
    440         }
     291        finished = G.executeKernel(b, kernel, segNo, finished);
    441292
    442293        if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableCycleCounter))) {
     
    446297            cycleCountStart = cycleCountEnd;
    447298        }
    448 //        Value * const segNo = b->acquireLogicalSegmentNo();
    449 //        Value * nextSegNo = b->CreateAdd(segNo, b->getSize(1));
    450 //        b->releaseLogicalSegmentNo(nextSegNo);
    451     }
    452 
    453     b->CreateCondBr(terminated, pipelineExit, pipelineLoop);
     299    }
     300
     301    segNo->addIncoming(b->CreateAdd(segNo, b->getSize(1)), b->GetInsertBlock());
     302    b->CreateCondBr(finished, pipelineExit, pipelineLoop);
    454303
    455304    pipelineExit->moveAfter(b->GetInsertBlock());
     
    481330
    482331/** ------------------------------------------------------------------------------------------------------------- *
     332 * @brief initialize
     333 ** ------------------------------------------------------------------------------------------------------------- */
     334void PipelineGenerator::initialize(const std::vector<Kernel *> & kernels) {
     335
     336    // Our goal when building G is *not* to model the dataflow of our program but instead to
     337    // detetermine the minimum number of sufficient data tests needed to ensure each kernel has
     338    // enough data to progress.
     339
     340    // For example, suppose we have kernels A, B and C, and that B has a fixed input and fixed
     341    // output rate. C also has a fixed input rate but A does *not* have a fixed output rate.
     342    // C must test whether it has enough input from B as B is not guaranteed to have enough
     343    // input from A. Moreover if C is depedent on B, C could be skipped entirely.
     344
     345    // Note: we cannot simply test the output of A for both B and C. In a our data-parallel
     346    // pipeline A's state may change by the time we process C.
     347
     348    for (const Kernel * const consumer : kernels) {
     349        const auto v = add_vertex(consumer, G);
     350        M.emplace(consumer, v);
     351        const auto & inputs = consumer->getStreamInputs();
     352        for (unsigned i = 0; i < inputs.size(); ++i) {
     353
     354            const auto buffer = consumer->getStreamSetInputBuffer(i);
     355            const Kernel * const producer = buffer->getProducer();
     356            const Binding & output = producer->getStreamOutput(buffer);
     357            if (output.getRate().isRelative()) continue;
     358
     359            const Binding & input = inputs[i];
     360            auto ub_in = consumer->getUpperBound(input.getRate()) * consumer->getStride();
     361            if (input.hasLookahead()) {
     362                ub_in += input.getLookahead();
     363            }
     364
     365            const auto lb_out = producer->getLowerBound(output.getRate()) * producer->getStride();
     366
     367            const auto rate = lb_out / ub_in;
     368            const auto f = M.find(producer); assert (f != M.end());
     369            const auto u = f->second;
     370            // If we have multiple inputs from the same kernel, we only need to consider the "slowest" one
     371            bool slowest = true;
     372            if (lb_out > 0) {
     373                for (const auto e : make_iterator_range(in_edges(v, G))) {
     374                    if (source(e, G) == u) {
     375                        Channel & p = G[e];
     376                        slowest = false;
     377                        if (rate < p.rate) {
     378                            p.rate = rate;
     379                            p.buffer = buffer;
     380                        }
     381                        break;
     382                    }
     383                }
     384            }
     385            if (slowest) {
     386                add_edge(u, v, Channel{rate, buffer}, G);
     387            }
     388        }
     389    }
     390
     391    // Take a transitive closure of G but whenever we attempt to insert an edge into the closure
     392    // that already exists, check instead whether the rate of our proposed edge is <= the existing
     393    // edge's rate. If so, the data availability is transitively guaranteed.
     394    for (const auto u : make_iterator_range(vertices(G))) {
     395        for (auto ei : make_iterator_range(in_edges(u, G))) {
     396            const auto v = source(ei, G);
     397            const Channel & pu = G[ei];           
     398            for (auto ej : make_iterator_range(out_edges(u, G))) {               
     399                const auto w = target(ej, G);
     400                const auto ratio = RateValue(G[u]->getStride(), G[w]->getStride());
     401                const auto rate = pu.rate * ratio;
     402                bool insert = true;
     403                for (auto ek : make_iterator_range(in_edges(w, G))) {
     404                    if (source(ek, G) == v) {
     405                        Channel & pw = G[ek];
     406                        if (rate <= pw.rate && pw.rate > 0) {
     407                            pw.buffer = nullptr;
     408                        }
     409                        insert = false;
     410                        break;
     411                    }
     412                }
     413                if (insert) {
     414                    add_edge(v, w, Channel{rate, nullptr}, G);
     415                }
     416            }
     417        }
     418    }
     419
     420    // remove any closure edges from G
     421    remove_edge_if([&](const Graph::edge_descriptor e) { return G[e].buffer == nullptr; }, G);
     422
     423    // If a kernel has no 'necessary to check' inputs then we can remove every output with a rate >= 1 from G
     424    for (const auto u : make_iterator_range(vertices(G))) {
     425        if (in_degree(u, G) == 0) {
     426            remove_out_edge_if(u, [&](const Graph::edge_descriptor e) { return G[e].rate >= RateValue{1, 1}; }, G);
     427        }
     428    }
     429
     430    // iterate through each kernel in order and determine which kernel last used a particular buffer
     431    for (Kernel * const kernel : kernels) {
     432        const auto & inputs = kernel->getStreamInputs();
     433        for (unsigned i = 0; i < inputs.size(); ++i) {
     434            lastConsumer[kernel->getStreamSetInputBuffer(i)] = kernel;
     435        }
     436    }
     437
     438}
     439
     440/** ------------------------------------------------------------------------------------------------------------- *
     441 * @brief executeKernel
     442 ** ------------------------------------------------------------------------------------------------------------- */
     443Value *PipelineGenerator::executeKernel(const std::unique_ptr<KernelBuilder> & b, const Kernel * const kernel, PHINode * const segNo, Value * const finished) {
     444
     445    const auto & inputs = kernel->getStreamInputs();
     446
     447    std::vector<Value *> args(2 + inputs.size());
     448
     449    const auto f = M.find(kernel); assert (f != M.end());
     450    const auto u = f->second;
     451
     452    BasicBlock * const kernelEntry = b->GetInsertBlock();
     453    BasicBlock * const kernelCode = b->CreateBasicBlock(kernel->getName());
     454    BasicBlock * const kernelExit = b->CreateBasicBlock(kernel->getName() + "_exit");
     455
     456    b->CreateUnlikelyCondBr(b->getTerminationSignal(), kernelExit, kernelCode);
     457
     458    b->SetInsertPoint(kernelExit);
     459    PHINode * const terminated = b->CreatePHI(b->getInt1Ty(), 2);
     460    // Since our initial "isFinal" state is equal to what the first kernel's termination signal state
     461    terminated->addIncoming(finished ? finished : b->getTrue(), kernelEntry);
     462    Value * isFinal = finished ? finished : b->getFalse();
     463
     464    b->SetInsertPoint(kernelCode);
     465    for (unsigned i = 0; i < inputs.size(); ++i) {
     466
     467        const Binding & input = inputs[i];
     468
     469        const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     470
     471        const auto name = input.getName();
     472
     473        const auto p = producedItemCount.find(buffer);
     474        if (LLVM_UNLIKELY(p == producedItemCount.end())) {
     475            report_fatal_error(kernel->getName() + " uses stream set " + name + " prior to its definition");
     476        }
     477        Value * const produced = p->second;
     478        const auto ub = kernel->getUpperBound(input.getRate()); assert (ub > 0);
     479        const auto strideLength = ceiling(ub * kernel->getStride()) ;
     480        Constant * const segmentLength = b->getSize(strideLength * codegen::SegmentSize);
     481
     482        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     483            b->CreateAssert(b->CreateICmpULE(segmentLength, b->getCapacity(name)),
     484                            kernel->getName() + ": " + name + " upper bound of segment length exceeds buffer capacity");
     485        }
     486
     487        Value * limit = nullptr;
     488        if (input.getRate().isFixed()) {
     489            // if the input is deferred, simply adding length to the processed item count may result in setting a limit
     490            // that is too low for. instead just calculate the limit of all fixed rates from the segment no.
     491            limit = b->CreateMul(b->CreateAdd(segNo, b->getSize(1)), segmentLength);
     492        } else {
     493            Value * const processed = b->getProcessedItemCount(name);
     494            limit = b->CreateAdd(processed, segmentLength);
     495        }
     496
     497        // TODO: currently, if we produce the exact amount as our limit states, we will have to process one additional segment
     498        // before we can consider this kernel finished. We ought to be able to avoid doing in some cases but need to prove its
     499        // always safe to do so.
     500
     501        Value * const consumingAll = b->CreateICmpULT(produced, limit);
     502        args[i + 2] = b->CreateSelect(consumingAll, produced, limit);
     503        isFinal = b->CreateAnd(isFinal, consumingAll);
     504
     505        // Check for available input (if it's both computable and not guaranteed to be sufficient by the processing rates)
     506        for (auto e : make_iterator_range(in_edges(u, G))) {
     507            const auto p = G[e];
     508            if (p.buffer == buffer) {
     509                BasicBlock * const sufficient = b->CreateBasicBlock(name + "_hasSufficientData");
     510
     511                Constant * const sl = b->getSize(strideLength);
     512
     513                Value * remaining = nullptr;
     514                if (input.getRate().isFixed()) {
     515                    remaining = b->CreateMul(segNo, sl);
     516                } else {
     517                    remaining = b->getProcessedItemCount(name);
     518                }
     519                remaining = b->CreateSub(produced, remaining);
     520
     521                Value * const hasSufficientData = b->CreateOr(b->CreateICmpUGE(remaining, sl), isFinal);
     522                terminated->addIncoming(b->getFalse(), b->GetInsertBlock());
     523                b->CreateLikelyCondBr(hasSufficientData, sufficient, kernelExit);
     524                b->SetInsertPoint(sufficient);
     525            }
     526        }
     527    }
     528
     529    applyOutputBufferExpansions(b, kernel);
     530
     531    args[0] = kernel->getInstance();
     532    args[1] = isFinal;
     533
     534    b->createDoSegmentCall(args);
     535
     536    if (inputs.empty() || kernel->canTerminateEarly()) {
     537        isFinal = b->CreateOr(isFinal, b->getTerminationSignal());
     538    }
     539    b->setTerminationSignal(isFinal);
     540//    b->CallPrintInt(kernel->getName() + "_finished", isFinal);
     541    BasicBlock * const kernelFinished = b->GetInsertBlock();
     542    kernelExit->moveAfter(kernelFinished);
     543    b->CreateBr(kernelExit);
     544
     545    b->SetInsertPoint(kernelExit);
     546    terminated->addIncoming(isFinal, kernelFinished);
     547
     548    updateProducedAndConsumedCounts(b, kernel);
     549
     550    return terminated;
     551}
     552
     553/** ------------------------------------------------------------------------------------------------------------- *
    483554 * @brief applyOutputBufferExpansions
    484555 ** ------------------------------------------------------------------------------------------------------------- */
    485 void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const std::string & name, DynamicBuffer * const db, const uint64_t baseSize) {
    486     BasicBlock * const doExpand = BasicBlock::Create(b->getContext(), name + "Expand", b->GetInsertBlock()->getParent());
    487     BasicBlock * const nextBlock = b->GetInsertBlock()->getNextNode();
    488     doExpand->moveAfter(b->GetInsertBlock());
    489     BasicBlock * const bufferReady = b->CreateBasicBlock(name + "Ready");
    490     bufferReady->moveAfter(doExpand);
    491     if (nextBlock) nextBlock->moveAfter(bufferReady);
    492 
    493     Value * const handle = db->getStreamSetHandle();
    494 
    495     Value * const produced = b->getProducedItemCount(name);
    496     Value * const consumed = b->getConsumedItemCount(name);
    497     Value * const required = b->CreateAdd(b->CreateSub(produced, consumed), b->getSize(2 * baseSize));
    498 
    499     b->CreateCondBr(b->CreateICmpUGT(required, db->getCapacity(b.get(), handle)), doExpand, bufferReady);
    500 
    501     b->SetInsertPoint(doExpand);
    502     db->doubleCapacity(b.get(), handle);
    503     // Ensure that capacity is sufficient by successive doubling, if necessary.
    504     b->CreateCondBr(b->CreateICmpUGT(required, db->getBufferedSize(b.get(), handle)), doExpand, bufferReady);
    505 
    506     b->SetInsertPoint(bufferReady);
    507 }
    508 
    509 void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const Kernel * k) {
     556void PipelineGenerator::applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const Kernel * k) {
    510557    const auto & outputs = k->getStreamSetOutputBuffers();
    511558    for (unsigned i = 0; i < outputs.size(); i++) {
    512559        if (isa<DynamicBuffer>(outputs[i])) {
    513             const auto ub = k->getUpperBound(k->getStreamOutput(i).getRate());
    514             const auto baseSize = (ub.numerator() * k->getStride() + ub.denominator() - 1) / ub.denominator();
     560            const auto baseSize = ceiling(k->getUpperBound(k->getStreamOutput(i).getRate()) * k->getStride() * codegen::SegmentSize);
    515561            if (LLVM_LIKELY(baseSize > 0)) {
     562
    516563                const auto & name = k->getStreamOutput(i).getName();
    517                 applyOutputBufferExpansions(b, name, cast<DynamicBuffer>(outputs[i]), baseSize);
     564
     565                BasicBlock * const doExpand = b->CreateBasicBlock(name + "Expand");
     566                BasicBlock * const nextBlock = b->GetInsertBlock()->getNextNode();
     567                doExpand->moveAfter(b->GetInsertBlock());
     568                BasicBlock * const bufferReady = b->CreateBasicBlock(name + "Ready");
     569                bufferReady->moveAfter(doExpand);
     570                if (nextBlock) nextBlock->moveAfter(bufferReady);
     571
     572                Value * const produced = b->getProducedItemCount(name);
     573                Value * const consumed = b->getConsumedItemCount(name);
     574                Value * const required = b->CreateAdd(b->CreateSub(produced, consumed), b->getSize(2 * baseSize));
     575
     576                b->CreateCondBr(b->CreateICmpUGT(required, b->getBufferedSize(name)), doExpand, bufferReady);
     577                b->SetInsertPoint(doExpand);
     578
     579                b->doubleCapacity(name);
     580                // Ensure that capacity is sufficient by successive doubling, if necessary.
     581                b->CreateCondBr(b->CreateICmpUGT(required, b->getBufferedSize(name)), doExpand, bufferReady);
     582
     583                b->SetInsertPoint(bufferReady);
     584
    518585            }
    519586        }
     
    522589
    523590/** ------------------------------------------------------------------------------------------------------------- *
    524  * @brief handleInsufficientData
     591 * @brief updateProducedAndConsumedCounts
    525592 ** ------------------------------------------------------------------------------------------------------------- */
    526 inline void handleInsufficientData(const std::unique_ptr<KernelBuilder> & b, Value * const produced, Value * const final, BasicBlock * const insufficient,
    527                                    const Kernel * const consumer,  const Binding & input, const StreamSetBuffer * const buffer) {
    528     const Kernel * const producer = buffer->getProducer();
    529     const Binding & output = producer->getStreamOutput(buffer);
    530     const auto consumedRate = consumer->getUpperBound(input.getRate()) * consumer->getStride();
    531     if (consumedRate > 0) {
    532         auto producedRate = producer->getLowerBound(output.getRate()) * producer->getStride();
    533         if (LLVM_UNLIKELY(input.hasLookahead())) {
    534             producedRate -= input.getLookahead();
    535         }
    536         if (LLVM_UNLIKELY(producedRate < consumedRate)) {
    537             const auto name = input.getName();
    538             BasicBlock * const sufficient = BasicBlock::Create(b->getContext(), name + "IsSufficient", b->GetInsertBlock()->getParent());
    539             Value * const processed = b->getProcessedItemCount(name);
    540 
    541             if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableAsserts))) {
    542                 b->CreateAssert(b->CreateICmpULE(processed, produced), input.getName() + ": processed cannot exceed produced");
    543             }
    544             Value * const unread = b->CreateSub(produced, processed);
    545             Constant * const amount = ConstantInt::get(unread->getType(), ceiling(consumedRate));
    546             Value * const cond = b->CreateOr(b->CreateICmpUGE(unread, amount), final);
    547             b->CreateLikelyCondBr(cond, sufficient, insufficient);
    548             b->SetInsertPoint(sufficient);
    549         }
    550     }
     593void PipelineGenerator::updateProducedAndConsumedCounts(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel) {
     594
     595    const auto & inputs = kernel->getStreamInputs();
     596    for (unsigned i = 0; i < inputs.size(); ++i) {
     597        Value * const processed = b->getProcessedItemCount(inputs[i].getName());
     598
     599        const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     600        auto f = consumedItemCount.find(buffer);
     601        Value * consumed = processed;
     602        if (f == consumedItemCount.end()) {
     603            consumedItemCount.emplace(buffer, consumed);
     604        } else {
     605            consumed = b->CreateUMin(consumed, f->second);
     606            f->second = consumed;
     607        }
     608
     609        // If this kernel is the last consumer of a input buffer, update the consumed count for that buffer.
     610        const auto c = lastConsumer.find(buffer);
     611        assert (c != lastConsumer.end());
     612        if (c->second == kernel) {
     613            Kernel * const producer = buffer->getProducer();
     614            const auto & output = producer->getStreamOutput(buffer);
     615            if (output.getRate().isRelative()) continue;
     616            b->setKernel(producer);
     617
     618            b->setConsumedItemCount(output.getName(), consumed);
     619            b->setKernel(kernel);
     620        }
     621    }
     622
     623    const auto & outputs = kernel->getStreamOutputs();
     624    for (unsigned i = 0; i < outputs.size(); ++i) {
     625        Value * const produced = b->getProducedItemCount(outputs[i].getName());
     626        const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
     627        assert (producedItemCount.count(buf) == 0);
     628        producedItemCount.emplace(buf, produced);
     629    }
     630
    551631}
    552632
    553 /** ------------------------------------------------------------------------------------------------------------- *
    554  * @brief requiresCopyBack
    555  ** ------------------------------------------------------------------------------------------------------------- */
    556 bool requiresCopyBack(const Kernel * k, const ProcessingRate & rate) {
    557     if (rate.isBounded() || rate.isUnknown()) {
    558         return true;
    559     } else if (rate.isRelative()) {
    560         return requiresCopyBack(k, k->getBinding(rate.getReference()).getRate());
    561     }
    562     return false;
    563 }
     633
  • icGREP/icgrep-devel/icgrep/u8u16.cpp

    r5853 r5856  
    284284    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
    285285   
    286     Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, segmentSize);
     286    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
    287287    mmapK->setInitialArguments({fileDecriptor});
    288288    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
     
    368368    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
    369369   
    370     Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder, segmentSize);
     370    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
    371371    mmapK->setInitialArguments({fileDecriptor});
    372372    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
Note: See TracChangeset for help on using the changeset viewer.