Changeset 5793


Ignore:
Timestamp:
Dec 20, 2017, 11:42:53 AM (6 months ago)
Author:
nmedfort
Message:

Bug fix for pipeline: it was terminating too early when there was insufficient output space to process all of the input for a kernel.

Location:
icGREP/icgrep-devel/icgrep
Files:
26 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5778 r5793  
    248248  COMMAND ./run_all "${CMAKE_BINARY_DIR}/u8u16 -segment-size=16 -enable-segment-pipeline-parallel")
    249249
    250 #add_test(
    251 #  NAME lz4d_test
    252 #  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/../QA/lz4d
    253 #  COMMAND ./run_all ${CMAKE_BINARY_DIR}/lz4d)
     250add_test(
     251  NAME lz4d_test
     252  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/../QA/lz4d
     253  COMMAND ./run_all ${CMAKE_BINARY_DIR}/lz4d)
    254254
    255255add_test(
  • icGREP/icgrep-devel/icgrep/editd/editd.cpp

    r5755 r5793  
    339339    auto CCResults = pxDriver.addBuffer<ExternalBuffer>(iBuilder, iBuilder->getStreamSetTy(4), outputStream);
    340340    auto ccck = pxDriver.addKernelInstance<PreprocessKernel>(iBuilder);
     341    // NOTE: CCResults are never consumed because they are written directly into an external buffer. This may make analysis difficult.
    341342    pxDriver.makeKernelCall(ccck, {BasisBits}, {CCResults});
    342343
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5787 r5793  
    101101    const unsigned encodingBits = 8;
    102102
    103     StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments + 1);
     103    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
    104104    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    105105    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    106106
    107     StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments + 1);
     107    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    108108    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
    109109    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
  • icGREP/icgrep-devel/icgrep/kernels/alignedprint.cpp

    r5626 r5793  
    270270PrintableBits::PrintableBits(const std::unique_ptr<kernel::KernelBuilder> & builder)
    271271: BlockOrientedKernel("PrintableBits", {Binding{builder->getStreamSetTy(1), "bitStream"}}, {Binding{builder->getStreamSetTy(1, 8), "byteStream"}}, {}, {}, {}) {
    272     setNoTerminateAttribute(true);
     272
    273273}
    274274
    275275SelectStream::SelectStream(const std::unique_ptr<kernel::KernelBuilder> & builder, unsigned sizeInputStreamSet, unsigned streamIndex)
    276276: BlockOrientedKernel("SelectStream", {Binding{builder->getStreamSetTy(sizeInputStreamSet), "bitStreams"}}, {Binding{builder->getStreamSetTy(1, 1), "bitStream"}}, {}, {}, {}), mSizeInputStreamSet(sizeInputStreamSet), mStreamIndex(streamIndex) {
    277     setNoTerminateAttribute(true);
    278277
    279278}
     
    281280ExpandOrSelectStreams::ExpandOrSelectStreams(const std::unique_ptr<kernel::KernelBuilder> & builder, unsigned sizeInputStreamSet, unsigned sizeOutputStreamSet)
    282281: BlockOrientedKernel("ExpandOrSelectStreams", {Binding{builder->getStreamSetTy(sizeInputStreamSet), "bitStreams"}}, {Binding{builder->getStreamSetTy(sizeOutputStreamSet), "outputbitStreams"}}, {}, {}, {}), mSizeInputStreamSet(sizeInputStreamSet), mSizeOutputStreamSet(sizeOutputStreamSet) {
    283     setNoTerminateAttribute(true);
    284282
    285283}
  • icGREP/icgrep-devel/icgrep/kernels/attributes.h

    r5782 r5793  
    105105        /** INPUT/OUTPUT STREAM ATTRIBUTES **/
    106106
     107        Misaligned,
     108
     109        // Assume that we cannot statically compute the alignment of this stream set and
     110        // perform any operations accordingly
     111
    107112        BlockSize, /// NOT DONE
    108113
     
    198203        // a MultiBlock kernel will select the *maximum* input item count as it's
    199204        // principle item length and zero-extend the streams accordingly.
    200 
    201         CanTerminate,
    202 
    203         // Informs the pipeline that this kernel can pass a "termination" message to it.
    204         // in which case the pipeline will propogate the message to the subsequent
    205         // kernels and end the program once the final kernel has returned its result.
    206205
    207206    };
     
    253252    friend Attribute LookBehind(const unsigned);
    254253    friend Attribute Deferred();
     254    friend Attribute Misaligned();
    255255    friend Attribute ConditionalRegionBegin();
    256256    friend Attribute ConditionalRegionEnd();
     
    331331}
    332332
     333inline Attribute Misaligned() {
     334    return Attribute(Attribute::KindId::Misaligned, 0);
     335}
     336
    333337inline Attribute ConditionalRegionBegin() {
    334338    return Attribute(Attribute::KindId::ConditionalRegionBegin, 0);
  • icGREP/icgrep-devel/icgrep/kernels/evenodd.cpp

    r5440 r5793  
    2020EvenOddKernel::EvenOddKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
    2121: BlockOrientedKernel("EvenOdd", {Binding{builder->getStreamSetTy(8, 1), "BasisBits"}}, {Binding{builder->getStreamSetTy(2, 1), "even_odd"}}, {}, {}, {}) {
    22     setNoTerminateAttribute(true);
    2322
    2423}
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5782 r5793  
    229229InvertMatchesKernel::InvertMatchesKernel(const std::unique_ptr<kernel::KernelBuilder> & builder)
    230230: BlockOrientedKernel("Invert",
    231     // Inputs
    232     {Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
    233     // Outputs
    234     {Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
    235     // Input/Output Scalars and internal state
    236     {}, {}, {}) {
    237     setNoTerminateAttribute(true);   
     231// Inputs
     232{Binding{builder->getStreamSetTy(1, 1), "matchedLines"}, Binding{builder->getStreamSetTy(1, 1), "lineBreaks"}},
     233// Outputs
     234{Binding{builder->getStreamSetTy(1, 1), "nonMatches"}},
     235// Input/Output Scalars and internal state
     236{}, {}, {}) {
     237
    238238}
    239239
  • icGREP/icgrep-devel/icgrep/kernels/interface.h

    r5782 r5793  
    6363    bool hasLookahead() const {
    6464        return hasAttribute(AttributeId::LookAhead);
     65    }
     66
     67    bool isMisaligned() const {
     68        return hasAttribute(AttributeId::Misaligned);
    6569    }
    6670
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5783 r5793  
    269269    }
    270270    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
    271     addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
     271    addScalar(sizeTy, TERMINATION_SIGNAL);
    272272    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    273273        addScalar(sizeTy, mStreamSetOutputs[i].getName() + CONSUMED_ITEM_COUNT_SUFFIX);
     
    628628inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
    629629    const auto & rate = binding.getRate();
    630     if (rate.isFixed() && binding.nonDeferred()) {
     630    if (rate.isFixed() && binding.nonDeferred() && !binding.isMisaligned()) {
    631631        const auto r = rate.getRate();
    632632        auto n = (r.numerator() * mStride);
     
    664664    }
    665665
    666     using AttributeId = kernel::Attribute::KindId;
    667666    using RateValue = ProcessingRate::RateValue;
    668667
     
    725724    Constant * const BLOCK_WIDTH_MASK = b->getSize(b->getBitBlockWidth() - 1);
    726725
     726    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     727        Value * terminatedTwice = b->CreateAnd(mIsFinal, b->getTerminationSignal());
     728        Value * unprocessedData = nullptr;
     729        for (unsigned i = 0; i < inputSetCount; i++) {
     730            Value * processed = b->getProcessedItemCount(mStreamSetInputs[i].getName());
     731            Value * const check = b->CreateICmpNE(processed, mAvailableItemCount[i]);
     732            unprocessedData = unprocessedData ? b->CreateOr(unprocessedData, check) : check;
     733        }
     734        b->CreateAssertZero(b->CreateAnd(terminatedTwice, unprocessedData),
     735                            getName() + " was called after its termination with additional input data");
     736        b->CreateAssertZero(terminatedTwice,
     737                            getName() + " was called after its termination");
     738    }
     739
    727740    // Now proceed with creation of the doSegment method.
    728741    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     
    745758        const auto & name = input.getName();
    746759        const ProcessingRate & rate = input.getRate();
    747         Value * processed = b->getProcessedItemCount(name);
    748         //b->CallPrintInt(getName() + "_" + name + "_processed", processed);
     760        Value * const processed = b->getProcessedItemCount(name);
    749761
    750762        mInitialProcessedItemCount[i] = processed;
    751763        Value * baseBuffer  = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
    752764
    753         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    754             b->CreateAssert(b->CreateICmpULT(processed, mAvailableItemCount[i]), "processed item count must be less than the available item count");
    755         }
     765        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {           
     766            b->CreateAssert(b->CreateICmpULE(processed, mAvailableItemCount[i]),
     767                            getName() + ": " + name + " processed item count exceeds its available item count");
     768        }
     769
     770        // Ensure that everything between S⌈P/S⌉, and S⌈n*(P + L)/S⌉ is linearly available, where S is the stride size,
     771        // P is the current processed position, L is the lookahead amount and n ∈ â„€+.
    756772
    757773        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);
    758         //b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
    759 
    760774        Value * avail = b->getLinearlyAccessibleItems(name, processed, unprocessed);
    761         //b->CallPrintInt(getName() + "_" + name + "_avail", avail);
    762 
    763 
    764         // Ensure that everything between S⌈P/S⌉, and S⌈n*(P + L)/S⌉ is linearly available, where S is
    765         // the stride size, P is the current processed position, L is the lookahead amount and n ∈ â„€+.
    766 
    767775        Value * remaining = avail;
    768776        if (LLVM_UNLIKELY(input.hasLookahead())) {
    769777            Constant * const lookahead = b->getSize(input.getLookahead());
    770778            remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO);
    771             //b->CallPrintInt(getName() + "_" + name + "_remaining", remaining);
    772779        }
    773780
    774781        inputStrideSize[i] = getStrideSize(b, rate);
    775 
    776782        Value * accessibleStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
    777 
    778         //b->CallPrintInt(getName() + "_" + name + "_accessibleStrides", accessibleStrides);
    779 
    780783        AllocaInst * const tempBuffer = temporaryInputBuffer[i];
    781784        if (tempBuffer) {
     
    795798            Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), unprocessed->getType());
    796799            Value * const temporaryAvailable = b->CreateUMin(unprocessed, temporarySize);
    797             if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    798                 b->CreateAssert(b->CreateICmpULE(avail, temporaryAvailable),
    799                                 "linearly available item count cannot exceed the temporarily available item count");
    800             }
    801800            Value * const offset = b->CreateAnd(processed, BLOCK_WIDTH_MASK);
    802801            Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), arraySize);
     
    816815
    817816            b->SetInsertPoint(resume);
    818             PHINode * const bufferPtr = b->CreatePHI(baseBuffer->getType(), 3);
    819             bufferPtr->addIncoming(baseBuffer , entry);
     817            PHINode * const bufferPtr = b->CreatePHI(baseBuffer->getType(), 4);
     818            bufferPtr->addIncoming(baseBuffer, entry);
    820819            bufferPtr->addIncoming(tempBuffer, copyToBackEnd);
    821820            bufferPtr->addIncoming(tempBuffer, copyToFrontEnd);
    822821            baseBuffer = bufferPtr;
    823822
    824             PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 3);
     823            PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 4);
    825824            phiAvailItemCount->addIncoming(avail, entry);
    826825            phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
     
    828827            avail = phiAvailItemCount;
    829828
    830             PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
     829            PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 4);
    831830            phiStrides->addIncoming(accessibleStrides, entry);
    832831            phiStrides->addIncoming(temporaryStrides, copyToBackEnd);
     
    849848        const ProcessingRate & rate = output.getRate();
    850849        Value * const produced = b->getProducedItemCount(name);
    851 
    852         //b->CallPrintInt(getName() + "_" + name + "_produced", produced);
    853 
    854850        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    855851        assert (baseBuffer->getType()->isPointerTy());
    856852        linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);
    857 
    858         //b->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]);
    859 
    860853        outputStrideSize[i] = getStrideSize(b, rate);
    861854        // Is the number of linearly writable items sufficient for a stride?
     
    863856            AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    864857            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
    865             //b->CallPrintInt(getName() + "_" + name + "_writableStrides", writableStrides);
    866 
    867 
    868858            // Do we require a temporary buffer to write to?
    869859            if (tempBuffer) {
    870860                assert (tempBuffer->getType() == baseBuffer->getType());
    871861                BasicBlock * const entry = b->GetInsertBlock();
    872                 BasicBlock * const clearBuffer = b->CreateBasicBlock(name + "ClearTemporaryBuffer");
     862                BasicBlock * const prepareTempBuffer = b->CreateBasicBlock(name + "PrepareTempBuffer");
    873863                BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
    874864                Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
    875                 b->CreateUnlikelyCondBr(requiresCopy, clearBuffer, resume);
     865                b->CreateUnlikelyCondBr(requiresCopy, prepareTempBuffer, resume);
    876866                // Clear the output buffer prior to using it
    877                 b->SetInsertPoint(clearBuffer);
     867                b->SetInsertPoint(prepareTempBuffer);
    878868                Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), tempBuffer->getArraySize());
    879869                b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
     
    883873                PHINode * const phiBuffer = b->CreatePHI(baseBuffer->getType(), 3);
    884874                phiBuffer->addIncoming(baseBuffer, entry);
    885                 phiBuffer->addIncoming(tempBuffer, clearBuffer);
     875                phiBuffer->addIncoming(tempBuffer, prepareTempBuffer);
    886876                baseBuffer = phiBuffer;
    887877                PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
    888878                phiStrides->addIncoming(writableStrides, entry);
    889                 phiStrides->addIncoming(ONE, clearBuffer);
     879                phiStrides->addIncoming(ONE, prepareTempBuffer);
    890880                writableStrides = phiStrides;
    891881            }
     
    964954            continue;
    965955        }
     956
    966957        Value * const baseBuffer = mStreamSetOutputBaseAddress[i];
    967958        assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType()));
     
    994985    //  We've dealt with the partial block processing and copied information back into the
    995986    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    996     if (hasNoTerminateAttribute()) {
    997         b->CreateCondBr(mIsFinal, segmentDone, strideDone);
    998     } else {
    999         BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
    1000         b->CreateCondBr(mIsFinal, setTermination, strideDone);
    1001 
    1002         b->SetInsertPoint(setTermination);
    1003         b->setTerminationSignal();
    1004         b->CreateBr(segmentDone);       
    1005     }
     987    BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
     988    b->CreateCondBr(mIsFinal, setTermination, strideDone);
     989
     990    b->SetInsertPoint(setTermination);
     991    b->setTerminationSignal();
     992    b->CreateBr(segmentDone);
    1006993
    1007994    /// STRIDE DONE
    1008995    strideDone->moveAfter(b->GetInsertBlock());
    1009996    b->SetInsertPoint(strideDone);
    1010 
    1011     b->CreateAssertZero(mIsFinal, "stride done cannot process the final block");
    1012997
    1013998    // do we have enough data for another stride?
     
    10191004        Value * const processed = b->getProcessedItemCount(name);
    10201005        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1021             b->CreateAssert(b->CreateICmpULE(processed, avail), getName() + "." + name + ": processed data exceeds available data");
     1006            b->CreateAssert(b->CreateICmpULE(processed, avail), getName() + ": " + name + " processed data exceeds available data");
    10221007        }
    10231008        Value * remaining = b->CreateSub(avail, processed);
    1024         if (LLVM_UNLIKELY(input.hasAttribute(AttributeId::LookAhead))) {
    1025             Constant * const lookahead = b->getSize(input.findAttribute(AttributeId::LookAhead).amount());
     1009        if (LLVM_UNLIKELY(input.hasLookahead())) {
     1010            Constant * const lookahead = b->getSize(input.getLookahead());
    10261011            remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO);
    10271012        }
     
    10421027            Value * const consumed = b->getConsumedItemCount(name);
    10431028            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1044                 b->CreateAssert(b->CreateICmpULE(consumed, produced), getName() + "." + name + ": consumed data exceeds produced data");
     1029                b->CreateAssert(b->CreateICmpULE(consumed, produced),
     1030                                getName() + ": " + name + " consumed data exceeds produced data");
    10451031            }
    10461032            Value * const unconsumed = b->CreateSub(produced, consumed);
    10471033            Value * const capacity = b->getCapacity(name);
    10481034            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1049                 b->CreateAssert(b->CreateICmpULE(unconsumed, capacity), getName() + "." + name + ": unconsumed data exceeds capacity");
     1035                b->CreateAssert(b->CreateICmpULE(unconsumed, capacity),
     1036                                getName() + ": " + name + " unconsumed data exceeds capacity");
    10501037            }
    10511038            Value * const remaining = b->CreateSub(capacity, unconsumed);
    10521039            Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
    10531040            Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
     1041
    10541042            hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    10551043        }
    10561044        // Do copybacks if necessary.
    10571045        if (mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate)) {
    1058             b->CreateCopyBack(name, mInitialProducedItemCount[i], produced);
    1059         }
    1060     }
    1061 
    1062     // b->CreateAssertZero(b->CreateOr(b->CreateNot(initiallyFinal), hasMoreStrides), getName() + " does not have enough output space for the final stride");
     1046            BasicBlock * const copyBack = b->CreateBasicBlock(name + "CopyBack");
     1047            BasicBlock * const done = b->CreateBasicBlock(name + "CopyBackDone");
     1048
     1049            Value * const bufferSize = b->getBufferedSize(name);
     1050            Value * const prior = b->CreateURem(mInitialProducedItemCount[i], bufferSize);
     1051            Value * const current = b->CreateURem(produced, bufferSize);
     1052            b->CreateUnlikelyCondBr(b->CreateICmpUGT(prior, current), copyBack, done);
     1053
     1054            b->SetInsertPoint(copyBack);
     1055            Value * const baseAddress = b->getBaseAddress(name);
     1056            const auto copyAlignment = getItemAlignment(mStreamSetOutputs[i]);
     1057            b->CreateStreamCpy(name, baseAddress, ZERO, baseAddress, bufferSize, current, copyAlignment);
     1058            b->CreateBr(done);
     1059
     1060            b->SetInsertPoint(done);
     1061        }
     1062    }
    10631063
    10641064    b->CreateCondBr(hasMoreStrides, segmentLoop, segmentDone);
     
    13441344    }
    13451345
    1346     writeFinalBlockMethod(b, getRemainingItems(b));
     1346    Value * const remainingItems = getRemainingItems(b);
     1347
     1348//    b->CallPrintInt(getName() + "_remainingItems", remainingItems);
     1349
     1350    writeFinalBlockMethod(b, remainingItems);
    13471351
    13481352    b->CreateBr(segmentDone);
     
    15511555, mCurrentMethod(nullptr)
    15521556, mAvailablePrincipalItemCount(nullptr)
    1553 , mNoTerminateAttribute(false)
    15541557, mIsGenerated(false)
    15551558, mStride(0)
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5782 r5793  
    110110    void finalizeInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
    111111
    112     bool hasNoTerminateAttribute() const {
    113         return mNoTerminateAttribute;
    114     }
    115 
    116112    StreamPort getStreamPort(const std::string & name) const;
    117113
     
    220216          Bindings && scalar_outputs,
    221217          Bindings && internal_scalars);
    222 
    223     void setNoTerminateAttribute(const bool noTerminate = true) {
    224         mNoTerminateAttribute = noTerminate;
    225     }
    226218
    227219    llvm::Value * getPrincipalItemCount() const {
     
    297289    llvm::Function *                    mCurrentMethod;
    298290    llvm::Value *                       mAvailablePrincipalItemCount;
    299     bool                                mNoTerminateAttribute;
    300291    bool                                mIsGenerated;
    301292    unsigned                            mStride;
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5782 r5793  
    118118
    119119Value * KernelBuilder::getTerminationSignal() {
    120     if (mKernel->hasNoTerminateAttribute()) {
    121         return getFalse();
    122     }
    123     return getScalarField(Kernel::TERMINATION_SIGNAL);
     120    return CreateICmpNE(getScalarField(Kernel::TERMINATION_SIGNAL), getSize(0));
    124121}
    125122
    126123void KernelBuilder::setTerminationSignal(llvm::Value * const value) {
    127     assert (!mKernel->hasNoTerminateAttribute());
    128124    assert (value->getType() == getInt1Ty());
    129125    if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
    130126        CallPrintIntToStderr(mKernel->getName() + ": setTerminationSignal", value);
    131127    }
    132     setScalarField(Kernel::TERMINATION_SIGNAL, value);
     128    setScalarField(Kernel::TERMINATION_SIGNAL, CreateZExt(value, getSizeTy()));
    133129}
    134130
     
    142138    return buf->getLinearlyWritableItems(this, getStreamHandle(name), fromPosition, getConsumedItemCount(name), reverse);
    143139}
    144 
    145 //Value * KernelBuilder::getLinearlyCopyableItems(const std::string & name, Value * fromPosition, bool reverse) {
    146 //    const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
    147 //    return buf->getLinearlyCopyableItems(this, getStreamHandle(name), fromPosition, reverse);
    148 //}
    149140
    150141/** ------------------------------------------------------------------------------------------------------------- *
     
    196187
    197188    const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
     189
    198190    const auto itemWidth = getItemWidth(buf->getBaseType());
    199191    assert ("invalid item width" && is_power_2(itemWidth));
     
    202194    // (w.r.t the stream copy) would be n*m. By taking this into account we can optimize and simplify the copy code.
    203195    const auto fieldWidth = getFieldWidth(itemWidth * itemAlignment, blockWidth);
    204 
    205 //    CallPrintInt(mKernel->getName() + "_" + name + "_target", target);
    206 //    CallPrintInt(mKernel->getName() + "_" + name + "_targetOffset", targetOffset);
    207 //    CallPrintInt(mKernel->getName() + "_" + name + "_source", source);
    208 //    CallPrintInt(mKernel->getName() + "_" + name + "_sourceOffset", sourceOffset);
    209 //    CallPrintInt(mKernel->getName() + "_" + name + "_itemsToCopy", itemsToCopy);
    210 
     196    const auto alignment = (fieldWidth + 7) / 8;
    211197    if (LLVM_LIKELY(itemWidth < fieldWidth)) {
    212         Constant * const factor = getSize(fieldWidth / itemWidth);
    213         CreateAssertZero(CreateURem(targetOffset, factor), "target offset is not a multiple of its field width");
    214         targetOffset = CreateUDiv(targetOffset, factor);
    215         CreateAssertZero(CreateURem(sourceOffset, factor), "source offset is not a multiple of its field width");
    216         sourceOffset = CreateUDiv(sourceOffset, factor);
     198        const auto factor = fieldWidth / itemWidth;
     199        Constant * const FACTOR = getSize(factor);
     200        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     201            ConstantInt * const ALIGNMENT = getSize(alignment);
     202            const auto kernelName = mKernel->getName()+ ": " + name;
     203            CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     204            CreateAssertZero(CreateURem(targetOffset, FACTOR), kernelName + " target offset is misaligned (" + std::to_string(factor) + ")");
     205            CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
     206            CreateAssertZero(CreateURem(sourceOffset, FACTOR), kernelName + " source offset is misaligned (" + std::to_string(factor) + ")");
     207        }
     208        targetOffset = CreateUDiv(targetOffset, FACTOR);
     209        sourceOffset = CreateUDiv(sourceOffset, FACTOR);
    217210    }
    218211
     
    240233
    241234    */
    242 
    243     const auto alignment = (fieldWidth + 7) / 8;
    244235
    245236    Type * const fieldWidthTy = getIntNTy(fieldWidth);
     
    396387}
    397388
    398 void KernelBuilder::CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to) {
    399     const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    400     buf->genCopyBackLogic(this, getStreamHandle(name), from, to, name);
    401 }
    402 
    403389Value * KernelBuilder::getConsumerLock(const std::string & name) {
    404390    return getScalarField(name + Kernel::CONSUMER_SUFFIX);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5782 r5793  
    101101    llvm::Value * getBlockAddress(const std::string & name, llvm::Value * const blockIndex);
    102102
    103     void CreateCopyBack(const std::string & name, llvm::Value * from, llvm::Value * to);
    104 
    105103    void setBaseAddress(const std::string & name, llvm::Value * addr);
    106104
  • icGREP/icgrep-devel/icgrep/kernels/linebreak_kernel.cpp

    r5782 r5793  
    5858    PabloAST * const LF = pb.createExtract(getInput(1), ZERO, "LF");
    5959    PabloAST * const CR = ccc.compileCC(makeCC(0x0D));
    60     PabloAST * const LF_VT_FF_CR = ccc.compileCC(makeCC(0x0A, 0x0D));
     60    PabloAST * const LF_VT_FF_CR = ccc.compileCC("LF,VT,FF,CR", makeCC(0x0A, 0x0D), pb);
    6161    Var * const LineBreak = pb.createVar("LineBreak", LF_VT_FF_CR);
    6262
     
    9393    it3.createAssign(LineBreak, it3.createOr(LineBreak, LS_PS));
    9494
    95     PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(pb.createNot(LineBreak), 1));
     95    PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(pb.createNot(LineBreak), 1), "unterminatedLineAtEOF");
    9696    pb.createAssign(pb.createExtract(getOutput(0), ZERO), pb.createOr(LineBreak, unterminatedLineAtEOF, "EOL"));
    9797}
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5755 r5793  
    175175LZ4ByteStreamDecoderKernel::LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize)
    176176: MultiBlockKernel("lz4ByteStreamDecoder",
    177     // Inputs
    178     {Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes"},
    179      Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes"},
    180      Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", FixedRate(), { Deferred(), LookBehind(65536) }}},
    181     // Outputs
    182     {Binding{iBuilder->getStreamSetTy(1, 8), "outputStream", UnknownRate()}},
    183     // Arguments
    184     {},
    185     {},
    186     {}),
    187  mBufferSize(bufferSize) {
    188     setNoTerminateAttribute(true);
     177// Inputs
     178{Binding{iBuilder->getStreamSetTy(2, 32), "literalIndexes"},
     179 Binding{iBuilder->getStreamSetTy(2, 32), "matchIndexes"},
     180 Binding{iBuilder->getStreamSetTy(1, 8), "inputStream", FixedRate(), { Deferred(), Misaligned(), LookBehind(65536) }}},
     181// Outputs
     182{Binding{iBuilder->getStreamSetTy(1, 8), "outputStream", UnknownRate()}},
     183// Arguments
     184{},
     185{},
     186{})
     187, mBufferSize(bufferSize) {
     188
    189189}
     190
     191
  • icGREP/icgrep-devel/icgrep/kernels/lz4_index_decoder.cpp

    r5755 r5793  
    684684LZ4IndexDecoderKernel::LZ4IndexDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    685685: BlockOrientedKernel("lz4IndexDecoder",
    686     // Inputs
    687     {Binding{b->getStreamSetTy(1, 8), "byteStream"},
    688      Binding{b->getStreamSetTy(1, 1), "extenders"}},
    689     // Outputs: literal start, literal length, match offset, match length
    690     {Binding{b->getStreamSetTy(2, 32), "literalIndexes", UnknownRate()},
    691      Binding{b->getStreamSetTy(2, 32), "matchIndexes", RateEqualTo("literalIndexes")}},
    692     // Arguments
    693     {Binding{b->getInt1Ty(), "hasBlockChecksum"}},
    694     {},
    695     // Internal states:
    696     {Binding{b->getInt32Ty(), "BlockNo"},
    697      Binding{b->getInt8Ty(), "State"},
    698      Binding{b->getInt32Ty(), "LZ4BlockStart"},
    699      Binding{b->getInt32Ty(), "LZ4BlockEnd"},
    700      Binding{b->getInt32Ty(), "BytesToSkip"},
    701      Binding{b->getInt32Ty(), "TempLength"},
    702      Binding{b->getInt32Ty(), "TempCount"},
    703      Binding{b->getInt32Ty(), "LiteralStart"},
    704      Binding{b->getInt32Ty(), "LiteralLength"},
    705      Binding{b->getInt32Ty(), "MatchOffset"},
    706      Binding{b->getInt32Ty(), "MatchLength"}})
     686// Inputs
     687{Binding{b->getStreamSetTy(1, 8), "byteStream", FixedRate(), Misaligned()},
     688 Binding{b->getStreamSetTy(1, 1), "extenders"}},
     689// Outputs: literal start, literal length, match offset, match length
     690{Binding{b->getStreamSetTy(2, 32), "literalIndexes", UnknownRate()},
     691 Binding{b->getStreamSetTy(2, 32), "matchIndexes", RateEqualTo("literalIndexes")}},
     692// Arguments
     693{Binding{b->getInt1Ty(), "hasBlockChecksum"}},
     694{},
     695// Internal states:
     696{Binding{b->getInt32Ty(), "BlockNo"},
     697 Binding{b->getInt8Ty(), "State"},
     698 Binding{b->getInt32Ty(), "LZ4BlockStart"},
     699 Binding{b->getInt32Ty(), "LZ4BlockEnd"},
     700 Binding{b->getInt32Ty(), "BytesToSkip"},
     701 Binding{b->getInt32Ty(), "TempLength"},
     702 Binding{b->getInt32Ty(), "TempCount"},
     703 Binding{b->getInt32Ty(), "LiteralStart"},
     704 Binding{b->getInt32Ty(), "LiteralLength"},
     705 Binding{b->getInt32Ty(), "MatchOffset"},
     706 Binding{b->getInt32Ty(), "MatchLength"}})
    707707, wordWidth{b->getSizeTy()->getBitWidth()} {
    708     setNoTerminateAttribute(true);
    709 }
     708
     709}
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp

    r5757 r5793  
    208208    {Binding{b->getStreamSetTy(8, 1), "basisBits"}}, {}, {}, {}),
    209209  mAligned(aligned) {
    210     setNoTerminateAttribute(true);
    211 }
    212 
    213 }
     210    if (!aligned) {
     211        mStreamSetInputs[0].addAttribute(Misaligned());
     212    }
     213}
     214}
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r5755 r5793  
    1515namespace kernel {
    1616
    17 Value * StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) {
    18     Value * codeUnitBuffer = iBuilder->getInputStreamBlockPtr("codeUnitBuffer", iBuilder->getInt32(0));
    19     codeUnitBuffer = iBuilder->CreatePointerCast(codeUnitBuffer, iBuilder->getInt8PtrTy());
     17Value * StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
     18    Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
     19    codeUnitBuffer = b->CreatePointerCast(codeUnitBuffer, b->getInt8PtrTy());
    2020    Value * bytesToDo = mAvailableItemCount[0];
    2121    if (LLVM_UNLIKELY(mCodeUnitWidth > 8)) {
    22         bytesToDo = iBuilder->CreateMul(bytesToDo, iBuilder->getSize(mCodeUnitWidth / 8));
     22        bytesToDo = b->CreateMul(bytesToDo, b->getSize(mCodeUnitWidth / 8));
    2323    } else if (LLVM_UNLIKELY(mCodeUnitWidth < 8)) {
    24         bytesToDo = iBuilder->CreateUDiv(bytesToDo, iBuilder->getSize(8 / mCodeUnitWidth));
     24        bytesToDo = b->CreateUDiv(bytesToDo, b->getSize(8 / mCodeUnitWidth));
    2525    }
    26     iBuilder->CreateWriteCall(iBuilder->getInt32(1), codeUnitBuffer, bytesToDo);
     26    b->CreateWriteCall(b->getInt32(1), codeUnitBuffer, bytesToDo);
    2727    return numOfStrides;
    2828}
    2929
    30 StdOutKernel::StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth)
    31 : MultiBlockKernel("stdout", {Binding{iBuilder->getStreamSetTy(1, codeUnitWidth), "codeUnitBuffer"}}, {}, {}, {}, {})
     30StdOutKernel::StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned codeUnitWidth)
     31: MultiBlockKernel("stdout", {Binding{b->getStreamSetTy(1, codeUnitWidth), "codeUnitBuffer"}}, {}, {}, {}, {})
    3232, mCodeUnitWidth(codeUnitWidth) {
    33     setNoTerminateAttribute(true);
    3433    // setKernelStride(getpagesize());
    3534}
    3635
    37 void FileSink::generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
    38     BasicBlock * setTerminationOnFailure = iBuilder->CreateBasicBlock("setTerminationOnFailure");
    39     BasicBlock * fileSinkInitExit = iBuilder->CreateBasicBlock("fileSinkInitExit");
    40     Value * fileName = iBuilder->getScalarField("fileName");
    41     Value * fileNameLength = iBuilder->CreateStrlenCall(fileName);
     36void FileSink::generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & b) {
     37    BasicBlock * setTerminationOnFailure = b->CreateBasicBlock("setTerminationOnFailure");
     38    BasicBlock * fileSinkInitExit = b->CreateBasicBlock("fileSinkInitExit");
     39    Value * fileName = b->getScalarField("fileName");
     40    Value * fileNameLength = b->CreateStrlenCall(fileName);
    4241    // Make a temporary file name template with the characters "XXXXXX" appended
    4342    // as required by mkstemp.
    44     Constant * suffixPlusNullLength = iBuilder->getSize(7);
    45     Value * tmpFileNamePtr = iBuilder->CreatePointerCast(iBuilder->CreateMalloc(iBuilder->CreateAdd(fileNameLength, suffixPlusNullLength)), iBuilder->getInt8PtrTy());
    46     iBuilder->setScalarField("tmpFileName", tmpFileNamePtr);
    47     iBuilder->CreateMemCpy(tmpFileNamePtr, fileName, fileNameLength, 1);
     43    Constant * suffixPlusNullLength = b->getSize(7);
     44    Value * tmpFileNamePtr = b->CreatePointerCast(b->CreateMalloc(b->CreateAdd(fileNameLength, suffixPlusNullLength)), b->getInt8PtrTy());
     45    b->setScalarField("tmpFileName", tmpFileNamePtr);
     46    b->CreateMemCpy(tmpFileNamePtr, fileName, fileNameLength, 1);
    4847#ifdef BACKUP_OLDFILE
    49     iBuilder->CreateMemCpy(iBuilder->CreateGEP(tmpFileNamePtr, fileNameLength), iBuilder->GetString(".saved"), suffixPlusNullLength, 1);
    50     iBuilder->CreateRenameCall(fileName, tmpFileNamePtr);
     48    b->CreateMemCpy(b->CreateGEP(tmpFileNamePtr, fileNameLength), b->GetString(".saved"), suffixPlusNullLength, 1);
     49    b->CreateRenameCall(fileName, tmpFileNamePtr);
    5150#else
    52     iBuilder->CreateUnlinkCall(fileName);
     51    b->CreateUnlinkCall(fileName);
    5352#endif
    54     iBuilder->CreateMemCpy(iBuilder->CreateGEP(tmpFileNamePtr, fileNameLength), iBuilder->GetString("XXXXXX"), suffixPlusNullLength, 1);
    55     Value * fileDes = iBuilder->CreateMkstempCall(tmpFileNamePtr);
    56     iBuilder->setScalarField("fileDes", fileDes);
    57     Value * failure = iBuilder->CreateICmpEQ(fileDes, iBuilder->getInt32(-1));
    58     iBuilder->CreateCondBr(failure, setTerminationOnFailure, fileSinkInitExit);
    59     iBuilder->SetInsertPoint(setTerminationOnFailure);
    60     iBuilder->setTerminationSignal();
    61     iBuilder->CreateBr(fileSinkInitExit);
    62     iBuilder->SetInsertPoint(fileSinkInitExit);
     53    b->CreateMemCpy(b->CreateGEP(tmpFileNamePtr, fileNameLength), b->GetString("XXXXXX"), suffixPlusNullLength, 1);
     54    Value * fileDes = b->CreateMkstempCall(tmpFileNamePtr);
     55    b->setScalarField("fileDes", fileDes);
     56    Value * failure = b->CreateICmpEQ(fileDes, b->getInt32(-1));
     57    b->CreateCondBr(failure, setTerminationOnFailure, fileSinkInitExit);
     58
     59    b->SetInsertPoint(setTerminationOnFailure);
     60    b->setTerminationSignal();
     61    b->CreateBr(fileSinkInitExit);
     62
     63    b->SetInsertPoint(fileSinkInitExit);
    6364}
    6465
    65 Value * FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) {
    66     BasicBlock * const closeFile = iBuilder->CreateBasicBlock("closeFile");
    67     BasicBlock * const fileOutExit = iBuilder->CreateBasicBlock("fileOutExit");
    68 
    69     Value * const fileDes = iBuilder->getScalarField("fileDes");
    70     Value * codeUnitBuffer = iBuilder->getInputStreamBlockPtr("codeUnitBuffer", iBuilder->getInt32(0));
    71     codeUnitBuffer = iBuilder->CreatePointerCast(codeUnitBuffer, iBuilder->getInt8PtrTy());
     66Value * FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) {
     67    Value * const fileDes = b->getScalarField("fileDes");
     68    Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
     69    codeUnitBuffer = b->CreatePointerCast(codeUnitBuffer, b->getInt8PtrTy());
    7270    Value * bytesToDo = mAvailableItemCount[0];
    7371    if (LLVM_UNLIKELY(mCodeUnitWidth > 8)) {
    74         bytesToDo = iBuilder->CreateMul(bytesToDo, iBuilder->getSize(mCodeUnitWidth / 8));
     72        bytesToDo = b->CreateMul(bytesToDo, b->getSize(mCodeUnitWidth / 8));
    7573    } else if (LLVM_UNLIKELY(mCodeUnitWidth < 8)) {
    76         bytesToDo = iBuilder->CreateUDiv(bytesToDo, iBuilder->getSize(8 / mCodeUnitWidth));
     74        bytesToDo = b->CreateUDiv(bytesToDo, b->getSize(8 / mCodeUnitWidth));
    7775    }   
    78     iBuilder->CreateWriteCall(fileDes, codeUnitBuffer, bytesToDo);
    79     iBuilder->CreateUnlikelyCondBr(mIsFinal, closeFile, fileOutExit);
    80 
    81     iBuilder->SetInsertPoint(closeFile);   
    82     iBuilder->CreateCloseCall(fileDes);
    83     Value * newFileNamePtr = iBuilder->getScalarField("fileName");
    84     Value * tmpFileNamePtr = iBuilder->getScalarField("tmpFileName");
    85     iBuilder->CreateRenameCall(tmpFileNamePtr, newFileNamePtr);
    86     iBuilder->CreateFree(tmpFileNamePtr);   
    87     iBuilder->CreateBr(fileOutExit);
    88    
    89     iBuilder->SetInsertPoint(fileOutExit);
     76    b->CreateWriteCall(fileDes, codeUnitBuffer, bytesToDo);
    9077    return numOfStrides;
    9178}
    9279
    93 FileSink::FileSink(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth)
     80void FileSink::generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b) {
     81    Value * const fileDes = b->getScalarField("fileDes");
     82    b->CreateCloseCall(fileDes);
     83    Value * newFileNamePtr = b->getScalarField("fileName");
     84    Value * tmpFileNamePtr = b->getScalarField("tmpFileName");
     85    b->CreateRenameCall(tmpFileNamePtr, newFileNamePtr);
     86    b->CreateFree(tmpFileNamePtr);
     87}
     88
     89FileSink::FileSink(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned codeUnitWidth)
    9490: MultiBlockKernel("filesink" + std::to_string(codeUnitWidth),
    95 {Binding{iBuilder->getStreamSetTy(1, codeUnitWidth), "codeUnitBuffer"}},
     91{Binding{b->getStreamSetTy(1, codeUnitWidth), "codeUnitBuffer"}},
    9692{},
    97 {Binding{iBuilder->getInt8PtrTy(), "fileName"}}, {}, {Binding{iBuilder->getInt8PtrTy(), "tmpFileName"}, Binding{iBuilder->getInt32Ty(), "fileDes"}})
     93{Binding{b->getInt8PtrTy(), "fileName"}}, {}, {Binding{b->getInt8PtrTy(), "tmpFileName"}, Binding{b->getInt32Ty(), "fileDes"}})
    9894, mCodeUnitWidth(codeUnitWidth) {
    9995    // setKernelStride(getpagesize());
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.h

    r5755 r5793  
    2828    void generateInitializeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    2929    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     30    void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b) override;
    3031private:
    3132    const unsigned mCodeUnitWidth;
  • icGREP/icgrep-devel/icgrep/kernels/streamset.cpp

    r5782 r5793  
    163163    }
    164164    consumed = b->CreateURem(consumed, bufferSize);
    165     Value * const limit = b->CreateSelect(b->CreateICmpULE(consumed, fromPosition), bufferSize, consumed);
     165    Constant * capacity = bufferSize;
     166    if (mOverflowBlocks) {
     167        capacity = ConstantInt::get(fromPosition->getType(), (mBufferBlocks + mOverflowBlocks) * b->getStride());
     168    }
     169    Value * const limit = b->CreateSelect(b->CreateICmpULE(consumed, fromPosition), capacity, consumed);
    166170    return b->CreateNUWSub(limit, fromPosition);
    167171}
     
    219223}
    220224
    221 void StreamSetBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * const handle, Value * priorProduced, Value * newProduced, const std::string Name) const {
    222     report_fatal_error("Copy back not supported for this buffer type:" + Name);
    223 }
    224 
    225225// Source File Buffer
    226226
     
    338338}
    339339
    340 Value * CircularBuffer::getLinearlyCopyableItems(IDISA::IDISA_Builder * const b, Value * const handle, Value * fromPosition, Value * availItems, bool reverse) const {
    341 //    Constant * bufSize = ConstantInt::get(priorProduced->getType(), mBufferBlocks * b->getBitBlockWidth());
    342 //    Value * from = b->CreateURem(fromPosition, bufSize);
    343 //    Value * avail = b->CreateURem(availItems, bufSize);
    344 //    Value * wraparound = b->CreateICmpUGT(from, avail);
    345 
    346 
    347     return nullptr;
    348 }
    349 
    350340Value * CircularBuffer::getRawItemPointer(IDISA::IDISA_Builder * const b, Value * const handle, Value * absolutePosition) const {
    351341    Value * ptr = getBaseAddress(b, handle);
     
    372362}
    373363
    374 Value * CircularCopybackBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const b, Value * const handle, Value * fromPosition, Value * consumed, bool reverse) const {
    375     Value * writableProper = StreamSetBuffer::getLinearlyWritableItems(b, handle, fromPosition, consumed, reverse);
    376     if (reverse) return writableProper;
    377     return b->CreateAdd(writableProper, b->getSize(mOverflowBlocks * b->getBitBlockWidth()));
    378 }
    379 
    380 void CircularCopybackBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * const handle, Value * priorProduced, Value * newProduced, const std::string Name) const {
    381     assert (priorProduced->getType() == newProduced->getType());
    382     Constant * bufSize = ConstantInt::get(priorProduced->getType(), mBufferBlocks * b->getBitBlockWidth());
    383     Value * priorBufPos = b->CreateURem(priorProduced, bufSize);
    384     Value * newBufPos = b->CreateURem(newProduced, bufSize);
    385     BasicBlock * copyBack = b->CreateBasicBlock(Name + "_circularCopyBack");
    386     BasicBlock * done = b->CreateBasicBlock(Name + "_circularCopyBackDone");
    387     Value * wraparound = b->CreateICmpUGT(priorBufPos, newBufPos);
    388     b->CreateCondBr(wraparound, copyBack, done);
    389 
    390     b->SetInsertPoint(copyBack);
    391     Value * const baseAddress = getBaseAddress(b, handle);
    392     Value * overflowAddress = b->CreateGEP(baseAddress, b->getInt32(mBufferBlocks));
    393     // copyStream(b, baseAddress, b->getSize(0), overflowAddress, b->getSize(0), newBufPos);
    394     createBlockAlignedCopy(b, baseAddress, overflowAddress, newBufPos);
    395     b->CreateBr(done);
    396 
    397     b->SetInsertPoint(done);
    398 }
    399 
    400364
    401365// SwizzledCopybackBuffer Buffer
    402 
    403366void SwizzledCopybackBuffer::allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) {
    404367    Type * const ty = getType();
     
    450413}
    451414
    452 Value * SwizzledCopybackBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const b, Value * const handle, Value * fromPosition, Value *consumed, bool reverse) const {
    453     Value * writableProper = StreamSetBuffer::getLinearlyWritableItems(b, handle, fromPosition, consumed, reverse);
    454     if (reverse) return writableProper;
    455     return b->CreateAdd(writableProper, b->getSize(mOverflowBlocks * b->getBitBlockWidth()));
    456 }
    457 
    458 void SwizzledCopybackBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * const handle, Value * priorProduced, Value * newProduced, const std::string Name) const {
    459     assert (priorProduced->getType() == newProduced->getType());
    460     Constant * bufSize = ConstantInt::get(priorProduced->getType(), mBufferBlocks * b->getBitBlockWidth());
    461     Value * priorBufPos = b->CreateURem(priorProduced, bufSize);
    462     Value * newBufPos = b->CreateURem(newProduced, bufSize);
    463     BasicBlock * copyBack = b->CreateBasicBlock(Name + "_swizzledCopyBack");
    464     BasicBlock * done = b->CreateBasicBlock(Name + "_swizzledCopyBackDone");
    465     Value * wraparound = b->CreateICmpUGT(priorBufPos, newBufPos);
    466     b->CreateCondBr(wraparound, copyBack, done);
    467     b->SetInsertPoint(copyBack);
    468     Value * overFlowAreaPtr = b->CreateGEP(handle, b->getSize(mBufferBlocks));
    469     createBlockAlignedCopy(b, handle, overFlowAreaPtr, newBufPos);
    470     b->CreateBr(done);
    471     b->SetInsertPoint(done);
    472 }
    473 
    474415// Expandable Buffer
    475416
     
    707648    Value * ptr = b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(Field::WorkingBlocks))});
    708649    return b->CreateMul(b->CreateLoad(ptr), b->getSize(b->getBitBlockWidth()));
    709 }
    710 
    711 void DynamicBuffer::genCopyBackLogic(IDISA::IDISA_Builder * const b, Value * const handle, Value * priorProducedCount, Value * newProducedCount, const std::string Name) const {
    712     assert (priorProducedCount->getType() == newProducedCount->getType());   
    713     Value * workingBlocks = b->CreateLoad(b->CreateGEP(handle, {b->getInt32(0), b->getInt32(int(DynamicBuffer::Field::WorkingBlocks))}));
    714     assert (workingBlocks->getType() == newProducedCount->getType());
    715     Value * bufSize = b->CreateMul(workingBlocks, ConstantInt::get(workingBlocks->getType(), b->getBitBlockWidth()));
    716     Value * priorBufPos = b->CreateURem(priorProducedCount, bufSize);
    717     Value * newBufPos = b->CreateURem(newProducedCount, bufSize);
    718     BasicBlock * copyBack = b->CreateBasicBlock(Name + "_dynamicCopyBack");
    719     BasicBlock * done = b->CreateBasicBlock(Name + "_dynamicCopyBackDone");
    720 
    721     Value * wraparound = b->CreateICmpUGT(priorBufPos, newBufPos);
    722     b->CreateCondBr(wraparound, copyBack, done);
    723 
    724     b->SetInsertPoint(copyBack);
    725     Value * bufBasePtr = getBaseAddress(b, handle);
    726     Value * overFlowAreaPtr = b->CreateGEP(bufBasePtr, workingBlocks);
    727     createBlockAlignedCopy(b, bufBasePtr, overFlowAreaPtr, newBufPos);
    728     b->CreateBr(done);
    729 
    730     b->SetInsertPoint(done);
    731650}
    732651
  • icGREP/icgrep-devel/icgrep/kernels/streamset.h

    r5782 r5793  
    8989    virtual llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * fromPos, llvm::Value * avail, bool reverse = false) const;
    9090
    91     virtual llvm::Value * getLinearlyCopyableItems(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * fromPos, llvm::Value * avail, bool reverse = false) const {
    92         return getLinearlyAccessibleItems(b, handle, fromPos, avail, reverse);
    93     }
    94    
    9591    void createBlockCopy(IDISA::IDISA_Builder * const b, llvm::Value * targetBlockPtr, llvm::Value * sourceBlockPtr, llvm::Value * blocksToCopy) const;
    9692
     
    106102        return mOverflowBlocks;
    107103    }
    108 
    109     virtual void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const;
    110104   
    111105    virtual ~StreamSetBuffer() = 0;
     
    221215    llvm::Value * getRawItemPointer(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * absolutePosition) const final;
    222216
    223     llvm::Value * getLinearlyCopyableItems(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * fromPos, llvm::Value * avail, bool reverse = false) const final;
    224 
    225217protected:
    226218
     
    243235   
    244236    CircularCopybackBuffer(const std::unique_ptr<kernel::KernelBuilder> & b, llvm::Type * type, size_t bufferBlocks, size_t overflowBlocks, unsigned AddressSpace = 0);
    245    
    246     llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * fromPosition, llvm::Value * consumed, bool reverse = false) const override;
    247    
    248     void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    249 
    250     void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const override;
     237       
     238    void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    251239
    252240};
     
    260248    void createBlockAlignedCopy(IDISA::IDISA_Builder * const b, llvm::Value * targetBlockPtr, llvm::Value * sourceBlockPtr, llvm::Value * itemsToCopy, const unsigned alignment = 1) const override;
    261249
    262     llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * fromPosition, llvm::Value * consumed, bool reverse = false) const override;
    263    
    264     void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    265 
    266     void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const override;
     250    void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & b) override;
    267251
    268252protected:
     
    330314   
    331315    void doubleCapacity(IDISA::IDISA_Builder * const b, llvm::Value * handle);
    332 
    333     void genCopyBackLogic(IDISA::IDISA_Builder * const b, llvm::Value * handle, llvm::Value * priorProduced, llvm::Value * newProduced, const std::string) const override;
    334316
    335317protected:
  • icGREP/icgrep-devel/icgrep/toolchain/cpudriver.cpp

    r5773 r5793  
    119119        k->initializeInstance(iBuilder);
    120120    }
    121     if (codegen::PipelineParallel) {
    122         generateParallelPipeline(iBuilder, mPipeline);
    123     } else if (codegen::SegmentPipelineParallel) {
     121    if (codegen::SegmentPipelineParallel) {
    124122        generateSegmentParallelPipeline(iBuilder, mPipeline);
    125123    } else {
  • icGREP/icgrep-devel/icgrep/toolchain/grep_pipeline.cpp

    r5782 r5793  
    6262    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
    6363   
    64     StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize + 1);
     64    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
    6565    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
    6666    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    6767   
    6868    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, 8);
    69     StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize + 1);
     69    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
    7070    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
    7171
  • icGREP/icgrep-devel/icgrep/toolchain/pipeline.cpp

    r5782 r5793  
    8585    Value * const segOffset = b->CreateLoad(b->CreateGEP(threadStruct, {b->getInt32(0), b->getInt32(1)}));
    8686
    87     BasicBlock * segmentLoop = BasicBlock::Create(b->getContext(), "segmentLoop", threadFunc);
     87    BasicBlock * const segmentLoop = BasicBlock::Create(b->getContext(), "segmentLoop", threadFunc);
    8888    b->CreateBr(segmentLoop);
    8989
     
    9191    PHINode * const segNo = b->CreatePHI(b->getSizeTy(), 2, "segNo");
    9292    segNo->addIncoming(segOffset, entryBlock);
    93 
    94     Value * terminated = b->getFalse();
    95     Value * const nextSegNo = b->CreateAdd(segNo, b->getSize(1));
    9693
    9794    BasicBlock * const exitThreadBlock = BasicBlock::Create(b->getContext(), "exitThread", threadFunc);
     
    106103    }
    107104
     105    Value * terminated = nullptr;
     106
     107    const bool serialize = codegen::DebugOptionIsSet(codegen::SerializeThreads);
     108
    108109    for (unsigned k = 0; k < n; ++k) {
    109110
     
    114115        b->CreateBr(kernelWait);
    115116
     117        BasicBlock * const kernelCheck = BasicBlock::Create(b->getContext(), kernel->getName() + "Check", threadFunc);
     118
    116119        BasicBlock * const kernelBody = BasicBlock::Create(b->getContext(), kernel->getName() + "Do", threadFunc);
    117120
     121        BasicBlock * const kernelEnd = BasicBlock::Create(b->getContext(), kernel->getName() + "End", threadFunc);
     122
    118123        b->SetInsertPoint(kernelWait);
    119         const unsigned waitIdx = codegen::DebugOptionIsSet(codegen::SerializeThreads) ? (n - 1) : k;
    120 
    121         b->setKernel(kernels[waitIdx]);
     124
     125        b->setKernel(kernels[serialize ? (n - 1) : k]);
    122126        Value * const processedSegmentCount = b->acquireLogicalSegmentNo();
    123127        b->setKernel(kernel);
    124128
    125129        assert (processedSegmentCount->getType() == segNo->getType());
    126         Value * const ready = b->CreateICmpEQ(segNo, processedSegmentCount);
    127 
    128         if (kernel->hasNoTerminateAttribute()) {
    129             b->CreateCondBr(ready, kernelBody, kernelWait);
    130         } else { // If the kernel was terminated in a previous segment then the pipeline is done.
    131             BasicBlock * kernelTerminated = BasicBlock::Create(b->getContext(), kernel->getName() + "Terminated", threadFunc, 0);
    132             BasicBlock * exitBlock = BasicBlock::Create(b->getContext(), kernel->getName() + "Exit", threadFunc, 0);
    133             b->CreateCondBr(ready, kernelTerminated, kernelWait);
    134 
    135             b->SetInsertPoint(kernelTerminated);
    136             Value * terminationSignal = b->getTerminationSignal();
    137             b->CreateCondBr(terminationSignal, exitBlock, kernelBody);
    138             b->SetInsertPoint(exitBlock);
    139             b->releaseLogicalSegmentNo(nextSegNo); // Ensure that the next thread will also exit.
    140             b->CreateBr(exitThreadBlock);
    141         }
    142 
    143         BasicBlock * const kernelEnd = BasicBlock::Create(b->getContext(), kernel->getName() + "End", threadFunc);
     130        Value * const ready = b->CreateICmpEQ(segNo, processedSegmentCount);       
     131        b->CreateCondBr(ready, kernelCheck, kernelWait);
     132
     133        b->SetInsertPoint(kernelCheck);
     134        b->CreateUnlikelyCondBr(b->getTerminationSignal(), kernelEnd, kernelBody);
    144135
    145136        // Execute the kernel segment
    146137        b->SetInsertPoint(kernelBody);
    147138        const auto & inputs = kernel->getStreamInputs();
    148         std::vector<Value *> args = {kernel->getInstance(), terminated};
     139        Value * const isFinal = b->CreateOr(terminated ? terminated : b->getFalse(), b->getTerminationSignal());
     140        std::vector<Value *> args = {kernel->getInstance(), isFinal};
    149141        for (unsigned i = 0; i < inputs.size(); ++i) {
    150142            const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     
    153145            Value * const produced = f->second;
    154146            args.push_back(produced);
    155             handleInsufficientData(b, produced, terminated, kernelEnd, kernel, inputs[i], buffer);
     147            handleInsufficientData(b, produced, isFinal, kernelEnd, kernel, inputs[i], buffer);
    156148        }
    157149
     
    162154        b->SetInsertPoint(kernelEnd);
    163155
    164         if (!kernel->hasNoTerminateAttribute()) {
    165             terminated = b->CreateOr(terminated, b->getTerminationSignal());
     156        Value * const finished = b->getTerminationSignal();
     157        if (terminated) { // all kernels must terminate
     158            terminated = b->CreateAnd(terminated, finished);
     159        } else {
     160            terminated = finished;
    166161        }
    167162
     
    190185            b->CreateStore(b->CreateAdd(b->CreateLoad(counterPtr), b->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
    191186            cycleCountStart = cycleCountEnd;
    192         }       
    193         b->releaseLogicalSegmentNo(nextSegNo);
     187        }
     188
     189        b->releaseLogicalSegmentNo(b->CreateAdd(segNo, b->getSize(1)));
    194190    }
    195191
     
    213209
    214210    segNo->addIncoming(b->CreateAdd(segNo, b->getSize(codegen::ThreadNum)), b->GetInsertBlock());
     211    if (LLVM_UNLIKELY(terminated == nullptr)) {
     212        report_fatal_error("error: at least one kernel must have a termination signal");
     213    }
    215214    b->CreateUnlikelyCondBr(terminated, exitThreadBlock, segmentLoop);
    216215
     
    240239    // -------------------------------------------------------------------------------------------------------------------------
    241240    const unsigned threads = codegen::ThreadNum - 1;
    242     assert (codegen::ThreadNum > 1);
     241    assert (codegen::ThreadNum > 0);
    243242    Type * const pthreadsTy = ArrayType::get(sizeTy, threads);
    244243    AllocaInst * const pthreads = b->CreateAlloca(pthreadsTy);
     
    279278    }
    280279   
    281     if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
     280    if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableCycleCounter))) {
     281        for (const Kernel * kernel : kernels) {
     282            b->setKernel(kernel);
     283            const auto & inputs = kernel->getStreamInputs();
     284            const auto & outputs = kernel->getStreamOutputs();
     285            Value * items = nullptr;
     286            if (inputs.empty()) {
     287                items = b->getProducedItemCount(outputs[0].getName());
     288            } else {
     289                items = b->getProcessedItemCount(inputs[0].getName());
     290            }
     291            Value * fItems = b->CreateUIToFP(items, b->getDoubleTy());
     292            Value * cycles = b->CreateLoad(b->getCycleCountPtr());
     293            Value * fCycles = b->CreateUIToFP(cycles, b->getDoubleTy());
     294            const auto formatString = kernel->getName() + ": %7.2e items processed; %7.2e CPU cycles,  %6.2f cycles per item.\n";
     295            Value * stringPtr = b->CreatePointerCast(b->GetString(formatString), b->getInt8PtrTy());
     296            b->CreateCall(b->GetDprintf(), {b->getInt32(2), stringPtr, fItems, fCycles, b->CreateFDiv(fCycles, fItems)});
     297        }
     298    }
     299   
     300}
     301
     302/** ------------------------------------------------------------------------------------------------------------- *
     303 * @brief generatePipelineLoop
     304 ** ------------------------------------------------------------------------------------------------------------- */
     305void generatePipelineLoop(const std::unique_ptr<KernelBuilder> & b, const std::vector<Kernel *> & kernels) {
     306
     307    BasicBlock * entryBlock = b->GetInsertBlock();
     308    Function * main = entryBlock->getParent();
     309
     310    // Create the basic blocks for the loop.
     311    BasicBlock * pipelineLoop = BasicBlock::Create(b->getContext(), "pipelineLoop", main);
     312    BasicBlock * pipelineExit = BasicBlock::Create(b->getContext(), "pipelineExit", main);
     313
     314    StreamSetBufferMap<Value *> producedItemCount;
     315    StreamSetBufferMap<Value *> consumedItemCount;
     316
     317    b->CreateBr(pipelineLoop);
     318    b->SetInsertPoint(pipelineLoop);
     319   
     320    Value * cycleCountStart = nullptr;
     321    Value * cycleCountEnd = nullptr;
     322    if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableCycleCounter))) {
     323        cycleCountStart = b->CreateReadCycleCounter();
     324    }
     325    Value * terminated = nullptr;
     326
     327    for (Kernel * const kernel : kernels) {
     328
     329        b->setKernel(kernel);
     330        const auto & inputs = kernel->getStreamInputs();
     331        const auto & outputs = kernel->getStreamOutputs();
     332
     333        Value * const isFinal = terminated ? terminated : b->getFalse();
     334
     335        std::vector<Value *> args = {kernel->getInstance(), isFinal};
     336
     337        for (unsigned i = 0; i < inputs.size(); ++i) {
     338            const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     339            const auto f = producedItemCount.find(buffer);
     340            if (LLVM_UNLIKELY(f == producedItemCount.end())) {
     341                report_fatal_error(kernel->getName() + " uses stream set " + inputs[i].getName() + " prior to its definition");
     342            }
     343            Value * const produced = f->second;
     344            args.push_back(produced);
     345            handleInsufficientData(b, produced, isFinal, pipelineLoop, kernel, inputs[i], buffer);
     346        }
     347
     348        applyOutputBufferExpansions(b, kernel);
     349
     350        b->createDoSegmentCall(args);
     351
     352        Value * const finished = b->getTerminationSignal();
     353        if (terminated) {
     354            // All kernels must agree that we've terminated.
     355            terminated = b->CreateAnd(terminated, finished);
     356        } else {
     357            terminated = finished;
     358        }
     359
     360        for (unsigned i = 0; i < outputs.size(); ++i) {
     361            Value * const produced = b->getProducedItemCount(outputs[i].getName());
     362            const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
     363            assert (producedItemCount.count(buf) == 0);
     364            producedItemCount.emplace(buf, produced);
     365        }
     366
     367        for (unsigned i = 0; i < inputs.size(); ++i) {
     368            Value * const processed = b->getProcessedItemCount(inputs[i].getName());
     369            const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(i);
     370            auto f = consumedItemCount.find(buf);
     371            if (f == consumedItemCount.end()) {
     372                consumedItemCount.emplace(buf, processed);
     373            } else {
     374                f->second = b->CreateUMin(processed, f->second);
     375            }
     376        }
     377
     378        if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableCycleCounter))) {
     379            cycleCountEnd = b->CreateReadCycleCounter();
     380            Value * counterPtr = b->getCycleCountPtr();
     381            b->CreateStore(b->CreateAdd(b->CreateLoad(counterPtr), b->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
     382            cycleCountStart = cycleCountEnd;
     383        }
     384//        Value * const segNo = b->acquireLogicalSegmentNo();
     385//        Value * nextSegNo = b->CreateAdd(segNo, b->getSize(1));
     386//        b->releaseLogicalSegmentNo(nextSegNo);
     387    }
     388
     389    for (const auto consumed : consumedItemCount) {
     390        const StreamSetBuffer * const buffer = consumed.first;
     391        Kernel * const kernel = buffer->getProducer();
     392        const auto & binding = kernel->getStreamOutput(buffer);
     393        if (LLVM_UNLIKELY(binding.getRate().isDerived())) {
     394            continue;
     395        }
     396        b->setKernel(kernel);
     397        b->setConsumedItemCount(binding.getName(), consumed.second);
     398    }
     399
     400    if (LLVM_UNLIKELY(terminated == nullptr)) {
     401        report_fatal_error("error: at least one kernel must have a termination signal");
     402    }
     403    b->CreateCondBr(terminated, pipelineExit, pipelineLoop);
     404
     405    pipelineExit->moveAfter(b->GetInsertBlock());
     406
     407    b->SetInsertPoint(pipelineExit);
     408
     409    if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableCycleCounter))) {
    282410        for (unsigned k = 0; k < kernels.size(); k++) {
    283411            auto & kernel = kernels[k];
     
    299427        }
    300428    }
    301    
    302 }
    303 
    304 
    305 /** ------------------------------------------------------------------------------------------------------------- *
    306  * @brief generateParallelPipeline
    307  ** ------------------------------------------------------------------------------------------------------------- */
    308 void generateParallelPipeline(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Kernel *> &kernels) {
    309 
    310     Module * const m = iBuilder->getModule();
    311     IntegerType * const sizeTy = iBuilder->getSizeTy();
    312     PointerType * const voidPtrTy = iBuilder->getVoidPtrTy();
    313     ConstantInt * bufferSegments = ConstantInt::get(sizeTy, codegen::BufferSegments - 1);
    314     ConstantInt * segmentItems = ConstantInt::get(sizeTy, codegen::SegmentSize * iBuilder->getBitBlockWidth());
    315     Constant * const nullVoidPtrVal = ConstantPointerNull::getNullValue(voidPtrTy);
    316 
    317     const unsigned n = kernels.size();
    318 
    319     Type * const pthreadsTy = ArrayType::get(sizeTy, n);
    320     AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
    321     Value * threadIdPtr[n];
    322     for (unsigned i = 0; i < n; ++i) {
    323         threadIdPtr[i] = iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
    324     }
    325 
    326     Value * instance[n];
    327     Type * structTypes[n];
    328     for (unsigned i = 0; i < n; ++i) {
    329         instance[i] = kernels[i]->getInstance();
    330         structTypes[i] = instance[i]->getType();
    331     }
    332 
    333     Type * const sharedStructType = StructType::get(m->getContext(), ArrayRef<Type *>{structTypes, n});
    334 
    335 
    336     AllocaInst * sharedStruct = iBuilder->CreateAlloca(sharedStructType);
    337     for (unsigned i = 0; i < n; ++i) {
    338         Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
    339         iBuilder->CreateStore(instance[i], ptr);
    340     }
    341 
    342     for (auto & kernel : kernels) {
    343         iBuilder->setKernel(kernel);
    344         iBuilder->releaseLogicalSegmentNo(iBuilder->getSize(0));
    345     }
    346 
    347     // GENERATE THE PRODUCING AND CONSUMING KERNEL MAPS
    348     StreamSetBufferMap<unsigned> producingKernel;
    349     StreamSetBufferMap<std::vector<unsigned>> consumingKernels;
    350     for (unsigned id = 0; id < n; ++id) {
    351         const auto & kernel = kernels[id];
    352         const auto & inputs = kernel->getStreamInputs();
    353         const auto & outputs = kernel->getStreamOutputs();
    354         // add any outputs from this kernel to the producing kernel map
    355         for (unsigned j = 0; j < outputs.size(); ++j) {
    356             const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(j);
    357             if (LLVM_UNLIKELY(producingKernel.count(buf) != 0)) {
    358                 report_fatal_error(kernel->getName() + " redefines stream set " + outputs[j].getName());
    359             }
    360             producingKernel.emplace(buf, id);
    361         }
    362         // and any inputs to the consuming kernels list
    363         for (unsigned j = 0; j < inputs.size(); ++j) {
    364             const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(j);
    365             auto f = consumingKernels.find(buf);
    366             if (f == consumingKernels.end()) {
    367                 if (LLVM_UNLIKELY(producingKernel.count(buf) == 0)) {
    368                     report_fatal_error(kernel->getName() + " uses stream set " + inputs[j].getName() + " prior to its definition");
    369                 }
    370                 consumingKernels.emplace(buf, std::vector<unsigned>{ id });
    371             } else {
    372                 f->second.push_back(id);
    373             }
    374         }
    375     }
    376 
    377     const auto ip = iBuilder->saveIP();
    378 
    379     // GENERATE UNIQUE PIPELINE PARALLEL THREAD FUNCTION FOR EACH KERNEL
    380     FlatSet<unsigned> kernelSet;
    381     kernelSet.reserve(n);
    382 
    383     Function * thread_functions[n];
    384     Value * producerSegNo[n];
    385     for (unsigned id = 0; id < n; id++) {
    386         const auto & kernel = kernels[id];
    387 
    388         iBuilder->setKernel(kernel);
    389 
    390         const auto & inputs = kernel->getStreamInputs();
    391 
    392         Function * const threadFunc = makeThreadFunction(iBuilder, "ppt:" + kernel->getName());
    393         auto ai = threadFunc->arg_begin();
    394        
    395          // Create the basic blocks for the thread function.
    396         BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc);
    397         BasicBlock * outputCheckBlock = BasicBlock::Create(iBuilder->getContext(), "outputCheck", threadFunc);
    398         BasicBlock * inputCheckBlock = BasicBlock::Create(iBuilder->getContext(), "inputCheck", threadFunc);
    399         BasicBlock * doSegmentBlock = BasicBlock::Create(iBuilder->getContext(), "doSegment", threadFunc);
    400         BasicBlock * exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc);
    401 
    402         iBuilder->SetInsertPoint(entryBlock);
    403 
    404         Value * const sharedStruct = iBuilder->CreateBitCast(&*(ai), sharedStructType->getPointerTo());
    405 
    406         for (unsigned k = 0; k < n; k++) {
    407             Value * const ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(k)});
    408             kernels[k]->setInstance(iBuilder->CreateLoad(ptr));
    409         }
    410 
    411         iBuilder->CreateBr(outputCheckBlock);
    412 
    413         // Check whether the output buffers are ready for more data
    414         iBuilder->SetInsertPoint(outputCheckBlock);
    415         PHINode * segNo = iBuilder->CreatePHI(iBuilder->getSizeTy(), 3, "segNo");
    416         segNo->addIncoming(iBuilder->getSize(0), entryBlock);
    417         segNo->addIncoming(segNo, outputCheckBlock);
    418 
    419         Value * outputWaitCond = iBuilder->getTrue();
    420         for (const StreamSetBuffer * buf : kernel->getStreamSetOutputBuffers()) {
    421             const auto & list = consumingKernels[buf];
    422             assert(std::is_sorted(list.begin(), list.end()));
    423             kernelSet.insert(list.begin(), list.end());
    424         }
    425         for (unsigned k : kernelSet) {
    426             iBuilder->setKernel(kernels[k]);
    427             Value * consumerSegNo = iBuilder->acquireLogicalSegmentNo();
    428             assert (consumerSegNo->getType() == segNo->getType());
    429             Value * consumedSegNo = iBuilder->CreateAdd(consumerSegNo, bufferSegments);
    430             outputWaitCond = iBuilder->CreateAnd(outputWaitCond, iBuilder->CreateICmpULE(segNo, consumedSegNo));
    431         }
    432         kernelSet.clear();
    433         iBuilder->setKernel(kernel);
    434         iBuilder->CreateCondBr(outputWaitCond, inputCheckBlock, outputCheckBlock);
    435 
    436         // Check whether the input buffers have enough data for this kernel to begin
    437         iBuilder->SetInsertPoint(inputCheckBlock);
    438         for (const StreamSetBuffer * buf : kernel->getStreamSetInputBuffers()) {
    439             kernelSet.insert(producingKernel[buf]);
    440         }
    441 
    442         Value * inputWaitCond = iBuilder->getTrue();
    443         for (unsigned k : kernelSet) {
    444             iBuilder->setKernel(kernels[k]);
    445             producerSegNo[k] = iBuilder->acquireLogicalSegmentNo();
    446             assert (producerSegNo[k]->getType() == segNo->getType());
    447             inputWaitCond = iBuilder->CreateAnd(inputWaitCond, iBuilder->CreateICmpULT(segNo, producerSegNo[k]));
    448         }
    449         iBuilder->setKernel(kernel);
    450         iBuilder->CreateCondBr(inputWaitCond, doSegmentBlock, inputCheckBlock);
    451 
    452         // Process the segment
    453         iBuilder->SetInsertPoint(doSegmentBlock);
    454 
    455         Value * const nextSegNo = iBuilder->CreateAdd(segNo, iBuilder->getSize(1));
    456         Value * terminated = nullptr;
    457         if (kernelSet.empty()) {
    458             // if this kernel has no input streams, the kernel itself must decide when it terminates.
    459             terminated = iBuilder->getTerminationSignal();
    460         } else {
    461             // ... otherwise the kernel terminates only when it exhausts all of its input streams
    462             terminated = iBuilder->getTrue();
    463             for (unsigned k : kernelSet) {
    464                 iBuilder->setKernel(kernels[k]);
    465                 terminated = iBuilder->CreateAnd(terminated, iBuilder->getTerminationSignal());
    466                 terminated = iBuilder->CreateAnd(terminated, iBuilder->CreateICmpEQ(nextSegNo, producerSegNo[k]));
    467             }
    468             kernelSet.clear();
    469             iBuilder->setKernel(kernel);
    470         }
    471 
    472         std::vector<Value *> args = {kernel->getInstance(), terminated};
    473         args.insert(args.end(), inputs.size(), iBuilder->CreateMul(segmentItems, segNo));
    474 
    475         iBuilder->createDoSegmentCall(args);
    476         segNo->addIncoming(nextSegNo, doSegmentBlock);
    477         iBuilder->releaseLogicalSegmentNo(nextSegNo);
    478 
    479         iBuilder->CreateCondBr(terminated, exitThreadBlock, outputCheckBlock);
    480 
    481         iBuilder->SetInsertPoint(exitThreadBlock);
    482 
    483         iBuilder->CreatePThreadExitCall(nullVoidPtrVal);
    484 
    485         iBuilder->CreateRetVoid();
    486 
    487         thread_functions[id] = threadFunc;
    488     }
    489 
    490     iBuilder->restoreIP(ip);
    491 
    492     for (unsigned i = 0; i < n; ++i) {
    493         kernels[i]->setInstance(instance[i]);
    494     }
    495 
    496     for (unsigned i = 0; i < n; ++i) {
    497         iBuilder->CreatePThreadCreateCall(threadIdPtr[i], nullVoidPtrVal, thread_functions[i], sharedStruct);
    498     }
    499 
    500     AllocaInst * const status = iBuilder->CreateAlloca(voidPtrTy);
    501     for (unsigned i = 0; i < n; ++i) {
    502         Value * threadId = iBuilder->CreateLoad(threadIdPtr[i]);
    503         iBuilder->CreatePThreadJoinCall(threadId, status);
    504     }
    505 }
    506 
    507 /** ------------------------------------------------------------------------------------------------------------- *
    508  * @brief generatePipelineLoop
    509  ** ------------------------------------------------------------------------------------------------------------- */
    510 void generatePipelineLoop(const std::unique_ptr<KernelBuilder> & b, const std::vector<Kernel *> & kernels) {
    511 
    512     BasicBlock * entryBlock = b->GetInsertBlock();
    513     Function * main = entryBlock->getParent();
    514 
    515     // Create the basic blocks for the loop.
    516     BasicBlock * pipelineLoop = BasicBlock::Create(b->getContext(), "pipelineLoop", main);
    517     BasicBlock * pipelineExit = BasicBlock::Create(b->getContext(), "pipelineExit", main);
    518 
    519     StreamSetBufferMap<Value *> producedItemCount;
    520     StreamSetBufferMap<Value *> consumedItemCount;
    521 
    522     b->CreateBr(pipelineLoop);
    523     b->SetInsertPoint(pipelineLoop);
    524    
    525     Value * cycleCountStart = nullptr;
    526     Value * cycleCountEnd = nullptr;
    527     if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    528         cycleCountStart = b->CreateReadCycleCounter();
    529     }
    530     Value * terminated = b->getFalse();
    531 
    532     for (Kernel * const kernel : kernels) {
    533 
    534         b->setKernel(kernel);
    535         const auto & inputs = kernel->getStreamInputs();
    536         const auto & outputs = kernel->getStreamOutputs();
    537 
    538         std::vector<Value *> args = {kernel->getInstance(), terminated};
    539 
    540         for (unsigned i = 0; i < inputs.size(); ++i) {
    541             const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
    542             const auto f = producedItemCount.find(buffer);
    543             if (LLVM_UNLIKELY(f == producedItemCount.end())) {
    544                 report_fatal_error(kernel->getName() + " uses stream set " + inputs[i].getName() + " prior to its definition");
    545             }
    546             Value * const produced = f->second;
    547             args.push_back(produced);
    548             handleInsufficientData(b, produced, terminated, pipelineLoop, kernel, inputs[i], buffer);
    549         }
    550 
    551         applyOutputBufferExpansions(b, kernel);
    552 
    553         b->createDoSegmentCall(args);
    554 
    555         if (!kernel->hasNoTerminateAttribute()) {
    556             Value * terminatedSignal = b->getTerminationSignal();
    557             terminated = b->CreateOr(terminated, terminatedSignal);
    558         }
    559         for (unsigned i = 0; i < outputs.size(); ++i) {
    560             Value * const produced = b->getProducedItemCount(outputs[i].getName());
    561             const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
    562             assert (producedItemCount.count(buf) == 0);
    563             producedItemCount.emplace(buf, produced);
    564         }
    565 
    566         for (unsigned i = 0; i < inputs.size(); ++i) {
    567             Value * const processed = b->getProcessedItemCount(inputs[i].getName());
    568             const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(i);
    569             auto f = consumedItemCount.find(buf);
    570             if (f == consumedItemCount.end()) {
    571                 consumedItemCount.emplace(buf, processed);
    572             } else {
    573                 f->second = b->CreateUMin(processed, f->second);
    574             }
    575         }
    576 
    577         if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    578             cycleCountEnd = b->CreateReadCycleCounter();
    579             Value * counterPtr = b->getCycleCountPtr();
    580             b->CreateStore(b->CreateAdd(b->CreateLoad(counterPtr), b->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
    581             cycleCountStart = cycleCountEnd;
    582         }
    583 //        Value * const segNo = b->acquireLogicalSegmentNo();
    584 //        Value * nextSegNo = b->CreateAdd(segNo, b->getSize(1));
    585 //        b->releaseLogicalSegmentNo(nextSegNo);
    586     }
    587 
    588     for (const auto consumed : consumedItemCount) {
    589         const StreamSetBuffer * const buffer = consumed.first;
    590         Kernel * const kernel = buffer->getProducer();
    591         const auto & binding = kernel->getStreamOutput(buffer);
    592         if (LLVM_UNLIKELY(binding.getRate().isDerived())) {
    593             continue;
    594         }
    595         b->setKernel(kernel);
    596         b->setConsumedItemCount(binding.getName(), consumed.second);
    597     }
    598 
    599     b->CreateCondBr(terminated, pipelineExit, pipelineLoop);
    600 
    601     b->SetInsertPoint(pipelineExit);
    602 
    603     if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    604         for (unsigned k = 0; k < kernels.size(); k++) {
    605             auto & kernel = kernels[k];
    606             b->setKernel(kernel);
    607             const auto & inputs = kernel->getStreamInputs();
    608             const auto & outputs = kernel->getStreamOutputs();
    609             Value * items = nullptr;
    610             if (inputs.empty()) {
    611                 items = b->getProducedItemCount(outputs[0].getName());
    612             } else {
    613                 items = b->getProcessedItemCount(inputs[0].getName());
    614             }
    615             Value * fItems = b->CreateUIToFP(items, b->getDoubleTy());
    616             Value * cycles = b->CreateLoad(b->getCycleCountPtr());
    617             Value * fCycles = b->CreateUIToFP(cycles, b->getDoubleTy());
    618             const auto formatString = kernel->getName() + ": %7.2e items processed; %7.2e CPU cycles,  %6.2f cycles per item.\n";
    619             Value * stringPtr = b->CreatePointerCast(b->GetString(formatString), b->getInt8PtrTy());
    620             b->CreateCall(b->GetDprintf(), {b->getInt32(2), stringPtr, fItems, fCycles, b->CreateFDiv(fCycles, fItems)});
    621         }
    622     }
     429
    623430}
    624431
     
    671478    const Kernel * const producer = buffer->getProducer();
    672479    const Binding & output = producer->getStreamOutput(buffer);
    673     auto producedRate = producer->getLowerBound(output.getRate()) * producer->getStride();
    674480    const auto consumedRate = consumer->getUpperBound(input.getRate()) * consumer->getStride();
    675     if (LLVM_UNLIKELY(input.hasLookahead())) {
    676         producedRate -= input.getLookahead();
    677 //        const auto amount = input.getLookahead();
    678 //        const auto strides = ((amount + consumer->getStride() - 1) / consumer->getStride());
    679 //        consumedRate += strides * consumer->getStride();
    680     }
    681     if (LLVM_UNLIKELY(producedRate < consumedRate)) {
    682         const auto name = input.getName();
    683         BasicBlock * const sufficient = BasicBlock::Create(b->getContext(), name + "IsSufficient", b->GetInsertBlock()->getParent());
    684         Value * const processed = b->getProcessedItemCount(name);
    685         Value * const unread = b->CreateSub(produced, processed);
    686         Constant * const amount = ConstantInt::get(unread->getType(), ceiling(consumedRate));
    687         Value * const cond = b->CreateOr(b->CreateICmpUGE(unread, amount), final);
    688         b->CreateLikelyCondBr(cond, sufficient, insufficient);
    689         b->SetInsertPoint(sufficient);
     481    if (consumedRate > 0) {
     482        auto producedRate = producer->getLowerBound(output.getRate()) * producer->getStride();
     483        if (LLVM_UNLIKELY(input.hasLookahead())) {
     484            producedRate -= input.getLookahead();
     485        }
     486        if (LLVM_UNLIKELY(producedRate < consumedRate)) {
     487            const auto name = input.getName();
     488            BasicBlock * const sufficient = BasicBlock::Create(b->getContext(), name + "IsSufficient", b->GetInsertBlock()->getParent());
     489            Value * const processed = b->getProcessedItemCount(name);
     490
     491            if (LLVM_UNLIKELY(DebugOptionIsSet(codegen::EnableAsserts))) {
     492                b->CreateAssert(b->CreateICmpULE(processed, produced), input.getName() + ": processed cannot exceed produced");
     493            }
     494            Value * const unread = b->CreateSub(produced, processed);
     495            Constant * const amount = ConstantInt::get(unread->getType(), ceiling(consumedRate));
     496            Value * const cond = b->CreateOr(b->CreateICmpUGE(unread, amount), final);
     497            b->CreateLikelyCondBr(cond, sufficient, insufficient);
     498            b->SetInsertPoint(sufficient);
     499        }
    690500    }
    691501}
  • icGREP/icgrep-devel/icgrep/toolchain/pipeline.h

    r5706 r5793  
    99#include <memory>
    1010
    11 namespace IDISA { class IDISA_Builder; }
    1211namespace kernel { class Kernel; }
    1312namespace kernel { class KernelBuilder; }
    1413
    1514void generateSegmentParallelPipeline(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const std::vector<kernel::Kernel *> & kernels);
    16 void generateParallelPipeline(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const std::vector<kernel::Kernel *> & kernels);
    1715void generatePipelineLoop(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const std::vector<kernel::Kernel *> & kernels);
    1816
  • icGREP/icgrep-devel/icgrep/toolchain/toolchain.cpp

    r5773 r5793  
    7676
    7777
    78 static cl::opt<bool, true> pipelineParallelOption("enable-pipeline-parallel", cl::location(PipelineParallel), cl::init(false),
    79                                                   cl::desc("Enable multithreading with pipeline parallelism."), cl::cat(CodeGenOptions));
    80    
    8178static cl::opt<bool, true> segmentPipelineParallelOption("enable-segment-pipeline-parallel", cl::location(SegmentPipelineParallel),
    8279                                                         cl::desc("Enable multithreading with segment pipeline parallelism."), cl::cat(CodeGenOptions));
     
    8986
    9087CodeGenOpt::Level OptLevel;
    91 
    92 bool PipelineParallel;
    9388
    9489bool SegmentPipelineParallel;
  • icGREP/icgrep-devel/icgrep/toolchain/toolchain.h

    r5773 r5793  
    5454bool LLVM_READONLY DebugOptionIsSet(const DebugFlags flag);
    5555
    56 extern bool PipelineParallel;
    5756extern bool SegmentPipelineParallel;
    5857   
Note: See TracChangeset for help on using the changeset viewer.