Ignore:
Timestamp:
Jan 14, 2018, 3:30:04 PM (20 months ago)
Author:
nmedfort
Message:

Potential bug fix for 32-bit

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
12 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5793 r5831  
    593593 * @brief requiresBufferedFinalStride
    594594 ** ------------------------------------------------------------------------------------------------------------- */
    595 inline bool requiresBufferedFinalStride(const Binding & binding) {
     595inline bool LLVM_READNONE requiresBufferedFinalStride(const Binding & binding) {
    596596    if (LLVM_LIKELY(isa<ArrayType>(binding.getType()))) {
    597597        return binding.getType()->getArrayNumElements() == 1;
     
    603603 * @brief getItemWidth
    604604 ** ------------------------------------------------------------------------------------------------------------- */
    605 inline unsigned getItemWidth(const Binding & b) {
     605inline unsigned LLVM_READNONE getItemWidth(const Binding & b) {
    606606    Type * ty = b.getType();
    607607    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     
    612612
    613613/** ------------------------------------------------------------------------------------------------------------- *
    614  * @brief getUpperBound
    615  ** ------------------------------------------------------------------------------------------------------------- */
    616 bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
     614 * @brief isTransitivelyUnknownRate
     615 ** ------------------------------------------------------------------------------------------------------------- */
     616bool LLVM_READNONE MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
    617617    if (rate.isUnknown()) {
    618618        return true;
     
    624624
    625625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief requiresTemporaryInputBuffer
     627 ** ------------------------------------------------------------------------------------------------------------- */
     628inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const {
     629    if (requiresBufferedFinalStride(binding)) {
     630        return true;
     631    } else if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
     632        report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
     633    } else {
     634        return !rate.isFixed();
     635    }
     636}
     637
     638/** ------------------------------------------------------------------------------------------------------------- *
     639 * @brief requiresTemporaryOutputBuffer
     640 ** ------------------------------------------------------------------------------------------------------------- */
     641inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const {
     642    if (requiresBufferedFinalStride(binding)) {
     643        return true;
     644    } else {
     645        return !(rate.isFixed() || isTransitivelyUnknownRate(rate));
     646    }
     647}
     648
     649/** ------------------------------------------------------------------------------------------------------------- *
    626650 * @brief getItemAlignment
    627651 ** ------------------------------------------------------------------------------------------------------------- */
    628 inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
     652inline unsigned LLVM_READNONE MultiBlockKernel::getItemAlignment(const Binding & binding) const {
    629653    const auto & rate = binding.getRate();
    630654    if (rate.isFixed() && binding.nonDeferred() && !binding.isMisaligned()) {
     
    641665
    642666/** ------------------------------------------------------------------------------------------------------------- *
     667 * @brief getCopyAlignment
     668 ** ------------------------------------------------------------------------------------------------------------- */
     669inline unsigned LLVM_READNONE MultiBlockKernel::getCopyAlignment(const Binding & binding) const {
     670    return ((getItemAlignment(binding) * getItemWidth(binding)) + 7) / 8;
     671}
     672
     673/** ------------------------------------------------------------------------------------------------------------- *
    643674 * @brief getStrideSize
    644675 ** ------------------------------------------------------------------------------------------------------------- */
    645 llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
     676llvm::Value * LLVM_READNONE MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
    646677    // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
    647678    const auto r = getUpperBound(rate);
     
    669700    const auto outputSetCount = mStreamSetOutputs.size();
    670701
    671     // Define and allocate the temporary buffer area in the prolog.
     702    // Define and allocate the temporary buffer area in the prolog.   
    672703    const auto blockAlignment = b->getBitBlockWidth() / 8;
    673704    AllocaInst * temporaryInputBuffer[inputSetCount];
    674     for (unsigned i = 0; i < inputSetCount; ++i) {
     705    for (unsigned i = 0; i < inputSetCount; ++i) {       
    675706        const Binding & input = mStreamSetInputs[i];
    676707        const ProcessingRate & rate = input.getRate();
    677         if (isTransitivelyUnknownRate(rate)) {
    678             report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
    679         } else if (rate.isFixed() && !requiresBufferedFinalStride(input)) {
    680             temporaryInputBuffer[i] = nullptr;
    681         } else {
     708        temporaryInputBuffer[i] = nullptr;
     709        if (requiresTemporaryInputBuffer(input, rate)) {
    682710            Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    683711            auto ub = getUpperBound(rate);
     
    696724        const Binding & output = mStreamSetOutputs[i];
    697725        const ProcessingRate & rate = output.getRate();
    698         if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate) || (rate.isFixed() && !requiresBufferedFinalStride(output)))) {
    699             temporaryOutputBuffer[i] = nullptr;
    700         } else {           
     726        temporaryOutputBuffer[i] = nullptr;
     727        if (requiresTemporaryOutputBuffer(output, rate)) {
    701728            auto ub = getUpperBound(rate);
    702729            if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     
    710737        }
    711738    }
    712 
    713     // Now we iteratively process these blocks using the doMultiBlock method.
    714     // In each iteration, we check how many linearly accessible / writable
    715     // items can be processed with our current input / output buffers. If we
    716     // cannot support an full stride, we check whether (a) there is enough
    717     // input data to process but it is not linearly accessible, in which case
    718     // we move the data into temporary buffers or (b) there is not enough data
    719     // to process, in which case we abort unless IsFinal was set.
    720739
    721740    Constant * const ZERO = b->getSize(0);
     
    738757    }
    739758
    740     // Now proceed with creation of the doSegment method.
    741     BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
    742 
    743     b->CreateBr(segmentLoop);
    744 
    745     /// DO SEGMENT LOOP
    746 
    747     b->SetInsertPoint(segmentLoop);
    748 
    749     // For each input buffer, get the initial processed item count, base input pointer, and the number of
    750     // linearly available strides.
    751     Value * numOfStrides = nullptr;
    752759    mInitialAvailableItemCount.assign(mAvailableItemCount.begin(), mAvailableItemCount.end());
    753760    mInitialProcessedItemCount.resize(inputSetCount);
    754761    mStreamSetInputBaseAddress.resize(inputSetCount);
     762
     763    // Now proceed with creation of the doSegment method.
     764    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     765
     766    b->CreateBr(segmentLoop);
     767
     768    /// DO SEGMENT LOOP
     769
     770    b->SetInsertPoint(segmentLoop);
     771
     772    Value * numOfStrides = nullptr;
     773
     774    // TODO: we don't want the our available output space to limit how many conditional blocks we
     775    // can check. When we have a conditional region, split computation of input/output strides and
     776    // check as many input strides as possible but leave the kernel in a state that respects our
     777    // available output space. NOTE: we know coming into this block that the pipeline or kernel has
     778    // ensured there is at least one stride worth of space.
     779
     780
     781    // For each input buffer, get the initial processed item count, base input pointer, and the number of
     782    // linearly available strides.
    755783    Value * inputStrideSize[inputSetCount];
     784    Value * linearlyAccessible[inputSetCount];
    756785    for (unsigned i = 0; i < inputSetCount; i++) {
    757786        const Binding & input = mStreamSetInputs[i];
    758787        const auto & name = input.getName();
    759         const ProcessingRate & rate = input.getRate();
    760788        Value * const processed = b->getProcessedItemCount(name);
    761 
    762789        mInitialProcessedItemCount[i] = processed;
    763         Value * baseBuffer  = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
    764 
    765         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {           
     790        mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
     791        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    766792            b->CreateAssert(b->CreateICmpULE(processed, mAvailableItemCount[i]),
    767793                            getName() + ": " + name + " processed item count exceeds its available item count");
    768794        }
    769 
    770         // Ensure that everything between S⌈P/S⌉, and S⌈n*(P + L)/S⌉ is linearly available, where S is the stride size,
    771         // P is the current processed position, L is the lookahead amount and n ∈ â„€+.
    772 
    773         Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);
    774         Value * avail = b->getLinearlyAccessibleItems(name, processed, unprocessed);
    775         Value * remaining = avail;
    776         if (LLVM_UNLIKELY(input.hasLookahead())) {
    777             Constant * const lookahead = b->getSize(input.getLookahead());
    778             remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO);
    779         }
    780 
    781         inputStrideSize[i] = getStrideSize(b, rate);
    782         Value * accessibleStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
     795        Value * const unprocessed = b->CreateNUWSub(mAvailableItemCount[i], processed);
     796        mAvailableItemCount[i] = unprocessed;
     797        Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
     798        linearlyAccessible[i] = accessible;
     799        inputStrideSize[i] = getStrideSize(b, input.getRate());
     800        Value * const accessibleStrides = b->CreateUDiv(accessible, inputStrideSize[i]);
     801        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
     802    }
     803
     804    BasicBlock * const checkInputAvailability = b->CreateBasicBlock("CheckInputAvailability");
     805    BasicBlock * const selectOutputBuffers = b->CreateBasicBlock("SelectOutputBuffers");
     806    b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), selectOutputBuffers, checkInputAvailability);
     807
     808    // Ensure that everything between S⌈P/S⌉ and S⌈n*(P + L)/S⌉ is linearly available, where S is the stride size,
     809    // P is the current processed position, L is the lookahead amount and n is our number of accessible strides ∈ â„€+.
     810    b->SetInsertPoint(checkInputAvailability);
     811    Value * const initiallyFinal = mIsFinal;
     812    Value * linearlyCopyable[inputSetCount];
     813    PHINode * selectedInputBuffer[inputSetCount];
     814    for (unsigned i = 0; i < inputSetCount; i++) {
    783815        AllocaInst * const tempBuffer = temporaryInputBuffer[i];
     816        selectedInputBuffer[i] = nullptr;
    784817        if (tempBuffer) {
    785818
    786             // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
    787             // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer.
     819            const Binding & input = mStreamSetInputs[i];
     820            const auto & name = input.getName();
     821            Value * const processed = mInitialProcessedItemCount[i];
     822            Value * const unprocessed = mAvailableItemCount[i];
     823            Value * const accessible = linearlyAccessible[i];
    788824
    789825            BasicBlock * const entry = b->GetInsertBlock();
     
    792828            BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
    793829
    794             b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume);
     830            Value * strideSize = inputStrideSize[i];
     831            if (LLVM_UNLIKELY(input.hasLookahead())) {
     832                Constant * const lookahead = b->getSize(input.getLookahead());
     833                strideSize = b->CreateNUWAdd(strideSize, lookahead);
     834            }
     835            Value * const requiresCopy = b->CreateICmpULT(accessible, strideSize);
     836            b->CreateUnlikelyCondBr(requiresCopy, copyFromBack, resume);
    795837
    796838            b->SetInsertPoint(copyFromBack);
    797839            Value * const arraySize = b->CreateZExt(tempBuffer->getArraySize(), b->getInt64Ty());
    798             Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), unprocessed->getType());
    799             Value * const temporaryAvailable = b->CreateUMin(unprocessed, temporarySize);
     840            Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), accessible->getType());
     841            Value * const copyable = b->CreateUMin(unprocessed, temporarySize); // <- we only really need strideSize items
    800842            Value * const offset = b->CreateAnd(processed, BLOCK_WIDTH_MASK);
    801843            Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), arraySize);
    802844            b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
    803             const auto copyAlignment = getItemAlignment(mStreamSetInputs[i]);
    804             b->CreateStreamCpy(name, tempBuffer, ZERO, baseBuffer, offset, avail, copyAlignment);
    805             Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
     845            b->CreateStreamCpy(name, tempBuffer, ZERO, mStreamSetInputBaseAddress[i], offset, accessible, getItemAlignment(input));
    806846            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
    807             b->CreateCondBr(b->CreateICmpNE(temporaryAvailable, unprocessed), copyFromFront, resume);
     847            b->CreateCondBr(b->CreateICmpNE(copyable, accessible), copyFromFront, resume);
    808848
    809849            b->SetInsertPoint(copyFromFront);
    810             Value * const remaining = b->CreateSub(temporaryAvailable, avail);
     850            Value * const remaining = b->CreateSub(copyable, accessible);
    811851            Value * const baseAddress = b->getBaseAddress(name);
    812             b->CreateStreamCpy(name, tempBuffer, avail, baseAddress, ZERO, remaining, copyAlignment);
     852            b->CreateStreamCpy(name, tempBuffer, accessible, baseAddress, ZERO, remaining, getItemAlignment(input));
     853            Value * const isPartialStride = b->CreateICmpUGE(copyable, strideSize);
    813854            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
    814855            b->CreateBr(resume);
    815856
    816857            b->SetInsertPoint(resume);
    817             PHINode * const bufferPtr = b->CreatePHI(baseBuffer->getType(), 4);
    818             bufferPtr->addIncoming(baseBuffer, entry);
    819             bufferPtr->addIncoming(tempBuffer, copyToBackEnd);
    820             bufferPtr->addIncoming(tempBuffer, copyToFrontEnd);
    821             baseBuffer = bufferPtr;
    822 
    823             PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 4);
    824             phiAvailItemCount->addIncoming(avail, entry);
    825             phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
    826             phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd);
    827             avail = phiAvailItemCount;
    828 
    829             PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 4);
    830             phiStrides->addIncoming(accessibleStrides, entry);
    831             phiStrides->addIncoming(temporaryStrides, copyToBackEnd);
    832             phiStrides->addIncoming(temporaryStrides, copyToFrontEnd);
    833             accessibleStrides = phiStrides;
    834         }
    835         mAvailableItemCount[i] = avail;
    836         mStreamSetInputBaseAddress[i] = baseBuffer;
    837         numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
    838     }
     858            PHINode * const address = b->CreatePHI(tempBuffer->getType(), 3);
     859            address->addIncoming(mStreamSetInputBaseAddress[i], entry);
     860            address->addIncoming(tempBuffer, copyToBackEnd);
     861            address->addIncoming(tempBuffer, copyToFrontEnd);
     862            selectedInputBuffer[i] = address;
     863            PHINode * const available = b->CreatePHI(accessible->getType(), 3);
     864            available->addIncoming(accessible, entry);
     865            available->addIncoming(copyable, copyToBackEnd);
     866            available->addIncoming(copyable, copyToFrontEnd);
     867            linearlyCopyable[i] = available;
     868            PHINode * const finalStride = b->CreatePHI(b->getInt1Ty(), 3);
     869            finalStride->addIncoming(mIsFinal, entry);
     870            finalStride->addIncoming(b->getTrue(), copyToBackEnd);
     871            finalStride->addIncoming(isPartialStride, copyToFrontEnd);
     872            mIsFinal = finalStride;
     873            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     874                Value * const hasStride = b->CreateOr(initiallyFinal, b->CreateNot(finalStride));
     875                b->CreateAssert(hasStride, getName() + ": " + name + " has insufficient input data for one stride");
     876            }
     877        }
     878    }
     879
     880    BasicBlock * const endCheckInputAvailability = b->GetInsertBlock();
     881    selectOutputBuffers->moveAfter(endCheckInputAvailability);
     882    b->CreateBr(selectOutputBuffers);
     883
     884    b->SetInsertPoint(selectOutputBuffers);
     885    PHINode * const final = b->CreatePHI(mIsFinal->getType(), 2);
     886    final->addIncoming(b->getFalse(), segmentLoop);
     887    final->addIncoming(mIsFinal, endCheckInputAvailability);
     888    mIsFinal = final;
     889    for (unsigned i = 0; i < inputSetCount; i++) {
     890        if (selectedInputBuffer[i]) {
     891            PHINode * const address = b->CreatePHI(selectedInputBuffer[i]->getType(), 2);
     892            address->addIncoming(mStreamSetInputBaseAddress[i], segmentLoop);
     893            address->addIncoming(selectedInputBuffer[i], endCheckInputAvailability);
     894            mStreamSetInputBaseAddress[i] = address;
     895            PHINode * const accessible = b->CreatePHI(linearlyAccessible[i]->getType(), 2);
     896            accessible->addIncoming(linearlyAccessible[i], segmentLoop);
     897            accessible->addIncoming(linearlyCopyable[i], endCheckInputAvailability);
     898            linearlyAccessible[i] = accessible;
     899        }
     900    }
     901    PHINode * const strides = b->CreatePHI(numOfStrides->getType(), 2);
     902    strides->addIncoming(numOfStrides, segmentLoop);
     903    strides->addIncoming(ONE, endCheckInputAvailability);
     904    numOfStrides = strides;
    839905
    840906    // Now determine the linearly writeable strides
     907    Value * outputStrideSize[outputSetCount];
    841908    Value * linearlyWritable[outputSetCount];
    842     Value * outputStrideSize[outputSetCount];
    843909    mInitialProducedItemCount.resize(outputSetCount);
    844910    mStreamSetOutputBaseAddress.resize(outputSetCount);
     
    846912        const auto & output = mStreamSetOutputs[i];
    847913        const auto & name = output.getName();
    848         const ProcessingRate & rate = output.getRate();
    849914        Value * const produced = b->getProducedItemCount(name);
    850915        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    851         assert (baseBuffer->getType()->isPointerTy());
    852         linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);
    853         outputStrideSize[i] = getStrideSize(b, rate);
     916        mInitialProducedItemCount[i] = produced;
     917        mStreamSetOutputBaseAddress[i] = baseBuffer;
     918
    854919        // Is the number of linearly writable items sufficient for a stride?
     920        outputStrideSize[i] = getStrideSize(b, output.getRate());
    855921        if (outputStrideSize[i]) {
     922            linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);
     923            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
     924            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
     925            // Do we require a temporary buffer to write to?
    856926            AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    857             Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
    858             // Do we require a temporary buffer to write to?
    859927            if (tempBuffer) {
    860928                assert (tempBuffer->getType() == baseBuffer->getType());
     
    876944                baseBuffer = phiBuffer;
    877945                PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
    878                 phiStrides->addIncoming(writableStrides, entry);
     946                phiStrides->addIncoming(numOfStrides, entry);
    879947                phiStrides->addIncoming(ONE, prepareTempBuffer);
    880                 writableStrides = phiStrides;
    881             }
    882             numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
    883         }
    884         mInitialProducedItemCount[i] = produced;
    885         mStreamSetOutputBaseAddress[i] = baseBuffer;
    886     }
    887 
    888     BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
    889 
    890     Value * const initiallyFinal = mIsFinal;
    891     if (LLVM_LIKELY(numOfStrides != nullptr)) {
    892         mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
    893         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    894             Value * const hasStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
    895             b->CreateAssert(hasStride, getName() + " has insufficient input data or output space for one stride");
    896         }
    897         for (unsigned i = 0; i < inputSetCount; ++i) {
    898             const auto & input = mStreamSetInputs[i];
    899             const ProcessingRate & rate = input.getRate();
    900             if (rate.isFixed() && input.nonDeferred()) {
    901                 mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i]));
    902             }
    903         }
    904     }
    905 
    906     //  We have one or more blocks of input data and output buffer space for all stream sets.
     948                numOfStrides = phiStrides;
     949            }
     950            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     951                b->CreateAssert(numOfStrides, getName() + ": " + name + " has insufficient output space for one stride");
     952            }
     953        }
     954    }
     955
     956    // Update the locally available item count to reflect the current state
     957    for (unsigned i = 0; i < inputSetCount; i++) {
     958        const Binding & input = mStreamSetInputs[i];
     959        if (input.getRate().isFixed() && input.nonDeferred()) {
     960            Value * const processable = b->CreateMul(numOfStrides, inputStrideSize[i]);
     961            linearlyAccessible[i] = b->CreateSelect(mIsFinal, linearlyAccessible[i], processable);
     962        }
     963        mAvailableItemCount[i] = linearlyAccessible[i];
     964    }
     965
     966    //  We have one or more strides of input data and output buffer space for all stream sets.
    907967    generateMultiBlockLogic(b, numOfStrides);
    908968
     
    920980        const ProcessingRate & rate = output.getRate();
    921981        if (rate.isFixed()) {
    922             assert (output.nonDeferred());
    923982            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
    924             Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
     983            Value * const ic = b->CreateNUWAdd(mInitialProducedItemCount[i], produced);
    925984            b->setProducedItemCount(output.getName(), ic);
    926985        }
     
    9501009    // Copy back data to the actual output buffers.
    9511010    for (unsigned i = 0; i < outputSetCount; i++) {
     1011
    9521012        AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    9531013        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
     
    9551015        }
    9561016
     1017        const auto & name = mStreamSetOutputs[i].getName();
     1018        Value * const produced = b->getProducedItemCount(name);
    9571019        Value * const baseBuffer = mStreamSetOutputBaseAddress[i];
    9581020        assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType()));
    959         const auto & name = mStreamSetOutputs[i].getName();
     1021        //const auto & name = mStreamSetOutputs[i].getName();
    9601022        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
    9611023        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
    9621024        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
    9631025        // If we used a temporary buffer, copy it back to the original output buffer
    964         b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume);
     1026        Value * const requiresCopy = b->CreateICmpEQ(tempBuffer, baseBuffer);
     1027        b->CreateCondBr(requiresCopy, copyToBack, resume);
    9651028
    9661029        b->SetInsertPoint(copyToBack);       
    9671030        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
    968         Value * const newProducedItemCount = b->getProducedItemCount(name);
    969         Value * const newlyProduced = b->CreateSub(newProducedItemCount, mInitialProducedItemCount[i]);
     1031        //Value * const newProducedItemCount = b->getProducedItemCount(name);
     1032        Value * const newlyProduced = b->CreateNUWSub(produced, mInitialProducedItemCount[i]);
    9701033        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
    9711034        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     
    9751038
    9761039        b->SetInsertPoint(copyToFront);
    977         Value * const remaining = b->CreateSub(newlyProduced, toWrite);
     1040        Value * const remaining = b->CreateNUWSub(newlyProduced, toWrite);
    9781041        Value * const baseAddress = b->getBaseAddress(name);
    9791042        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
     
    9871050    BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
    9881051    b->CreateCondBr(mIsFinal, setTermination, strideDone);
    989 
    9901052    b->SetInsertPoint(setTermination);
    9911053    b->setTerminationSignal();
     1054    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
    9921055    b->CreateBr(segmentDone);
    9931056
     
    10071070        }
    10081071        Value * remaining = b->CreateSub(avail, processed);
     1072        Value * strideSize = inputStrideSize[i];
    10091073        if (LLVM_UNLIKELY(input.hasLookahead())) {
    1010             Constant * const lookahead = b->getSize(input.getLookahead());
    1011             remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO);
    1012         }
    1013         Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
    1014         Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
     1074            strideSize = b->CreateNUWAdd(strideSize, b->getSize(input.getLookahead()));
     1075        }
     1076        Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, strideSize);
    10151077        hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    10161078    }
     
    10371099            }
    10381100            Value * const remaining = b->CreateSub(capacity, unconsumed);
    1039             Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
    1040             Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
    1041 
     1101            Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, outputStrideSize[i]);
    10421102            hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    10431103        }
     
    12251285 * @brief generateMultiBlockLogic
    12261286 ** ------------------------------------------------------------------------------------------------------------- */
    1227 Value * BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     1287void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
    12281288
    12291289    if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
     
    12391299    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
    12401300    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
    1241     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1242         b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
    1243                         "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
    1244     }
     1301
    12451302    const auto inputSetCount = mStreamSetInputs.size();
    12461303    Value * baseProcessedIndex[inputSetCount];
     
    13441401    }
    13451402
    1346     Value * const remainingItems = getRemainingItems(b);
    1347 
    1348 //    b->CallPrintInt(getName() + "_remainingItems", remainingItems);
    1349 
    1350     writeFinalBlockMethod(b, remainingItems);
     1403    writeFinalBlockMethod(b, getRemainingItems(b));
    13511404
    13521405    b->CreateBr(segmentDone);
     
    13681421    }
    13691422
    1370     return numOfBlocks;
    13711423}
    13721424
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5793 r5831  
    433433    // exit the RetVoid instruction will be added to complete the method.
    434434    //
    435     virtual llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
     435    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
    436436
    437437private:
     
    444444    unsigned getItemAlignment(const Binding & binding) const;
    445445
     446    unsigned getCopyAlignment(const Binding & binding) const;
     447
    446448    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
     449
     450    bool requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const;
     451
     452    bool requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const;
    447453
    448454    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
     
    488494private:
    489495
    490     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     496    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
    491497
    492498    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5793 r5831  
    1111using namespace kernel;
    1212
    13 Value * LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {
     13void LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {
    1414
    1515    BasicBlock * entry_block = b->GetInsertBlock();
     
    169169
    170170    b->SetInsertPoint(loopExit);
    171     return numOfStrides;
    172171}
    173172
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.h

    r5755 r5831  
    1818    LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize);
    1919protected:
    20     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;
     20    void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;
    2121private:
    2222    size_t mBufferSize;
  • icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

    r5755 r5831  
    3939// of bytes to the actual output stream.
    4040
    41 Value * expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
     41void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    4242
    4343    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
     
    131131    iBuilder->SetInsertPoint(expand3_4_exit);
    132132
    133     return numOfStrides;
    134133}
    135134
  • icGREP/icgrep-devel/icgrep/kernels/radix64.h

    r5755 r5831  
    2525    bool hasSignature() const override { return false; }
    2626private:
    27     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
     27    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    2828};
    2929
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp

    r5782 r5831  
    2121namespace kernel {
    2222
    23 Value * ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
     23void ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) {
    2424
    2525    Module * const m = iBuilder->getModule();
     
    4747    Value * line_break = iBuilder->getInputStreamBlockPtr("lineBreak", iBuilder->getInt32(0));
    4848
    49     Value * blocksToDo = iBuilder->CreateAdd(numOfStrides, iBuilder->CreateZExt(mIsFinal, numOfStrides->getType()));
    50     blocksToDo = iBuilder->CreateMul(blocksToDo, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth()));
     49    Value * const blocksToDo = iBuilder->CreateMul(numOfStrides, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth()));
    5150   
    5251    Value * match_result_ptr = iBuilder->CreateBitCast(match_result, scanwordVectorType->getPointerTo());
     
    205204
    206205    iBuilder->SetInsertPoint(scanReturn);
    207     return numOfStrides;
    208206}
    209207
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.h

    r5755 r5831  
    2020    bool hasSignature() const override { return false; }
    2121private:
    22     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     22    void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    2323};
    2424
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r5793 r5831  
    1515namespace kernel {
    1616
    17 Value * StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
     17void StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
    1818    Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
    1919    codeUnitBuffer = b->CreatePointerCast(codeUnitBuffer, b->getInt8PtrTy());
     
    2525    }
    2626    b->CreateWriteCall(b->getInt32(1), codeUnitBuffer, bytesToDo);
    27     return numOfStrides;
    2827}
    2928
     
    6463}
    6564
    66 Value * FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) {
     65void FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) {
    6766    Value * const fileDes = b->getScalarField("fileDes");
    6867    Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
     
    7574    }   
    7675    b->CreateWriteCall(fileDes, codeUnitBuffer, bytesToDo);
    77     return numOfStrides;
    7876}
    7977
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.h

    r5793 r5831  
    1616    StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth);
    1717private:
    18     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     18    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    1919private:
    2020    const unsigned mCodeUnitWidth;
     
    2727protected:
    2828    void generateInitializeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    29     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     29    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    3030    void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b) override;
    3131private:
  • icGREP/icgrep-devel/icgrep/kernels/until_n.cpp

    r5830 r5831  
    1717namespace kernel {
    1818
    19 const unsigned packSize = 64;
    20    
    21 llvm::Value * UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
     19void UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
    2220
    2321/* 
     
    3937*/
    4038
     39    const unsigned packSize = b->getSizeTy()->getBitWidth();
    4140    Constant * const ZERO = b->getSize(0);
    4241    Constant * const ONE = b->getSize(1);
     
    101100    //Value * const packPtr = b->CreateGEP(b->CreatePointerCast(groupPtr, packPtrTy), packOffset);
    102101    //Value * const packBits = b->CreateLoad(packPtr);
    103     Value * const packCount = b->CreatePopcount(packBits);
     102    Value * const packCount = b->CreateZExtOrTrunc(b->CreatePopcount(packBits), b->getSizeTy());
    104103    Value * const observedUpTo = b->CreateNUWAdd(observed, packCount);
    105104
     
    173172    b->setProducedItemCount("uptoN", producedCount);
    174173
    175     return numOfStrides;
     174}
     175
     176unsigned LLVM_READNONE calculateRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
     177    const unsigned packSize = b->getSizeTy()->getBitWidth();
     178    return (packSize * packSize) / b->getBitBlockWidth();
    176179}
    177180
    178181UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    179 : MultiBlockKernel("UntilN",
     182: MultiBlockKernel("UntilN_" + std::to_string(calculateRate(b)),
    180183// inputs
    181 {Binding{b->getStreamSetTy(), "bits", FixedRate((packSize * packSize) / b->getBitBlockWidth())}},
     184{Binding{b->getStreamSetTy(), "bits", FixedRate(calculateRate(b))}},
    182185// outputs
    183 {Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, (packSize * packSize) / b->getBitBlockWidth())}},
     186{Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, calculateRate(b))}},
    184187// input scalar
    185188{Binding{b->getSizeTy(), "N"}}, {},
  • icGREP/icgrep-devel/icgrep/kernels/until_n.h

    r5830 r5831  
    1414    UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    1515private:
    16     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     16    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
    1717
    1818};
Note: See TracChangeset for help on using the changeset viewer.