Ignore:
Timestamp:
Dec 9, 2017, 5:05:16 PM (17 months ago)
Author:
nmedfort
Message:

Minor changes and hopefully a fix for bug exposed by base64 test

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5761 r5771  
    680680    // Define and allocate the temporary buffer area in the prolog.
    681681    const auto blockAlignment = b->getBitBlockWidth() / 8;
    682     Value * temporaryInputBuffer[inputSetCount];
     682    AllocaInst * temporaryInputBuffer[inputSetCount];
    683683    for (unsigned i = 0; i < inputSetCount; ++i) {
    684 
    685         // TODO: if this is a fixed rate input stream and the pipeline guarantees it will not call the kernel unless
    686         // there is sufficient input and all buffers will be sized sufficiently for the input, we ought to be able to
    687         // avoid the temporary buffer checks.
    688 
    689         const ProcessingRate & rate = mStreamSetInputs[i].getRate();
    690         Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    691         const auto ub = getUpperBound(rate);
    692         if (ub.numerator() == 0) {
     684        const auto & input = mStreamSetInputs[i];
     685        const ProcessingRate & rate = input.getRate();
     686        if (isTransitivelyUnknownRate(rate)) {
    693687            report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
    694         } else {           
    695             temporaryInputBuffer[i] = b->CreateAlignedAlloca(ty, blockAlignment, b->getSize(roundUp(ub)));
    696             Type * const sty = temporaryInputBuffer[i]->getType()->getPointerElementType();
    697             b->CreateStore(Constant::getNullValue(sty), temporaryInputBuffer[i]);
    698         }       
    699     }
    700 
    701     Value * temporaryOutputBuffer[outputSetCount];
     688        } else if (rate.isFixed() && input.nonDeferred() && !requiresBufferedFinalStride(input)) {
     689            temporaryInputBuffer[i] = nullptr;
     690        } else {
     691            Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
     692            const auto ub = getUpperBound(rate);
     693            Constant * arraySize = b->getInt64(roundUp(ub));
     694            AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
     695            assert (ptr->isStaticAlloca());
     696            temporaryInputBuffer[i] = ptr;
     697        }
     698    }
     699
     700    AllocaInst * temporaryOutputBuffer[outputSetCount];
    702701    for (unsigned i = 0; i < outputSetCount; i++) {
    703         const ProcessingRate & rate = mStreamSetOutputs[i].getRate();
    704         Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
    705         if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
     702        const auto & output = mStreamSetOutputs[i];
     703        const ProcessingRate & rate = output.getRate();
     704        if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate) || (rate.isFixed() && output.nonDeferred() && !requiresBufferedFinalStride(output)))) {
    706705            temporaryOutputBuffer[i] = nullptr;
    707706        } else {           
     
    710709                ub += mStreamSetOutputBuffers[i]->overflowSize();
    711710            }
    712             temporaryOutputBuffer[i] = b->CreateAlignedAlloca(ty, blockAlignment, b->getSize(roundUp(ub)));
    713             Type * const sty = temporaryOutputBuffer[i]->getType()->getPointerElementType();
    714             b->CreateStore(Constant::getNullValue(sty), temporaryOutputBuffer[i]);
     711            Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     712            Constant * arraySize = b->getInt64(roundUp(ub));
     713            AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
     714            assert (ptr->isStaticAlloca());
     715            temporaryOutputBuffer[i] = ptr;
    715716        }
    716717    }
     
    751752        Value * const ic = b->getProcessedItemCount(name);
    752753        mInitialProcessedItemCount[i] = ic;
    753         b->CreateAssert(b->CreateICmpUGE(mAvailableItemCount[i], ic), "processed item count cannot exceed the available item count");
     754        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     755            b->CreateAssert(b->CreateICmpUGE(mAvailableItemCount[i], ic),
     756                            "processed item count cannot exceed the available item count");
     757        }
    754758        assert (ic->getType() == mAvailableItemCount[i]->getType());
    755759        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], ic);
    756 
    757         mStreamSetInputBaseAddress[i]  = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     760        Value * baseBuffer  = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
    758761        mInitialAvailableItemCount[i] = mAvailableItemCount[i];
    759762        mAvailableItemCount[i] = b->getLinearlyAccessibleItems(name, ic, unprocessed);
     
    761764        // Are our linearly accessible items sufficient for a stride?
    762765        inputStrideSize[i] = getStrideSize(b, rate);
    763 
    764766        Value * accessibleStrides = b->CreateUDiv(mAvailableItemCount[i], inputStrideSize[i]);
    765         if (!rate.isFixed() || (requiresBufferedFinalStride(input) && input.nonDeferred())) {
     767        AllocaInst * const tempBuffer = temporaryInputBuffer[i];
     768        if (tempBuffer) {
    766769
    767770            // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
     
    777780            b->SetInsertPoint(copyFromBack);
    778781            Value * const temporaryAvailable = b->CreateUMin(unprocessed, inputStrideSize[i]);
    779 
    780             b->CreateAssert(b->CreateICmpULE(mAvailableItemCount[i], temporaryAvailable), "linearly available cannot be greater than temporarily available");
    781             Value * const tempBufferPtr = temporaryInputBuffer[i];
     782            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     783                b->CreateAssert(b->CreateICmpULE(mAvailableItemCount[i], temporaryAvailable),
     784                                "linearly available cannot be greater than temporarily available");
     785            }
    782786            Value * const offset = b->CreateAnd(ic, BLOCK_WIDTH_MASK);
     787            Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), tempBuffer->getArraySize());
     788            b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
    783789            const auto copyAlignment = getItemAlignment(mStreamSetInputs[i]);
    784             b->CreateMemZero(tempBufferPtr, ConstantExpr::getSizeOf(tempBufferPtr->getType()), blockAlignment);
    785             b->CreateStreamCpy(name, tempBufferPtr, ZERO, mStreamSetInputBaseAddress[i] , offset, mAvailableItemCount[i], copyAlignment);
     790            b->CreateStreamCpy(name, tempBuffer, ZERO, baseBuffer, offset, mAvailableItemCount[i], copyAlignment);
    786791            Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
    787792            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
     
    791796            Value * const remaining = b->CreateSub(temporaryAvailable, mAvailableItemCount[i]);
    792797            Value * const baseAddress = b->getBaseAddress(name);
    793             b->CreateStreamCpy(name, tempBufferPtr, mAvailableItemCount[i], baseAddress, ZERO, remaining, copyAlignment);
     798            b->CreateStreamCpy(name, tempBuffer, mAvailableItemCount[i], baseAddress, ZERO, remaining, copyAlignment);
    794799            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
    795800            b->CreateBr(resume);
    796801
    797802            b->SetInsertPoint(resume);
    798             PHINode * const bufferPtr = b->CreatePHI(mStreamSetInputBaseAddress[i] ->getType(), 3);
    799             bufferPtr->addIncoming(mStreamSetInputBaseAddress[i] , entry);
    800             bufferPtr->addIncoming(tempBufferPtr, copyToBackEnd);
    801             bufferPtr->addIncoming(tempBufferPtr, copyToFrontEnd);
    802             mStreamSetInputBaseAddress[i] = bufferPtr;
     803            PHINode * const bufferPtr = b->CreatePHI(baseBuffer->getType(), 3);
     804            bufferPtr->addIncoming(baseBuffer , entry);
     805            bufferPtr->addIncoming(tempBuffer, copyToBackEnd);
     806            bufferPtr->addIncoming(tempBuffer, copyToFrontEnd);
     807            baseBuffer = bufferPtr;
    803808
    804809            PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 3);
     
    808813            mAvailableItemCount[i] = phiAvailItemCount;
    809814
    810             PHINode * const phiNumOfStrides = b->CreatePHI(b->getSizeTy(), 2);
    811             phiNumOfStrides->addIncoming(accessibleStrides, entry);
    812             phiNumOfStrides->addIncoming(temporaryStrides, copyToBackEnd);
    813             phiNumOfStrides->addIncoming(temporaryStrides, copyToFrontEnd);
    814             accessibleStrides = phiNumOfStrides;
    815         }
     815            PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
     816            phiStrides->addIncoming(accessibleStrides, entry);
     817            phiStrides->addIncoming(temporaryStrides, copyToBackEnd);
     818            phiStrides->addIncoming(temporaryStrides, copyToFrontEnd);
     819            accessibleStrides = phiStrides;
     820        }
     821
     822        mStreamSetInputBaseAddress[i] = baseBuffer;
    816823        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
    817824    }
     
    819826    // Now determine the linearly writeable strides
    820827    Value * linearlyWritable[outputSetCount];
    821     Value * baseOutputBuffer[outputSetCount];
    822828    Value * outputStrideSize[outputSetCount];
    823829    mInitialProducedItemCount.resize(outputSetCount);
     
    828834        const ProcessingRate & rate = output.getRate();
    829835        Value * const ic = b->getProducedItemCount(name);
    830         baseOutputBuffer[i] = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
    831         assert (baseOutputBuffer[i]->getType()->isPointerTy());
    832         linearlyWritable[i] = b->getLinearlyWritableItems(name, ic);
     836        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(ic, LOG_2_BLOCK_WIDTH));
     837        assert (baseBuffer->getType()->isPointerTy());
     838        linearlyWritable[i] = b->getLinearlyWritableItems(name, ic);       
     839        outputStrideSize[i] = getStrideSize(b, rate);
     840        // Is the number of linearly writable items sufficient for a stride?
     841        if (outputStrideSize[i]) {
     842            AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
     843            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
     844            // Do we require a temporary buffer to write to?
     845            if (tempBuffer) {
     846                assert (tempBuffer->getType() == baseBuffer->getType());
     847                BasicBlock * const entry = b->GetInsertBlock();
     848                BasicBlock * const useTemporary = b->CreateBasicBlock(name + "UseTemporary");
     849                BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
     850                Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
     851
     852                b->CreateUnlikelyCondBr(requiresCopy, useTemporary, resume);
     853
     854                // Clear the buffer after use since we may end up reusing it within the same stride
     855                b->SetInsertPoint(useTemporary);
     856                Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), tempBuffer->getArraySize());
     857                b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
     858                b->CreateBr(resume);
     859
     860                b->SetInsertPoint(resume);
     861                PHINode * const phiBuffer = b->CreatePHI(baseBuffer->getType(), 3);
     862                phiBuffer->addIncoming(baseBuffer, entry);
     863                phiBuffer->addIncoming(tempBuffer, useTemporary);
     864                baseBuffer = phiBuffer;
     865                PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
     866                phiStrides->addIncoming(writableStrides, entry);
     867                phiStrides->addIncoming(ONE, useTemporary);
     868                writableStrides = phiStrides;
     869
     870            }
     871            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
     872        }
    833873        mInitialProducedItemCount[i] = ic;
    834         outputStrideSize[i] = nullptr;
    835         if (temporaryOutputBuffer[i]) {
    836             outputStrideSize[i] = getStrideSize(b, rate);
    837             // Is the number of linearly writable items sufficient for a stride?
    838             Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
    839             if (!rate.isFixed() || requiresBufferedFinalStride(output)) {
    840                 Value * const requiresCopy = b->CreateICmpEQ(writableStrides, ZERO);
    841                 assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
    842                 baseOutputBuffer[i] = b->CreateSelect(requiresCopy, temporaryOutputBuffer[i], baseOutputBuffer[i]);
    843                 writableStrides = b->CreateSelect(requiresCopy, ONE, writableStrides);
    844             }
    845             numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
    846             assert (temporaryOutputBuffer[i]->getType() == baseOutputBuffer[i]->getType());
    847         }
    848         mStreamSetOutputBaseAddress[i] = baseOutputBuffer[i];
     874        mStreamSetOutputBaseAddress[i] = baseBuffer;
    849875    }
    850876
     
    854880    if (LLVM_LIKELY(numOfStrides != nullptr)) {
    855881        mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
    856         Value * const hasStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
    857         b->CreateAssert(hasStride, getName() + " has insufficient input data or output space for one stride");
     882        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     883            Value * const hasStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
     884            b->CreateAssert(hasStride, getName() + " has insufficient input data or output space for one stride");
     885        }
    858886        for (unsigned i = 0; i < inputSetCount; ++i) {
    859887            const ProcessingRate & rate = mStreamSetInputs[i].getRate();
     
    908936    // Copy back data to the actual output buffers.
    909937    for (unsigned i = 0; i < outputSetCount; i++) {
    910         Value * const tempBuffer = temporaryOutputBuffer[i];
     938        AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    911939        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
    912940            continue;
    913941        }
    914         Value * const baseBuffer = baseOutputBuffer[i];
     942        Value * const baseBuffer = mStreamSetOutputBaseAddress[i];
    915943        assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType()));
    916944        const auto & name = mStreamSetOutputs[i].getName();
    917945        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
    918946        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
    919         BasicBlock * const clearBuffer = b->CreateBasicBlock(name + "ClearBuffer");
    920947        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
    921948        // If we used a temporary buffer, copy it back to the original output buffer
     
    930957        b->CreateStreamCpy(name, baseBuffer, offset, tempBuffer, ZERO, toWrite, alignment);
    931958        // If we required a temporary output buffer, we will probably need to write to the beginning of the buffer as well.
    932         b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, clearBuffer);
     959        b->CreateLikelyCondBr(b->CreateICmpULT(toWrite, newlyProduced), copyToFront, resume);
    933960
    934961        b->SetInsertPoint(copyToFront);
     
    936963        Value * const baseAddress = b->getBaseAddress(name);
    937964        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
    938         b->CreateBr(clearBuffer);
    939         // Clear the buffer after use since we may end up reusing it within the same stride
    940         b->SetInsertPoint(clearBuffer);
    941 
    942965        b->CreateBr(resume);
    943966
     
    970993        Value * const avail = mInitialAvailableItemCount[i];
    971994        Value * const processed = b->getProcessedItemCount(name);
    972         b->CreateAssert(b->CreateICmpULE(processed, avail), name + ": processed data cannot exceed available data");
     995        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     996            b->CreateAssert(b->CreateICmpULE(processed, avail), name + ": processed data cannot exceed available data");
     997        }
    973998        Value * const remaining = b->CreateSub(avail, processed);
    974999        Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
     
    9871012        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
    9881013            Value * const consumed = b->getConsumedItemCount(name);
    989             b->CreateAssert(b->CreateICmpULE(consumed, produced), name + ": consumed data cannot exceed produced data");
     1014            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1015                b->CreateAssert(b->CreateICmpULE(consumed, produced), name + ": consumed data cannot exceed produced data");
     1016            }
    9901017            Value * const unconsumed = b->CreateSub(produced, consumed);
    9911018            Value * const capacity = b->getCapacity(name);
    992             b->CreateAssert(b->CreateICmpULE(unconsumed, capacity), name + ": unconsumed data cannot exceed capacity");
     1019            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1020                b->CreateAssert(b->CreateICmpULE(unconsumed, capacity), name + ": unconsumed data cannot exceed capacity");
     1021            }
    9931022            Value * const remaining = b->CreateSub(capacity, unconsumed);
    9941023            Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
     
    11811210    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
    11821211    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
    1183     b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
    1184                     "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
     1212    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     1213        b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
     1214                        "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
     1215    }
    11851216    const auto inputSetCount = mStreamSetInputs.size();
    11861217    Value * baseProcessedIndex[inputSetCount];
Note: See TracChangeset for help on using the changeset viewer.