Changeset 5831 for icGREP


Ignore:
Timestamp:
Jan 14, 2018, 3:30:04 PM (13 months ago)
Author:
nmedfort
Message:

Potential bug fix for 32-bit

Location:
icGREP/icgrep-devel/icgrep
Files:
14 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5824 r5831  
    9797//
    9898// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
    99 //#define USE_DIRECT_LF_BUILDER 1
     99
     100unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
     101    const unsigned packSize = b->getSizeTy()->getBitWidth();
     102    return (packSize * packSize) / b->getBitBlockWidth();
     103}
    100104
    101105std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
     
    103107    const unsigned segmentSize = codegen::SegmentSize;
    104108    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
     109    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
     110    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
    105111    const unsigned encodingBits = 8;
    106112
    107     StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
     113    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     114
     115    #ifdef USE_DIRECT_LF_BUILDER
     116    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
     117    mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream});
     118    #endif
     119
     120    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
    108121    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    109122    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    110123
    111     StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    112 #ifdef USE_DIRECT_LF_BUILDER
    113     kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::DirectLineFeedBuilder>(idb);
    114     mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream});
    115 #else
     124    #ifndef USE_DIRECT_LF_BUILDER
    116125    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
    117126    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
    118 #endif
    119 
    120     StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    121     StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     127    #endif
     128
     129    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     130    StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    122131    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
    123132    mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});
    124133
    125134    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    126     StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), segmentSize * bufferSegments);
     135    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), baseBufferSize);
    127136    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
    128137
    129138    const auto n = REs.size();
    130139    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
    131 
    132140    for(unsigned i = 0; i < n; ++i) {
    133141        REs[i] = resolveModesAndExternalSymbols(REs[i]);
     
    137145#ifdef USE_MULTIPLEX_CC
    138146        const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
     147
     148        StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    139149        if (UnicodeSets.size() <= 1) {
    140             StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    141150            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
    142151            mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
    143152            MatchResultsBufs[i] = MatchResults;
    144         }
    145         else {
     153        } else {
    146154            mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
    147155            REs[i] = transformCCs(mpx.get(), REs[i]);
    148156            std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
    149157            auto numOfCharacterClasses = mpx_basis.size();
    150             StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
     158            StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
    151159            kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
    152             mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
    153             StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     160            mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});           
    154161            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()});
    155162            mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams, CharClasses}, {MatchResults});
     
    157164        }
    158165#else
    159         StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     166        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    160167        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
    161168        mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
     
    165172    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
    166173    if (REs.size() > 1) {
    167         MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     174        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    168175        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
    169176        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
     
    174181        StreamSetBuffer * OriginalMatches = Matches;
    175182        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
    176         Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     183        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    177184        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
    178185    }
     
    181188        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
    182189        StreamSetBuffer * OriginalMatches = Matches;
    183         Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     190        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    184191        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
    185192    }
     
    187194        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
    188195        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
    189         StreamSetBuffer * AllMatches = Matches;
    190         Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     196        StreamSetBuffer * const AllMatches = Matches;
     197        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    191198        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
    192199    }
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5793 r5831  
    593593 * @brief requiresBufferedFinalStride
    594594 ** ------------------------------------------------------------------------------------------------------------- */
    595 inline bool requiresBufferedFinalStride(const Binding & binding) {
     595inline bool LLVM_READNONE requiresBufferedFinalStride(const Binding & binding) {
    596596    if (LLVM_LIKELY(isa<ArrayType>(binding.getType()))) {
    597597        return binding.getType()->getArrayNumElements() == 1;
     
    603603 * @brief getItemWidth
    604604 ** ------------------------------------------------------------------------------------------------------------- */
    605 inline unsigned getItemWidth(const Binding & b) {
     605inline unsigned LLVM_READNONE getItemWidth(const Binding & b) {
    606606    Type * ty = b.getType();
    607607    if (LLVM_LIKELY(isa<ArrayType>(ty))) {
     
    612612
    613613/** ------------------------------------------------------------------------------------------------------------- *
    614  * @brief getUpperBound
    615  ** ------------------------------------------------------------------------------------------------------------- */
    616 bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
     614 * @brief isTransitivelyUnknownRate
     615 ** ------------------------------------------------------------------------------------------------------------- */
     616bool LLVM_READNONE MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {
    617617    if (rate.isUnknown()) {
    618618        return true;
     
    624624
    625625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief requiresTemporaryInputBuffer
     627 ** ------------------------------------------------------------------------------------------------------------- */
     628inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const {
     629    if (requiresBufferedFinalStride(binding)) {
     630        return true;
     631    } else if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) {
     632        report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
     633    } else {
     634        return !rate.isFixed();
     635    }
     636}
     637
     638/** ------------------------------------------------------------------------------------------------------------- *
     639 * @brief requiresTemporaryOutputBuffer
     640 ** ------------------------------------------------------------------------------------------------------------- */
     641inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const {
     642    if (requiresBufferedFinalStride(binding)) {
     643        return true;
     644    } else {
     645        return !(rate.isFixed() || isTransitivelyUnknownRate(rate));
     646    }
     647}
     648
     649/** ------------------------------------------------------------------------------------------------------------- *
    626650 * @brief getItemAlignment
    627651 ** ------------------------------------------------------------------------------------------------------------- */
    628 inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {
     652inline unsigned LLVM_READNONE MultiBlockKernel::getItemAlignment(const Binding & binding) const {
    629653    const auto & rate = binding.getRate();
    630654    if (rate.isFixed() && binding.nonDeferred() && !binding.isMisaligned()) {
     
    641665
    642666/** ------------------------------------------------------------------------------------------------------------- *
     667 * @brief getCopyAlignment
     668 ** ------------------------------------------------------------------------------------------------------------- */
     669inline unsigned LLVM_READNONE MultiBlockKernel::getCopyAlignment(const Binding & binding) const {
     670    return ((getItemAlignment(binding) * getItemWidth(binding)) + 7) / 8;
     671}
     672
     673/** ------------------------------------------------------------------------------------------------------------- *
    643674 * @brief getStrideSize
    644675 ** ------------------------------------------------------------------------------------------------------------- */
    645 llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
     676llvm::Value * LLVM_READNONE MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {
    646677    // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation
    647678    const auto r = getUpperBound(rate);
     
    669700    const auto outputSetCount = mStreamSetOutputs.size();
    670701
    671     // Define and allocate the temporary buffer area in the prolog.
     702    // Define and allocate the temporary buffer area in the prolog.   
    672703    const auto blockAlignment = b->getBitBlockWidth() / 8;
    673704    AllocaInst * temporaryInputBuffer[inputSetCount];
    674     for (unsigned i = 0; i < inputSetCount; ++i) {
     705    for (unsigned i = 0; i < inputSetCount; ++i) {       
    675706        const Binding & input = mStreamSetInputs[i];
    676707        const ProcessingRate & rate = input.getRate();
    677         if (isTransitivelyUnknownRate(rate)) {
    678             report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input.");
    679         } else if (rate.isFixed() && !requiresBufferedFinalStride(input)) {
    680             temporaryInputBuffer[i] = nullptr;
    681         } else {
     708        temporaryInputBuffer[i] = nullptr;
     709        if (requiresTemporaryInputBuffer(input, rate)) {
    682710            Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    683711            auto ub = getUpperBound(rate);
     
    696724        const Binding & output = mStreamSetOutputs[i];
    697725        const ProcessingRate & rate = output.getRate();
    698         if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate) || (rate.isFixed() && !requiresBufferedFinalStride(output)))) {
    699             temporaryOutputBuffer[i] = nullptr;
    700         } else {           
     726        temporaryOutputBuffer[i] = nullptr;
     727        if (requiresTemporaryOutputBuffer(output, rate)) {
    701728            auto ub = getUpperBound(rate);
    702729            if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     
    710737        }
    711738    }
    712 
    713     // Now we iteratively process these blocks using the doMultiBlock method.
    714     // In each iteration, we check how many linearly accessible / writable
    715     // items can be processed with our current input / output buffers. If we
    716     // cannot support an full stride, we check whether (a) there is enough
    717     // input data to process but it is not linearly accessible, in which case
    718     // we move the data into temporary buffers or (b) there is not enough data
    719     // to process, in which case we abort unless IsFinal was set.
    720739
    721740    Constant * const ZERO = b->getSize(0);
     
    738757    }
    739758
    740     // Now proceed with creation of the doSegment method.
    741     BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
    742 
    743     b->CreateBr(segmentLoop);
    744 
    745     /// DO SEGMENT LOOP
    746 
    747     b->SetInsertPoint(segmentLoop);
    748 
    749     // For each input buffer, get the initial processed item count, base input pointer, and the number of
    750     // linearly available strides.
    751     Value * numOfStrides = nullptr;
    752759    mInitialAvailableItemCount.assign(mAvailableItemCount.begin(), mAvailableItemCount.end());
    753760    mInitialProcessedItemCount.resize(inputSetCount);
    754761    mStreamSetInputBaseAddress.resize(inputSetCount);
     762
     763    // Now proceed with creation of the doSegment method.
     764    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     765
     766    b->CreateBr(segmentLoop);
     767
     768    /// DO SEGMENT LOOP
     769
     770    b->SetInsertPoint(segmentLoop);
     771
     772    Value * numOfStrides = nullptr;
     773
     774    // TODO: we don't want the our available output space to limit how many conditional blocks we
     775    // can check. When we have a conditional region, split computation of input/output strides and
     776    // check as many input strides as possible but leave the kernel in a state that respects our
     777    // available output space. NOTE: we know coming into this block that the pipeline or kernel has
     778    // ensured there is at least one stride worth of space.
     779
     780
     781    // For each input buffer, get the initial processed item count, base input pointer, and the number of
     782    // linearly available strides.
    755783    Value * inputStrideSize[inputSetCount];
     784    Value * linearlyAccessible[inputSetCount];
    756785    for (unsigned i = 0; i < inputSetCount; i++) {
    757786        const Binding & input = mStreamSetInputs[i];
    758787        const auto & name = input.getName();
    759         const ProcessingRate & rate = input.getRate();
    760788        Value * const processed = b->getProcessedItemCount(name);
    761 
    762789        mInitialProcessedItemCount[i] = processed;
    763         Value * baseBuffer  = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
    764 
    765         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {           
     790        mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
     791        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    766792            b->CreateAssert(b->CreateICmpULE(processed, mAvailableItemCount[i]),
    767793                            getName() + ": " + name + " processed item count exceeds its available item count");
    768794        }
    769 
    770         // Ensure that everything between S⌈P/S⌉, and S⌈n*(P + L)/S⌉ is linearly available, where S is the stride size,
    771         // P is the current processed position, L is the lookahead amount and n ∈ â„€+.
    772 
    773         Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);
    774         Value * avail = b->getLinearlyAccessibleItems(name, processed, unprocessed);
    775         Value * remaining = avail;
    776         if (LLVM_UNLIKELY(input.hasLookahead())) {
    777             Constant * const lookahead = b->getSize(input.getLookahead());
    778             remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO);
    779         }
    780 
    781         inputStrideSize[i] = getStrideSize(b, rate);
    782         Value * accessibleStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
     795        Value * const unprocessed = b->CreateNUWSub(mAvailableItemCount[i], processed);
     796        mAvailableItemCount[i] = unprocessed;
     797        Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
     798        linearlyAccessible[i] = accessible;
     799        inputStrideSize[i] = getStrideSize(b, input.getRate());
     800        Value * const accessibleStrides = b->CreateUDiv(accessible, inputStrideSize[i]);
     801        numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
     802    }
     803
     804    BasicBlock * const checkInputAvailability = b->CreateBasicBlock("CheckInputAvailability");
     805    BasicBlock * const selectOutputBuffers = b->CreateBasicBlock("SelectOutputBuffers");
     806    b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), selectOutputBuffers, checkInputAvailability);
     807
     808    // Ensure that everything between S⌈P/S⌉ and S⌈n*(P + L)/S⌉ is linearly available, where S is the stride size,
     809    // P is the current processed position, L is the lookahead amount and n is our number of accessible strides ∈ â„€+.
     810    b->SetInsertPoint(checkInputAvailability);
     811    Value * const initiallyFinal = mIsFinal;
     812    Value * linearlyCopyable[inputSetCount];
     813    PHINode * selectedInputBuffer[inputSetCount];
     814    for (unsigned i = 0; i < inputSetCount; i++) {
    783815        AllocaInst * const tempBuffer = temporaryInputBuffer[i];
     816        selectedInputBuffer[i] = nullptr;
    784817        if (tempBuffer) {
    785818
    786             // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever
    787             // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer.
     819            const Binding & input = mStreamSetInputs[i];
     820            const auto & name = input.getName();
     821            Value * const processed = mInitialProcessedItemCount[i];
     822            Value * const unprocessed = mAvailableItemCount[i];
     823            Value * const accessible = linearlyAccessible[i];
    788824
    789825            BasicBlock * const entry = b->GetInsertBlock();
     
    792828            BasicBlock * const resume = b->CreateBasicBlock(name + "Resume");
    793829
    794             b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume);
     830            Value * strideSize = inputStrideSize[i];
     831            if (LLVM_UNLIKELY(input.hasLookahead())) {
     832                Constant * const lookahead = b->getSize(input.getLookahead());
     833                strideSize = b->CreateNUWAdd(strideSize, lookahead);
     834            }
     835            Value * const requiresCopy = b->CreateICmpULT(accessible, strideSize);
     836            b->CreateUnlikelyCondBr(requiresCopy, copyFromBack, resume);
    795837
    796838            b->SetInsertPoint(copyFromBack);
    797839            Value * const arraySize = b->CreateZExt(tempBuffer->getArraySize(), b->getInt64Ty());
    798             Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), unprocessed->getType());
    799             Value * const temporaryAvailable = b->CreateUMin(unprocessed, temporarySize);
     840            Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), accessible->getType());
     841            Value * const copyable = b->CreateUMin(unprocessed, temporarySize); // <- we only really need strideSize items
    800842            Value * const offset = b->CreateAnd(processed, BLOCK_WIDTH_MASK);
    801843            Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), arraySize);
    802844            b->CreateMemZero(tempBuffer, bufferSize, blockAlignment);
    803             const auto copyAlignment = getItemAlignment(mStreamSetInputs[i]);
    804             b->CreateStreamCpy(name, tempBuffer, ZERO, baseBuffer, offset, avail, copyAlignment);
    805             Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE);
     845            b->CreateStreamCpy(name, tempBuffer, ZERO, mStreamSetInputBaseAddress[i], offset, accessible, getItemAlignment(input));
    806846            BasicBlock * const copyToBackEnd = b->GetInsertBlock();
    807             b->CreateCondBr(b->CreateICmpNE(temporaryAvailable, unprocessed), copyFromFront, resume);
     847            b->CreateCondBr(b->CreateICmpNE(copyable, accessible), copyFromFront, resume);
    808848
    809849            b->SetInsertPoint(copyFromFront);
    810             Value * const remaining = b->CreateSub(temporaryAvailable, avail);
     850            Value * const remaining = b->CreateSub(copyable, accessible);
    811851            Value * const baseAddress = b->getBaseAddress(name);
    812             b->CreateStreamCpy(name, tempBuffer, avail, baseAddress, ZERO, remaining, copyAlignment);
     852            b->CreateStreamCpy(name, tempBuffer, accessible, baseAddress, ZERO, remaining, getItemAlignment(input));
     853            Value * const isPartialStride = b->CreateICmpUGE(copyable, strideSize);
    813854            BasicBlock * const copyToFrontEnd = b->GetInsertBlock();
    814855            b->CreateBr(resume);
    815856
    816857            b->SetInsertPoint(resume);
    817             PHINode * const bufferPtr = b->CreatePHI(baseBuffer->getType(), 4);
    818             bufferPtr->addIncoming(baseBuffer, entry);
    819             bufferPtr->addIncoming(tempBuffer, copyToBackEnd);
    820             bufferPtr->addIncoming(tempBuffer, copyToFrontEnd);
    821             baseBuffer = bufferPtr;
    822 
    823             PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 4);
    824             phiAvailItemCount->addIncoming(avail, entry);
    825             phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd);
    826             phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd);
    827             avail = phiAvailItemCount;
    828 
    829             PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 4);
    830             phiStrides->addIncoming(accessibleStrides, entry);
    831             phiStrides->addIncoming(temporaryStrides, copyToBackEnd);
    832             phiStrides->addIncoming(temporaryStrides, copyToFrontEnd);
    833             accessibleStrides = phiStrides;
    834         }
    835         mAvailableItemCount[i] = avail;
    836         mStreamSetInputBaseAddress[i] = baseBuffer;
    837         numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides);
    838     }
     858            PHINode * const address = b->CreatePHI(tempBuffer->getType(), 3);
     859            address->addIncoming(mStreamSetInputBaseAddress[i], entry);
     860            address->addIncoming(tempBuffer, copyToBackEnd);
     861            address->addIncoming(tempBuffer, copyToFrontEnd);
     862            selectedInputBuffer[i] = address;
     863            PHINode * const available = b->CreatePHI(accessible->getType(), 3);
     864            available->addIncoming(accessible, entry);
     865            available->addIncoming(copyable, copyToBackEnd);
     866            available->addIncoming(copyable, copyToFrontEnd);
     867            linearlyCopyable[i] = available;
     868            PHINode * const finalStride = b->CreatePHI(b->getInt1Ty(), 3);
     869            finalStride->addIncoming(mIsFinal, entry);
     870            finalStride->addIncoming(b->getTrue(), copyToBackEnd);
     871            finalStride->addIncoming(isPartialStride, copyToFrontEnd);
     872            mIsFinal = finalStride;
     873            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     874                Value * const hasStride = b->CreateOr(initiallyFinal, b->CreateNot(finalStride));
     875                b->CreateAssert(hasStride, getName() + ": " + name + " has insufficient input data for one stride");
     876            }
     877        }
     878    }
     879
     880    BasicBlock * const endCheckInputAvailability = b->GetInsertBlock();
     881    selectOutputBuffers->moveAfter(endCheckInputAvailability);
     882    b->CreateBr(selectOutputBuffers);
     883
     884    b->SetInsertPoint(selectOutputBuffers);
     885    PHINode * const final = b->CreatePHI(mIsFinal->getType(), 2);
     886    final->addIncoming(b->getFalse(), segmentLoop);
     887    final->addIncoming(mIsFinal, endCheckInputAvailability);
     888    mIsFinal = final;
     889    for (unsigned i = 0; i < inputSetCount; i++) {
     890        if (selectedInputBuffer[i]) {
     891            PHINode * const address = b->CreatePHI(selectedInputBuffer[i]->getType(), 2);
     892            address->addIncoming(mStreamSetInputBaseAddress[i], segmentLoop);
     893            address->addIncoming(selectedInputBuffer[i], endCheckInputAvailability);
     894            mStreamSetInputBaseAddress[i] = address;
     895            PHINode * const accessible = b->CreatePHI(linearlyAccessible[i]->getType(), 2);
     896            accessible->addIncoming(linearlyAccessible[i], segmentLoop);
     897            accessible->addIncoming(linearlyCopyable[i], endCheckInputAvailability);
     898            linearlyAccessible[i] = accessible;
     899        }
     900    }
     901    PHINode * const strides = b->CreatePHI(numOfStrides->getType(), 2);
     902    strides->addIncoming(numOfStrides, segmentLoop);
     903    strides->addIncoming(ONE, endCheckInputAvailability);
     904    numOfStrides = strides;
    839905
    840906    // Now determine the linearly writeable strides
     907    Value * outputStrideSize[outputSetCount];
    841908    Value * linearlyWritable[outputSetCount];
    842     Value * outputStrideSize[outputSetCount];
    843909    mInitialProducedItemCount.resize(outputSetCount);
    844910    mStreamSetOutputBaseAddress.resize(outputSetCount);
     
    846912        const auto & output = mStreamSetOutputs[i];
    847913        const auto & name = output.getName();
    848         const ProcessingRate & rate = output.getRate();
    849914        Value * const produced = b->getProducedItemCount(name);
    850915        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    851         assert (baseBuffer->getType()->isPointerTy());
    852         linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);
    853         outputStrideSize[i] = getStrideSize(b, rate);
     916        mInitialProducedItemCount[i] = produced;
     917        mStreamSetOutputBaseAddress[i] = baseBuffer;
     918
    854919        // Is the number of linearly writable items sufficient for a stride?
     920        outputStrideSize[i] = getStrideSize(b, output.getRate());
    855921        if (outputStrideSize[i]) {
     922            linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);
     923            Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
     924            numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
     925            // Do we require a temporary buffer to write to?
    856926            AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    857             Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);
    858             // Do we require a temporary buffer to write to?
    859927            if (tempBuffer) {
    860928                assert (tempBuffer->getType() == baseBuffer->getType());
     
    876944                baseBuffer = phiBuffer;
    877945                PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2);
    878                 phiStrides->addIncoming(writableStrides, entry);
     946                phiStrides->addIncoming(numOfStrides, entry);
    879947                phiStrides->addIncoming(ONE, prepareTempBuffer);
    880                 writableStrides = phiStrides;
    881             }
    882             numOfStrides = b->CreateUMin(numOfStrides, writableStrides);
    883         }
    884         mInitialProducedItemCount[i] = produced;
    885         mStreamSetOutputBaseAddress[i] = baseBuffer;
    886     }
    887 
    888     BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
    889 
    890     Value * const initiallyFinal = mIsFinal;
    891     if (LLVM_LIKELY(numOfStrides != nullptr)) {
    892         mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO));
    893         if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    894             Value * const hasStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal);
    895             b->CreateAssert(hasStride, getName() + " has insufficient input data or output space for one stride");
    896         }
    897         for (unsigned i = 0; i < inputSetCount; ++i) {
    898             const auto & input = mStreamSetInputs[i];
    899             const ProcessingRate & rate = input.getRate();
    900             if (rate.isFixed() && input.nonDeferred()) {
    901                 mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i]));
    902             }
    903         }
    904     }
    905 
    906     //  We have one or more blocks of input data and output buffer space for all stream sets.
     948                numOfStrides = phiStrides;
     949            }
     950            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     951                b->CreateAssert(numOfStrides, getName() + ": " + name + " has insufficient output space for one stride");
     952            }
     953        }
     954    }
     955
     956    // Update the locally available item count to reflect the current state
     957    for (unsigned i = 0; i < inputSetCount; i++) {
     958        const Binding & input = mStreamSetInputs[i];
     959        if (input.getRate().isFixed() && input.nonDeferred()) {
     960            Value * const processable = b->CreateMul(numOfStrides, inputStrideSize[i]);
     961            linearlyAccessible[i] = b->CreateSelect(mIsFinal, linearlyAccessible[i], processable);
     962        }
     963        mAvailableItemCount[i] = linearlyAccessible[i];
     964    }
     965
     966    //  We have one or more strides of input data and output buffer space for all stream sets.
    907967    generateMultiBlockLogic(b, numOfStrides);
    908968
     
    920980        const ProcessingRate & rate = output.getRate();
    921981        if (rate.isFixed()) {
    922             assert (output.nonDeferred());
    923982            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
    924             Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
     983            Value * const ic = b->CreateNUWAdd(mInitialProducedItemCount[i], produced);
    925984            b->setProducedItemCount(output.getName(), ic);
    926985        }
     
    9501009    // Copy back data to the actual output buffers.
    9511010    for (unsigned i = 0; i < outputSetCount; i++) {
     1011
    9521012        AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    9531013        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
     
    9551015        }
    9561016
     1017        const auto & name = mStreamSetOutputs[i].getName();
     1018        Value * const produced = b->getProducedItemCount(name);
    9571019        Value * const baseBuffer = mStreamSetOutputBaseAddress[i];
    9581020        assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType()));
    959         const auto & name = mStreamSetOutputs[i].getName();
     1021        //const auto & name = mStreamSetOutputs[i].getName();
    9601022        BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack");
    9611023        BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront");
    9621024        BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack");
    9631025        // If we used a temporary buffer, copy it back to the original output buffer
    964         b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume);
     1026        Value * const requiresCopy = b->CreateICmpEQ(tempBuffer, baseBuffer);
     1027        b->CreateCondBr(requiresCopy, copyToBack, resume);
    9651028
    9661029        b->SetInsertPoint(copyToBack);       
    9671030        Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK);
    968         Value * const newProducedItemCount = b->getProducedItemCount(name);
    969         Value * const newlyProduced = b->CreateSub(newProducedItemCount, mInitialProducedItemCount[i]);
     1031        //Value * const newProducedItemCount = b->getProducedItemCount(name);
     1032        Value * const newlyProduced = b->CreateNUWSub(produced, mInitialProducedItemCount[i]);
    9701033        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
    9711034        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     
    9751038
    9761039        b->SetInsertPoint(copyToFront);
    977         Value * const remaining = b->CreateSub(newlyProduced, toWrite);
     1040        Value * const remaining = b->CreateNUWSub(newlyProduced, toWrite);
    9781041        Value * const baseAddress = b->getBaseAddress(name);
    9791042        b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment);
     
    9871050    BasicBlock * const setTermination = b->CreateBasicBlock("setTermination");
    9881051    b->CreateCondBr(mIsFinal, setTermination, strideDone);
    989 
    9901052    b->SetInsertPoint(setTermination);
    9911053    b->setTerminationSignal();
     1054    BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone");
    9921055    b->CreateBr(segmentDone);
    9931056
     
    10071070        }
    10081071        Value * remaining = b->CreateSub(avail, processed);
     1072        Value * strideSize = inputStrideSize[i];
    10091073        if (LLVM_UNLIKELY(input.hasLookahead())) {
    1010             Constant * const lookahead = b->getSize(input.getLookahead());
    1011             remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO);
    1012         }
    1013         Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]);
    1014         Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
     1074            strideSize = b->CreateNUWAdd(strideSize, b->getSize(input.getLookahead()));
     1075        }
     1076        Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, strideSize);
    10151077        hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    10161078    }
     
    10371099            }
    10381100            Value * const remaining = b->CreateSub(capacity, unconsumed);
    1039             Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]);
    1040             Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO);
    1041 
     1101            Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, outputStrideSize[i]);
    10421102            hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides);
    10431103        }
     
    12251285 * @brief generateMultiBlockLogic
    12261286 ** ------------------------------------------------------------------------------------------------------------- */
    1227 Value * BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     1287void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
    12281288
    12291289    if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) {
     
    12391299    BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock");
    12401300    BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone");
    1241     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    1242         b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal),
    1243                         "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is");
    1244     }
     1301
    12451302    const auto inputSetCount = mStreamSetInputs.size();
    12461303    Value * baseProcessedIndex[inputSetCount];
     
    13441401    }
    13451402
    1346     Value * const remainingItems = getRemainingItems(b);
    1347 
    1348 //    b->CallPrintInt(getName() + "_remainingItems", remainingItems);
    1349 
    1350     writeFinalBlockMethod(b, remainingItems);
     1403    writeFinalBlockMethod(b, getRemainingItems(b));
    13511404
    13521405    b->CreateBr(segmentDone);
     
    13681421    }
    13691422
    1370     return numOfBlocks;
    13711423}
    13721424
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5793 r5831  
    433433    // exit the RetVoid instruction will be added to complete the method.
    434434    //
    435     virtual llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
     435    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
    436436
    437437private:
     
    444444    unsigned getItemAlignment(const Binding & binding) const;
    445445
     446    unsigned getCopyAlignment(const Binding & binding) const;
     447
    446448    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
     449
     450    bool requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const;
     451
     452    bool requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const;
    447453
    448454    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
     
    488494private:
    489495
    490     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     496    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
    491497
    492498    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5793 r5831  
    1111using namespace kernel;
    1212
    13 Value * LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {
     13void LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {
    1414
    1515    BasicBlock * entry_block = b->GetInsertBlock();
     
    169169
    170170    b->SetInsertPoint(loopExit);
    171     return numOfStrides;
    172171}
    173172
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.h

    r5755 r5831  
    1818    LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize);
    1919protected:
    20     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;
     20    void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;
    2121private:
    2222    size_t mBufferSize;
  • icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

    r5755 r5831  
    3939// of bytes to the actual output stream.
    4040
    41 Value * expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
     41void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    4242
    4343    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
     
    131131    iBuilder->SetInsertPoint(expand3_4_exit);
    132132
    133     return numOfStrides;
    134133}
    135134
  • icGREP/icgrep-devel/icgrep/kernels/radix64.h

    r5755 r5831  
    2525    bool hasSignature() const override { return false; }
    2626private:
    27     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
     27    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;
    2828};
    2929
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp

    r5782 r5831  
    2121namespace kernel {
    2222
    23 Value * ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
     23void ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) {
    2424
    2525    Module * const m = iBuilder->getModule();
     
    4747    Value * line_break = iBuilder->getInputStreamBlockPtr("lineBreak", iBuilder->getInt32(0));
    4848
    49     Value * blocksToDo = iBuilder->CreateAdd(numOfStrides, iBuilder->CreateZExt(mIsFinal, numOfStrides->getType()));
    50     blocksToDo = iBuilder->CreateMul(blocksToDo, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth()));
     49    Value * const blocksToDo = iBuilder->CreateMul(numOfStrides, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth()));
    5150   
    5251    Value * match_result_ptr = iBuilder->CreateBitCast(match_result, scanwordVectorType->getPointerTo());
     
    205204
    206205    iBuilder->SetInsertPoint(scanReturn);
    207     return numOfStrides;
    208206}
    209207
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.h

    r5755 r5831  
    2020    bool hasSignature() const override { return false; }
    2121private:
    22     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     22    void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    2323};
    2424
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r5793 r5831  
    1515namespace kernel {
    1616
    17 Value * StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
     17void StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
    1818    Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
    1919    codeUnitBuffer = b->CreatePointerCast(codeUnitBuffer, b->getInt8PtrTy());
     
    2525    }
    2626    b->CreateWriteCall(b->getInt32(1), codeUnitBuffer, bytesToDo);
    27     return numOfStrides;
    2827}
    2928
     
    6463}
    6564
    66 Value * FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) {
     65void FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) {
    6766    Value * const fileDes = b->getScalarField("fileDes");
    6867    Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
     
    7574    }   
    7675    b->CreateWriteCall(fileDes, codeUnitBuffer, bytesToDo);
    77     return numOfStrides;
    7876}
    7977
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.h

    r5793 r5831  
    1616    StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth);
    1717private:
    18     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     18    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    1919private:
    2020    const unsigned mCodeUnitWidth;
     
    2727protected:
    2828    void generateInitializeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override;
    29     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     29    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
    3030    void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b) override;
    3131private:
  • icGREP/icgrep-devel/icgrep/kernels/until_n.cpp

    r5830 r5831  
    1717namespace kernel {
    1818
    19 const unsigned packSize = 64;
    20    
    21 llvm::Value * UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
     19void UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
    2220
    2321/* 
     
    3937*/
    4038
     39    const unsigned packSize = b->getSizeTy()->getBitWidth();
    4140    Constant * const ZERO = b->getSize(0);
    4241    Constant * const ONE = b->getSize(1);
     
    101100    //Value * const packPtr = b->CreateGEP(b->CreatePointerCast(groupPtr, packPtrTy), packOffset);
    102101    //Value * const packBits = b->CreateLoad(packPtr);
    103     Value * const packCount = b->CreatePopcount(packBits);
     102    Value * const packCount = b->CreateZExtOrTrunc(b->CreatePopcount(packBits), b->getSizeTy());
    104103    Value * const observedUpTo = b->CreateNUWAdd(observed, packCount);
    105104
     
    173172    b->setProducedItemCount("uptoN", producedCount);
    174173
    175     return numOfStrides;
     174}
     175
     176unsigned LLVM_READNONE calculateRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
     177    const unsigned packSize = b->getSizeTy()->getBitWidth();
     178    return (packSize * packSize) / b->getBitBlockWidth();
    176179}
    177180
    178181UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & b)
    179 : MultiBlockKernel("UntilN",
     182: MultiBlockKernel("UntilN_" + std::to_string(calculateRate(b)),
    180183// inputs
    181 {Binding{b->getStreamSetTy(), "bits", FixedRate((packSize * packSize) / b->getBitBlockWidth())}},
     184{Binding{b->getStreamSetTy(), "bits", FixedRate(calculateRate(b))}},
    182185// outputs
    183 {Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, (packSize * packSize) / b->getBitBlockWidth())}},
     186{Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, calculateRate(b))}},
    184187// input scalar
    185188{Binding{b->getSizeTy(), "N"}}, {},
  • icGREP/icgrep-devel/icgrep/kernels/until_n.h

    r5830 r5831  
    1414    UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    1515private:
    16     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     16    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
    1717
    1818};
  • icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.cpp

    r5828 r5831  
    4242using TypeId = PabloAST::ClassTypeId;
    4343
    44 inline static unsigned getAlignment(const Value * const type) {
    45     return type->getType()->getPrimitiveSizeInBits() / 8;
     44inline static unsigned getAlignment(const Type * const type) {
     45    return type->getPrimitiveSizeInBits() / 8;
     46}
     47
     48inline static unsigned getAlignment(const Value * const expr) {
     49    return getAlignment(expr->getType());
    4650}
    4751
    4852inline static unsigned getPointerElementAlignment(const Value * const ptr) {
    49     return ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits() / 8;
     53    return getAlignment(ptr->getType()->getPointerElementType());
    5054}
    5155
     
    672676                    lhvStreamIndex = compileExpression(b, cast<Extract>(lh)->getIndex());
    673677                } else {
    674                     baseLhv = compileExpression(b, lh, false);
     678                    baseLhv = compileExpression(b, lh);
    675679                }
    676680
     
    682686                    rhvStreamIndex = compileExpression(b, cast<Extract>(rh)->getIndex());
    683687                } else {
    684                     baseRhv = compileExpression(b, rh, false);
     688                    baseRhv = compileExpression(b, rh);
    685689                }
    686690
     
    688692
    689693                if (LLVM_UNLIKELY(typeId == TypeId::Add || typeId == TypeId::Subtract)) {
    690 
    691 
    692694
    693695                    value = b->CreateAlloca(vTy, b->getInt32(n));
     
    700702                        } else {
    701703                            lhv = getPointerToVar(b, cast<Var>(lh), lhvStreamIndex, index);
    702                             lhv = b->CreateAlignedLoad(lhv, getAlignment(lhv));
     704                            lhv = b->CreateBlockAlignedLoad(lhv);
    703705                        }
    704706                        lhv = b->CreateBitCast(lhv, vTy);
     
    709711                        } else {
    710712                            rhv = getPointerToVar(b, cast<Var>(rh), rhvStreamIndex, index);
    711                             rhv = b->CreateAlignedLoad(rhv, getAlignment(rhv));
     713                            rhv = b->CreateBlockAlignedLoad(rhv);
    712714                        }
    713715                        rhv = b->CreateBitCast(rhv, vTy);
     
    716718                        if (typeId == TypeId::Add) {
    717719                            result = b->CreateAdd(lhv, rhv);
    718                         } else {
     720                        } else { // if (typeId == TypeId::Subtract) {
    719721                            result = b->CreateSub(lhv, rhv);
    720722                        }
    721723                        b->CreateAlignedStore(result, b->CreateGEP(value, {b->getInt32(0), b->getInt32(i)}), getAlignment(result));
    722724                    }
    723 
    724 
    725725
    726726                } else {
     
    735735                        } else {
    736736                            lhv = getPointerToVar(b, cast<Var>(lh), lhvStreamIndex, index);
    737                             lhv = b->CreateAlignedLoad(lhv, getAlignment(lhv));
     737                            lhv = b->CreateBlockAlignedLoad(lhv);
    738738                        }
    739739                        lhv = b->CreateBitCast(lhv, vTy);
     
    744744                        } else {
    745745                            rhv = getPointerToVar(b, cast<Var>(rh), rhvStreamIndex, index);
    746                             rhv = b->CreateAlignedLoad(rhv, getAlignment(rhv));
     746                            rhv = b->CreateBlockAlignedLoad(rhv);
    747747                        }
    748748                        rhv = b->CreateBitCast(rhv, vTy);
     
    764764                            default: llvm_unreachable("invalid vector operator id");
    765765                        }
    766                         Value * const mask = b->CreateBitCast(b->hsimd_signmask(n, comp), fw);
     766                        Value * const mask = b->CreateZExtOrTrunc(b->hsimd_signmask(n, comp), fw);
    767767                        value = b->mvmd_insert(m, value, mask, i);
    768768                    }
Note: See TracChangeset for help on using the changeset viewer.