Ignore:
Timestamp:
Jun 9, 2017, 12:44:00 PM (2 years ago)
Author:
cameron
Message:

Fix for read_source kernel; stride attribute for multiblock kernels

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5493 r5497  
    655655    const unsigned outputSetCount = mStreamSetOutputs.size();
    656656    const unsigned totalSetCount = inputSetCount + outputSetCount;
     657   
     658    unsigned itemsPerStride[totalSetCount];
    657659    bool isDerived[totalSetCount];
    658     unsigned itemsPerPrincipalBlock[totalSetCount];
    659660   
    660     for (unsigned i = 0; i < inputSetCount; i++) {
     661    if (mStride == 0) mStride = bitBlockWidth;
     662
     663    itemsPerStride[0] = mStride;
     664    isDerived[0] = true;
     665   
     666    for (unsigned i = 1; i < inputSetCount; i++) {
    661667        auto & rate = mStreamSetInputs[i].rate;
    662668        std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
    663669        if (rate.isExact()) {
    664670            if (refSet.empty()) {
    665                 itemsPerPrincipalBlock[i] = rate.calculateRatio(bitBlockWidth);
     671                itemsPerStride[i] = rate.calculateRatio(itemsPerStride[0]);
    666672                isDerived[i] = true;
    667673                continue;
     
    672678                assert (port == Port::Input && ssIdx < i);
    673679                if (isDerived[ssIdx]) {
    674                     itemsPerPrincipalBlock[i] = rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]);
     680                    itemsPerStride[i] = rate.calculateRatio(itemsPerStride[ssIdx]);
    675681                    isDerived[i] = true;
    676682                    continue;
     
    687693        if (rate.isExact() || rate.isMaxRatio()) {
    688694            if (refSet.empty()) {
    689                 itemsPerPrincipalBlock[i] = rate.calculateRatio(bitBlockWidth);
     695                itemsPerStride[i] = rate.calculateRatio(bitBlockWidth);
    690696                isDerived[i] = rate.isExact();
    691697                continue;
     
    696702                if (port == Port::Output) ssIdx += inputSetCount;
    697703                if (isDerived[ssIdx]) {
    698                     itemsPerPrincipalBlock[i] = rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]);
     704                    itemsPerStride[i] = rate.calculateRatio(itemsPerStride[ssIdx]);
    699705                    isDerived[i] = rate.isExact();
    700706                    continue;
     
    709715    for (unsigned i = 0; i < totalSetCount; i++) {
    710716        if (isDerived[i]) {
    711             if (itemsPerPrincipalBlock[i] == bitBlockWidth) {
    712                 maxBlocksToCopy[i] = 1;
     717            if (itemsPerStride[i] % bitBlockWidth == 0) {
     718                maxBlocksToCopy[i] = itemsPerStride[i] / bitBlockWidth;
    713719            }
    714720            else {
    715721                // May not be block aligned, can overlap partial blocks at both ends.
    716                 maxBlocksToCopy[i] = itemsPerPrincipalBlock[i]/bitBlockWidth + 2;
     722                maxBlocksToCopy[i] = itemsPerStride[i]/bitBlockWidth + 2;
    717723            }
    718724        }
    719725        else {
    720             // For variable input stream sets, we make a single block of items
    721             // available, if possible, but this block could be nonaligned.
    722             maxBlocksToCopy[i] = 2;
     726            // For variable input stream sets, we make a single stride of items
     727            // available, if possible, but this stride could be nonaligned.
     728            maxBlocksToCopy[i] = mStride / bitBlockWidth + 2;
    723729        }
    724730    }
     
    805811
    806812    ConstantInt * blockSize = kb->getSize(kb->getBitBlockWidth());
    807 
     813    ConstantInt * strideSize = kb->getSize(mStride);
     814   
    808815    Value * availablePos = mAvailableItemCount[0];
    809816    Value * itemsAvail = availablePos;
     
    822829    Value * processed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
    823830    Value * itemsToDo = kb->CreateSub(itemsAvail, processed);
    824     Value * fullBlocksToDo = kb->CreateUDiv(itemsToDo, blockSize);
    825     Value * excessItems = kb->CreateURem(itemsToDo, blockSize);
     831    Value * fullStridesToDo = kb->CreateUDiv(itemsToDo, strideSize);
     832    Value * excessItems = kb->CreateURem(itemsToDo, strideSize);
    826833
    827834    //  Now we iteratively process these blocks using the doMultiBlock method.
     
    835842    kb->CreateBr(doSegmentOuterLoop);
    836843    kb->SetInsertPoint(doSegmentOuterLoop);
    837     PHINode * const blocksRemaining = kb->CreatePHI(kb->getSizeTy(), 2, "blocksRemaining");
    838     blocksRemaining->addIncoming(fullBlocksToDo, entry);
     844    PHINode * const stridesRemaining = kb->CreatePHI(kb->getSizeTy(), 2, "stridesRemaining");
     845    stridesRemaining->addIncoming(fullStridesToDo, entry);
    839846
    840847    // For each input buffer, determine the processedItemCount, the block pointer for the
     
    849856    //  by limitations of linearly available input buffer space.
    850857
    851     Value * linearlyAvailBlocks = blocksRemaining;
     858    Value * linearlyAvailStrides = stridesRemaining;
    852859    for (unsigned i = 0; i < inputSetCount; i++) {
    853860        Value * p = kb->getProcessedItemCount(mStreamSetInputs[i].name);
     
    858865        if (isDerived[i]) {
    859866            auto & rate = mStreamSetInputs[i].rate;
    860             Value * blocks = nullptr;
     867            Value * maxReferenceItems = nullptr;
    861868            if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator()) && (rate.referenceStreamSet() == "")) {
    862                 blocks = mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(kb.get(), blkNo);
     869                maxReferenceItems = kb->CreateMul(mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(kb.get(), blkNo), blockSize);
    863870            } else {
    864871                Value * linearlyAvailItems = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(kb.get(), p);
    865                 Value * items = rate.CreateMaxReferenceItemsCalculation(kb.get(), linearlyAvailItems);
    866                 blocks = kb->CreateUDiv(items, blockSize);
     872                maxReferenceItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), linearlyAvailItems);
    867873            }
    868             linearlyAvailBlocks = kb->CreateSelect(kb->CreateICmpULT(blocks, linearlyAvailBlocks), blocks, linearlyAvailBlocks);
     874            Value * maxStrides = kb->CreateUDiv(maxReferenceItems, strideSize);
     875            linearlyAvailStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyAvailStrides), maxStrides, linearlyAvailStrides);
    869876        }
    870877    }
    871878    //  Now determine the linearly writeable blocks, based on available blocks reduced
    872879    //  by limitations of output buffer space.
    873     Value * linearlyWritableBlocks = linearlyAvailBlocks;
     880    Value * linearlyWritableStrides = linearlyAvailStrides;
    874881
    875882    for (unsigned i = 0; i < outputSetCount; i++) {
     
    881888        if (isDerived[inputSetCount + i]) {
    882889            auto & rate = mStreamSetOutputs[i].rate;
    883             Value * blocks = nullptr;
     890            Value * maxReferenceItems = nullptr;
    884891            if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator())) {
    885                 blocks = mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(kb.get(), blkNo);
     892                maxReferenceItems = kb->CreateMul(mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(kb.get(), blkNo), blockSize);
    886893            } else {
    887894                Value * writableItems = mStreamSetOutputBuffers[0]->getLinearlyWritableItems(kb.get(), p);
    888                 blocks = kb->CreateUDiv(writableItems, blockSize);
     895                maxReferenceItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), writableItems);
    889896            }
    890             linearlyWritableBlocks = kb->CreateSelect(kb->CreateICmpULT(blocks, linearlyWritableBlocks), blocks, linearlyWritableBlocks);
    891         }
    892     }
    893 
    894     Value * haveBlocks = kb->CreateICmpUGT(linearlyWritableBlocks, kb->getSize(0));
    895     kb->CreateCondBr(haveBlocks, doMultiBlockCall, tempBlockCheck);
     897            Value * maxStrides = kb->CreateUDiv(maxReferenceItems, strideSize);
     898            linearlyWritableStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyWritableStrides), maxStrides, linearlyWritableStrides);
     899        }
     900    }
     901
     902    Value * haveStrides = kb->CreateICmpUGT(linearlyWritableStrides, kb->getSize(0));
     903    kb->CreateCondBr(haveStrides, doMultiBlockCall, tempBlockCheck);
    896904
    897905    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
     
    899907    kb->SetInsertPoint(doMultiBlockCall);
    900908
    901     Value * linearlyAvailItems = kb->CreateMul(linearlyWritableBlocks, blockSize);
     909    Value * linearlyAvailItems = kb->CreateMul(linearlyWritableStrides, strideSize);
    902910
    903911    std::vector<Value *> doMultiBlockArgs;
     
    964972    Value * nowProcessed = kb->CreateAdd(processedItemCount[0], linearlyAvailItems);
    965973    kb->setProcessedItemCount(mStreamSetInputs[0].name, nowProcessed);
    966     Value * reducedBlocksToDo = kb->CreateSub(blocksRemaining, linearlyWritableBlocks);
     974    Value * reducedStridesToDo = kb->CreateSub(stridesRemaining, linearlyWritableStrides);
    967975    BasicBlock * multiBlockFinal = kb->GetInsertBlock();
    968     blocksRemaining->addIncoming(reducedBlocksToDo, multiBlockFinal);
     976    stridesRemaining->addIncoming(reducedStridesToDo, multiBlockFinal);
    969977    kb->CreateBr(doSegmentOuterLoop);
    970978    //
     
    978986
    979987    kb->SetInsertPoint(tempBlockCheck);
    980     haveBlocks = kb->CreateICmpUGT(blocksRemaining, kb->getSize(0));
    981     kb->CreateCondBr(kb->CreateOr(mIsFinal, haveBlocks), doTempBufferBlock, segmentDone);
     988    haveStrides = kb->CreateICmpUGT(stridesRemaining, kb->getSize(0));
     989    kb->CreateCondBr(kb->CreateOr(mIsFinal, haveStrides), doTempBufferBlock, segmentDone);
    982990
    983991    kb->SetInsertPoint(doTempBufferBlock);
    984     Value * tempBlockItems = kb->CreateSelect(haveBlocks, blockSize, excessItems);
    985     Value * doFinal = kb->CreateNot(haveBlocks);
     992    Value * tempBlockItems = kb->CreateSelect(haveStrides, strideSize, excessItems);
     993    Value * doFinal = kb->CreateNot(haveStrides);
    986994
    987995    // Begin constructing the doMultiBlock args.
     
    9941002        if (!isDerived[i]) {
    9951003            Value * avail = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]);
    996             doMultiBlockArgs.push_back(kb->CreateSelect(kb->CreateICmpULT(avail, blockSize), avail, blockSize));
     1004            doMultiBlockArgs.push_back(kb->CreateSelect(kb->CreateICmpULT(avail, strideSize), avail, strideSize));
    9971005        }
    9981006    }
     
    10981106    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
    10991107    //
    1100     blocksRemaining->addIncoming(kb->CreateSub(blocksRemaining, kb->CreateZExt(haveBlocks, kb->getSizeTy())), kb->GetInsertBlock());
    1101     kb->CreateCondBr(haveBlocks, doSegmentOuterLoop, segmentDone);
     1108    stridesRemaining->addIncoming(kb->CreateSub(stridesRemaining, kb->CreateZExt(haveStrides, kb->getSizeTy())), kb->GetInsertBlock());
     1109    kb->CreateCondBr(haveStrides, doSegmentOuterLoop, segmentDone);
    11021110    kb->SetInsertPoint(segmentDone);
    11031111}
     
    11681176                                   std::vector<Binding> && scalar_outputs,
    11691177                                   std::vector<Binding> && internal_scalars)
    1170 : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
    1171    
     1178: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
     1179, mStride(0) {
    11721180}
    11731181
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5479 r5497  
    121121        return mStreamSetOutputBuffers[i];
    122122    }
    123 
     123   
    124124    virtual ~Kernel() = 0;
    125125
     
    335335#.  The Multi-Block Kernel Builder will arrange that these input parameters may be
    336336    processed under the following simplifying assumptions.
    337     * the number of itemsToDo will either be an exact multiple of the BlockSize,
    338       or, for processing the final block, a value less than BlockSize
     337    * the number of itemsToDo will either be an exact multiple of the kernel stride,
     338      or, for processing the final block, a value less than the kernel stride
    339339    * the input buffer of the principal stream set and all input buffers of stream sets
    340340      with derived processing rates will be safe to access and have data available in
     
    345345    * all output buffers will be safe to access and have space available
    346346      for the given maximum output generation rates based on the given number
    347       of blocksToDo of the principal input stream set; no further bounds checking
     347      of itemsToDo of the principal input stream set; no further bounds checking
    348348      is needed.
    349349    * for final block processing, all input buffers will be extended to be safely
     
    357357    * for any input pointer p, a GEP instruction with a single int32 index i
    358358      will produce a pointer to the buffer position corresponding to the ith block of the
    359       principal input stream set.
     359      input stream set.
    360360    * for any output stream set declared with a Fixed or Add1 processing rate with respect
    361361      to the principal input stream set, a GEP instruction with a single int32 index i
    362362      will produce a pointer to the buffer position corresponding to the ith block of the
    363       principal input stream set.
     363      stream set.
    364364
    365365#.  Upon completion of multi-block processing, the Multi-Block Kernel Builder will arrange that
     
    393393    //
    394394    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb) = 0;
     395   
     396    // Kernels typically perform block-at-a-time processing, but some kernels may require
     397    // a different stride.   In the case of multiblock kernels, the stride attribute
     398    // determines the number of minimum number of items that will be provided to the kernel
     399    // on each doMultiBlock call.
     400    //
     401   
     402    unsigned getKernelStride() const { return mStride;}
     403       
     404    void setKernelStride(unsigned stride) {mStride = stride;}
     405       
     406       
    395407
    396408private:
     409    size_t                            mStride;
     410
    397411
    398412    // Given a kernel subtype with an appropriate interface, the generateDoSegment
  • icGREP/icgrep-devel/icgrep/kernels/source_kernel.cpp

    r5493 r5497  
    241241    // Otherwise, allocate a buffer with twice the capacity and copy the unconsumed data back into it
    242242    iBuilder->SetInsertPoint(expandAndCopyBack);
     243
    243244    Value * const expandedCapacity = iBuilder->CreateShl(capacity, 1);
    244245    Value * const expandedBuffer = iBuilder->CreatePointerCast(iBuilder->CreateCacheAlignedMalloc(expandedCapacity), codeUnitPtrTy);
     
    255256    baseAddress->addIncoming(expandedBuffer, expandAndCopyBack);
    256257    Value * const modifiedPtr = iBuilder->CreateGEP(baseAddress, remaining);
    257     Value * const logicalAddress = iBuilder->CreateGEP(modifiedPtr, iBuilder->CreateNeg(iBuilder->CreateAnd(produced, alignmentMask)));
     258    Value * const logicalAddress = iBuilder->CreateGEP(baseAddress, iBuilder->CreateNeg(consumed));
    258259    iBuilder->setBaseAddress("sourceBuffer", logicalAddress);
    259260    iBuilder->CreateBr(readData);
  • icGREP/icgrep-devel/icgrep/kernels/streamset.cpp

    r5493 r5497  
    121121    } else {
    122122        Constant * bufSize = iBuilder->getSize(mBufferBlocks * iBuilder->getStride());
    123         return iBuilder->CreateSub(bufSize, iBuilder->CreateURem(fromPosition, bufSize));
     123        return iBuilder->CreateSub(bufSize, iBuilder->CreateURem(fromPosition, bufSize, "linearItems"));
    124124    }
    125125}
     
    127127Value * StreamSetBuffer::getLinearlyAccessibleBlocks(IDISA::IDISA_Builder * const iBuilder, Value * fromBlock) const {
    128128    Constant * bufBlocks = iBuilder->getSize(mBufferBlocks);
    129     return iBuilder->CreateSub(bufBlocks, iBuilder->CreateURem(fromBlock, bufBlocks));
     129    return iBuilder->CreateSub(bufBlocks, iBuilder->CreateURem(fromBlock, bufBlocks), "linearBlocks");
    130130}
    131131
Note: See TracChangeset for help on using the changeset viewer.