Changeset 5830


Ignore:
Timestamp:
Jan 14, 2018, 1:57:43 AM (5 months ago)
Author:
nmedfort
Message:

UntilN kernel rewritten to use new MultiBlock? system

Location:
icGREP/icgrep-devel/icgrep
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.cpp

    r5828 r5830  
    538538
    539539Value * IDISA_Builder::bitblock_mask_from(Value * pos) {
    540     Type * bitBlockInt = getIntNTy(getBitBlockWidth());
    541     return bitCast(CreateShl(ConstantInt::getAllOnesValue(bitBlockInt), CreateZExt(pos, bitBlockInt)));
    542    
    543 }
     540    Type * const ty = getIntNTy(getBitBlockWidth());
     541    Constant * const ONES = ConstantInt::getAllOnesValue(ty);
     542    Constant * const ZEROES = ConstantInt::getNullValue(ty);
     543    Constant * const BIT_BLOCK_WIDTH = ConstantInt::get(pos->getType(), getBitBlockWidth());
     544    Value * const mask = CreateSelect(CreateICmpULT(pos, BIT_BLOCK_WIDTH), CreateShl(ONES, CreateZExt(pos, ty)), ZEROES);
     545    return bitCast(mask);
     546}
     547
    544548Value * IDISA_Builder::bitblock_set_bit(Value * pos) {
    545     Type * bitBlockInt = getIntNTy(getBitBlockWidth());
    546     return bitCast(CreateShl(ConstantInt::get(bitBlockInt, 1), CreateZExt(pos, bitBlockInt)));
     549    Type * const ty = getIntNTy(getBitBlockWidth());
     550    return bitCast(CreateShl(ConstantInt::get(ty, 1), CreateZExt(pos, ty)));
    547551}
    548552
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5827 r5830  
    401401}
    402402
    403 Value * KernelBuilder::getInputStreamBlockPtr(const std::string & name, Value * streamIndex) {
    404     Value * const addr = mKernel->getStreamSetInputAddress(name);
    405     if (addr) {
    406         return CreateGEP(addr, {getInt32(0), streamIndex});
    407     } else {
    408         const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    409         Value * const blockIndex = CreateLShr(getProcessedItemCount(name), std::log2(getBitBlockWidth()));
    410         return buf->getStreamBlockPtr(this, getStreamHandle(name), getBaseAddress(name), streamIndex, blockIndex, true);
    411     }
    412 }
    413 
    414403Value * KernelBuilder::loadInputStreamBlock(const std::string & name, Value * streamIndex) {
    415404    return CreateBlockAlignedLoad(getInputStreamBlockPtr(name, streamIndex));
     
    448437}
    449438
    450 Value * KernelBuilder::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex) {
     439Value * KernelBuilder::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex, Value * const blockOffset) {
    451440    Value * const addr = mKernel->getStreamSetOutputAddress(name);
    452441    if (addr) {
    453         return CreateGEP(addr, {getInt32(0), streamIndex});
     442        return CreateGEP(addr, {blockOffset, streamIndex});
    454443    } else {
    455444        const StreamSetBuffer * const buf = mKernel->getOutputStreamSetBuffer(name);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5793 r5830  
    7171    // use in implementing kernels.
    7272
    73     llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
     73    llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex) {
     74        return getInputStreamBlockPtr(name, streamIndex, getInt32(0));
     75    }
     76
     77    llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset);
    7478
    7579    llvm::Value * loadInputStreamBlock(const std::string & name, llvm::Value * streamIndex);
     
    8185    llvm::Value * getInputStreamSetCount(const std::string & name);
    8286
    83     llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex);
     87    llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex) {
     88        return getOutputStreamBlockPtr(name, streamIndex, getInt32(0));
     89    }
     90
     91    llvm::Value * getOutputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset);
    8492
    8593    llvm::StoreInst * storeOutputStreamBlock(const std::string & name, llvm::Value * streamIndex, llvm::Value * toStore);
     
    9098
    9199    llvm::Value * getOutputStreamSetCount(const std::string & name);
    92 
    93     llvm::Value * getInputStreamBlockPtr(const std::string & name, llvm::Value * streamIndex, llvm::Value * blockOffset);
    94100
    95101    llvm::Value * getRawInputPointer(const std::string & name, llvm::Value * absolutePosition);
  • icGREP/icgrep-devel/icgrep/kernels/until_n.cpp

    r5755 r5830  
    88#include <kernels/kernel_builder.h>
    99#include <kernels/streamset.h>
     10#include <toolchain/toolchain.h>
    1011
    1112namespace llvm { class Type; }
     
    1819const unsigned packSize = 64;
    1920   
    20 Value * UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
     21llvm::Value * UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {
     22
    2123/* 
    2224   Strategy:  first form an index consisting of one bit per packsize input positions,
     
    3638   input stream.
    3739*/
    38    
    39     BasicBlock * entry = kb->GetInsertBlock();
    40    
    41     BasicBlock * processGroups = kb->CreateBasicBlock("processGroups");
    42     BasicBlock * processBlockGroup = kb->CreateBasicBlock("processBlockGroup");
    43     BasicBlock * doScan = kb->CreateBasicBlock("doScan");
    44     BasicBlock * scanLoop = kb->CreateBasicBlock("scanLoop");
    45     BasicBlock * continueScanLoop = kb->CreateBasicBlock("continueScanLoop");
    46     BasicBlock * scanDone = kb->CreateBasicBlock("scanDone");
    47     BasicBlock * notFoundYet = kb->CreateBasicBlock("notFoundYet");
    48     BasicBlock * findNth = kb->CreateBasicBlock("findNth");
    49     BasicBlock * getPosnAfterNth = kb->CreateBasicBlock("getPosnAfterNth");
    50     BasicBlock * nthPosFound = kb->CreateBasicBlock("nthPosFound");
    51     BasicBlock * doSegmentReturn = kb->CreateBasicBlock("doSegmentReturn");
    52     Constant * blockSize = kb->getSize(kb->getBitBlockWidth());
    53     Constant * blockSizeLess1 = kb->getSize(kb->getBitBlockWidth() - 1);
    54     Constant * packsPerBlock = kb->getSize(kb->getBitBlockWidth()/packSize);
    55    
    56     Value * N = kb->getScalarField("N");
    57    
    58     // Set up the types for processing by pack.
    59     Type * iPackTy = kb->getIntNTy(packSize);
    60     Type * iPackPtrTy = iPackTy->getPointerTo();
    61    
    62 //    Function::arg_iterator args = mCurrentMethod->arg_begin();
    63 //    /* self = */ args++;
    64 //    Value * itemsToDo = &*(args++);
    65 //    Value * sourceBitstream = &*(args++);
    66 //    Value * uptoN_bitstream = &*(args);
    67    
    68     Value * itemsToDo = mAvailableItemCount[0];
    69     Value * sourceBitstream = kb->getInputStreamBlockPtr("bits", kb->getInt32(0)); // mStreamBufferPtr[0];
    70     Value * uptoN_bitstream = kb->getInputStreamBlockPtr("uptoN", kb->getInt32(0)); // mStreamBufferPtr[1];
    7140
    72     // Compute the ceiling of the number of blocks to do.  If we have a final
    73     // partial block, it is treated as a full block initially.   
    74     Value * blocksToDo = kb->CreateUDiv(kb->CreateAdd(itemsToDo, blockSizeLess1), blockSize);
    75    
    76     // We will create a bitmask of size packSize with one bit for every packSize positions.
    77     // The index can accommodate blocksPerGroup blocks.
    78     Constant * blocksPerGroup = kb->getSize(packSize/((kb->getBitBlockWidth()/packSize)));
    79     kb->CreateCondBr(kb->CreateICmpUGT(blocksToDo, kb->getSize(0)), processGroups, notFoundYet);
    80    
    81     // Each iteration of the outerloop processes one blockGroup of at most blocksPerGroup.
    82     kb->SetInsertPoint(processGroups);
    83     PHINode * blockGroupBase = kb->CreatePHI(kb->getSizeTy(), 2);
    84     blockGroupBase->addIncoming(kb->getSize(0), entry);
    85     Value * groupPackPtr = kb->CreatePointerCast(kb->CreateGEP(sourceBitstream, blockGroupBase), iPackPtrTy);
    86     Value * blockGroupLimit = kb->CreateAdd(blockGroupBase, blocksPerGroup);
    87     blockGroupLimit = kb->CreateSelect(kb->CreateICmpULT(blockGroupLimit, blocksToDo), blockGroupLimit, blocksToDo);
    88     kb->CreateBr(processBlockGroup);
     41    Constant * const ZERO = b->getSize(0);
     42    Constant * const ONE = b->getSize(1);
     43    const auto packsPerBlock = b->getBitBlockWidth() / packSize;
     44    Constant * const PACK_SIZE = b->getSize(packSize);
     45    Constant * const PACKS_PER_BLOCK = b->getSize(packsPerBlock);
     46    Value * const ZEROES = b->allZeroes();
     47    Type * packTy = b->getIntNTy(packSize);
    8948
    90     // Outer loop processes the blocksToDo in groups of up to blocksPerGroup at a time.
    91     // The bitmask for this group is assembled.
    92     kb->SetInsertPoint(processBlockGroup);
    93     PHINode * blockNo = kb->CreatePHI(kb->getSizeTy(), 2);
    94     PHINode * groupMask = kb->CreatePHI(iPackTy, 2);
    95     blockNo->addIncoming(blockGroupBase, processGroups);
    96     groupMask->addIncoming(ConstantInt::getNullValue(iPackTy), processGroups);
     49    BasicBlock * const entry = b->GetInsertBlock();
     50    BasicBlock * const strideLoop = b->CreateBasicBlock("strideLoop");
    9751
    98     Value * blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(sourceBitstream, {blockNo, kb->getInt32(0)}));
    99     kb->CreateBlockAlignedStore(blk, kb->CreateGEP(uptoN_bitstream, {blockNo, kb->getInt32(0)}));
    100     Value * hasbit = kb->simd_ugt(packSize, blk, kb->allZeroes());
    101     Value * blockMask = kb->CreateZExtOrTrunc(kb->hsimd_signmask(packSize, hasbit), iPackTy);
    102     Value * nextBlockNo = kb->CreateAdd(blockNo, kb->getSize(1));
    103     Value * blockMaskPosition = kb->CreateMul(kb->CreateSub(blockNo, blockGroupBase), packsPerBlock);
    104     Value * nextgroupMask = kb->CreateOr(groupMask, kb->CreateShl(blockMask, blockMaskPosition));
    105     blockNo->addIncoming(nextBlockNo, processBlockGroup);
    106     groupMask->addIncoming(nextgroupMask, processBlockGroup);
    107     kb->CreateCondBr(kb->CreateICmpULT(nextBlockNo, blockGroupLimit), processBlockGroup, doScan);
     52    b->CreateBr(strideLoop);
     53    b->SetInsertPoint(strideLoop);
     54    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
     55    strideIndex->addIncoming(ZERO, entry);
    10856
    109     // The index pack has been assembled - process the corresponding blocks.
    110     kb->SetInsertPoint(doScan);
    111     Value * seenSoFar = kb->getScalarField("seenSoFar");
    112     kb->CreateCondBr(kb->CreateICmpUGT(nextgroupMask, ConstantInt::getNullValue(iPackTy)), scanLoop, scanDone);
    113    
    114     kb->SetInsertPoint(scanLoop);
    115     PHINode * groupMaskPhi = kb->CreatePHI(iPackTy, 2);
    116     groupMaskPhi->addIncoming(nextgroupMask, doScan);
    117     PHINode * seenSoFarPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    118     seenSoFarPhi->addIncoming(seenSoFar, doScan);
    119     Value * nonZeroPack = kb->CreateZExtOrTrunc(kb->CreateCountForwardZeroes(groupMaskPhi), kb->getSizeTy());
    120     Value * scanMask = kb->CreateLoad(kb->CreateGEP(groupPackPtr, nonZeroPack));
    121     Value * packCount = kb->CreateZExtOrTrunc(kb->CreatePopcount(scanMask), kb->getSizeTy());
    122     Value * newTotalSeen = kb->CreateAdd(packCount, seenSoFarPhi);
    123     Value * seenLessThanN = kb->CreateICmpULT(newTotalSeen, N);
    124     kb->CreateCondBr(seenLessThanN, continueScanLoop, findNth);
     57    const auto n = (packSize * packSize) / b->getBitBlockWidth();
     58    Value * groupMask = nullptr;
     59    Value * const baseOffset = b->CreateMul(strideIndex, b->getSize(n));
     60    for (unsigned i = 0; i < n; ++i) {
     61        Value * offset = b->CreateNUWAdd(baseOffset, b->getSize(i));
     62        Value * inputPtr = b->getInputStreamBlockPtr("bits", ZERO, offset);
     63        Value * inputValue = b->CreateBlockAlignedLoad(inputPtr);
     64        Value * outputPtr = b->getOutputStreamBlockPtr("uptoN", ZERO, offset);
     65        b->CreateBlockAlignedStore(inputValue, outputPtr);
     66        Value * markers = b->CreateNot(b->simd_eq(packSize, inputValue, ZEROES));
     67        Value * blockMask = b->CreateZExtOrTrunc(b->hsimd_signmask(packSize, markers), packTy);
     68        if (i) {
     69            blockMask = b->CreateShl(blockMask, i * packsPerBlock);
     70            groupMask = b->CreateOr(groupMask, blockMask);
     71        } else {
     72            groupMask = blockMask;
     73        }
     74    }
    12575
    126     kb->SetInsertPoint(continueScanLoop);
    127     Value * reducedGroupMask = kb->CreateResetLowestBit(groupMaskPhi);
    128     groupMaskPhi->addIncoming(reducedGroupMask, continueScanLoop);
    129     seenSoFarPhi->addIncoming(newTotalSeen, continueScanLoop);
    130     kb->CreateCondBr(kb->CreateICmpUGT(reducedGroupMask, ConstantInt::getNullValue(iPackTy)), scanLoop, scanDone);
     76    BasicBlock * const processGroups = b->CreateBasicBlock("processGroups");
     77    BasicBlock * const nextStride = b->CreateBasicBlock("nextStride");
    13178
    132     // Now we have processed the group of blocks and updated the number of positions
    133     // seenSoFar without finding the Nth bit. 
    134     kb->SetInsertPoint(scanDone);
    135     PHINode * newTotalSeenPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    136     newTotalSeenPhi->addIncoming(seenSoFar, doScan);
    137     newTotalSeenPhi->addIncoming(newTotalSeen, continueScanLoop);
    138     kb->setScalarField("seenSoFar", newTotalSeenPhi);
    139     blockGroupBase->addIncoming(nextBlockNo, scanDone);
    140     kb->CreateCondBr(kb->CreateICmpULT(nextBlockNo, blocksToDo), processGroups, notFoundYet);
     79    b->CreateLikelyCondBr(b->CreateIsNull(groupMask), nextStride, processGroups);
    14180
    142     kb->SetInsertPoint(notFoundYet);
    143     // Now we have determined that the Nth bit has not been found in the entire
    144     // set of itemsToDo.
    145    
    146     Value * finalCount = kb->CreateAdd(kb->getProducedItemCount("uptoN"), itemsToDo);
    147     kb->setProducedItemCount("uptoN", finalCount);
    148     kb->CreateBr(doSegmentReturn);
     81    b->SetInsertPoint(processGroups);
     82    Value * const N = b->getScalarField("N");
     83    Value * const initiallyObserved = b->getScalarField("observed");
     84    BasicBlock * const processGroup = b->CreateBasicBlock("processGroup");
     85    b->CreateBr(processGroup);
    14986
    150     //
    151     // With the last input scanMask loaded, the count of one bits seen reaches or
    152     // exceeds N.  Determine the position immediately after the Nth one bit.
    153     //
    154     kb->SetInsertPoint(findNth);
    155     PHINode * seen1 = kb->CreatePHI(kb->getSizeTy(), 2);
    156     seen1->addIncoming(seenSoFarPhi, scanLoop);
    157     PHINode * remainingBits = kb->CreatePHI(iPackTy, 2);
    158     remainingBits->addIncoming(scanMask, scanLoop);
    159     Value * clearLowest = kb->CreateResetLowestBit(remainingBits);
    160     Value * oneMoreSeen = kb->CreateAdd(seen1, kb->getSize(1));
    161     seen1->addIncoming(oneMoreSeen, findNth);
    162     remainingBits->addIncoming(clearLowest, findNth);
    163     kb->CreateCondBr(kb->CreateICmpULT(oneMoreSeen, N), findNth, getPosnAfterNth);
     87    b->SetInsertPoint(processGroup);
     88    PHINode * const observed = b->CreatePHI(initiallyObserved->getType(), 2);
     89    observed->addIncoming(initiallyObserved, processGroups);
     90    PHINode * const groupMarkers = b->CreatePHI(groupMask->getType(), 2);
     91    groupMarkers->addIncoming(groupMask, processGroups);
    16492
    165     //
    166     // We have cleared the low bits of scanMask up to and including the Nth in the stream.
    167     kb->SetInsertPoint(getPosnAfterNth);
    168     Value * scanMaskUpToN = kb->CreateXor(scanMask, clearLowest);
    169     Value * posnInPack = kb->CreateSub(ConstantInt::get(iPackTy, packSize-1), kb->CreateCountReverseZeroes(scanMaskUpToN));
    170     Value * posnInGroup = kb->CreateAdd(kb->CreateMul(nonZeroPack, kb->getSize(packSize)), posnInPack);
    171     Value * posnInItemsToDo = kb->CreateAdd(kb->CreateMul(blockGroupBase, blockSize), posnInGroup);
    172     // It is conceivable that we found a bit at a position beyond the given itemsToDo,
    173     // when we have a partial pack at the end of input.  In this case, the Nth bit does
    174     // not exist in the valid range of itemsToDo.
    175     kb->CreateCondBr(kb->CreateICmpUGE(posnInItemsToDo, itemsToDo), notFoundYet, nthPosFound);
    176    
    177     kb->SetInsertPoint(nthPosFound);
    178     Value * itemsToKeep = kb->CreateAdd(posnInItemsToDo, kb->getSize(1));
    179     finalCount = kb->CreateAdd(kb->getProcessedItemCount("bits"), itemsToKeep);
    180     Value * finalBlock = kb->CreateUDiv(itemsToKeep, blockSize);
    181     blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(sourceBitstream, {finalBlock, kb->getInt32(0)}));
    182     blk = kb->CreateAnd(blk, kb->CreateNot(kb->bitblock_mask_from(kb->CreateURem(itemsToKeep, blockSize))));
    183     Value * outputPtr = kb->CreateGEP(uptoN_bitstream, {finalBlock, kb->getInt32(0)});
    184     kb->CreateBlockAlignedStore(blk, outputPtr);
    185     kb->setProcessedItemCount("bits", finalCount);
    186     kb->setProducedItemCount("uptoN", finalCount);
    187     kb->setTerminationSignal();
    188     kb->CreateBr(doSegmentReturn);
    189    
    190     kb->SetInsertPoint(doSegmentReturn);
     93    Value * const groupIndex = b->CreateZExtOrTrunc(b->CreateCountForwardZeroes(groupMarkers), b->getSizeTy());
     94    Value * const blockIndex = b->CreateNUWAdd(baseOffset, b->CreateUDiv(groupIndex, PACKS_PER_BLOCK));
     95    Value * const packOffset = b->CreateURem(groupIndex, PACKS_PER_BLOCK);
     96    Value * const groupPtr = b->getInputStreamBlockPtr("bits", ZERO, blockIndex);
     97    Value * const groupValue = b->CreateBlockAlignedLoad(groupPtr);
     98    Value * const packBits = b->CreateExtractElement(groupValue, packOffset);
     99
     100    //Type * packPtrTy = packTy->getPointerTo();
     101    //Value * const packPtr = b->CreateGEP(b->CreatePointerCast(groupPtr, packPtrTy), packOffset);
     102    //Value * const packBits = b->CreateLoad(packPtr);
     103    Value * const packCount = b->CreatePopcount(packBits);
     104    Value * const observedUpTo = b->CreateNUWAdd(observed, packCount);
     105
     106    BasicBlock * const haveNotSeenEnough = b->CreateBasicBlock("haveNotSeenEnough");
     107    BasicBlock * const seenNOrMore = b->CreateBasicBlock("seenNOrMore");
     108    b->CreateLikelyCondBr(b->CreateICmpULT(observedUpTo, N), haveNotSeenEnough, seenNOrMore);
     109
     110    // update our kernel state and check whether we have any other groups to process
     111    b->SetInsertPoint(haveNotSeenEnough);
     112    observed->addIncoming(observedUpTo, haveNotSeenEnough);
     113    b->setScalarField("observed", observedUpTo);
     114    Value * const remainingGroupMarkers = b->CreateResetLowestBit(groupMarkers);
     115    groupMarkers->addIncoming(remainingGroupMarkers, haveNotSeenEnough);
     116    b->CreateLikelyCondBr(b->CreateIsNull(remainingGroupMarkers), nextStride, processGroup);
     117
     118    // we've seen N non-zero items; determine the position of our items and clear any subsequent markers
     119    b->SetInsertPoint(seenNOrMore);
     120    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     121        b->CreateAssert(b->CreateICmpUGT(N, observed), "N must be greater than observed count!");
     122    }
     123    Value * const bitsToFind = b->CreateNUWSub(N, observed);
     124    BasicBlock * const findNthBit = b->CreateBasicBlock("findNthBit");
     125    BasicBlock * const foundNthBit = b->CreateBasicBlock("foundNthBit");
     126    b->CreateBr(findNthBit);
     127
     128    b->SetInsertPoint(findNthBit);
     129    PHINode * const remainingPositions = b->CreatePHI(bitsToFind->getType(), 2);
     130    remainingPositions->addIncoming(bitsToFind, seenNOrMore);
     131    PHINode * const remainingBits = b->CreatePHI(packBits->getType(), 2);
     132    remainingBits->addIncoming(packBits, seenNOrMore);
     133    Value * const nextRemainingPositions = b->CreateNUWSub(remainingPositions, ONE);
     134    remainingPositions->addIncoming(nextRemainingPositions, findNthBit);
     135    Value * const nextRemainingBits = b->CreateResetLowestBit(remainingBits);
     136    remainingBits->addIncoming(nextRemainingBits, findNthBit);
     137
     138    b->CreateLikelyCondBr(b->CreateIsNull(nextRemainingPositions), foundNthBit, findNthBit);
     139
     140    // If we've found the n-th bit, end the segment after clearing the markers
     141    b->SetInsertPoint(foundNthBit);
     142    Value * const inputPtr = b->getInputStreamBlockPtr("bits", ZERO, blockIndex);
     143    Value * const inputValue = b->CreateBlockAlignedLoad(inputPtr);
     144    Value * const packPosition = b->CreateZExtOrTrunc(b->CreateCountForwardZeroes(remainingBits), b->getSizeTy());
     145    Value * const basePosition = b->CreateNUWMul(packOffset, PACK_SIZE);
     146    Value * const blockOffset = b->CreateNUWAdd(b->CreateOr(basePosition, packPosition), ONE);
     147    Value * const mask = b->CreateNot(b->bitblock_mask_from(blockOffset));
     148    Value * const maskedInputValue = b->CreateAnd(inputValue, mask);
     149    Value * const outputPtr = b->getOutputStreamBlockPtr("uptoN", ZERO, blockIndex);
     150    b->CreateBlockAlignedStore(maskedInputValue, outputPtr);
     151    Value * const positionOfNthItem = b->CreateNUWAdd(b->CreateMul(blockIndex, b->getSize(b->getBitBlockWidth())), blockOffset);
     152    b->setTerminationSignal();
     153    BasicBlock * const segmentDone = b->CreateBasicBlock("segmentDone");
     154    b->CreateBr(segmentDone);
     155
     156    nextStride->moveAfter(foundNthBit);
     157
     158    b->SetInsertPoint(nextStride);
     159    Value * const nextStrideIndex = b->CreateNUWAdd(strideIndex, ONE);
     160    strideIndex->addIncoming(nextStrideIndex, nextStride);
     161    b->CreateLikelyCondBr(b->CreateICmpEQ(nextStrideIndex, numOfStrides), segmentDone, strideLoop);
     162
     163    Constant * const FULL_STRIDE = b->getSize(packSize * packSize);
     164
     165    b->SetInsertPoint(segmentDone);
     166    PHINode * const produced = b->CreatePHI(b->getSizeTy(), 2);
     167    produced->addIncoming(positionOfNthItem, foundNthBit);
     168    produced->addIncoming(FULL_STRIDE, nextStride);
     169
     170    Value * producedCount = b->getProducedItemCount("uptoN");
     171    producedCount = b->CreateNUWAdd(producedCount, b->CreateNUWMul(FULL_STRIDE, strideIndex));
     172    producedCount = b->CreateNUWAdd(producedCount, produced);
     173    b->setProducedItemCount("uptoN", producedCount);
     174
    191175    return numOfStrides;
    192176}
    193177
    194 UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & kb)
    195 : MultiBlockKernel("UntilN", {Binding{kb->getStreamSetTy(1, 1), "bits"}},
    196                              {Binding{kb->getStreamSetTy(1, 1), "uptoN", BoundedRate(0, 1)}},
    197                              {Binding{kb->getSizeTy(), "N"}}, {},
    198                              {Binding{kb->getSizeTy(), "seenSoFar"}}) {
     178UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & b)
     179: MultiBlockKernel("UntilN",
     180// inputs
     181{Binding{b->getStreamSetTy(), "bits", FixedRate((packSize * packSize) / b->getBitBlockWidth())}},
     182// outputs
     183{Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, (packSize * packSize) / b->getBitBlockWidth())}},
     184// input scalar
     185{Binding{b->getSizeTy(), "N"}}, {},
     186// internal state
     187{Binding{b->getSizeTy(), "observed"}}) {
     188
    199189}
    200190
  • icGREP/icgrep-devel/icgrep/kernels/until_n.h

    r5755 r5830  
    66#define UNTIL_N_H
    77
    8 #include "kernel.h"  // for KernelBuilder
    9 namespace IDISA { class IDISA_Builder; }
     8#include "kernel.h"
    109
    1110namespace kernel {
    1211
    13 class UntilNkernel : public MultiBlockKernel {
     12class UntilNkernel final : public MultiBlockKernel {
    1413public:
    1514    UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder);
    1615private:
    17     llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;
     16    llvm::Value * generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
    1817
    1918};
Note: See TracChangeset for help on using the changeset viewer.