Ignore:
Timestamp:
Sep 7, 2017, 4:56:56 PM (21 months ago)
Author:
nmedfort
Message:

Partial check-in for avoidance of compiling Pablo/LLVM code to determine the Kernel struct type when using a cached object. Inactive RE alternation minimization check in.

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5623 r5630  
    111111}
    112112
    113 Module * Kernel::makeModule(const std::unique_ptr<KernelBuilder> & idb) {
    114     assert (mModule == nullptr);
    115     std::stringstream cacheName;   
     113std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & idb) const {
     114    std::stringstream cacheName;
    116115    cacheName << getName() << '_' << idb->getBuilderUniqueName();
    117116    for (const StreamSetBuffer * b: mStreamSetInputBuffers) {
     
    121120        cacheName <<  ':' <<  b->getUniqueID();
    122121    }
    123     mModule = new Module(cacheName.str(), idb->getContext());
    124     prepareKernel(idb);
     122    return cacheName.str();
     123}
     124
     125Module * Kernel::setModule(Module * const module) {
     126    assert (mModule == nullptr || mModule == module);
     127    assert (module != nullptr);
     128    mModule = module;
    125129    return mModule;
    126130}
    127131
    128 Module * Kernel::setModule(const std::unique_ptr<KernelBuilder> & idb, llvm::Module * const module) {
    129     assert (mModule == nullptr);
    130     mModule = module;
    131     prepareKernel(idb);
    132     return mModule;
     132Module * Kernel::makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb) {
     133    return setModule(new Module(getCacheName(idb), idb->getContext()));
    133134}
    134135
     
    145146    const auto requiredBlocks = codegen::SegmentSize + ((blockSize + mLookAheadPositions - 1) / blockSize);
    146147
     148    IntegerType * const sizeTy = idb->getSizeTy();
     149
    147150    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
    148151        if ((mStreamSetInputBuffers[i]->getBufferBlocks() != 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
     
    151154        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
    152155        if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
    153             addScalar(idb->getSizeTy(), mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
    154         }
    155     }
    156 
    157     IntegerType * const sizeTy = idb->getSizeTy();
     156            addScalar(sizeTy, mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
     157        }
     158    }
     159
    158160    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    159161        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
     
    191193    // will be able to add instrumentation to cached modules without recompilation.
    192194    addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
     195    addInternalKernelProperties(idb);
    193196    // NOTE: StructType::create always creates a new type even if an identical one exists.
    194     mKernelStateType = getModule()->getTypeByName(getName());
     197    if (LLVM_UNLIKELY(mModule == nullptr)) {
     198        setModule(new Module(getCacheName(idb), idb->getContext()));
     199    }
     200    mKernelStateType = mModule->getTypeByName(getName());
    195201    if (LLVM_LIKELY(mKernelStateType == nullptr)) {
    196202        mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
     
    198204    processingRateAnalysis();
    199205}
    200    
     206
     207void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) {
     208
     209    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
     210    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
     211        report_fatal_error("Cannot prepare kernel after kernel state finalized");
     212    }
     213    assert (getModule());
     214    const auto blockSize = idb->getBitBlockWidth();
     215    if (mStride == 0) {
     216        // Set the default kernel stride.
     217        mStride = blockSize;
     218    }
     219    const auto requiredBlocks = codegen::SegmentSize + ((blockSize + mLookAheadPositions - 1) / blockSize);
     220
     221    IntegerType * const sizeTy = idb->getSizeTy();
     222    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     223        if ((mStreamSetInputBuffers[i]->getBufferBlocks() != 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
     224            //report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
     225        }
     226        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
     227        if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
     228            addScalar(sizeTy, mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
     229        }
     230    }
     231
     232    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     233        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
     234        if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
     235            addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
     236        }
     237    }
     238    for (const auto & binding : mScalarInputs) {
     239        addScalar(binding.type, binding.name);
     240    }
     241    for (const auto & binding : mScalarOutputs) {
     242        addScalar(binding.type, binding.name);
     243    }
     244    if (mStreamMap.empty()) {
     245        prepareStreamSetNameMap();
     246    }
     247    for (const auto & binding : mInternalScalars) {
     248        addScalar(binding.type, binding.name);
     249    }
     250
     251    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
     252    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     253        addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
     254    }
     255
     256    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
     257    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
     258
     259    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     260        addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
     261    }
     262
     263    // We compile in a 64-bit CPU cycle counter into every kernel.   It will remain unused
     264    // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines
     265    // will be able to add instrumentation to cached modules without recompilation.
     266    addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR);
     267    mKernelStateType = getModule()->getTypeByName(getName());
     268    assert (mKernelStateType);
     269    processingRateAnalysis();
     270}
    201271   
    202272void Kernel::processingRateAnalysis() {
     
    290360        const auto m = idb->getModule();
    291361        const auto ip = idb->saveIP();
    292         const auto saveInstance = getInstance();
     362        // const auto saveInstance = getInstance();
    293363        idb->setModule(mModule);
    294364        addKernelDeclarations(idb);
     
    296366        callGenerateDoSegmentMethod(idb);
    297367        callGenerateFinalizeMethod(idb);
    298         setInstance(saveInstance);
     368        // setInstance(saveInstance);
    299369        idb->setModule(m);
    300370        idb->restoreIP(ip);
     
    812882
    813883    Value * blockBaseMask = kb->CreateNot(kb->getSize(kb->getBitBlockWidth() - 1));
    814     //
    815     // Define and allocate the temporary buffer area.
    816     //
    817     Type * tempBuffers[totalSetCount];
    818     for (unsigned i = 0; i < totalSetCount; i++) {
    819         unsigned blocks = maxBlocksToCopy[i];
    820         Type * bufType = i < inputSetCount ? mStreamSetInputBuffers[i]->getStreamSetBlockType() : mStreamSetOutputBuffers[i -inputSetCount]->getStreamSetBlockType();
    821         if (blocks > 1) {
    822             tempBuffers[i] = ArrayType::get(bufType, blocks);
    823         }
    824         else {
    825             tempBuffers[i] = bufType;
    826         }
    827     }
    828     Type * tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount), "tempBuf");
    829     Value * tempParameterArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
    830884    ConstantInt * blockSize = kb->getSize(kb->getBitBlockWidth());
    831885    ConstantInt * strideSize = kb->getSize(mStride);
     
    866920    // buffer block containing the next item, and the number of linearly available items.
    867921
    868     std::vector<Value *> processedItemCount;
    869     std::vector<Value *> inputBlockPtr;
     922    Value * processedItemCount[inputSetCount];
     923    Value * inputBlockPtr[inputSetCount];
    870924    std::vector<Value *> producedItemCount;
    871925    std::vector<Value *> outputBlockPtr;
     
    873927    //  Now determine the linearly available blocks, based on blocks remaining reduced
    874928    //  by limitations of linearly available input buffer space.
    875 
    876929    Value * linearlyAvailStrides = stridesRemaining;
    877930    for (unsigned i = 0; i < inputSetCount; i++) {
     
    879932        Value * blkNo = kb->CreateUDiv(p, blockSize);
    880933        Value * b = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
    881         processedItemCount.push_back(p);
    882         inputBlockPtr.push_back(b);
     934        // processedItemCount.push_back(p);
     935        processedItemCount[i] = p;
     936        // inputBlockPtr.push_back(b);
     937        inputBlockPtr[i] = b;
    883938        auto & rate = mStreamSetInputs[i].rate;
    884939        if (rate.isUnknownRate()) continue;  // No calculation possible for unknown rates.
     
    894949        linearlyAvailStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyAvailStrides), maxStrides, linearlyAvailStrides);
    895950    }
     951
    896952    //  Now determine the linearly writeable blocks, based on available blocks reduced
    897953    //  by limitations of output buffer space.
     
    915971        linearlyWritableStrides = kb->CreateSelect(kb->CreateICmpULT(maxStrides, linearlyWritableStrides), maxStrides, linearlyWritableStrides);
    916972    }
    917     Value * haveStrides = kb->CreateICmpUGT(linearlyWritableStrides, kb->getSize(0));
    918     kb->CreateCondBr(haveStrides, doMultiBlockCall, tempBlockCheck);
     973    Value * const haveFullStrides = kb->CreateICmpUGT(linearlyWritableStrides, kb->getSize(0));
     974    kb->CreateCondBr(haveFullStrides, doMultiBlockCall, tempBlockCheck);
    919975
    920976    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
     
    9441000
    9451001    kb->CreateCall(multiBlockFunction, doMultiBlockArgs);
     1002
    9461003    // Do copybacks if necessary.
    9471004    unsigned priorIdx = 0;
    948     for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
    949         Value * log2BlockSize = kb->getSize(std::log2(kb->getBitBlockWidth()));
     1005    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {       
    9501006        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
     1007            Value * log2BlockSize = kb->getSize(std::log2(kb->getBitBlockWidth()));
    9511008            BasicBlock * copyBack = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
    9521009            BasicBlock * done = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
     
    9861043    kb->setProcessedItemCount(mStreamSetInputs[0].name, nowProcessed);
    9871044    Value * reducedStridesToDo = kb->CreateSub(stridesRemaining, linearlyWritableStrides);
    988     BasicBlock * multiBlockFinal = kb->GetInsertBlock();
    989     stridesRemaining->addIncoming(reducedStridesToDo, multiBlockFinal);
     1045    stridesRemaining->addIncoming(reducedStridesToDo, kb->GetInsertBlock());
    9901046    kb->CreateBr(doSegmentOuterLoop);
     1047
     1048
    9911049    //
    9921050    // We use temporary buffers in 3 different cases that preclude full block processing.
     
    9991057
    10001058    kb->SetInsertPoint(tempBlockCheck);
    1001     haveStrides = kb->CreateICmpUGT(stridesRemaining, kb->getSize(0));
     1059    Value * const haveStrides = kb->CreateICmpUGT(stridesRemaining, kb->getSize(0));
    10021060    kb->CreateCondBr(kb->CreateOr(mIsFinal, haveStrides), doTempBufferBlock, segmentDone);
    10031061
     
    10171075        }
    10181076    }
     1077    //
     1078    // Define and allocate the temporary buffer area.
     1079    //
     1080    Type * tempBuffers[totalSetCount];
     1081    for (unsigned i = 0; i < inputSetCount; ++i) {
     1082        Type * bufType = mStreamSetInputBuffers[i]->getStreamSetBlockType();
     1083        tempBuffers[i] = ArrayType::get(bufType, maxBlocksToCopy[i]);
     1084    }
     1085    for (unsigned i = 0; i < outputSetCount; i++) {
     1086        Type * bufType = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     1087        tempBuffers[i + inputSetCount] = ArrayType::get(bufType, maxBlocksToCopy[i + inputSetCount]);
     1088    }
     1089    Type * tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount), "tempBuf");
    10191090    // Prepare the temporary buffer area.
    1020     //
    1021     // First zero it out.
    1022     Constant * const tempAreaSize = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(tempParameterStructType), kb->getSizeTy(), false);
    1023     kb->CreateMemZero(tempParameterArea, tempAreaSize);
    1024     // For each input and output buffer, copy over necessary data starting from the last
    1025     // block boundary.
     1091    Value * tempParameterArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
     1092    kb->CreateMemZero(tempParameterArea, ConstantExpr::getSizeOf(tempParameterStructType));
     1093    // For each input and output buffer, copy over necessary data starting from the last block boundary.
    10261094    Value * itemCountNeeded[inputSetCount];
    10271095    itemCountNeeded[0] = tempBlockItems;
    10281096    Value * finalItemCountNeeded[inputSetCount];
    10291097
    1030     for (unsigned i = 0; i < mStreamSetInputBuffers.size(); i++) {
     1098    for (unsigned i = 0; i < inputSetCount; i++) {
    10311099        Type * bufPtrType = mStreamSetInputBuffers[i]->getPointerType();
    10321100        if (mItemsPerStride[i] != 0) {
     
    10351103            ConstantInt * strideItems = kb->getSize(mItemsPerStride[i]);
    10361104            Value * strideBasePos = kb->CreateSub(processedItemCount[i], kb->CreateURem(processedItemCount[i], strideItems));
    1037             Value * blockBasePos = (mItemsPerStride[i] % bitBlockWidth == 0) ? strideBasePos : kb->CreateAnd(strideBasePos, blockBaseMask);
     1105            Value * blockBasePos = strideBasePos;
     1106            if (mItemsPerStride[i] & (bitBlockWidth - 1)) {
     1107                blockBasePos = kb->CreateAnd(strideBasePos, blockBaseMask);
     1108            }
    10381109
    10391110            // The number of items to copy is determined by the processing rate requirements.
     
    10771148            }
    10781149            tempArgs.push_back(tempBufPtr);
    1079         }
    1080         else {
     1150        } else {
    10811151            Value * bufPtr = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
    10821152            bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetInputBuffers[i]->getPointerType());
     
    10851155    }
    10861156    Value * outputBasePos[outputSetCount];
    1087     for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
    1088         Value * tempBufPtr = kb->CreateGEP(tempParameterArea,  {kb->getInt32(0), kb->getInt32(mStreamSetInputs.size() + i)});
     1157    for (unsigned i = 0; i < outputSetCount; i++) {
     1158        Value * tempBufPtr = kb->CreateGEP(tempParameterArea,  {kb->getInt32(0), kb->getInt32(inputSetCount + i)});
    10891159        Type * bufPtrType = mStreamSetOutputBuffers[i]->getPointerType();
    10901160        tempBufPtr = kb->CreatePointerCast(tempBufPtr, bufPtrType);
     
    11341204    }
    11351205
    1136 
    11371206    //  We've dealt with the partial block processing and copied information back into the
    11381207    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5615 r5630  
    8888
    8989    StreamPort getStreamPort(const std::string & name) const;
    90    
    91     llvm::Module * makeModule(const std::unique_ptr<KernelBuilder> & idb);
    92 
    93     llvm::Module * setModule(const std::unique_ptr<KernelBuilder> & idb, llvm::Module * const module);
     90
     91    llvm::Module * setModule(llvm::Module * const module);
     92
     93    llvm::Module * makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb);
    9494
    9595    llvm::Module * getModule() const {
     
    137137    virtual ~Kernel() = 0;
    138138
    139 protected:
     139    void prepareKernel(const std::unique_ptr<KernelBuilder> & idb);
     140
     141    void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb);
     142
     143    std::string getCacheName(const std::unique_ptr<KernelBuilder> & idb) const;
     144
     145protected:
     146
     147    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
    140148
    141149    // Constructor
     
    168176
    169177    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
    170 
    171     virtual void prepareKernel(const std::unique_ptr<KernelBuilder> & idb);
    172178
    173179    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { }
  • icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp

    r5440 r5630  
    7878            outputBufferBasePtr,
    7979            iBuilder->CreateGEP(inputBufferBasePtr, iBuilder->CreateAdd(literalStart, copyLength1)),
    80             iBuilder->CreateSub(literalLength, copyLength1), 8);        // Buffer start is aligned.
     80            iBuilder->CreateSub(literalLength, copyLength1), 1); // Buffer start is aligned.
     81    // NOTE: Test case reported non-8-byte alignment
    8182    outputItems = iBuilder->CreateAdd(outputItems, literalLength);
    8283
  • icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp

    r5526 r5630  
    5555    /* self = */ args++;
    5656    Value * itemsToDo = &*(args++);
    57     Value * inputStreamAvail = &*(args++);
     57    /* inputStreamAvail = */ args++;
    5858    Value * match_result = &*(args++);
    5959    Value * line_break = &*(args++);
    60     Value * input_stream = &*(args);
     60    /* input_stream = */ args++;
    6161
    6262    Value * blocksToDo = iBuilder->CreateUDiv(iBuilder->CreateAdd(itemsToDo, blockSizeLess1), blockSize);
Note: See TracChangeset for help on using the changeset viewer.