Changeset 6237 for icGREP


Ignore:
Timestamp:
Dec 15, 2018, 3:43:59 PM (5 months ago)
Author:
nmedfort
Message:

Re-enabled segment pipeline parallelism; moved logical segment number into pipeline kernel.

Location:
icGREP/icgrep-devel/icgrep
Files:
10 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/CBuilder.cpp

    r6233 r6237  
    351351    }
    352352    size = CreateZExtOrTrunc(size, sizeTy);
     353    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     354        CreateAssert(size, "CreateMalloc: 0-byte malloc is implementation defined");
     355    }
    353356    CallInst * const ptr = CreateCall(f, size);
    354357    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     
    377380    size = CreateZExtOrTrunc(size, sizeTy);
    378381    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     382        CreateAssert(size, "CreateAlignedMalloc: 0-byte malloc is implementation defined");
    379383        CreateAssertZero(CreateURem(size, align), "CreateAlignedMalloc: size must be an integral multiple of alignment.");
    380384    }
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r6233 r6237  
    5353    Value * const ptr = getScalarFieldPtr(fieldName);
    5454    CreateStore(value, ptr);
    55 }
    56 
    57 LoadInst * KernelBuilder::acquireLogicalSegmentNo() {
    58     return CreateAtomicLoadAcquire(getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR));
    59 }
    60 
    61 void KernelBuilder::releaseLogicalSegmentNo(Value * const nextSegNo) {
    62     CreateAtomicStoreRelease(nextSegNo, getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR));
    6355}
    6456
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r6233 r6237  
    2424    // Set the value of a scalar field for the current instance.
    2525    void setScalarField(const std::string & fieldName, llvm::Value * value);
    26 
    27     // Synchronization actions for executing a kernel for a particular logical segment.
    28     //
    29     // Before the segment is processed, acquireLogicalSegmentNo must be used to load
    30     // the segment number of the kernel state to ensure that the previous segment is
    31     // complete (by checking that the acquired segment number is equal to the desired segment
    32     // number).
    33     // After all segment processing actions for the kernel are complete, and any necessary
    34     // data has been extracted from the kernel for further pipeline processing, the
    35     // segment number must be incremented and stored using releaseLogicalSegmentNo.
    36     llvm::LoadInst * acquireLogicalSegmentNo();
    37 
    38     void releaseLogicalSegmentNo(llvm::Value * const nextSegNo);
    3926
    4027    llvm::Value * getAvailableItemCount(const std::string & name);
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/core_logic.hpp

    r6233 r6237  
    66
    77/** ------------------------------------------------------------------------------------------------------------- *
    8  * @brief addInternalKernelProperties
    9  ** ------------------------------------------------------------------------------------------------------------- */
    10 inline void PipelineCompiler::addInternalKernelProperties(BuilderRef b) {
     8 * @brief addPipelineKernelProperties
     9 ** ------------------------------------------------------------------------------------------------------------- */
     10inline void PipelineCompiler::addPipelineKernelProperties(BuilderRef b) {
    1111    initializePopCounts();
    1212    const auto numOfKernels = mPipeline.size();
    1313    b->setKernel(mPipelineKernel);
    14     IntegerType * const boolTy = b->getInt1Ty();
    1514    for (unsigned i = 0; i < numOfKernels; ++i) {
    16         // TODO: prove two termination signals can be fused into a single counter?
    17         const auto prefix = makeKernelName(i);
    18         mPipelineKernel->addInternalScalar(boolTy, prefix + TERMINATION_SIGNAL);
     15        addInternalKernelProperties(b, i);
    1916        addBufferHandlesToPipelineKernel(b, i);
    2017        addPopCountScalarsToPipelineKernel(b, i);
    2118    }
    2219    b->setKernel(mPipelineKernel);
     20}
     21
     22//const static std::string PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
     23//const static std::string PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
     24// const static std::string NON_DEFERRED_ITEM_COUNT_SUFFIX = "_nonDeferredItemCount";
     25// const static std::string LOGICAL_SEGMENT_NO_SCALAR = "segmentNo";
     26
     27/** ------------------------------------------------------------------------------------------------------------- *
     28 * @brief addInternalKernelProperties
     29 ** ------------------------------------------------------------------------------------------------------------- */
     30inline void PipelineCompiler::addInternalKernelProperties(BuilderRef b, const unsigned kernelIndex) {
     31//    Kernel * const kernel = mPipeline[kernelIndex];
     32    IntegerType * const sizeTy = b->getSizeTy();
     33
     34    const auto name = makeKernelName(kernelIndex);
     35    // TODO: prove two termination signals can be fused into a single counter?
     36    mPipelineKernel->addInternalScalar(b->getInt1Ty(), name + TERMINATION_SIGNAL);
     37    // TODO: non deferred item count for fixed rates could be calculated from seg no.
     38    mPipelineKernel->addInternalScalar(sizeTy, name + LOGICAL_SEGMENT_NO_SCALAR);
     39
     40//    const auto numOfInputs = kernel->getNumOfStreamInputs();
     41//    for (unsigned i = 0; i < numOfInputs; i++) {
     42//        const Binding & input = kernel->getInputStreamSetBinding(i);
     43//        const auto prefix = makeBufferName(kernelIndex, input);
     44//        mPipelineKernel->addInternalScalar(sizeTy, prefix + PROCESSED_ITEM_COUNT_SUFFIX);
     45//        if (input.isDeferred()) {
     46//            mPipelineKernel->addInternalScalar(sizeTy, prefix + NON_DEFERRED_ITEM_COUNT_SUFFIX);
     47//        }
     48//    }
     49
     50//    const auto numOfOutputs = kernel->getNumOfStreamOutputs();
     51//    for (unsigned i = 0; i < numOfOutputs; i++) {
     52//        const Binding & output = kernel->getOutputStreamSetBinding(i);
     53//        const auto prefix = makeBufferName(kernelIndex, output);
     54//        mPipelineKernel->addInternalScalar(sizeTy, prefix + PRODUCED_ITEM_COUNT_SUFFIX);
     55//        if (output.isDeferred()) {
     56//            mPipelineKernel->addInternalScalar(sizeTy, prefix + NON_DEFERRED_ITEM_COUNT_SUFFIX);
     57//        }
     58//    }
     59
    2360}
    2461
     
    299336
    300337    readFinalProducedItemCounts(b);
    301     releaseCurrentSegment(b);
    302338    updateOptionalCycleCounter(b);
    303339
     
    305341}
    306342
    307 /** ------------------------------------------------------------------------------------------------------------- *
    308  * @brief wait
     343// Synchronization actions for executing a kernel for a particular logical segment.
     344
     345// Before the segment is processed, CreateAtomicLoadAcquire must be used to load
     346// the segment number of the kernel state to ensure that the previous segment is
     347// complete (by checking that the acquired segment number is equal to the desired segment
     348// number).
     349
     350// After all segment processing actions for the kernel are complete, and any necessary
     351// data has been extracted from the kernel for further pipeline processing, the
     352// segment number must be incremented and stored using CreateAtomicStoreRelease.
     353
     354/** ------------------------------------------------------------------------------------------------------------- *
     355 * @brief synchronize
    309356 ** ------------------------------------------------------------------------------------------------------------- */
    310357void PipelineCompiler::synchronize(BuilderRef b) {
    311358
    312     const auto kernelName = makeKernelName(mKernelIndex);
    313 
    314     BasicBlock * const kernelWait = b->CreateBasicBlock(kernelName + "Wait", mPipelineEnd);
     359    const auto prefix = makeKernelName(mKernelIndex);
     360    b->setKernel(mPipelineKernel);
     361    BasicBlock * const kernelWait = b->CreateBasicBlock(prefix + "Wait", mPipelineEnd);
    315362    b->CreateBr(kernelWait);
    316363
    317364    b->SetInsertPoint(kernelWait);
    318     const Kernel * waitingOn = mKernel;
    319     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::SerializeThreads))) {
    320         waitingOn = mPipeline.back();
    321     }
    322     b->setKernel(waitingOn);
    323     Value * const processedSegmentCount = b->acquireLogicalSegmentNo();
     365    const auto serialize = codegen::DebugOptionIsSet(codegen::SerializeThreads);
     366    const unsigned waitingOnIdx = serialize ? mPipeline.size() - 1 : mKernelIndex;
     367    const auto waitingOn = makeKernelName(waitingOnIdx);
     368    Value * const waitingOnPtr = b->getScalarFieldPtr(waitingOn + LOGICAL_SEGMENT_NO_SCALAR);
     369    Value * const processedSegmentCount = b->CreateAtomicLoadAcquire(waitingOnPtr);
    324370    assert (processedSegmentCount->getType() == mSegNo->getType());
    325371    Value * const ready = b->CreateICmpEQ(mSegNo, processedSegmentCount);
    326372
    327     BasicBlock * const kernelCheck = b->CreateBasicBlock(kernelName + "Check", mPipelineEnd);
     373    BasicBlock * const kernelCheck = b->CreateBasicBlock(prefix + "Check", mPipelineEnd);
    328374    b->CreateCondBr(ready, kernelCheck, kernelWait);
    329375
     
    332378}
    333379
     380/** ------------------------------------------------------------------------------------------------------------- *
     381 * @brief releaseCurrentSegment
     382 ** ------------------------------------------------------------------------------------------------------------- */
     383inline void PipelineCompiler::releaseCurrentSegment(BuilderRef b) {
     384    b->setKernel(mPipelineKernel);
     385    Value * const nextSegNo = b->CreateAdd(mSegNo, b->getSize(1));
     386    const auto prefix = makeKernelName(mKernelIndex);
     387    Value * const waitingOnPtr = b->getScalarFieldPtr(prefix + LOGICAL_SEGMENT_NO_SCALAR);
     388    b->CreateAtomicStoreRelease(nextSegNo, waitingOnPtr);
     389}
    334390
    335391/** ------------------------------------------------------------------------------------------------------------- *
     
    525581}
    526582
    527 /** ------------------------------------------------------------------------------------------------------------- *
    528  * @brief releaseCurrentSegment
    529  ** ------------------------------------------------------------------------------------------------------------- */
    530 inline void PipelineCompiler::releaseCurrentSegment(BuilderRef b) {
    531     Value * const nextSegNo = b->CreateAdd(mSegNo, b->getSize(1));
    532     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableMProtect))) {
    533         b->CreateMProtect(mKernel->getHandle(), CBuilder::Protect::WRITE);
    534     }
    535     b->releaseLogicalSegmentNo(nextSegNo);
    536     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableMProtect))) {
    537         b->CreateMProtect(mKernel->getHandle(), CBuilder::Protect::READ);
    538     }
    539 }
    540 
    541 }
     583}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_builder.cpp

    r6228 r6237  
    360360PipelineBuilder::PipelineBuilder(BaseDriver & driver,
    361361                                 Bindings && stream_inputs, Bindings && stream_outputs,
    362                                  Bindings && scalar_inputs, Bindings && scalar_outputs)
     362                                 Bindings && scalar_inputs, Bindings && scalar_outputs,
     363                                 const unsigned numOfThreads)
    363364: mDriver(driver)
    364 , mNumOfThreads(1)
     365, mNumOfThreads(numOfThreads)
    365366, mInputStreamSets(stream_inputs)
    366367, mOutputStreamSets(stream_outputs)
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_compiler.hpp

    r6233 r6237  
    166166    PipelineCompiler(BuilderRef b, PipelineKernel * const pipelineKernel);
    167167
    168     void addInternalKernelProperties(BuilderRef b);
     168    void addPipelineKernelProperties(BuilderRef b);
    169169    void generateInitializeMethod(BuilderRef b);
    170170    void generateSingleThreadKernelMethod(BuilderRef b);
     
    177177// main pipeline functions
    178178
     179    void addInternalKernelProperties(BuilderRef b, const unsigned kernelIndex);
     180
    179181    void start(BuilderRef b, Value * const initialSegNo);
    180182    void setActiveKernel(BuilderRef b, const unsigned index);
     
    183185    void end(BuilderRef b, const unsigned step);
    184186
    185     Value * allocateThreadLocalSpace(BuilderRef b);
     187    StructType * getLocalStateType(BuilderRef b);
     188    Value * allocateThreadLocalSpace(BuilderRef b, StructType * localStateType);
    186189    void setThreadLocalSpace(BuilderRef b, Value * const localState);
    187190    void deallocateThreadLocalSpace(BuilderRef b, Value * const localState);
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_kernel.cpp

    r6228 r6237  
    2828    } else { // add handles for each of unique streams
    2929        mCompiler = llvm::make_unique<PipelineCompiler>(b, this);
    30         mCompiler->addInternalKernelProperties(b);
     30        mCompiler->addPipelineKernelProperties(b);
    3131    }
    3232}
     
    9191        } else {
    9292            mCompiler->generateMultiThreadKernelMethod(b, mNumOfThreads);
    93         }       
     93        }
    9494    }
    9595}
  • icGREP/icgrep-devel/icgrep/kernels/pipeline/pipeline_logic.hpp

    r6233 r6237  
    1010 ** ------------------------------------------------------------------------------------------------------------- */
    1111void PipelineCompiler::generateSingleThreadKernelMethod(BuilderRef b) {
    12     Value * const localState = allocateThreadLocalSpace(b);
     12    StructType * const localStateType = getLocalStateType(b);
     13    Value * const localState = allocateThreadLocalSpace(b, localStateType);
    1314    setThreadLocalSpace(b, localState);
    1415    start(b, b->getSize(0));
     
    2021    deallocateThreadLocalSpace(b, localState);
    2122}
     23
     24
    2225
    2326/** ------------------------------------------------------------------------------------------------------------- *
     
    3437    assert (numOfThreads > 1);
    3538
     39
    3640    Module * const m = b->getModule();
    3741    IntegerType * const sizeTy = b->getSizeTy();
     
    4246    ConstantInt * const TWO = b->getInt32(2);
    4347
    44     // store where we'll resume compiling the DoSegment method
    45     const auto resumePoint = b->saveIP();
    4648    Value * const handle = mPipelineKernel->getHandle(); assert (handle);
    47     StructType * const threadStructType = StructType::get(m->getContext(), {handle->getType(), sizeTy, voidPtrTy});
    48     FunctionType * const threadFuncType = FunctionType::get(voidPtrTy, {voidPtrTy}, false);
     49    StructType * const localStateTy = getLocalStateType(b);
     50    PointerType * const localStatePtrTy = localStateTy->getPointerTo();
     51    StructType * const threadStructType = StructType::get(m->getContext(), {handle->getType(), sizeTy, localStatePtrTy});
     52    FunctionType * const threadFuncType = FunctionType::get(b->getVoidTy(), {voidPtrTy}, false);
    4953
    5054    const auto threadName = mPipelineKernel->getName() + "_DoSegmentThread";
     
    5357    auto args = threadFunc->arg_begin();
    5458    args->setName("kernelStateObject");
     59
     60    // store where we'll resume compiling the DoSegment method
     61    const auto resumePoint = b->saveIP();
    5562
    5663    // -------------------------------------------------------------------------------------------------------------------------
     
    6875        synchronize(b);
    6976        executeKernel(b);
     77        releaseCurrentSegment(b);
    7078    }
    7179    mKernel = nullptr;
     
    8189    b->CreateBr(exitFunction);
    8290    b->SetInsertPoint(exitFunction);
    83     b->CreateRet(nullVoidPtrVal);
     91    b->CreateRetVoid();
    8492
    8593    // -------------------------------------------------------------------------------------------------------------------------
    86     // MAKE PIPELINE DRIVER
     94    // MAKE PIPELINE DRIVER CONTINUED
    8795    // -------------------------------------------------------------------------------------------------------------------------
    8896    b->restoreIP(resumePoint);
     
    95103        threadIdPtr[i] = b->CreateGEP(pthreads, {ZERO, b->getInt32(i)});
    96104    }
     105
    97106    // use the process thread to handle the initial segment function after spawning
    98107    // (n - 1) threads to handle the subsequent offsets
    99     Value * localState[threads];
     108    std::vector<Value *> localState(numOfThreads);
    100109    for (unsigned i = 0; i < threads; ++i) {
    101110        AllocaInst * const threadState = b->CreateAlloca(threadStructType);
    102111        b->CreateStore(handle, b->CreateGEP(threadState, {ZERO, ZERO}));
    103112        b->CreateStore(b->getSize(i + 1), b->CreateGEP(threadState, {ZERO, ONE}));
    104         localState[i] = allocateThreadLocalSpace(b);
     113        localState[i] = allocateThreadLocalSpace(b, localStateTy);
    105114        b->CreateStore(localState[i], b->CreateGEP(threadState, {ZERO, TWO}));
    106115        b->CreatePThreadCreateCall(threadIdPtr[i], nullVoidPtrVal, threadFunc, threadState);
     
    119128    }
    120129
    121 
    122130}
    123131
     
    126134};
    127135
     136/** ------------------------------------------------------------------------------------------------------------- *
     137 * @brief getLocalStateType
     138 ** ------------------------------------------------------------------------------------------------------------- */
     139inline StructType * PipelineCompiler::getLocalStateType(BuilderRef b) {
     140    StructType * const popCountTy = getPopCountThreadLocalStateType(b);
     141    return StructType::get(popCountTy, nullptr);
     142}
    128143
    129144/** ------------------------------------------------------------------------------------------------------------- *
    130145 * @brief allocateThreadLocalSpace
    131146 ** ------------------------------------------------------------------------------------------------------------- */
    132 inline Value * PipelineCompiler::allocateThreadLocalSpace(BuilderRef b) {
    133     // malloc the local state object
    134     StructType * const popCountTy = getPopCountThreadLocalStateType(b);
    135     StructType * const localStateTy = StructType::get(popCountTy, nullptr);
    136     Value * const localState = b->CreateCacheAlignedAlloca(localStateTy);
    137     // and any pop count refs
     147inline Value * PipelineCompiler::allocateThreadLocalSpace(BuilderRef b, StructType * localStateType) {
     148    Value * const localState = b->CreateCacheAlignedAlloca(localStateType);
    138149    Constant * const ZERO = b->getInt32(0);
    139150    Constant * const POP_COUNT_STRUCT = b->getInt32(POP_COUNT_STRUCT_INDEX);
  • icGREP/icgrep-devel/icgrep/kernels/pipeline_builder.h

    r6184 r6237  
    6161    PipelineBuilder(BaseDriver & driver,
    6262                    Bindings && stream_inputs, Bindings && stream_outputs,
    63                     Bindings && scalar_inputs, Bindings && scalar_outputs);
     63                    Bindings && scalar_inputs, Bindings && scalar_outputs,
     64                    const unsigned numOfThreads = 1);
    6465
    6566protected:
  • icGREP/icgrep-devel/icgrep/toolchain/driver.cpp

    r6209 r6237  
    1515 ** ------------------------------------------------------------------------------------------------------------- */
    1616std::unique_ptr<PipelineBuilder> BaseDriver::makePipelineWithIO(Bindings stream_inputs, Bindings stream_outputs, Bindings scalar_inputs, Bindings scalar_outputs) {
    17     return llvm::make_unique<PipelineBuilder>(*this, std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_inputs), std::move(scalar_outputs));
     17    return llvm::make_unique<PipelineBuilder>(*this, std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_inputs), std::move(scalar_outputs), codegen::ThreadNum);
    1818}
    1919
     
    2222 ** ------------------------------------------------------------------------------------------------------------- */
    2323std::unique_ptr<kernel::PipelineBuilder> BaseDriver::makePipeline(Bindings scalar_inputs, Bindings scalar_outputs) {
    24     return llvm::make_unique<PipelineBuilder>(*this, Bindings{}, Bindings{}, std::move(scalar_inputs), std::move(scalar_outputs));
     24    return llvm::make_unique<PipelineBuilder>(*this, Bindings{}, Bindings{}, std::move(scalar_inputs), std::move(scalar_outputs), codegen::ThreadNum);
    2525}
    2626
Note: See TracChangeset for help on using the changeset viewer.