Ignore:
Timestamp:
Dec 15, 2017, 12:44:01 PM (18 months ago)
Author:
nmedfort
Message:

Initial check-in of LookAhead? support; modified LineBreakKernel? to compute CR+LF using LookAhead?(1) + misc. fixes.

Location:
icGREP/icgrep-devel/icgrep/toolchain
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/toolchain/grep_pipeline.cpp

    r5769 r5782  
    6262    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
    6363   
    64     StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
    65    
     64    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize + 1);
    6665    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
    6766    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    6867   
     68    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, 8);
     69    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize + 1);
     70    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
     71
    6972    kernel::Kernel * linebreakK = pxDriver.addKernelInstance<kernel::LineBreakKernelBuilder>(idb, 8);
    7073    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
    71     pxDriver.makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
     74    StreamSetBuffer * CRLFStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
     75    pxDriver.makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});
    7276   
    7377    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    74     StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize);
     78    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), segmentSize);
    7579    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
    7680   
    7781    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
    7882    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern);
    79     pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
     83    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
    8084   
    8185    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
  • icGREP/icgrep-devel/icgrep/toolchain/pipeline.cpp

    r5761 r5782  
    3333
    3434void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const Kernel * kernel);
     35
     36void handleInsufficientData(const std::unique_ptr<KernelBuilder> & b, Value * const produced, Value * const final, BasicBlock * const entry, const Kernel * const consumer,  const Binding & input, const StreamSetBuffer * const buffer);
    3537
    3638/** ------------------------------------------------------------------------------------------------------------- *
     
    4345 * fashion such that processing of segment S_i by the full pipeline is carried out by thread i mod T.
    4446 ** ------------------------------------------------------------------------------------------------------------- */
    45 void generateSegmentParallelPipeline(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Kernel *> & kernels) {
     47void generateSegmentParallelPipeline(const std::unique_ptr<KernelBuilder> & b, const std::vector<Kernel *> & kernels) {
    4648
    4749    const unsigned n = kernels.size();
    48     Module * const m = iBuilder->getModule();
    49     IntegerType * const sizeTy = iBuilder->getSizeTy();
    50     PointerType * const voidPtrTy = iBuilder->getVoidPtrTy();
     50    Module * const m = b->getModule();
     51    IntegerType * const sizeTy = b->getSizeTy();
     52    PointerType * const voidPtrTy = b->getVoidPtrTy();
    5153    Constant * nullVoidPtrVal = ConstantPointerNull::getNullValue(voidPtrTy);
    5254    std::vector<Type *> structTypes;
     
    6163    StructType * const threadStructType = StructType::get(m->getContext(), {sharedStructType->getPointerTo(), sizeTy});
    6264
    63     const auto ip = iBuilder->saveIP();
    64 
    65     Function * const threadFunc = makeThreadFunction(iBuilder, "segment");
     65    const auto ip = b->saveIP();
     66
     67    Function * const threadFunc = makeThreadFunction(b, "segment");
    6668    auto args = threadFunc->arg_begin();
    6769
     
    7173
    7274     // Create the basic blocks for the thread function.
    73     BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc);
    74     iBuilder->SetInsertPoint(entryBlock);
    75 
    76     Value * const threadStruct = iBuilder->CreateBitCast(&*(args), threadStructType->getPointerTo());
    77 
    78     Value * const sharedStatePtr = iBuilder->CreateLoad(iBuilder->CreateGEP(threadStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
     75    BasicBlock * entryBlock = BasicBlock::Create(b->getContext(), "entry", threadFunc);
     76    b->SetInsertPoint(entryBlock);
     77
     78    Value * const threadStruct = b->CreateBitCast(&*(args), threadStructType->getPointerTo());
     79
     80    Value * const sharedStatePtr = b->CreateLoad(b->CreateGEP(threadStruct, {b->getInt32(0), b->getInt32(0)}));
    7981    for (unsigned k = 0; k < n; ++k) {
    80         Value * ptr = iBuilder->CreateLoad(iBuilder->CreateGEP(sharedStatePtr, {iBuilder->getInt32(0), iBuilder->getInt32(k)}));
     82        Value * ptr = b->CreateLoad(b->CreateGEP(sharedStatePtr, {b->getInt32(0), b->getInt32(k)}));
    8183        kernels[k]->setInstance(ptr);
    8284    }
    83     Value * const segOffset = iBuilder->CreateLoad(iBuilder->CreateGEP(threadStruct, {iBuilder->getInt32(0), iBuilder->getInt32(1)}));
    84 
    85     BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentLoop", threadFunc);
    86     iBuilder->CreateBr(segmentLoop);
    87 
    88     iBuilder->SetInsertPoint(segmentLoop);
    89     PHINode * const segNo = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "segNo");
     85    Value * const segOffset = b->CreateLoad(b->CreateGEP(threadStruct, {b->getInt32(0), b->getInt32(1)}));
     86
     87    BasicBlock * segmentLoop = BasicBlock::Create(b->getContext(), "segmentLoop", threadFunc);
     88    b->CreateBr(segmentLoop);
     89
     90    b->SetInsertPoint(segmentLoop);
     91    PHINode * const segNo = b->CreatePHI(b->getSizeTy(), 2, "segNo");
    9092    segNo->addIncoming(segOffset, entryBlock);
    9193
    92     Value * terminated = iBuilder->getFalse();
    93     Value * const nextSegNo = iBuilder->CreateAdd(segNo, iBuilder->getSize(1));
    94 
    95     BasicBlock * segmentLoopBody = nullptr;
    96     BasicBlock * const exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc);
    97 
    98     StreamSetBufferMap<Value *> producedPos;
    99     StreamSetBufferMap<Value *> consumedPos;
     94    Value * terminated = b->getFalse();
     95    Value * const nextSegNo = b->CreateAdd(segNo, b->getSize(1));
     96
     97    BasicBlock * const exitThreadBlock = BasicBlock::Create(b->getContext(), "exitThread", threadFunc);
     98
     99    StreamSetBufferMap<Value *> producedItemCount;
     100    StreamSetBufferMap<Value *> consumedItemCount;
    100101
    101102    Value * cycleCountStart = nullptr;
    102103    Value * cycleCountEnd = nullptr;
    103104    if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    104         cycleCountStart = iBuilder->CreateReadCycleCounter();
     105        cycleCountStart = b->CreateReadCycleCounter();
    105106    }
    106107
     
    109110        const auto & kernel = kernels[k];
    110111
    111         BasicBlock * const segmentWait = BasicBlock::Create(iBuilder->getContext(), kernel->getName() + "Wait", threadFunc);
    112 
    113         BasicBlock * segmentYield = segmentWait;
    114         iBuilder->CreateBr(segmentWait);
    115 
    116         segmentLoopBody = BasicBlock::Create(iBuilder->getContext(), kernel->getName() + "Do", threadFunc);
    117 
    118         iBuilder->SetInsertPoint(segmentWait);
     112        BasicBlock * const kernelWait = BasicBlock::Create(b->getContext(), kernel->getName() + "Wait", threadFunc);
     113
     114        b->CreateBr(kernelWait);
     115
     116        BasicBlock * const kernelBody = BasicBlock::Create(b->getContext(), kernel->getName() + "Do", threadFunc);
     117
     118        b->SetInsertPoint(kernelWait);
    119119        const unsigned waitIdx = codegen::DebugOptionIsSet(codegen::SerializeThreads) ? (n - 1) : k;
    120120
    121         iBuilder->setKernel(kernels[waitIdx]);
    122         Value * const processedSegmentCount = iBuilder->acquireLogicalSegmentNo();
    123         iBuilder->setKernel(kernel);
     121        b->setKernel(kernels[waitIdx]);
     122        Value * const processedSegmentCount = b->acquireLogicalSegmentNo();
     123        b->setKernel(kernel);
    124124
    125125        assert (processedSegmentCount->getType() == segNo->getType());
    126         Value * const ready = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
     126        Value * const ready = b->CreateICmpEQ(segNo, processedSegmentCount);
    127127
    128128        if (kernel->hasNoTerminateAttribute()) {
    129             iBuilder->CreateCondBr(ready, segmentLoopBody, segmentYield);
     129            b->CreateCondBr(ready, kernelBody, kernelWait);
    130130        } else { // If the kernel was terminated in a previous segment then the pipeline is done.
    131             BasicBlock * completionTest = BasicBlock::Create(iBuilder->getContext(), kernel->getName() + "Completed", threadFunc, 0);
    132             BasicBlock * exitBlock = BasicBlock::Create(iBuilder->getContext(), kernel->getName() + "Exit", threadFunc, 0);
    133             iBuilder->CreateCondBr(ready, completionTest, segmentYield);
    134 
    135             iBuilder->SetInsertPoint(completionTest);
    136             Value * terminationSignal = iBuilder->getTerminationSignal();
    137             iBuilder->CreateCondBr(terminationSignal, exitBlock, segmentLoopBody);
    138             iBuilder->SetInsertPoint(exitBlock);
    139             // Ensure that the next thread will also exit.
    140             iBuilder->releaseLogicalSegmentNo(nextSegNo);
    141             iBuilder->CreateBr(exitThreadBlock);
    142         }
     131            BasicBlock * kernelTerminated = BasicBlock::Create(b->getContext(), kernel->getName() + "Terminated", threadFunc, 0);
     132            BasicBlock * exitBlock = BasicBlock::Create(b->getContext(), kernel->getName() + "Exit", threadFunc, 0);
     133            b->CreateCondBr(ready, kernelTerminated, kernelWait);
     134
     135            b->SetInsertPoint(kernelTerminated);
     136            Value * terminationSignal = b->getTerminationSignal();
     137            b->CreateCondBr(terminationSignal, exitBlock, kernelBody);
     138            b->SetInsertPoint(exitBlock);
     139            b->releaseLogicalSegmentNo(nextSegNo); // Ensure that the next thread will also exit.
     140            b->CreateBr(exitThreadBlock);
     141        }
     142
     143        BasicBlock * const kernelEnd = BasicBlock::Create(b->getContext(), kernel->getName() + "End", threadFunc);
    143144
    144145        // Execute the kernel segment
    145         iBuilder->SetInsertPoint(segmentLoopBody);
     146        b->SetInsertPoint(kernelBody);
    146147        const auto & inputs = kernel->getStreamInputs();
    147148        std::vector<Value *> args = {kernel->getInstance(), terminated};
    148149        for (unsigned i = 0; i < inputs.size(); ++i) {
    149             const auto f = producedPos.find(kernel->getStreamSetInputBuffer(i));
    150             assert (f != producedPos.end());
    151             args.push_back(f->second);
    152         }
    153 
    154         iBuilder->setKernel(kernel);
    155         iBuilder->createDoSegmentCall(args);
     150            const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     151            const auto f = producedItemCount.find(buffer);
     152            assert (f != producedItemCount.end());
     153            Value * const produced = f->second;
     154            args.push_back(produced);
     155            handleInsufficientData(b, produced, terminated, kernelEnd, kernel, inputs[i], buffer);
     156        }
     157
     158        b->setKernel(kernel);
     159        b->createDoSegmentCall(args);
     160        b->CreateBr(kernelEnd);
     161
     162        b->SetInsertPoint(kernelEnd);
     163
    156164        if (!kernel->hasNoTerminateAttribute()) {
    157             terminated = iBuilder->CreateOr(terminated, iBuilder->getTerminationSignal());
     165            terminated = b->CreateOr(terminated, b->getTerminationSignal());
    158166        }
    159167
    160168        const auto & outputs = kernel->getStreamOutputs();
    161169        for (unsigned i = 0; i < outputs.size(); ++i) {           
    162             Value * const produced = iBuilder->getProducedItemCount(outputs[i].getName()); // terminated
     170            Value * const produced = b->getProducedItemCount(outputs[i].getName());
    163171            const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
    164             assert (producedPos.count(buf) == 0);
    165             producedPos.emplace(buf, produced);
     172            assert (producedItemCount.count(buf) == 0);
     173            producedItemCount.emplace(buf, produced);
    166174        }
    167175        for (unsigned i = 0; i < inputs.size(); ++i) {
    168             Value * const processedItemCount = iBuilder->getProcessedItemCount(inputs[i].getName());
     176            Value * const processedItemCount = b->getProcessedItemCount(inputs[i].getName());
    169177            const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(i);           
    170             auto f = consumedPos.find(buf);
    171             if (f == consumedPos.end()) {
    172                 consumedPos.emplace(buf, processedItemCount);
     178            auto f = consumedItemCount.find(buf);
     179            if (f == consumedItemCount.end()) {
     180                consumedItemCount.emplace(buf, processedItemCount);
    173181            } else {
    174                 Value * lesser = iBuilder->CreateICmpULT(processedItemCount, f->second);
    175                 f->second = iBuilder->CreateSelect(lesser, processedItemCount, f->second);
    176             }
    177         }
     182                assert (f->second);
     183                f->second = b->CreateUMin(processedItemCount, f->second);
     184            }
     185        }
     186
    178187        if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    179             cycleCountEnd = iBuilder->CreateReadCycleCounter();
    180             Value * counterPtr = iBuilder->getCycleCountPtr();
    181             iBuilder->CreateStore(iBuilder->CreateAdd(iBuilder->CreateLoad(counterPtr), iBuilder->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
     188            cycleCountEnd = b->CreateReadCycleCounter();
     189            Value * counterPtr = b->getCycleCountPtr();
     190            b->CreateStore(b->CreateAdd(b->CreateLoad(counterPtr), b->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
    182191            cycleCountStart = cycleCountEnd;
    183         }
    184        
    185         iBuilder->releaseLogicalSegmentNo(nextSegNo);
    186     }
    187 
    188     assert (segmentLoopBody);
    189     exitThreadBlock->moveAfter(segmentLoopBody);
    190 
    191     for (const auto consumed : consumedPos) {
     192        }       
     193        b->releaseLogicalSegmentNo(nextSegNo);
     194    }
     195
     196    exitThreadBlock->moveAfter(b->GetInsertBlock());
     197    for (const auto consumed : consumedItemCount) {
    192198        const StreamSetBuffer * const buf = consumed.first;
    193199        Kernel * const k = buf->getProducer();
     
    199205                    continue;
    200206                }
    201                 iBuilder->setKernel(k);
    202                 iBuilder->setConsumedItemCount(binding.getName(), consumed.second);
     207                b->setKernel(k);
     208                b->setConsumedItemCount(binding.getName(), consumed.second);
    203209                break;
    204210            }
     
    206212    }
    207213
    208     segNo->addIncoming(iBuilder->CreateAdd(segNo, iBuilder->getSize(codegen::ThreadNum)), segmentLoopBody);
    209     iBuilder->CreateCondBr(terminated, exitThreadBlock, segmentLoop);
    210 
    211     iBuilder->SetInsertPoint(exitThreadBlock);
     214    segNo->addIncoming(b->CreateAdd(segNo, b->getSize(codegen::ThreadNum)), b->GetInsertBlock());
     215    b->CreateUnlikelyCondBr(terminated, exitThreadBlock, segmentLoop);
     216
     217    b->SetInsertPoint(exitThreadBlock);
    212218
    213219    // only call pthread_exit() within spawned threads; otherwise it'll be equivalent to calling exit() within the process
    214     BasicBlock * const exitThread = BasicBlock::Create(iBuilder->getContext(), "ExitThread", threadFunc);
    215     BasicBlock * const exitFunction = BasicBlock::Create(iBuilder->getContext(), "ExitProcessFunction", threadFunc);
    216 
    217     Value * const exitCond = iBuilder->CreateICmpEQ(segOffset, ConstantInt::getNullValue(segOffset->getType()));
    218     iBuilder->CreateCondBr(exitCond, exitFunction, exitThread);
    219     iBuilder->SetInsertPoint(exitThread);
    220     iBuilder->CreatePThreadExitCall(nullVoidPtrVal);
    221     iBuilder->CreateBr(exitFunction);
    222     iBuilder->SetInsertPoint(exitFunction);
    223     iBuilder->CreateRetVoid();
     220    BasicBlock * const exitThread = BasicBlock::Create(b->getContext(), "ExitThread", threadFunc);
     221    BasicBlock * const exitFunction = BasicBlock::Create(b->getContext(), "ExitProcessFunction", threadFunc);
     222
     223    Value * const exitCond = b->CreateICmpEQ(segOffset, ConstantInt::getNullValue(segOffset->getType()));
     224    b->CreateCondBr(exitCond, exitFunction, exitThread);
     225    b->SetInsertPoint(exitThread);
     226    b->CreatePThreadExitCall(nullVoidPtrVal);
     227    b->CreateBr(exitFunction);
     228    b->SetInsertPoint(exitFunction);
     229    b->CreateRetVoid();
    224230
    225231    // -------------------------------------------------------------------------------------------------------------------------
    226     iBuilder->restoreIP(ip);
     232    b->restoreIP(ip);
    227233
    228234    for (unsigned i = 0; i < n; ++i) {
     
    236242    assert (codegen::ThreadNum > 1);
    237243    Type * const pthreadsTy = ArrayType::get(sizeTy, threads);
    238     AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
     244    AllocaInst * const pthreads = b->CreateAlloca(pthreadsTy);
    239245    Value * threadIdPtr[threads];
    240246
    241247    for (unsigned i = 0; i < threads; ++i) {
    242         threadIdPtr[i] = iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
    243     }
    244 
    245     for (unsigned i = 0; i < n; ++i) {
    246         iBuilder->setKernel(kernels[i]);
    247         iBuilder->releaseLogicalSegmentNo(iBuilder->getSize(0));
    248     }
    249 
    250     AllocaInst * const sharedStruct = iBuilder->CreateCacheAlignedAlloca(sharedStructType);
    251     for (unsigned i = 0; i < n; ++i) {
    252         Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
    253         iBuilder->CreateStore(kernels[i]->getInstance(), ptr);
     248        threadIdPtr[i] = b->CreateGEP(pthreads, {b->getInt32(0), b->getInt32(i)});
     249    }
     250
     251    for (unsigned i = 0; i < n; ++i) {
     252        b->setKernel(kernels[i]);
     253        b->releaseLogicalSegmentNo(b->getSize(0));
     254    }
     255
     256    AllocaInst * const sharedStruct = b->CreateCacheAlignedAlloca(sharedStructType);
     257    for (unsigned i = 0; i < n; ++i) {
     258        Value * ptr = b->CreateGEP(sharedStruct, {b->getInt32(0), b->getInt32(i)});
     259        b->CreateStore(kernels[i]->getInstance(), ptr);
    254260    }
    255261
    256262    // use the process thread to handle the initial segment function after spawning (n - 1) threads to handle the subsequent offsets
    257263    for (unsigned i = 0; i < threads; ++i) {
    258         AllocaInst * const threadState = iBuilder->CreateAlloca(threadStructType);
    259         iBuilder->CreateStore(sharedStruct, iBuilder->CreateGEP(threadState, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
    260         iBuilder->CreateStore(iBuilder->getSize(i + 1), iBuilder->CreateGEP(threadState, {iBuilder->getInt32(0), iBuilder->getInt32(1)}));
    261         iBuilder->CreatePThreadCreateCall(threadIdPtr[i], nullVoidPtrVal, threadFunc, threadState);
    262     }
    263 
    264     AllocaInst * const threadState = iBuilder->CreateAlloca(threadStructType);
    265     iBuilder->CreateStore(sharedStruct, iBuilder->CreateGEP(threadState, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
    266     iBuilder->CreateStore(iBuilder->getSize(0), iBuilder->CreateGEP(threadState, {iBuilder->getInt32(0), iBuilder->getInt32(1)}));
    267     iBuilder->CreateCall(threadFunc, iBuilder->CreatePointerCast(threadState, voidPtrTy));
    268 
    269     AllocaInst * const status = iBuilder->CreateAlloca(voidPtrTy);
     264        AllocaInst * const threadState = b->CreateAlloca(threadStructType);
     265        b->CreateStore(sharedStruct, b->CreateGEP(threadState, {b->getInt32(0), b->getInt32(0)}));
     266        b->CreateStore(b->getSize(i + 1), b->CreateGEP(threadState, {b->getInt32(0), b->getInt32(1)}));
     267        b->CreatePThreadCreateCall(threadIdPtr[i], nullVoidPtrVal, threadFunc, threadState);
     268    }
     269
     270    AllocaInst * const threadState = b->CreateAlloca(threadStructType);
     271    b->CreateStore(sharedStruct, b->CreateGEP(threadState, {b->getInt32(0), b->getInt32(0)}));
     272    b->CreateStore(b->getSize(0), b->CreateGEP(threadState, {b->getInt32(0), b->getInt32(1)}));
     273    b->CreateCall(threadFunc, b->CreatePointerCast(threadState, voidPtrTy));
     274
     275    AllocaInst * const status = b->CreateAlloca(voidPtrTy);
    270276    for (unsigned i = 0; i < threads; ++i) {
    271         Value * threadId = iBuilder->CreateLoad(threadIdPtr[i]);
    272         iBuilder->CreatePThreadJoinCall(threadId, status);
     277        Value * threadId = b->CreateLoad(threadIdPtr[i]);
     278        b->CreatePThreadJoinCall(threadId, status);
    273279    }
    274280   
     
    276282        for (unsigned k = 0; k < kernels.size(); k++) {
    277283            auto & kernel = kernels[k];
    278             iBuilder->setKernel(kernel);
     284            b->setKernel(kernel);
    279285            const auto & inputs = kernel->getStreamInputs();
    280286            const auto & outputs = kernel->getStreamOutputs();
    281287            Value * items = nullptr;
    282288            if (inputs.empty()) {
    283                 items = iBuilder->getProducedItemCount(outputs[0].getName());
     289                items = b->getProducedItemCount(outputs[0].getName());
    284290            } else {
    285                 items = iBuilder->getProcessedItemCount(inputs[0].getName());
    286             }
    287             Value * fItems = iBuilder->CreateUIToFP(items, iBuilder->getDoubleTy());
    288             Value * cycles = iBuilder->CreateLoad(iBuilder->getCycleCountPtr());
    289             Value * fCycles = iBuilder->CreateUIToFP(cycles, iBuilder->getDoubleTy());
     291                items = b->getProcessedItemCount(inputs[0].getName());
     292            }
     293            Value * fItems = b->CreateUIToFP(items, b->getDoubleTy());
     294            Value * cycles = b->CreateLoad(b->getCycleCountPtr());
     295            Value * fCycles = b->CreateUIToFP(cycles, b->getDoubleTy());
    290296            const auto formatString = kernel->getName() + ": %7.2e items processed; %7.2e CPU cycles,  %6.2f cycles per item.\n";
    291             Value * stringPtr = iBuilder->CreatePointerCast(iBuilder->GetString(formatString), iBuilder->getInt8PtrTy());
    292             iBuilder->CreateCall(iBuilder->GetDprintf(), {iBuilder->getInt32(2), stringPtr, fItems, fCycles, iBuilder->CreateFDiv(fCycles, fItems)});
     297            Value * stringPtr = b->CreatePointerCast(b->GetString(formatString), b->getInt8PtrTy());
     298            b->CreateCall(b->GetDprintf(), {b->getInt32(2), stringPtr, fItems, fCycles, b->CreateFDiv(fCycles, fItems)});
    293299        }
    294300    }
     
    502508 * @brief generatePipelineLoop
    503509 ** ------------------------------------------------------------------------------------------------------------- */
    504 void generatePipelineLoop(const std::unique_ptr<KernelBuilder> & iBuilder, const std::vector<Kernel *> & kernels) {
    505 
    506     BasicBlock * entryBlock = iBuilder->GetInsertBlock();
     510void generatePipelineLoop(const std::unique_ptr<KernelBuilder> & b, const std::vector<Kernel *> & kernels) {
     511
     512    BasicBlock * entryBlock = b->GetInsertBlock();
    507513    Function * main = entryBlock->getParent();
    508514
    509515    // Create the basic blocks for the loop.
    510     BasicBlock * pipelineLoop = BasicBlock::Create(iBuilder->getContext(), "pipelineLoop", main);
    511     BasicBlock * pipelineExit = BasicBlock::Create(iBuilder->getContext(), "pipelineExit", main);
    512 
    513     StreamSetBufferMap<Value *> producedPos;
    514     StreamSetBufferMap<Value *> consumedPos;
    515 
    516     iBuilder->CreateBr(pipelineLoop);
    517     iBuilder->SetInsertPoint(pipelineLoop);
     516    BasicBlock * pipelineLoop = BasicBlock::Create(b->getContext(), "pipelineLoop", main);
     517    BasicBlock * pipelineExit = BasicBlock::Create(b->getContext(), "pipelineExit", main);
     518
     519    StreamSetBufferMap<Value *> producedItemCount;
     520    StreamSetBufferMap<Value *> consumedItemCount;
     521
     522    b->CreateBr(pipelineLoop);
     523    b->SetInsertPoint(pipelineLoop);
    518524   
    519525    Value * cycleCountStart = nullptr;
    520526    Value * cycleCountEnd = nullptr;
    521527    if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    522         cycleCountStart = iBuilder->CreateReadCycleCounter();
    523     }
    524     Value * terminated = iBuilder->getFalse();
     528        cycleCountStart = b->CreateReadCycleCounter();
     529    }
     530    Value * terminated = b->getFalse();
    525531
    526532    for (Kernel * const kernel : kernels) {
    527533
    528         iBuilder->setKernel(kernel);
     534        b->setKernel(kernel);
    529535        const auto & inputs = kernel->getStreamInputs();
    530536        const auto & outputs = kernel->getStreamOutputs();
     
    533539
    534540        for (unsigned i = 0; i < inputs.size(); ++i) {
    535             const auto f = producedPos.find(kernel->getStreamSetInputBuffer(i));
    536             if (LLVM_UNLIKELY(f == producedPos.end())) {
     541            const StreamSetBuffer * const buffer = kernel->getStreamSetInputBuffer(i);
     542            const auto f = producedItemCount.find(buffer);
     543            if (LLVM_UNLIKELY(f == producedItemCount.end())) {
    537544                report_fatal_error(kernel->getName() + " uses stream set " + inputs[i].getName() + " prior to its definition");
    538545            }
    539             args.push_back(f->second);
    540         }
    541 
    542         applyOutputBufferExpansions(iBuilder, kernel);
    543 
    544         iBuilder->createDoSegmentCall(args);
     546            Value * const produced = f->second;
     547            args.push_back(produced);
     548            handleInsufficientData(b, produced, terminated, pipelineLoop, kernel, inputs[i], buffer);
     549        }
     550
     551        applyOutputBufferExpansions(b, kernel);
     552
     553        b->createDoSegmentCall(args);
    545554
    546555        if (!kernel->hasNoTerminateAttribute()) {
    547             Value * terminatedSignal = iBuilder->getTerminationSignal();
    548             terminated = iBuilder->CreateOr(terminated, terminatedSignal);
     556            Value * terminatedSignal = b->getTerminationSignal();
     557            terminated = b->CreateOr(terminated, terminatedSignal);
    549558        }
    550559        for (unsigned i = 0; i < outputs.size(); ++i) {
    551             Value * const produced = iBuilder->getProducedItemCount(outputs[i].getName()); // , terminated
     560            Value * const produced = b->getProducedItemCount(outputs[i].getName());
    552561            const StreamSetBuffer * const buf = kernel->getStreamSetOutputBuffer(i);
    553             assert (producedPos.count(buf) == 0);
    554             producedPos.emplace(buf, produced);
     562            assert (producedItemCount.count(buf) == 0);
     563            producedItemCount.emplace(buf, produced);
    555564        }
    556565
    557566        for (unsigned i = 0; i < inputs.size(); ++i) {
    558             Value * const processed = iBuilder->getProcessedItemCount(inputs[i].getName());
     567            Value * const processed = b->getProcessedItemCount(inputs[i].getName());
    559568            const StreamSetBuffer * const buf = kernel->getStreamSetInputBuffer(i);
    560             auto f = consumedPos.find(buf);
    561             if (f == consumedPos.end()) {
    562                 consumedPos.emplace(buf, processed);
     569            auto f = consumedItemCount.find(buf);
     570            if (f == consumedItemCount.end()) {
     571                consumedItemCount.emplace(buf, processed);
    563572            } else {
    564                 Value * lesser = iBuilder->CreateICmpULT(processed, f->second);
    565                 f->second = iBuilder->CreateSelect(lesser, processed, f->second);
    566             }
    567         }
     573                f->second = b->CreateUMin(processed, f->second);
     574            }
     575        }
     576
    568577        if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    569             cycleCountEnd = iBuilder->CreateReadCycleCounter();
    570             Value * counterPtr = iBuilder->getCycleCountPtr();
    571             iBuilder->CreateStore(iBuilder->CreateAdd(iBuilder->CreateLoad(counterPtr), iBuilder->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
     578            cycleCountEnd = b->CreateReadCycleCounter();
     579            Value * counterPtr = b->getCycleCountPtr();
     580            b->CreateStore(b->CreateAdd(b->CreateLoad(counterPtr), b->CreateSub(cycleCountEnd, cycleCountStart)), counterPtr);
    572581            cycleCountStart = cycleCountEnd;
    573582        }
    574 
    575         Value * const segNo = iBuilder->acquireLogicalSegmentNo();
    576         Value * nextSegNo = iBuilder->CreateAdd(segNo, iBuilder->getSize(1));
    577         iBuilder->releaseLogicalSegmentNo(nextSegNo);
    578     }
    579 
    580     for (const auto consumed : consumedPos) {
    581         const StreamSetBuffer * const buf = consumed.first;
    582         Kernel * const k = buf->getProducer();
    583         const auto & outputs = k->getStreamSetOutputBuffers();
    584         for (unsigned i = 0; i < outputs.size(); ++i) {
    585             if (outputs[i] == buf) {
    586                 const auto & binding = k->getStreamOutput(i);
    587                 if (LLVM_UNLIKELY(binding.getRate().isDerived())) {
    588                     continue;
    589                 }
    590                 iBuilder->setKernel(k);
    591                 iBuilder->setConsumedItemCount(binding.getName(), consumed.second);
    592                 break;
    593             }
    594         }
    595     }
    596 
    597     iBuilder->CreateCondBr(terminated, pipelineExit, pipelineLoop);
    598 
    599     iBuilder->SetInsertPoint(pipelineExit);
     583//        Value * const segNo = b->acquireLogicalSegmentNo();
     584//        Value * nextSegNo = b->CreateAdd(segNo, b->getSize(1));
     585//        b->releaseLogicalSegmentNo(nextSegNo);
     586    }
     587
     588    for (const auto consumed : consumedItemCount) {
     589        const StreamSetBuffer * const buffer = consumed.first;
     590        Kernel * const kernel = buffer->getProducer();
     591        const auto & binding = kernel->getStreamOutput(buffer);
     592        if (LLVM_UNLIKELY(binding.getRate().isDerived())) {
     593            continue;
     594        }
     595        b->setKernel(kernel);
     596        b->setConsumedItemCount(binding.getName(), consumed.second);
     597    }
     598
     599    b->CreateCondBr(terminated, pipelineExit, pipelineLoop);
     600
     601    b->SetInsertPoint(pipelineExit);
    600602
    601603    if (DebugOptionIsSet(codegen::EnableCycleCounter)) {
    602604        for (unsigned k = 0; k < kernels.size(); k++) {
    603605            auto & kernel = kernels[k];
    604             iBuilder->setKernel(kernel);
     606            b->setKernel(kernel);
    605607            const auto & inputs = kernel->getStreamInputs();
    606608            const auto & outputs = kernel->getStreamOutputs();
    607609            Value * items = nullptr;
    608610            if (inputs.empty()) {
    609                 items = iBuilder->getProducedItemCount(outputs[0].getName());
     611                items = b->getProducedItemCount(outputs[0].getName());
    610612            } else {
    611                 items = iBuilder->getProcessedItemCount(inputs[0].getName());
    612             }
    613             Value * fItems = iBuilder->CreateUIToFP(items, iBuilder->getDoubleTy());
    614             Value * cycles = iBuilder->CreateLoad(iBuilder->getCycleCountPtr());
    615             Value * fCycles = iBuilder->CreateUIToFP(cycles, iBuilder->getDoubleTy());
     613                items = b->getProcessedItemCount(inputs[0].getName());
     614            }
     615            Value * fItems = b->CreateUIToFP(items, b->getDoubleTy());
     616            Value * cycles = b->CreateLoad(b->getCycleCountPtr());
     617            Value * fCycles = b->CreateUIToFP(cycles, b->getDoubleTy());
    616618            const auto formatString = kernel->getName() + ": %7.2e items processed; %7.2e CPU cycles,  %6.2f cycles per item.\n";
    617             Value * stringPtr = iBuilder->CreatePointerCast(iBuilder->GetString(formatString), iBuilder->getInt8PtrTy());
    618             iBuilder->CreateCall(iBuilder->GetDprintf(), {iBuilder->getInt32(2), stringPtr, fItems, fCycles, iBuilder->CreateFDiv(fCycles, fItems)});
     619            Value * stringPtr = b->CreatePointerCast(b->GetString(formatString), b->getInt8PtrTy());
     620            b->CreateCall(b->GetDprintf(), {b->getInt32(2), stringPtr, fItems, fCycles, b->CreateFDiv(fCycles, fItems)});
    619621        }
    620622    }
    621623}
    622624
     625/** ------------------------------------------------------------------------------------------------------------- *
     626 * @brief applyOutputBufferExpansions
     627 ** ------------------------------------------------------------------------------------------------------------- */
    623628void applyOutputBufferExpansions(const std::unique_ptr<KernelBuilder> & b, const std::string & name, DynamicBuffer * const db, const uint64_t baseSize) {
    624 
    625     BasicBlock * const doExpand = b->CreateBasicBlock(name + "Expand");
     629    BasicBlock * const doExpand = BasicBlock::Create(b->getContext(), name + "Expand", b->GetInsertBlock()->getParent());
    626630    BasicBlock * const nextBlock = b->GetInsertBlock()->getNextNode();
    627631    doExpand->moveAfter(b->GetInsertBlock());
     
    659663    }
    660664}
     665
     666/** ------------------------------------------------------------------------------------------------------------- *
     667 * @brief handleInsufficientData
     668 ** ------------------------------------------------------------------------------------------------------------- */
     669inline void handleInsufficientData(const std::unique_ptr<KernelBuilder> & b, Value * const produced, Value * const final, BasicBlock * const insufficient,
     670                                   const Kernel * const consumer,  const Binding & input, const StreamSetBuffer * const buffer) {
     671    const Kernel * const producer = buffer->getProducer();
     672    const Binding & output = producer->getStreamOutput(buffer);
     673    auto producedRate = producer->getLowerBound(output.getRate()) * producer->getStride();
     674    const auto consumedRate = consumer->getUpperBound(input.getRate()) * consumer->getStride();
     675    if (LLVM_UNLIKELY(input.hasLookahead())) {
     676        producedRate -= input.getLookahead();
     677//        const auto amount = input.getLookahead();
     678//        const auto strides = ((amount + consumer->getStride() - 1) / consumer->getStride());
     679//        consumedRate += strides * consumer->getStride();
     680    }
     681    if (LLVM_UNLIKELY(producedRate < consumedRate)) {
     682        const auto name = input.getName();
     683        BasicBlock * const sufficient = BasicBlock::Create(b->getContext(), name + "IsSufficient", b->GetInsertBlock()->getParent());
     684        Value * const processed = b->getProcessedItemCount(name);
     685        Value * const unread = b->CreateSub(produced, processed);
     686        Constant * const amount = ConstantInt::get(unread->getType(), ceiling(consumedRate));
     687        Value * const cond = b->CreateOr(b->CreateICmpUGE(unread, amount), final);
     688        b->CreateLikelyCondBr(cond, sufficient, insufficient);
     689        b->SetInsertPoint(sufficient);
     690    }
     691}
     692
Note: See TracChangeset for help on using the changeset viewer.