Ignore:
Timestamp:
Jan 17, 2017, 12:00:43 PM (3 years ago)
Author:
cameron
Message:

New doSegment partial progress

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp

    r5260 r5263  
    99#include <kernels/interface.h>
    1010#include <kernels/kernel.h>
    11 #include <kernels/s2p_kernel.h>
    1211#include <iostream>
    1312#include <unordered_map>
     
    1716using namespace llvm;
    1817
    19 #if 0
    20 
    21 using BufferMap = std::unordered_map<StreamSetBuffer *, std::pair<KernelBuilder *, unsigned>>;
    22 
    23 static void createStreamBufferMap(BufferMap & bufferMap, const std::vector<KernelBuilder *> & kernels) {
    24     for (auto k: kernels) {
    25         auto outputSets = k->getStreamSetOutputBuffers();
    26         for (unsigned i = 0; i < outputSets.size(); i++) {
    27             bufferMap.insert(std::make_pair(outputSets[i], std::make_pair(k, i)));
    28         }
    29     }
    30     for (auto k: kernels) {
    31         auto inputSets = k->getStreamSetInputBuffers();
     18using ProducerTable = std::vector<std::vector<std::pair<unsigned, unsigned>>>;
     19
     20ProducerTable createProducerTable(const std::vector<KernelBuilder *> & kernels) {
     21    ProducerTable producerTable;
     22    producerTable.reserve(kernels.size());
     23   
     24    std::vector<std::vector<bool>> userTable;
     25    userTable.reserve(kernels.size());
     26   
     27    // First prepare a map from streamSet output buffers to their producing kernel and output index.
     28    std::unordered_map<const StreamSetBuffer *, std::pair<unsigned, unsigned>> bufferMap;
     29   
     30    for (unsigned k = 0; k < kernels.size(); k++) {
     31        auto outputSets = kernels[k]->getStreamSetOutputBuffers();
     32        for (unsigned j = 0; j < outputSets.size(); j++) {
     33            userTable[k].push_back(false);
     34            bufferMap.insert(std::make_pair(outputSets[j], std::make_pair(k, j)));
     35        }
     36    }
     37    for (unsigned k = 0; k < kernels.size(); k++) {
     38        auto inputSets = kernels[k]->getStreamSetInputBuffers();
    3239        for (unsigned i = 0; i < inputSets.size(); i++) {
    33             if (bufferMap.find(inputSets[i]) == bufferMap.end()) {
    34                 llvm::report_fatal_error("Pipeline error: input buffer #" + std::to_string(i) + " of " + k->getName() + ": no corresponding output buffer. ");
     40            auto f = bufferMap.find(inputSets[i]);
     41            if (f == bufferMap.end()) {
     42                llvm::report_fatal_error("Pipeline error: input buffer #" + std::to_string(i) + " of " + kernels[k]->getName() + ": no corresponding output buffer. ");
    3543            }
    36         }
    37     }
    38 }
    39 
    40 static Value * getSegmentBlocks(BufferMap & bufferMap, KernelBuilder * kernel) {
    41     IDISA::IDISA_Builder * iBuilder = kernel->getBuilder();
    42     std::cerr << "getSegmentBlocks\n";
    43 
    44     KernelBuilder * sourceKernel;
    45 
    46     unsigned outputIndex;
    47     auto inputs = kernel->getStreamSetInputBuffers();
    48     if (inputs.empty()) return iBuilder->getSize(codegen::SegmentSize * iBuilder->getStride() / iBuilder->getBitBlockWidth());
    49     std::string inputSetName = kernel->getStreamInputs()[0].name;
    50     std::cerr << "inputSetName = " << inputSetName << "\n";
    51     auto f = bufferMap.find(inputs[0]);
    52     assert(f != bufferMap.end()  && "bufferMap failure");
    53     std::tie(sourceKernel, outputIndex) = f->second;
    54     std::cerr << "outputIndex = " << outputIndex << "\n";
    55     Value * produced = sourceKernel->getProducedItemCount(sourceKernel->getInstance(), sourceKernel->getStreamOutputs()[outputIndex].name);
    56     iBuilder->CallPrintInt("produced", produced);
    57     Value * processed = kernel->getProcessedItemCount(kernel->getInstance(), inputSetName);
    58     iBuilder->CallPrintInt("processed", processed);
    59     Value * itemsToDo = iBuilder->CreateSub(produced, processed);
    60     return iBuilder->CreateUDiv(itemsToDo, iBuilder->getSize(iBuilder->getStride()));
    61 }
    62 
    63 #endif
    64 
    65 Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels, Type * sharedStructType, int id) {
    66 
     44            producerTable[k].push_back(f->second);
     45            unsigned sourceKernel, outputIndex;
     46            std::tie(sourceKernel, outputIndex) = f->second;
     47            if (sourceKernel >= k) {
     48                llvm::report_fatal_error("Pipeline error: input buffer #" + std::to_string(i) + " of " + kernels[k]->getName() + ": not defined before use. ");
     49            }
     50            //errs() << "sourceKernel: " + std::to_string(sourceKernel) + ", outputIndex: " + std::to_string(outputIndex) + ", user: " + std::to_string(k) + "\n";
     51            userTable[sourceKernel][outputIndex]= true;
     52           
     53        }
     54    }
     55    for (unsigned k = 0; k < kernels.size(); k++) {
     56        auto outputSets = kernels[k]->getStreamSetOutputBuffers();
     57        //errs() << "kernel: " + kernels[k]->getName() + "\n";
     58        for (unsigned j = 0; j < outputSets.size(); j++) {
     59            if (userTable[k][j] == false) {
     60                llvm::report_fatal_error("Pipeline error: output buffer #" + std::to_string(j) + " of " + kernels[k]->getName() + ": no users. ");
     61            }
     62        }
     63    }
     64    return producerTable;
     65}
     66
     67
     68Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels, Type * sharedStructType, ProducerTable & producerTable, int id) {
     69   
     70    // ProducerPos[k][i] will hold the producedItemCount of the i^th output stream
     71    // set of the k^th kernel.  These values will be loaded immediately after the
     72    // doSegment and finalSegment calls for kernel k and later used as the
     73    // producer position arguments for later doSegment/finalSegment calls.
     74   
     75    std::vector<std::vector<Value *>> ProducerPos;
     76   
     77   
     78    const auto ip = iBuilder->saveIP();
     79   
    6780    Module * m = iBuilder->getModule();
    6881    Type * const size_ty = iBuilder->getSizeTy();
     
    87100    std::vector<BasicBlock *> segmentWait;
    88101    std::vector<BasicBlock *> segmentLoopBody;
    89     std::vector<BasicBlock *> partialSegmentWait;
    90     std::vector<BasicBlock *> partialSegmentLoopBody;
    91     bool terminationSignalEncountered = false;
    92102    for (unsigned i = 0; i < kernels.size(); i++) {
    93103        std::string kname = kernels[i]->getName();
    94104        segmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), kname + "Wait", threadFunc, 0));
    95105        segmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "do_" + kname, threadFunc, 0));
    96         if (terminationSignalEncountered) {
    97             partialSegmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), kname + "WaitFinal", threadFunc, 0));
    98             partialSegmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "finish_" + kname, threadFunc, 0));
    99         }
    100         else {
    101             partialSegmentWait.push_back(nullptr);
    102             partialSegmentLoopBody.push_back(nullptr);
    103             terminationSignalEncountered = kernels[i]->hasNoTerminateAttribute() == false;
    104         }
    105     }
    106     segmentWait.push_back(segmentLoop); // If the last kernel does not terminate, loop back.
    107     partialSegmentWait.push_back(exitThreadBlock); // After the last kernel terminates, we're done.
     106    }
    108107
    109108    iBuilder->SetInsertPoint(entryBlock);
     
    116115    }
    117116   
    118     // Some important constant values.
    119     int segmentSize = codegen::SegmentSize;
    120     Constant * segmentBlocks = ConstantInt::get(size_ty, segmentSize);
    121117    iBuilder->CreateBr(segmentLoop);
    122118
     
    128124    Value * alreadyDone = kernels[last_kernel]->getTerminationSignal(instancePtrs[last_kernel]);
    129125    iBuilder->CreateCondBr(alreadyDone, exitThreadBlock, segmentWait[0]);
    130 
    131    
    132    
    133     for (unsigned i = 0; i < kernels.size(); i++) {
    134         iBuilder->SetInsertPoint(segmentWait[i]);
    135         Value * processedSegmentCount = kernels[i]->acquireLogicalSegmentNo(instancePtrs[i]);
     126   
     127    Value * doFinal = ConstantInt::getNullValue(iBuilder->getInt1Ty());
     128
     129    for (unsigned k = 0; k < kernels.size(); k++) {
     130        iBuilder->SetInsertPoint(segmentWait[k]);
     131        Value * processedSegmentCount = kernels[k]->acquireLogicalSegmentNo(instancePtrs[k]);
    136132        Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
    137         iBuilder->CreateCondBr(cond, segmentLoopBody[i], segmentWait[i]);
    138 
    139         iBuilder->SetInsertPoint(segmentLoopBody[i]);
    140         if (i == last_kernel) {
     133        iBuilder->CreateCondBr(cond, segmentLoopBody[k], segmentWait[k]);
     134       
     135        iBuilder->SetInsertPoint(segmentLoopBody[k]);
     136        if (k == last_kernel) {
    141137            segNo->addIncoming(iBuilder->CreateAdd(segNo, ConstantInt::get(size_ty, threadNum)), segmentLoopBody[last_kernel]);
    142138        }
    143         kernels[i]->createDoSegmentCall(instancePtrs[i], segmentBlocks);
    144         if (kernels[i]->hasNoTerminateAttribute()) {
    145             kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], nextSegNo);
    146             iBuilder->CreateBr(segmentWait[i+1]);
     139       
     140       
     141       
     142       
     143        std::vector<Value *> doSegmentArgs = {instancePtrs[k], doFinal};
     144        for (unsigned j = 0; j < kernels[k]->getStreamInputs().size(); j++) {
     145            unsigned producerKernel, outputIndex;
     146            std::tie(producerKernel, outputIndex) = producerTable[k][j];
     147            doSegmentArgs.push_back(ProducerPos[producerKernel][outputIndex]);
     148        }
     149        kernels[k]->createDoSegmentCall(doSegmentArgs);
     150        std::vector<Value *> produced;
     151        for (unsigned i = 0; i < kernels[k]->getStreamOutputs().size(); i++) {
     152            produced.push_back(kernels[k]->getProducedItemCount(instancePtrs[k], kernels[k]->getStreamOutputs()[i].name));
     153        }
     154        ProducerPos.push_back(produced);
     155        if (! (kernels[k]->hasNoTerminateAttribute())) {
     156            Value * terminated = kernels[k]->getTerminationSignal(instancePtrs[k]);
     157            doFinal = iBuilder->CreateOr(doFinal, terminated);
     158        }
     159        kernels[k]->releaseLogicalSegmentNo(instancePtrs[k], nextSegNo);
     160        if (k == last_kernel) {
     161            iBuilder->CreateCondBr(doFinal, exitThreadBlock, segmentLoop);
    147162        }
    148163        else {
    149             Value * terminated = kernels[i]->getTerminationSignal(instancePtrs[i]);
    150             kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], nextSegNo);
    151             iBuilder->CreateCondBr(terminated, partialSegmentWait[i+1], segmentWait[i+1]);
    152         }
    153         if (partialSegmentWait[i] != nullptr) {
    154             iBuilder->SetInsertPoint(partialSegmentWait[i]);
    155             Value * processedSegmentCount = kernels[i]->acquireLogicalSegmentNo(instancePtrs[i]);
    156             Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
    157             iBuilder->CreateCondBr(cond, partialSegmentLoopBody[i], partialSegmentWait[i]);
    158            
    159             iBuilder->SetInsertPoint(partialSegmentLoopBody[i]);
    160             kernels[i]->createFinalSegmentCall(instancePtrs[i], segmentBlocks);
    161             kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], nextSegNo);
    162             iBuilder->CreateBr(partialSegmentWait[i+1]);
     164            iBuilder->CreateBr(segmentWait[k+1]);
    163165        }
    164166    }
     
    168170    iBuilder->CreatePThreadExitCall(nullVal);
    169171    iBuilder->CreateRetVoid();
     172    iBuilder->restoreIP(ip);
    170173
    171174    return threadFunc;
     
    182185   
    183186    unsigned threadNum = codegen::ThreadNum;
    184 
     187   
    185188    Module * m = iBuilder->getModule();
    186 
     189   
    187190    Type * const size_ty = iBuilder->getSizeTy();
    188191    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
    189192    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
    190 
     193   
    191194    for (auto k : kernels) k->createInstance();
    192 
     195   
     196    ProducerTable producerTable = createProducerTable(kernels);
     197   
    193198    Type * const pthreadsTy = ArrayType::get(size_ty, threadNum);
    194199    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
     
    199204    Value * nullVal = Constant::getNullValue(voidPtrTy);
    200205    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
    201 
     206   
    202207    std::vector<Type *> structTypes;
    203208    for (unsigned i = 0; i < kernels.size(); i++) {
     
    205210    }
    206211    Type * sharedStructType = StructType::get(m->getContext(), structTypes);
    207 
     212   
    208213    AllocaInst * sharedStruct = iBuilder->CreateAlloca(sharedStructType);
    209214    for (unsigned i = 0; i < kernels.size(); i++) {
     
    211216        iBuilder->CreateStore(kernels[i]->getInstance(), ptr);
    212217    }
    213 
     218   
    214219    std::vector<Function *> thread_functions;
    215220    const auto ip = iBuilder->saveIP();
    216221    for (unsigned i = 0; i < threadNum; i++) {
    217         thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, i));
     222        thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, producerTable, i));
    218223    }
    219224    iBuilder->restoreIP(ip);
    220 
     225   
    221226    for (unsigned i = 0; i < threadNum; i++) {
    222227        iBuilder->CreatePThreadCreateCall(pthreadsPtrs[i], nullVal, thread_functions[i], iBuilder->CreateBitCast(sharedStruct, int8PtrTy));
    223228    }
    224 
     229   
    225230    std::vector<Value *> threadIDs;
    226231    for (unsigned i = 0; i < threadNum; i++) {
     
    231236        iBuilder->CreatePThreadJoinCall(threadIDs[i], status);
    232237    }
    233 
     238   
    234239}
    235240
    236241void generatePipelineParallel(IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels) {
    237  
     242    
    238243    Type * pthreadTy = iBuilder->getSizeTy();
    239244    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
    240245    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
    241 
     246   
    242247    Type * const pthreadsTy = ArrayType::get(pthreadTy, kernels.size());
    243 
     248   
    244249    for (auto k : kernels) k->createInstance();
    245 
     250   
    246251    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
    247252    std::vector<Value *> pthreadsPtrs;
     
    251256    Value * nullVal = Constant::getNullValue(voidPtrTy);
    252257    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
    253 
     258   
    254259    std::vector<Function *> kernel_functions;
    255260    const auto ip = iBuilder->saveIP();
     
    258263    }
    259264    iBuilder->restoreIP(ip);
    260 
     265   
    261266    for (unsigned i = 0; i < kernels.size(); i++) {
    262267        iBuilder->CreatePThreadCreateCall(pthreadsPtrs[i], nullVal, kernel_functions[i], iBuilder->CreateBitCast(kernels[i]->getInstance(), int8PtrTy));
    263268    }
    264 
     269   
    265270    std::vector<Value *> threadIDs;
    266271    for (unsigned i = 0; i < kernels.size(); i++) {
     
    276281void generatePipelineLoop(IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels) {
    277282    for (auto k : kernels) k->createInstance();
    278     //BufferMap bufferMap;
    279     //createStreamBufferMap(bufferMap, kernels);
    280283   
    281284    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
    282285    Function * main = entryBlock->getParent();
    283 
    284     // Create the basic blocks
     286   
     287    // Create the basic blocks for the loop.
    285288    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentLoop", main, 0);
    286289    BasicBlock * exitBlock = BasicBlock::Create(iBuilder->getContext(), "exitBlock", main, 0);
    287     // We create vectors of loop body and final segment blocks indexed by kernel.
    288     std::vector<BasicBlock *> loopBodyBlocks;
    289     std::vector<BasicBlock *> finalSegmentBlocks;
    290 
    291     loopBodyBlocks.push_back(segmentLoop);
    292     finalSegmentBlocks.push_back(nullptr); 
    293    
    294     for (unsigned i = 1; i < kernels.size(); i++) {
    295         if (kernels[i-1]->hasNoTerminateAttribute()) {
    296             // Previous kernel cannot terminate.   Continue with the previous blocks;
    297             loopBodyBlocks.push_back(loopBodyBlocks.back());
    298             finalSegmentBlocks.push_back(finalSegmentBlocks.back());
    299         }
    300         else {
    301             loopBodyBlocks.push_back(BasicBlock::Create(iBuilder->getContext(), "do_" + kernels[i]->getName(), main, 0));
    302             finalSegmentBlocks.push_back(BasicBlock::Create(iBuilder->getContext(), "finish_" + kernels[i]->getName(), main, 0));
    303         }
    304     }
    305     loopBodyBlocks.push_back(segmentLoop); // If the last kernel does not terminate, loop back.
    306     finalSegmentBlocks.push_back(exitBlock); // If the last kernel does terminate, we're done.
     290   
     291    ProducerTable producerTable = createProducerTable(kernels);
     292   
     293    // ProducerPos[k][i] will hold the producedItemCount of the i^th output stream
     294    // set of the k^th kernel.  These values will be loaded immediately after the
     295    // doSegment and finalSegment calls for kernel k and later used as the
     296    // producer position arguments for later doSegment/finalSegment calls.
     297   
     298    std::vector<std::vector<Value *>> ProducerPos;
    307299   
    308300    iBuilder->CreateBr(segmentLoop);
    309     Constant * segBlocks = iBuilder->getSize(codegen::SegmentSize * iBuilder->getStride() / iBuilder->getBitBlockWidth());
    310     for (unsigned i = 0; i < kernels.size(); i++) {
    311         iBuilder->SetInsertPoint(loopBodyBlocks[i]);
    312         //Value * segBlocks = getSegmentBlocks(bufferMap, kernels[i]);
    313         Value * segNo = kernels[i]->acquireLogicalSegmentNo(kernels[i]->getInstance());
    314         kernels[i]->createDoSegmentCall(kernels[i]->getInstance(), segBlocks);
    315         if (kernels[i]->hasNoTerminateAttribute()) {
    316             kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
    317             if (i == kernels.size() - 1) {
    318                 iBuilder->CreateBr(segmentLoop);
    319             }
    320         }
    321         else {
    322             Value * terminated = kernels[i]->getTerminationSignal(kernels[i]->getInstance());
    323             kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
    324             iBuilder->CreateCondBr(terminated, finalSegmentBlocks[i+1], loopBodyBlocks[i+1]);
    325         }
    326         if (finalSegmentBlocks[i] != nullptr) {
    327             iBuilder->SetInsertPoint(finalSegmentBlocks[i]);
    328             Value * segNo = kernels[i]->acquireLogicalSegmentNo(kernels[i]->getInstance());
    329             kernels[i]->createFinalSegmentCall(kernels[i]->getInstance(), segBlocks);
    330             kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
    331             if (finalSegmentBlocks[i] != finalSegmentBlocks[i+1]) {
    332                 iBuilder->CreateBr(finalSegmentBlocks[i+1]);
    333             }
    334         }
    335     }
     301    iBuilder->SetInsertPoint(segmentLoop);
     302
     303    Value * terminationFound = ConstantInt::getNullValue(iBuilder->getInt1Ty());
     304    for (unsigned k = 0; k < kernels.size(); k++) {
     305        Value * instance = kernels[k]->getInstance();
     306        std::vector<Value *> doSegmentArgs = {instance, terminationFound};
     307        for (unsigned j = 0; j < kernels[k]->getStreamInputs().size(); j++) {
     308            unsigned producerKernel, outputIndex;
     309            std::tie(producerKernel, outputIndex) = producerTable[k][j];
     310            doSegmentArgs.push_back(ProducerPos[producerKernel][outputIndex]);
     311        }
     312        kernels[k]->createDoSegmentCall(doSegmentArgs);
     313        if (! (kernels[k]->hasNoTerminateAttribute())) {
     314            Value * terminated = kernels[k]->getTerminationSignal(instance);
     315            terminationFound = iBuilder->CreateOr(terminationFound, terminated);
     316        }
     317        std::vector<Value *> produced;
     318        for (unsigned i = 0; i < kernels[k]->getStreamOutputs().size(); i++) {
     319            produced.push_back(kernels[k]->getProducedItemCount(instance, kernels[k]->getStreamOutputs()[i].name));
     320        }
     321        ProducerPos.push_back(produced);
     322        Value * segNo = kernels[k]->acquireLogicalSegmentNo(instance);
     323        kernels[k]->releaseLogicalSegmentNo(instance, iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
     324    }
     325    iBuilder->CreateCondBr(terminationFound, exitBlock, segmentLoop);
    336326    iBuilder->SetInsertPoint(exitBlock);
    337327}
     328
     329   
Note: See TracChangeset for help on using the changeset viewer.