Ignore:
Timestamp:
Sep 26, 2016, 12:05:51 PM (3 years ago)
Author:
lindanl
Message:

Add segment pipeline parallel strategy. Move ballot function to IDISA NVPTX.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp

    r5141 r5165  
    1515
    1616#include <llvm/IR/TypeBuilder.h>
     17#include <iostream>
    1718
    1819using namespace kernel;
     20
     21Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, Type * sharedStructType, int id) {
     22
     23    Module * m = iBuilder->getModule();
     24    Type * const size_ty = iBuilder->getSizeTy();
     25    Type * const voidTy = Type::getVoidTy(m->getContext());
     26    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
     27    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
     28
     29    Function * const threadFunc = cast<Function>(m->getOrInsertFunction(name, voidTy, int8PtrTy, nullptr));
     30    threadFunc->setCallingConv(CallingConv::C);
     31    Function::arg_iterator args = threadFunc->arg_begin();
     32
     33    Value * const input = &*(args++);
     34    input->setName("input");
     35
     36    int threadNum = codegen::ThreadNum;
     37
     38     // Create the basic blocks for the thread function.
     39    BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc, 0);
     40    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentCond", threadFunc, 0);
     41    BasicBlock * finalSegmentLoopExit = BasicBlock::Create(iBuilder->getContext(), "partialSegmentCond", threadFunc, 0);
     42    BasicBlock * exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc, 0);
     43    std::vector<BasicBlock *> segmentWait;
     44    std::vector<BasicBlock *> segmentLoopBody;
     45    std::vector<BasicBlock *> partialSegmentWait;
     46    std::vector<BasicBlock *> partialSegmentLoopBody;
     47    for (unsigned i = 0; i < kernels.size(); i++) {
     48        segmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
     49        segmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
     50        partialSegmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "partialSegmentWait"+std::to_string(i), threadFunc, 0));
     51        partialSegmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "partialSegmentLoopBody"+std::to_string(i), threadFunc, 0));
     52    }
     53
     54    iBuilder->SetInsertPoint(entryBlock);
     55
     56    Value * sharedStruct = iBuilder->CreateBitCast(input, PointerType::get(sharedStructType, 0));
     57    Value * myThreadId = ConstantInt::get(size_ty, id);
     58    Value * fileSize = iBuilder->CreateLoad(iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
     59    std::vector<Value *> instancePtrs;
     60    for (unsigned i = 0; i < kernels.size(); i++) {
     61        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i + 1)});
     62        instancePtrs.push_back(iBuilder->CreateLoad(ptr));
     63    }
     64   
     65    // Some important constant values.
     66    int segmentSize = codegen::SegmentSize;
     67    Constant * segmentBlocks = ConstantInt::get(size_ty, segmentSize);
     68    Constant * hypersegmentBlocks = ConstantInt::get(size_ty, segmentSize * threadNum);
     69    Constant * segmentBytes = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize);
     70    Constant * hypersegmentBytes = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize * threadNum);
     71    Constant * const blockSize = ConstantInt::get(size_ty, iBuilder->getStride());
     72
     73    // The offset of my starting segment within the thread group hypersegment.
     74    Value * myBlockNo = iBuilder->CreateMul(segmentBlocks, myThreadId);
     75    Value * myOffset = iBuilder->CreateMul(segmentBytes, myThreadId);
     76    Value * fullSegLimit = iBuilder->CreateAdd(myOffset, segmentBytes);
     77
     78    iBuilder->CreateBr(segmentLoop);
     79
     80    iBuilder->SetInsertPoint(segmentLoop);
     81    PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
     82    remainingBytes->addIncoming(fileSize, entryBlock);
     83    PHINode * blockNo = iBuilder->CreatePHI(size_ty, 2, "blockNo");
     84    blockNo->addIncoming(myBlockNo, entryBlock);
     85
     86    Value * LT_fullSegment = iBuilder->CreateICmpSLT(remainingBytes, fullSegLimit);
     87    iBuilder->CreateCondBr(LT_fullSegment, finalSegmentLoopExit, segmentWait[0]);
     88
     89    for (unsigned i = 0; i < kernels.size(); i++) {
     90        iBuilder->SetInsertPoint(segmentWait[i]);
     91        Value * curBlockNo = kernels[i]->getBlockNo(instancePtrs[i]);
     92        Value * cond = iBuilder->CreateICmpEQ(curBlockNo, blockNo);
     93        iBuilder->CreateCondBr(cond, segmentLoopBody[i], segmentWait[i]);
     94
     95        iBuilder->SetInsertPoint(segmentLoopBody[i]);
     96        kernels[i]->createDoSegmentCall(instancePtrs[i], segmentBlocks);
     97        if (i == kernels.size() - 1) break;
     98        iBuilder->CreateBr(segmentWait[i+1]);
     99    }
     100   
     101    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, hypersegmentBytes), segmentLoopBody[kernels.size()-1]);
     102    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, hypersegmentBlocks), segmentLoopBody[kernels.size()-1]);
     103    iBuilder->CreateBr(segmentLoop);
     104
     105    // Now we may have a partial segment, or we may be completely done
     106    // because the last segment was handled by a previous thread in the group.
     107    iBuilder->SetInsertPoint(finalSegmentLoopExit);
     108    Value * alreadyDone = iBuilder->CreateICmpSLT(remainingBytes, myOffset);
     109    Value * remainingForMe = iBuilder->CreateSub(remainingBytes, myOffset);
     110    Value * blocksToDo = iBuilder->CreateUDiv(remainingForMe, blockSize);
     111    iBuilder->CreateCondBr(alreadyDone, exitThreadBlock, partialSegmentWait[0]);
     112
     113    // Full Block Pipeline loop
     114    for (unsigned i = 0; i < kernels.size(); i++) {
     115        iBuilder->SetInsertPoint(partialSegmentWait[i]);
     116        Value * curBlockNo = kernels[i]->getBlockNo(instancePtrs[i]);
     117        Value * cond = iBuilder->CreateICmpEQ(curBlockNo, blockNo);
     118        iBuilder->CreateCondBr(cond, partialSegmentLoopBody[i], partialSegmentWait[i]);
     119
     120        iBuilder->SetInsertPoint(partialSegmentLoopBody[i]);
     121        kernels[i]->createDoSegmentCall(instancePtrs[i], blocksToDo);
     122        kernels[i]->createFinalBlockCall(instancePtrs[i], iBuilder->CreateURem(remainingForMe, blockSize));
     123        if (i == kernels.size() - 1) break;
     124        iBuilder->CreateBr(partialSegmentWait[i+1]);
     125    }
     126    iBuilder->CreateBr(exitThreadBlock);
     127
     128    iBuilder->SetInsertPoint(exitThreadBlock);
     129    Value * nullVal = Constant::getNullValue(voidPtrTy);
     130    Function * pthreadExitFunc = m->getFunction("pthread_exit");
     131    CallInst * exitThread = iBuilder->CreateCall(pthreadExitFunc, {nullVal});
     132    exitThread->setDoesNotReturn();
     133    iBuilder->CreateRetVoid();
     134
     135    return threadFunc;
     136}
     137
     138void generateSegmentParallelPipeline(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances, Value * fileSize) {
     139   
     140    int threadNum = codegen::ThreadNum;
     141
     142    Module * m = iBuilder->getModule();
     143
     144    Type * const size_ty = iBuilder->getSizeTy();
     145    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
     146    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
     147    Type * const pthreadsTy = ArrayType::get(size_ty, threadNum);
     148    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
     149    std::vector<Value *> pthreadsPtrs;
     150    for (unsigned i = 0; i < threadNum; i++) {
     151        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
     152    }
     153    Value * nullVal = Constant::getNullValue(voidPtrTy);
     154    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
     155
     156    std::vector<Type *> structTypes;
     157    structTypes.push_back(size_ty);//input size
     158    for (unsigned i = 0; i < instances.size(); i++) {
     159        structTypes.push_back(instances[i]->getType());
     160    }
     161    Type * sharedStructType = StructType::get(m->getContext(), structTypes);
     162
     163    AllocaInst * sharedStruct;
     164    sharedStruct = iBuilder->CreateAlloca(sharedStructType);
     165    Value * sizePtr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
     166    iBuilder->CreateStore(fileSize, sizePtr);
     167    for (unsigned i = 0; i < instances.size(); i++) {
     168        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i+1)});
     169        iBuilder->CreateStore(instances[i], ptr);
     170    }
     171
     172    std::vector<Function *> thread_functions;
     173    const auto ip = iBuilder->saveIP();
     174    for (unsigned i = 0; i < threadNum; i++) {
     175        thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, i));
     176    }
     177    iBuilder->restoreIP(ip);
     178
     179    Function * pthreadCreateFunc = m->getFunction("pthread_create");
     180    Function * pthreadJoinFunc = m->getFunction("pthread_join");
     181
     182    for (unsigned i = 0; i < threadNum; i++) {
     183        iBuilder->CreateCall(pthreadCreateFunc, std::vector<Value *>({pthreadsPtrs[i], nullVal, thread_functions[i], iBuilder->CreateBitCast(sharedStruct, int8PtrTy)}));
     184    }
     185
     186    std::vector<Value *> threadIDs;
     187    for (unsigned i = 0; i < threadNum; i++) {
     188        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
     189    }
     190   
     191    for (unsigned i = 0; i < threadNum; i++) {
     192        iBuilder->CreateCall(pthreadJoinFunc, std::vector<Value *>({threadIDs[i], status}));
     193    }
     194
     195}
    19196
    20197void generatePipelineParallel(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances) {
     
    22199    Module * m = iBuilder->getModule();
    23200
    24     Type * pthreadTy = iBuilder->getSizeTy(); //Pthread Type for 64-bit machine.     
     201    Type * pthreadTy = iBuilder->getSizeTy();    
    25202    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
    26203    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
Note: See TracChangeset for help on using the changeset viewer.