source: icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp @ 5251

Last change on this file since 5251 was 5251, checked in by cameron, 3 years ago

Stream set buffer maps

File size: 10.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "pipeline.h"
7#include <toolchain.h>
8#include <IR_Gen/idisa_builder.h>
9#include <kernels/interface.h>
10#include <kernels/kernel.h>
11#include <kernels/s2p_kernel.h>
12#include <iostream>
13#include <unordered_map>
14
15using namespace kernel;
16
17using BufferMap = std::unordered_map<StreamSetBuffer *, std::pair<KernelBuilder *, unsigned>>;
18
19
20static void createStreamBufferMap(BufferMap bufferMap, std::vector<KernelBuilder *> kernels) {
21    for (auto k: kernels) {
22        auto outputSets = k->getStreamSetOutputBuffers();
23        for (unsigned i = 0; i < outputSets.size(); i++) {
24            bufferMap.insert(std::make_pair(outputSets[i], std::make_pair(k, i)));
25        }
26    }
27    for (auto k: kernels) {
28        auto inputSets = k->getStreamSetInputBuffers();
29        for (unsigned i = 0; i < inputSets.size(); i++) {
30            if (bufferMap.find(inputSets[i]) == bufferMap.end()) {
31                llvm::report_fatal_error("Pipeline error: input buffer #" + std::to_string(i) + " of " + k->getName() + ": no corresponding output buffer. ");
32            }
33        }
34    }
35}
36
37
38Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, Type * sharedStructType, int id) {
39
40    Module * m = iBuilder->getModule();
41    Type * const size_ty = iBuilder->getSizeTy();
42    Type * const voidTy = iBuilder->getVoidTy();
43    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
44    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
45
46    Function * const threadFunc = cast<Function>(m->getOrInsertFunction(name, voidTy, int8PtrTy, nullptr));
47    threadFunc->setCallingConv(CallingConv::C);
48    Function::arg_iterator args = threadFunc->arg_begin();
49
50    Value * const input = &*(args++);
51    input->setName("input");
52
53    unsigned threadNum = codegen::ThreadNum;
54
55     // Create the basic blocks for the thread function.
56    BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc, 0);
57    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentLoop", threadFunc, 0);
58    BasicBlock * exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc, 0);
59    std::vector<BasicBlock *> segmentWait;
60    std::vector<BasicBlock *> segmentLoopBody;
61    for (unsigned i = 0; i < kernels.size(); i++) {
62        segmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
63        segmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
64    }
65
66    iBuilder->SetInsertPoint(entryBlock);
67    Value * sharedStruct = iBuilder->CreateBitCast(input, PointerType::get(sharedStructType, 0));
68    Constant * myThreadId = ConstantInt::get(size_ty, id);
69    std::vector<Value *> instancePtrs;
70    for (unsigned i = 0; i < kernels.size(); i++) {
71        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
72        instancePtrs.push_back(iBuilder->CreateLoad(ptr));
73    }
74   
75    // Some important constant values.
76    int segmentSize = codegen::SegmentSize;
77    Constant * segmentBlocks = ConstantInt::get(size_ty, segmentSize);
78    iBuilder->CreateBr(segmentLoop);
79
80    iBuilder->SetInsertPoint(segmentLoop);
81    PHINode * segNo = iBuilder->CreatePHI(size_ty, 2, "segNo");
82    segNo->addIncoming(myThreadId, entryBlock);
83    unsigned last_kernel = kernels.size() - 1;
84    Value * alreadyDone = kernels[last_kernel]->getTerminationSignal(instancePtrs[last_kernel]);
85    iBuilder->CreateCondBr(alreadyDone, exitThreadBlock, segmentWait[0]);
86
87    for (unsigned i = 0; i < kernels.size(); i++) {
88        iBuilder->SetInsertPoint(segmentWait[i]);
89        Value * processedSegmentCount = kernels[i]->acquireLogicalSegmentNo(instancePtrs[i]);
90        Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
91        iBuilder->CreateCondBr(cond, segmentLoopBody[i], segmentWait[i]);
92
93        iBuilder->SetInsertPoint(segmentLoopBody[i]);
94        kernels[i]->createDoSegmentCall(instancePtrs[i], segmentBlocks);
95        // Must be the last action, for synchronization.
96        kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], iBuilder->CreateAdd(processedSegmentCount, iBuilder->getSize(1)));
97        if (i == last_kernel) break;
98        iBuilder->CreateBr(segmentWait[i+1]);
99    }
100   
101    segNo->addIncoming(iBuilder->CreateAdd(segNo, ConstantInt::get(size_ty, threadNum)), segmentLoopBody[last_kernel]);
102    Value * endSignal = kernels[last_kernel]->getTerminationSignal(instancePtrs[last_kernel]);
103    iBuilder->CreateCondBr(endSignal, exitThreadBlock, segmentLoop);
104   
105    iBuilder->SetInsertPoint(exitThreadBlock);
106    Value * nullVal = Constant::getNullValue(voidPtrTy);
107    iBuilder->CreatePThreadExitCall(nullVal);
108    iBuilder->CreateRetVoid();
109
110    return threadFunc;
111}
112
113// Given a computation expressed as a logical pipeline of K kernels k0, k_1, ...k_(K-1)
114// operating over an input stream set S, a segment-parallel implementation divides the input
115// into segments and coordinates a set of T <= K threads to each process one segment at a time.   
116// Let S_0, S_1, ... S_N be the segments of S.   Segments are assigned to threads in a round-robin
117// fashion such that processing of segment S_i by the full pipeline is carried out by thread i mod T.
118
119
120void generateSegmentParallelPipeline(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels) {
121   
122    unsigned threadNum = codegen::ThreadNum;
123
124    Module * m = iBuilder->getModule();
125
126    Type * const size_ty = iBuilder->getSizeTy();
127    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
128    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
129
130    for (auto k : kernels) k->createInstance();
131
132    Type * const pthreadsTy = ArrayType::get(size_ty, threadNum);
133    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
134    std::vector<Value *> pthreadsPtrs;
135    for (unsigned i = 0; i < threadNum; i++) {
136        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
137    }
138    Value * nullVal = Constant::getNullValue(voidPtrTy);
139    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
140
141    std::vector<Type *> structTypes;
142    for (unsigned i = 0; i < kernels.size(); i++) {
143        structTypes.push_back(kernels[i]->getInstance()->getType());
144    }
145    Type * sharedStructType = StructType::get(m->getContext(), structTypes);
146
147    AllocaInst * sharedStruct = iBuilder->CreateAlloca(sharedStructType);
148    for (unsigned i = 0; i < kernels.size(); i++) {
149        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
150        iBuilder->CreateStore(kernels[i]->getInstance(), ptr);
151    }
152
153    std::vector<Function *> thread_functions;
154    const auto ip = iBuilder->saveIP();
155    for (unsigned i = 0; i < threadNum; i++) {
156        thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, i));
157    }
158    iBuilder->restoreIP(ip);
159
160    for (unsigned i = 0; i < threadNum; i++) {
161        iBuilder->CreatePThreadCreateCall(pthreadsPtrs[i], nullVal, thread_functions[i], iBuilder->CreateBitCast(sharedStruct, int8PtrTy));
162    }
163
164    std::vector<Value *> threadIDs;
165    for (unsigned i = 0; i < threadNum; i++) { 
166        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
167    }
168   
169    for (unsigned i = 0; i < threadNum; i++) { 
170        iBuilder->CreatePThreadJoinCall(threadIDs[i], status);
171    }
172
173}
174
175void generatePipelineParallel(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels) {
176 
177    Type * pthreadTy = iBuilder->getSizeTy();
178    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
179    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
180
181    Type * const pthreadsTy = ArrayType::get(pthreadTy, kernels.size());
182
183    for (auto k : kernels) k->createInstance();
184
185    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
186    std::vector<Value *> pthreadsPtrs;
187    for (unsigned i = 0; i < kernels.size(); i++) {
188        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
189    }
190    Value * nullVal = Constant::getNullValue(voidPtrTy);
191    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
192
193    std::vector<Function *> kernel_functions;
194    const auto ip = iBuilder->saveIP();
195    for (unsigned i = 0; i < kernels.size(); i++) {
196        kernel_functions.push_back(kernels[i]->generateThreadFunction("k_"+std::to_string(i)));
197    }
198    iBuilder->restoreIP(ip);
199
200    for (unsigned i = 0; i < kernels.size(); i++) {
201        iBuilder->CreatePThreadCreateCall(pthreadsPtrs[i], nullVal, kernel_functions[i], iBuilder->CreateBitCast(kernels[i]->getInstance(), int8PtrTy));
202    }
203
204    std::vector<Value *> threadIDs;
205    for (unsigned i = 0; i < kernels.size(); i++) { 
206        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
207    }
208   
209    for (unsigned i = 0; i < kernels.size(); i++) { 
210        iBuilder->CreatePThreadJoinCall(threadIDs[i], status);
211    }
212}
213
214
215void generatePipelineLoop(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels) {
216   
217    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
218    Function * main = entryBlock->getParent();
219       
220    const unsigned segmentSize = codegen::SegmentSize;
221    Type * const size_ty = iBuilder->getSizeTy();
222
223    // Create the basic blocks for the loop.
224    BasicBlock * segmentBlock = BasicBlock::Create(iBuilder->getContext(), "segmentLoop", main, 0);
225    BasicBlock * exitBlock = BasicBlock::Create(iBuilder->getContext(), "exitBlock", main, 0);
226    for (auto k : kernels) k->createInstance();
227    iBuilder->CreateBr(segmentBlock);
228    iBuilder->SetInsertPoint(segmentBlock);
229    Constant * segBlocks = ConstantInt::get(size_ty, segmentSize * iBuilder->getStride() / iBuilder->getBitBlockWidth());
230    for (unsigned i = 0; i < kernels.size(); i++) {
231        kernels[i]->createDoSegmentCall(kernels[i]->getInstance(), segBlocks);
232        Value * segNo = kernels[i]->acquireLogicalSegmentNo(kernels[i]->getInstance());
233        kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
234    }
235    Value * endSignal = kernels.back()->getTerminationSignal(kernels.back()->getInstance());
236    iBuilder->CreateCondBr(endSignal, exitBlock, segmentBlock);
237    iBuilder->SetInsertPoint(exitBlock);
238
239}
Note: See TracBrowser for help on using the repository browser.