source: icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp @ 5260

Last change on this file since 5260 was 5260, checked in by nmedfort, 9 months ago

Changes working towards simplifying accessing stream elements + some modifications to simplify include / forward declarations within the CodeGen? library.

File size: 15.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "pipeline.h"
7#include <toolchain.h>
8#include <IR_Gen/idisa_builder.h>
9#include <kernels/interface.h>
10#include <kernels/kernel.h>
11#include <kernels/s2p_kernel.h>
12#include <iostream>
13#include <unordered_map>
14
15using namespace kernel;
16using namespace parabix;
17using namespace llvm;
18
19#if 0
20
21using BufferMap = std::unordered_map<StreamSetBuffer *, std::pair<KernelBuilder *, unsigned>>;
22
23static void createStreamBufferMap(BufferMap & bufferMap, const std::vector<KernelBuilder *> & kernels) {
24    for (auto k: kernels) {
25        auto outputSets = k->getStreamSetOutputBuffers();
26        for (unsigned i = 0; i < outputSets.size(); i++) {
27            bufferMap.insert(std::make_pair(outputSets[i], std::make_pair(k, i)));
28        }
29    }
30    for (auto k: kernels) {
31        auto inputSets = k->getStreamSetInputBuffers();
32        for (unsigned i = 0; i < inputSets.size(); i++) {
33            if (bufferMap.find(inputSets[i]) == bufferMap.end()) {
34                llvm::report_fatal_error("Pipeline error: input buffer #" + std::to_string(i) + " of " + k->getName() + ": no corresponding output buffer. ");
35            }
36        }
37    }
38}
39
40static Value * getSegmentBlocks(BufferMap & bufferMap, KernelBuilder * kernel) {
41    IDISA::IDISA_Builder * iBuilder = kernel->getBuilder();
42    std::cerr << "getSegmentBlocks\n";
43
44    KernelBuilder * sourceKernel;
45
46    unsigned outputIndex;
47    auto inputs = kernel->getStreamSetInputBuffers();
48    if (inputs.empty()) return iBuilder->getSize(codegen::SegmentSize * iBuilder->getStride() / iBuilder->getBitBlockWidth());
49    std::string inputSetName = kernel->getStreamInputs()[0].name;
50    std::cerr << "inputSetName = " << inputSetName << "\n";
51    auto f = bufferMap.find(inputs[0]);
52    assert(f != bufferMap.end()  && "bufferMap failure");
53    std::tie(sourceKernel, outputIndex) = f->second;
54    std::cerr << "outputIndex = " << outputIndex << "\n";
55    Value * produced = sourceKernel->getProducedItemCount(sourceKernel->getInstance(), sourceKernel->getStreamOutputs()[outputIndex].name);
56    iBuilder->CallPrintInt("produced", produced);
57    Value * processed = kernel->getProcessedItemCount(kernel->getInstance(), inputSetName);
58    iBuilder->CallPrintInt("processed", processed);
59    Value * itemsToDo = iBuilder->CreateSub(produced, processed);
60    return iBuilder->CreateUDiv(itemsToDo, iBuilder->getSize(iBuilder->getStride()));
61}
62
63#endif
64
65Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels, Type * sharedStructType, int id) {
66
67    Module * m = iBuilder->getModule();
68    Type * const size_ty = iBuilder->getSizeTy();
69    Type * const voidTy = iBuilder->getVoidTy();
70    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
71    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
72
73    Function * const threadFunc = cast<Function>(m->getOrInsertFunction(name, voidTy, int8PtrTy, nullptr));
74    threadFunc->setCallingConv(CallingConv::C);
75    Function::arg_iterator args = threadFunc->arg_begin();
76
77    Value * const input = &*(args++);
78    input->setName("input");
79
80    unsigned threadNum = codegen::ThreadNum;
81
82     // Create the basic blocks for the thread function.
83    BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc, 0);
84    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentLoop", threadFunc, 0);
85    BasicBlock * exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc, 0);
86   
87    std::vector<BasicBlock *> segmentWait;
88    std::vector<BasicBlock *> segmentLoopBody;
89    std::vector<BasicBlock *> partialSegmentWait;
90    std::vector<BasicBlock *> partialSegmentLoopBody;
91    bool terminationSignalEncountered = false;
92    for (unsigned i = 0; i < kernels.size(); i++) {
93        std::string kname = kernels[i]->getName();
94        segmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), kname + "Wait", threadFunc, 0));
95        segmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "do_" + kname, threadFunc, 0));
96        if (terminationSignalEncountered) {
97            partialSegmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), kname + "WaitFinal", threadFunc, 0));
98            partialSegmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "finish_" + kname, threadFunc, 0));
99        }
100        else {
101            partialSegmentWait.push_back(nullptr);
102            partialSegmentLoopBody.push_back(nullptr);
103            terminationSignalEncountered = kernels[i]->hasNoTerminateAttribute() == false;
104        }
105    }
106    segmentWait.push_back(segmentLoop); // If the last kernel does not terminate, loop back.
107    partialSegmentWait.push_back(exitThreadBlock); // After the last kernel terminates, we're done.
108
109    iBuilder->SetInsertPoint(entryBlock);
110    Value * sharedStruct = iBuilder->CreateBitCast(input, PointerType::get(sharedStructType, 0));
111    Constant * myThreadId = ConstantInt::get(size_ty, id);
112    std::vector<Value *> instancePtrs;
113    for (unsigned i = 0; i < kernels.size(); i++) {
114        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
115        instancePtrs.push_back(iBuilder->CreateLoad(ptr));
116    }
117   
118    // Some important constant values.
119    int segmentSize = codegen::SegmentSize;
120    Constant * segmentBlocks = ConstantInt::get(size_ty, segmentSize);
121    iBuilder->CreateBr(segmentLoop);
122
123    iBuilder->SetInsertPoint(segmentLoop);
124    PHINode * segNo = iBuilder->CreatePHI(size_ty, 2, "segNo");
125    segNo->addIncoming(myThreadId, entryBlock);
126    Value * nextSegNo = iBuilder->CreateAdd(segNo, iBuilder->getSize(1));
127    unsigned last_kernel = kernels.size() - 1;
128    Value * alreadyDone = kernels[last_kernel]->getTerminationSignal(instancePtrs[last_kernel]);
129    iBuilder->CreateCondBr(alreadyDone, exitThreadBlock, segmentWait[0]);
130
131   
132   
133    for (unsigned i = 0; i < kernels.size(); i++) {
134        iBuilder->SetInsertPoint(segmentWait[i]);
135        Value * processedSegmentCount = kernels[i]->acquireLogicalSegmentNo(instancePtrs[i]);
136        Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
137        iBuilder->CreateCondBr(cond, segmentLoopBody[i], segmentWait[i]);
138
139        iBuilder->SetInsertPoint(segmentLoopBody[i]);
140        if (i == last_kernel) {
141            segNo->addIncoming(iBuilder->CreateAdd(segNo, ConstantInt::get(size_ty, threadNum)), segmentLoopBody[last_kernel]);
142        }
143        kernels[i]->createDoSegmentCall(instancePtrs[i], segmentBlocks);
144        if (kernels[i]->hasNoTerminateAttribute()) {
145            kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], nextSegNo);
146            iBuilder->CreateBr(segmentWait[i+1]);
147        }
148        else {
149            Value * terminated = kernels[i]->getTerminationSignal(instancePtrs[i]);
150            kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], nextSegNo);
151            iBuilder->CreateCondBr(terminated, partialSegmentWait[i+1], segmentWait[i+1]);
152        }
153        if (partialSegmentWait[i] != nullptr) {
154            iBuilder->SetInsertPoint(partialSegmentWait[i]);
155            Value * processedSegmentCount = kernels[i]->acquireLogicalSegmentNo(instancePtrs[i]);
156            Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
157            iBuilder->CreateCondBr(cond, partialSegmentLoopBody[i], partialSegmentWait[i]);
158           
159            iBuilder->SetInsertPoint(partialSegmentLoopBody[i]);
160            kernels[i]->createFinalSegmentCall(instancePtrs[i], segmentBlocks);
161            kernels[i]->releaseLogicalSegmentNo(instancePtrs[i], nextSegNo);
162            iBuilder->CreateBr(partialSegmentWait[i+1]);
163        }
164    }
165   
166    iBuilder->SetInsertPoint(exitThreadBlock);
167    Value * nullVal = Constant::getNullValue(voidPtrTy);
168    iBuilder->CreatePThreadExitCall(nullVal);
169    iBuilder->CreateRetVoid();
170
171    return threadFunc;
172}
173
174// Given a computation expressed as a logical pipeline of K kernels k0, k_1, ...k_(K-1)
175// operating over an input stream set S, a segment-parallel implementation divides the input
176// into segments and coordinates a set of T <= K threads to each process one segment at a time.   
177// Let S_0, S_1, ... S_N be the segments of S.   Segments are assigned to threads in a round-robin
178// fashion such that processing of segment S_i by the full pipeline is carried out by thread i mod T.
179
180
181void generateSegmentParallelPipeline(IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels) {
182   
183    unsigned threadNum = codegen::ThreadNum;
184
185    Module * m = iBuilder->getModule();
186
187    Type * const size_ty = iBuilder->getSizeTy();
188    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
189    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
190
191    for (auto k : kernels) k->createInstance();
192
193    Type * const pthreadsTy = ArrayType::get(size_ty, threadNum);
194    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
195    std::vector<Value *> pthreadsPtrs;
196    for (unsigned i = 0; i < threadNum; i++) {
197        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
198    }
199    Value * nullVal = Constant::getNullValue(voidPtrTy);
200    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
201
202    std::vector<Type *> structTypes;
203    for (unsigned i = 0; i < kernels.size(); i++) {
204        structTypes.push_back(kernels[i]->getInstance()->getType());
205    }
206    Type * sharedStructType = StructType::get(m->getContext(), structTypes);
207
208    AllocaInst * sharedStruct = iBuilder->CreateAlloca(sharedStructType);
209    for (unsigned i = 0; i < kernels.size(); i++) {
210        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
211        iBuilder->CreateStore(kernels[i]->getInstance(), ptr);
212    }
213
214    std::vector<Function *> thread_functions;
215    const auto ip = iBuilder->saveIP();
216    for (unsigned i = 0; i < threadNum; i++) {
217        thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, i));
218    }
219    iBuilder->restoreIP(ip);
220
221    for (unsigned i = 0; i < threadNum; i++) {
222        iBuilder->CreatePThreadCreateCall(pthreadsPtrs[i], nullVal, thread_functions[i], iBuilder->CreateBitCast(sharedStruct, int8PtrTy));
223    }
224
225    std::vector<Value *> threadIDs;
226    for (unsigned i = 0; i < threadNum; i++) { 
227        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
228    }
229   
230    for (unsigned i = 0; i < threadNum; i++) { 
231        iBuilder->CreatePThreadJoinCall(threadIDs[i], status);
232    }
233
234}
235
236void generatePipelineParallel(IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels) {
237 
238    Type * pthreadTy = iBuilder->getSizeTy();
239    Type * const voidPtrTy = iBuilder->getVoidPtrTy();
240    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
241
242    Type * const pthreadsTy = ArrayType::get(pthreadTy, kernels.size());
243
244    for (auto k : kernels) k->createInstance();
245
246    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
247    std::vector<Value *> pthreadsPtrs;
248    for (unsigned i = 0; i < kernels.size(); i++) {
249        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
250    }
251    Value * nullVal = Constant::getNullValue(voidPtrTy);
252    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
253
254    std::vector<Function *> kernel_functions;
255    const auto ip = iBuilder->saveIP();
256    for (unsigned i = 0; i < kernels.size(); i++) {
257        kernel_functions.push_back(kernels[i]->generateThreadFunction("k_"+std::to_string(i)));
258    }
259    iBuilder->restoreIP(ip);
260
261    for (unsigned i = 0; i < kernels.size(); i++) {
262        iBuilder->CreatePThreadCreateCall(pthreadsPtrs[i], nullVal, kernel_functions[i], iBuilder->CreateBitCast(kernels[i]->getInstance(), int8PtrTy));
263    }
264
265    std::vector<Value *> threadIDs;
266    for (unsigned i = 0; i < kernels.size(); i++) { 
267        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
268    }
269   
270    for (unsigned i = 0; i < kernels.size(); i++) { 
271        iBuilder->CreatePThreadJoinCall(threadIDs[i], status);
272    }
273}
274
275
276void generatePipelineLoop(IDISA::IDISA_Builder * iBuilder, const std::vector<KernelBuilder *> & kernels) {
277    for (auto k : kernels) k->createInstance();
278    //BufferMap bufferMap;
279    //createStreamBufferMap(bufferMap, kernels);
280   
281    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
282    Function * main = entryBlock->getParent();
283
284    // Create the basic blocks. 
285    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentLoop", main, 0);
286    BasicBlock * exitBlock = BasicBlock::Create(iBuilder->getContext(), "exitBlock", main, 0);
287    // We create vectors of loop body and final segment blocks indexed by kernel.
288    std::vector<BasicBlock *> loopBodyBlocks;
289    std::vector<BasicBlock *> finalSegmentBlocks;
290
291    loopBodyBlocks.push_back(segmentLoop); 
292    finalSegmentBlocks.push_back(nullptr); 
293   
294    for (unsigned i = 1; i < kernels.size(); i++) {
295        if (kernels[i-1]->hasNoTerminateAttribute()) {
296            // Previous kernel cannot terminate.   Continue with the previous blocks;
297            loopBodyBlocks.push_back(loopBodyBlocks.back());
298            finalSegmentBlocks.push_back(finalSegmentBlocks.back());
299        }
300        else {
301            loopBodyBlocks.push_back(BasicBlock::Create(iBuilder->getContext(), "do_" + kernels[i]->getName(), main, 0));
302            finalSegmentBlocks.push_back(BasicBlock::Create(iBuilder->getContext(), "finish_" + kernels[i]->getName(), main, 0));
303        }
304    }
305    loopBodyBlocks.push_back(segmentLoop); // If the last kernel does not terminate, loop back.
306    finalSegmentBlocks.push_back(exitBlock); // If the last kernel does terminate, we're done.
307   
308    iBuilder->CreateBr(segmentLoop);
309    Constant * segBlocks = iBuilder->getSize(codegen::SegmentSize * iBuilder->getStride() / iBuilder->getBitBlockWidth());
310    for (unsigned i = 0; i < kernels.size(); i++) {
311        iBuilder->SetInsertPoint(loopBodyBlocks[i]);
312        //Value * segBlocks = getSegmentBlocks(bufferMap, kernels[i]);
313        Value * segNo = kernels[i]->acquireLogicalSegmentNo(kernels[i]->getInstance());
314        kernels[i]->createDoSegmentCall(kernels[i]->getInstance(), segBlocks);
315        if (kernels[i]->hasNoTerminateAttribute()) {
316            kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
317            if (i == kernels.size() - 1) {
318                iBuilder->CreateBr(segmentLoop);
319            }
320        }
321        else {
322            Value * terminated = kernels[i]->getTerminationSignal(kernels[i]->getInstance());
323            kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
324            iBuilder->CreateCondBr(terminated, finalSegmentBlocks[i+1], loopBodyBlocks[i+1]);
325        }
326        if (finalSegmentBlocks[i] != nullptr) {
327            iBuilder->SetInsertPoint(finalSegmentBlocks[i]);
328            Value * segNo = kernels[i]->acquireLogicalSegmentNo(kernels[i]->getInstance());
329            kernels[i]->createFinalSegmentCall(kernels[i]->getInstance(), segBlocks);
330            kernels[i]->releaseLogicalSegmentNo(kernels[i]->getInstance(), iBuilder->CreateAdd(segNo, iBuilder->getSize(1)));
331            if (finalSegmentBlocks[i] != finalSegmentBlocks[i+1]) {
332                iBuilder->CreateBr(finalSegmentBlocks[i+1]);
333            }
334        }
335    }
336    iBuilder->SetInsertPoint(exitBlock);
337}
Note: See TracBrowser for help on using the repository browser.