source: icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp @ 5165

Last change on this file since 5165 was 5165, checked in by lindanl, 3 years ago

Add segment pipeline parallel strategy. Move ballot function to IDISA NVPTX.

File size: 14.7 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6
7#include <toolchain.h>
8#include "pipeline.h"
9
10#include <IDISA/idisa_builder.h>
11
12#include <kernels/interface.h>
13#include <kernels/kernel.h>
14#include <kernels/s2p_kernel.h>
15
16#include <llvm/IR/TypeBuilder.h>
17#include <iostream>
18
19using namespace kernel;
20
21Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, Type * sharedStructType, int id) {
22
23    Module * m = iBuilder->getModule();
24    Type * const size_ty = iBuilder->getSizeTy();
25    Type * const voidTy = Type::getVoidTy(m->getContext());
26    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
27    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
28
29    Function * const threadFunc = cast<Function>(m->getOrInsertFunction(name, voidTy, int8PtrTy, nullptr));
30    threadFunc->setCallingConv(CallingConv::C);
31    Function::arg_iterator args = threadFunc->arg_begin();
32
33    Value * const input = &*(args++);
34    input->setName("input");
35
36    int threadNum = codegen::ThreadNum;
37
38     // Create the basic blocks for the thread function.
39    BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc, 0);
40    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentCond", threadFunc, 0);
41    BasicBlock * finalSegmentLoopExit = BasicBlock::Create(iBuilder->getContext(), "partialSegmentCond", threadFunc, 0);
42    BasicBlock * exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc, 0);
43    std::vector<BasicBlock *> segmentWait;
44    std::vector<BasicBlock *> segmentLoopBody;
45    std::vector<BasicBlock *> partialSegmentWait;
46    std::vector<BasicBlock *> partialSegmentLoopBody;
47    for (unsigned i = 0; i < kernels.size(); i++) {
48        segmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
49        segmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
50        partialSegmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "partialSegmentWait"+std::to_string(i), threadFunc, 0));
51        partialSegmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "partialSegmentLoopBody"+std::to_string(i), threadFunc, 0));
52    }
53
54    iBuilder->SetInsertPoint(entryBlock);
55
56    Value * sharedStruct = iBuilder->CreateBitCast(input, PointerType::get(sharedStructType, 0));
57    Value * myThreadId = ConstantInt::get(size_ty, id);
58    Value * fileSize = iBuilder->CreateLoad(iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
59    std::vector<Value *> instancePtrs;
60    for (unsigned i = 0; i < kernels.size(); i++) {
61        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i + 1)});
62        instancePtrs.push_back(iBuilder->CreateLoad(ptr));
63    }
64   
65    // Some important constant values.
66    int segmentSize = codegen::SegmentSize;
67    Constant * segmentBlocks = ConstantInt::get(size_ty, segmentSize);
68    Constant * hypersegmentBlocks = ConstantInt::get(size_ty, segmentSize * threadNum);
69    Constant * segmentBytes = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize);
70    Constant * hypersegmentBytes = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize * threadNum);
71    Constant * const blockSize = ConstantInt::get(size_ty, iBuilder->getStride());
72
73    // The offset of my starting segment within the thread group hypersegment.
74    Value * myBlockNo = iBuilder->CreateMul(segmentBlocks, myThreadId);
75    Value * myOffset = iBuilder->CreateMul(segmentBytes, myThreadId);
76    Value * fullSegLimit = iBuilder->CreateAdd(myOffset, segmentBytes);
77
78    iBuilder->CreateBr(segmentLoop);
79
80    iBuilder->SetInsertPoint(segmentLoop);
81    PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
82    remainingBytes->addIncoming(fileSize, entryBlock);
83    PHINode * blockNo = iBuilder->CreatePHI(size_ty, 2, "blockNo");
84    blockNo->addIncoming(myBlockNo, entryBlock);
85
86    Value * LT_fullSegment = iBuilder->CreateICmpSLT(remainingBytes, fullSegLimit);
87    iBuilder->CreateCondBr(LT_fullSegment, finalSegmentLoopExit, segmentWait[0]);
88
89    for (unsigned i = 0; i < kernels.size(); i++) {
90        iBuilder->SetInsertPoint(segmentWait[i]);
91        Value * curBlockNo = kernels[i]->getBlockNo(instancePtrs[i]);
92        Value * cond = iBuilder->CreateICmpEQ(curBlockNo, blockNo);
93        iBuilder->CreateCondBr(cond, segmentLoopBody[i], segmentWait[i]);
94
95        iBuilder->SetInsertPoint(segmentLoopBody[i]);
96        kernels[i]->createDoSegmentCall(instancePtrs[i], segmentBlocks);
97        if (i == kernels.size() - 1) break;
98        iBuilder->CreateBr(segmentWait[i+1]);
99    }
100   
101    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, hypersegmentBytes), segmentLoopBody[kernels.size()-1]);
102    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, hypersegmentBlocks), segmentLoopBody[kernels.size()-1]);
103    iBuilder->CreateBr(segmentLoop);
104
105    // Now we may have a partial segment, or we may be completely done
106    // because the last segment was handled by a previous thread in the group.
107    iBuilder->SetInsertPoint(finalSegmentLoopExit);
108    Value * alreadyDone = iBuilder->CreateICmpSLT(remainingBytes, myOffset);
109    Value * remainingForMe = iBuilder->CreateSub(remainingBytes, myOffset);
110    Value * blocksToDo = iBuilder->CreateUDiv(remainingForMe, blockSize);
111    iBuilder->CreateCondBr(alreadyDone, exitThreadBlock, partialSegmentWait[0]);
112
113    // Full Block Pipeline loop
114    for (unsigned i = 0; i < kernels.size(); i++) {
115        iBuilder->SetInsertPoint(partialSegmentWait[i]);
116        Value * curBlockNo = kernels[i]->getBlockNo(instancePtrs[i]);
117        Value * cond = iBuilder->CreateICmpEQ(curBlockNo, blockNo);
118        iBuilder->CreateCondBr(cond, partialSegmentLoopBody[i], partialSegmentWait[i]);
119
120        iBuilder->SetInsertPoint(partialSegmentLoopBody[i]);
121        kernels[i]->createDoSegmentCall(instancePtrs[i], blocksToDo);
122        kernels[i]->createFinalBlockCall(instancePtrs[i], iBuilder->CreateURem(remainingForMe, blockSize));
123        if (i == kernels.size() - 1) break;
124        iBuilder->CreateBr(partialSegmentWait[i+1]);
125    }
126    iBuilder->CreateBr(exitThreadBlock);
127
128    iBuilder->SetInsertPoint(exitThreadBlock);
129    Value * nullVal = Constant::getNullValue(voidPtrTy);
130    Function * pthreadExitFunc = m->getFunction("pthread_exit");
131    CallInst * exitThread = iBuilder->CreateCall(pthreadExitFunc, {nullVal});
132    exitThread->setDoesNotReturn();
133    iBuilder->CreateRetVoid();
134
135    return threadFunc;
136}
137
138void generateSegmentParallelPipeline(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances, Value * fileSize) {
139   
140    int threadNum = codegen::ThreadNum;
141
142    Module * m = iBuilder->getModule();
143
144    Type * const size_ty = iBuilder->getSizeTy();
145    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
146    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
147    Type * const pthreadsTy = ArrayType::get(size_ty, threadNum);
148    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
149    std::vector<Value *> pthreadsPtrs;
150    for (unsigned i = 0; i < threadNum; i++) {
151        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
152    }
153    Value * nullVal = Constant::getNullValue(voidPtrTy);
154    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
155
156    std::vector<Type *> structTypes;
157    structTypes.push_back(size_ty);//input size
158    for (unsigned i = 0; i < instances.size(); i++) {
159        structTypes.push_back(instances[i]->getType());
160    }
161    Type * sharedStructType = StructType::get(m->getContext(), structTypes);
162
163    AllocaInst * sharedStruct;
164    sharedStruct = iBuilder->CreateAlloca(sharedStructType);
165    Value * sizePtr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
166    iBuilder->CreateStore(fileSize, sizePtr);
167    for (unsigned i = 0; i < instances.size(); i++) {
168        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i+1)});
169        iBuilder->CreateStore(instances[i], ptr);
170    }
171
172    std::vector<Function *> thread_functions;
173    const auto ip = iBuilder->saveIP();
174    for (unsigned i = 0; i < threadNum; i++) {
175        thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, i));
176    }
177    iBuilder->restoreIP(ip);
178
179    Function * pthreadCreateFunc = m->getFunction("pthread_create");
180    Function * pthreadJoinFunc = m->getFunction("pthread_join");
181
182    for (unsigned i = 0; i < threadNum; i++) {
183        iBuilder->CreateCall(pthreadCreateFunc, std::vector<Value *>({pthreadsPtrs[i], nullVal, thread_functions[i], iBuilder->CreateBitCast(sharedStruct, int8PtrTy)}));
184    }
185
186    std::vector<Value *> threadIDs;
187    for (unsigned i = 0; i < threadNum; i++) { 
188        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
189    }
190   
191    for (unsigned i = 0; i < threadNum; i++) { 
192        iBuilder->CreateCall(pthreadJoinFunc, std::vector<Value *>({threadIDs[i], status}));
193    }
194
195}
196
197void generatePipelineParallel(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances) {
198 
199    Module * m = iBuilder->getModule();
200
201    Type * pthreadTy = iBuilder->getSizeTy();     
202    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
203    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
204
205    Type * const pthreadsTy = ArrayType::get(pthreadTy, kernels.size());
206    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
207    std::vector<Value *> pthreadsPtrs;
208    for (unsigned i = 0; i < kernels.size(); i++) {
209        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
210    }
211    Value * nullVal = Constant::getNullValue(voidPtrTy);
212    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
213
214    std::vector<Function *> kernel_functions;
215    const auto ip = iBuilder->saveIP();
216    for (unsigned i = 0; i < kernels.size(); i++) {
217        kernel_functions.push_back(kernels[i]->generateThreadFunction("k_"+std::to_string(i)));
218    }
219    iBuilder->restoreIP(ip);
220
221    Function * pthreadCreateFunc = m->getFunction("pthread_create");
222    Function * pthreadJoinFunc = m->getFunction("pthread_join");
223
224    for (unsigned i = 0; i < kernels.size(); i++) {
225        iBuilder->CreateCall(pthreadCreateFunc, std::vector<Value *>({pthreadsPtrs[i], nullVal, kernel_functions[i], iBuilder->CreateBitCast(instances[i], int8PtrTy)}));
226    }
227
228    std::vector<Value *> threadIDs;
229    for (unsigned i = 0; i < kernels.size(); i++) { 
230        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
231    }
232   
233    for (unsigned i = 0; i < kernels.size(); i++) { 
234        iBuilder->CreateCall(pthreadJoinFunc, std::vector<Value *>({threadIDs[i], status}));
235    }
236}
237
238
239void generatePipelineLoop(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances, Value * fileSize) {
240   
241    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
242    Function * main = entryBlock->getParent();
243       
244    const unsigned segmentSize = codegen::SegmentSize;
245    Type * const size_ty = iBuilder->getSizeTy();
246
247    // Create the basic blocks for the loop.
248    BasicBlock * segmentCondBlock = nullptr;
249    BasicBlock * segmentBodyBlock = nullptr;
250    if (segmentSize > 1) {
251        segmentCondBlock = BasicBlock::Create(iBuilder->getContext(), "segmentCond", main, 0);
252        segmentBodyBlock = BasicBlock::Create(iBuilder->getContext(), "segmentBody", main, 0);
253    }
254    BasicBlock * fullCondBlock = BasicBlock::Create(iBuilder->getContext(), "fullCond", main, 0);
255    BasicBlock * fullBodyBlock = BasicBlock::Create(iBuilder->getContext(), "fullBody", main, 0);
256    BasicBlock * finalBlock = BasicBlock::Create(iBuilder->getContext(), "final", main, 0);
257   
258   
259    Value * initialBufferSize = nullptr;
260    Value * initialBlockNo = nullptr;
261    BasicBlock * initialBlock = nullptr;
262   
263    if (segmentSize > 1) {
264        iBuilder->CreateBr(segmentCondBlock);
265        iBuilder->SetInsertPoint(segmentCondBlock);
266        PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
267        remainingBytes->addIncoming(fileSize, entryBlock);
268        PHINode * blockNo = iBuilder->CreatePHI(size_ty, 2, "blockNo");
269        blockNo->addIncoming(ConstantInt::get(size_ty, 0), entryBlock);
270       
271        Constant * const step = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize);
272        Value * segmentCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
273        iBuilder->CreateCondBr(segmentCondTest, fullCondBlock, segmentBodyBlock);
274       
275        iBuilder->SetInsertPoint(segmentBodyBlock);
276        Value * segBlocks = ConstantInt::get(size_ty, segmentSize);
277        for (unsigned i = 0; i < kernels.size(); i++) {
278            kernels[i]->createDoSegmentCall(instances[i], segBlocks);
279        }
280        remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, step), segmentBodyBlock);
281        blockNo->addIncoming(iBuilder->CreateAdd(blockNo, segBlocks), segmentBodyBlock);
282       
283        iBuilder->CreateBr(segmentCondBlock);
284        initialBufferSize = remainingBytes;
285        initialBlockNo = blockNo;
286        initialBlock = segmentCondBlock;
287    } else {
288        initialBufferSize = fileSize;
289        initialBlockNo = ConstantInt::get(size_ty, 0);
290        initialBlock = entryBlock;
291        iBuilder->CreateBr(fullCondBlock);
292    }
293   
294    iBuilder->SetInsertPoint(fullCondBlock);
295    PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
296    remainingBytes->addIncoming(initialBufferSize, initialBlock);
297    PHINode * blockNo = iBuilder->CreatePHI(size_ty, 2, "blockNo");
298    blockNo->addIncoming(initialBlockNo, initialBlock);
299   
300    Constant * const step = ConstantInt::get(size_ty, iBuilder->getStride());
301    Value * fullCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
302    iBuilder->CreateCondBr(fullCondTest, finalBlock, fullBodyBlock);
303   
304    // Full Block Pipeline loop
305    iBuilder->SetInsertPoint(fullBodyBlock);
306    for (unsigned i = 0; i < kernels.size(); i++) {
307        kernels[i]->createDoSegmentCall(instances[i], ConstantInt::get(size_ty, 1));
308    }
309   
310    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, step), fullBodyBlock);
311    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, ConstantInt::get(size_ty, 1)), fullBodyBlock);
312    iBuilder->CreateBr(fullCondBlock);
313   
314    iBuilder->SetInsertPoint(finalBlock);
315    for (unsigned i = 0; i < kernels.size(); i++) {
316        kernels[i]->createFinalBlockCall(instances[i], remainingBytes);
317    }
318}
Note: See TracBrowser for help on using the repository browser.