source: icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp @ 5175

Last change on this file since 5175 was 5175, checked in by cameron, 3 years ago

Some tidy ups and changes to prepare for LLVM 3.9

File size: 14.8 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6
7#include <toolchain.h>
8#include "pipeline.h"
9
10#include <IDISA/idisa_builder.h>
11
12#include <kernels/interface.h>
13#include <kernels/kernel.h>
14#include <kernels/s2p_kernel.h>
15
16#include <llvm/IR/TypeBuilder.h>
17#include <iostream>
18
19using namespace kernel;
20
21Function * generateSegmentParallelPipelineThreadFunction(std::string name, IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, Type * sharedStructType, int id) {
22
23    Module * m = iBuilder->getModule();
24    Type * const size_ty = iBuilder->getSizeTy();
25    Type * const voidTy = Type::getVoidTy(m->getContext());
26    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
27    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
28
29    Function * const threadFunc = cast<Function>(m->getOrInsertFunction(name, voidTy, int8PtrTy, nullptr));
30    threadFunc->setCallingConv(CallingConv::C);
31    Function::arg_iterator args = threadFunc->arg_begin();
32
33    Value * const input = &*(args++);
34    input->setName("input");
35
36    unsigned threadNum = codegen::ThreadNum;
37
38     // Create the basic blocks for the thread function.
39    BasicBlock * entryBlock = BasicBlock::Create(iBuilder->getContext(), "entry", threadFunc, 0);
40    BasicBlock * segmentLoop = BasicBlock::Create(iBuilder->getContext(), "segmentCond", threadFunc, 0);
41    BasicBlock * finalSegmentLoopExit = BasicBlock::Create(iBuilder->getContext(), "partialSegmentCond", threadFunc, 0);
42    BasicBlock * exitThreadBlock = BasicBlock::Create(iBuilder->getContext(), "exitThread", threadFunc, 0);
43    std::vector<BasicBlock *> segmentWait;
44    std::vector<BasicBlock *> segmentLoopBody;
45    std::vector<BasicBlock *> partialSegmentWait;
46    std::vector<BasicBlock *> partialSegmentLoopBody;
47    for (unsigned i = 0; i < kernels.size(); i++) {
48        segmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
49        segmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "segmentWait"+std::to_string(i), threadFunc, 0));
50        partialSegmentWait.push_back(BasicBlock::Create(iBuilder->getContext(), "partialSegmentWait"+std::to_string(i), threadFunc, 0));
51        partialSegmentLoopBody.push_back(BasicBlock::Create(iBuilder->getContext(), "partialSegmentLoopBody"+std::to_string(i), threadFunc, 0));
52    }
53
54    iBuilder->SetInsertPoint(entryBlock);
55    Value * sharedStruct = iBuilder->CreateBitCast(input, PointerType::get(sharedStructType, 0));
56    Value * myThreadId = ConstantInt::get(size_ty, id);
57    Value * fileSize = iBuilder->CreateLoad(iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)}));
58    std::vector<Value *> instancePtrs;
59    for (unsigned i = 0; i < kernels.size(); i++) {
60        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i + 1)});
61        instancePtrs.push_back(iBuilder->CreateLoad(ptr));
62    }
63   
64    // Some important constant values.
65    int segmentSize = codegen::SegmentSize;
66    Constant * segmentBlocks = ConstantInt::get(size_ty, segmentSize);
67    Constant * segmentBytes = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize);
68    Constant * hypersegmentBytes = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize * threadNum);
69    Constant * const blockSize = ConstantInt::get(size_ty, iBuilder->getStride());
70
71    Value * myFirstSegNo = myThreadId;  //
72    // The offset of my starting segment within the thread group hypersegment.
73    Value * myOffset = iBuilder->CreateMul(segmentBytes, myThreadId);
74    Value * fullSegLimit = iBuilder->CreateAdd(myOffset, segmentBytes);
75
76    iBuilder->CreateBr(segmentLoop);
77
78    iBuilder->SetInsertPoint(segmentLoop);
79    PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
80    remainingBytes->addIncoming(fileSize, entryBlock);
81    PHINode * segNo = iBuilder->CreatePHI(size_ty, 2, "segNo");
82    segNo->addIncoming(myFirstSegNo, entryBlock);
83
84    Value * LT_fullSegment = iBuilder->CreateICmpSLT(remainingBytes, fullSegLimit);
85    iBuilder->CreateCondBr(LT_fullSegment, finalSegmentLoopExit, segmentWait[0]);
86
87    for (unsigned i = 0; i < kernels.size(); i++) {
88        iBuilder->SetInsertPoint(segmentWait[i]);
89        Value * processedSegmentCount = kernels[i]->getLogicalSegmentNo(instancePtrs[i]);
90        Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
91        iBuilder->CreateCondBr(cond, segmentLoopBody[i], segmentWait[i]);
92
93        iBuilder->SetInsertPoint(segmentLoopBody[i]);
94        kernels[i]->createDoSegmentCall(instancePtrs[i], segmentBlocks);
95        if (i == kernels.size() - 1) break;
96        iBuilder->CreateBr(segmentWait[i+1]);
97    }
98   
99    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, hypersegmentBytes), segmentLoopBody[kernels.size()-1]);
100    segNo->addIncoming(iBuilder->CreateAdd(segNo, ConstantInt::get(size_ty, threadNum)), segmentLoopBody[kernels.size()-1]);
101    iBuilder->CreateBr(segmentLoop);
102
103    // Now we may have a partial segment, or we may be completely done
104    // because the last segment was handled by a previous thread in the group.
105    iBuilder->SetInsertPoint(finalSegmentLoopExit);
106    Value * alreadyDone = iBuilder->CreateICmpSLT(remainingBytes, myOffset);
107    Value * remainingForMe = iBuilder->CreateSub(remainingBytes, myOffset);
108    Value * blocksToDo = iBuilder->CreateUDiv(remainingForMe, blockSize);
109    iBuilder->CreateCondBr(alreadyDone, exitThreadBlock, partialSegmentWait[0]);
110
111    // Full Block Pipeline loop
112    for (unsigned i = 0; i < kernels.size(); i++) {
113        iBuilder->SetInsertPoint(partialSegmentWait[i]);
114        Value * processedSegmentCount = kernels[i]->getLogicalSegmentNo(instancePtrs[i]);
115        Value * cond = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
116        iBuilder->CreateCondBr(cond, partialSegmentLoopBody[i], partialSegmentWait[i]);
117
118        iBuilder->SetInsertPoint(partialSegmentLoopBody[i]);
119        kernels[i]->createDoSegmentCall(instancePtrs[i], blocksToDo);
120        kernels[i]->createFinalBlockCall(instancePtrs[i], iBuilder->CreateURem(remainingForMe, blockSize));
121        if (i == kernels.size() - 1) break;
122        iBuilder->CreateBr(partialSegmentWait[i+1]);
123    }
124    iBuilder->CreateBr(exitThreadBlock);
125
126    iBuilder->SetInsertPoint(exitThreadBlock);
127    Value * nullVal = Constant::getNullValue(voidPtrTy);
128    Function * pthreadExitFunc = m->getFunction("pthread_exit");
129    CallInst * exitThread = iBuilder->CreateCall(pthreadExitFunc, {nullVal});
130    exitThread->setDoesNotReturn();
131    iBuilder->CreateRetVoid();
132
133    return threadFunc;
134}
135
136void generateSegmentParallelPipeline(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances, Value * fileSize) {
137   
138    unsigned threadNum = codegen::ThreadNum;
139
140    Module * m = iBuilder->getModule();
141
142    Type * const size_ty = iBuilder->getSizeTy();
143    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
144    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
145    Type * const pthreadsTy = ArrayType::get(size_ty, threadNum);
146    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
147    std::vector<Value *> pthreadsPtrs;
148    for (unsigned i = 0; i < threadNum; i++) {
149        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
150    }
151    Value * nullVal = Constant::getNullValue(voidPtrTy);
152    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
153
154    std::vector<Type *> structTypes;
155    structTypes.push_back(size_ty);//input size
156    for (unsigned i = 0; i < instances.size(); i++) {
157        structTypes.push_back(instances[i]->getType());
158    }
159    Type * sharedStructType = StructType::get(m->getContext(), structTypes);
160
161    AllocaInst * sharedStruct;
162    sharedStruct = iBuilder->CreateAlloca(sharedStructType);
163    Value * sizePtr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
164    iBuilder->CreateStore(fileSize, sizePtr);
165    for (unsigned i = 0; i < instances.size(); i++) {
166        Value * ptr = iBuilder->CreateGEP(sharedStruct, {iBuilder->getInt32(0), iBuilder->getInt32(i+1)});
167        iBuilder->CreateStore(instances[i], ptr);
168    }
169
170    std::vector<Function *> thread_functions;
171    const auto ip = iBuilder->saveIP();
172    for (unsigned i = 0; i < threadNum; i++) {
173        thread_functions.push_back(generateSegmentParallelPipelineThreadFunction("thread"+std::to_string(i), iBuilder, kernels, sharedStructType, i));
174    }
175    iBuilder->restoreIP(ip);
176
177    Function * pthreadCreateFunc = m->getFunction("pthread_create");
178    Function * pthreadJoinFunc = m->getFunction("pthread_join");
179
180    for (unsigned i = 0; i < threadNum; i++) {
181        iBuilder->CreateCall(pthreadCreateFunc, std::vector<Value *>({pthreadsPtrs[i], nullVal, thread_functions[i], iBuilder->CreateBitCast(sharedStruct, int8PtrTy)}));
182    }
183
184    std::vector<Value *> threadIDs;
185    for (unsigned i = 0; i < threadNum; i++) { 
186        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
187    }
188   
189    for (unsigned i = 0; i < threadNum; i++) { 
190        iBuilder->CreateCall(pthreadJoinFunc, std::vector<Value *>({threadIDs[i], status}));
191    }
192
193}
194
195void generatePipelineParallel(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances) {
196 
197    Module * m = iBuilder->getModule();
198
199    Type * pthreadTy = iBuilder->getSizeTy();     
200    Type * const voidPtrTy = TypeBuilder<void *, false>::get(m->getContext());
201    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
202
203    Type * const pthreadsTy = ArrayType::get(pthreadTy, kernels.size());
204    AllocaInst * const pthreads = iBuilder->CreateAlloca(pthreadsTy);
205    std::vector<Value *> pthreadsPtrs;
206    for (unsigned i = 0; i < kernels.size(); i++) {
207        pthreadsPtrs.push_back(iBuilder->CreateGEP(pthreads, {iBuilder->getInt32(0), iBuilder->getInt32(i)}));
208    }
209    Value * nullVal = Constant::getNullValue(voidPtrTy);
210    AllocaInst * const status = iBuilder->CreateAlloca(int8PtrTy);
211
212    std::vector<Function *> kernel_functions;
213    const auto ip = iBuilder->saveIP();
214    for (unsigned i = 0; i < kernels.size(); i++) {
215        kernel_functions.push_back(kernels[i]->generateThreadFunction("k_"+std::to_string(i)));
216    }
217    iBuilder->restoreIP(ip);
218
219    Function * pthreadCreateFunc = m->getFunction("pthread_create");
220    Function * pthreadJoinFunc = m->getFunction("pthread_join");
221
222    for (unsigned i = 0; i < kernels.size(); i++) {
223        iBuilder->CreateCall(pthreadCreateFunc, std::vector<Value *>({pthreadsPtrs[i], nullVal, kernel_functions[i], iBuilder->CreateBitCast(instances[i], int8PtrTy)}));
224    }
225
226    std::vector<Value *> threadIDs;
227    for (unsigned i = 0; i < kernels.size(); i++) { 
228        threadIDs.push_back(iBuilder->CreateLoad(pthreadsPtrs[i]));
229    }
230   
231    for (unsigned i = 0; i < kernels.size(); i++) { 
232        iBuilder->CreateCall(pthreadJoinFunc, std::vector<Value *>({threadIDs[i], status}));
233    }
234}
235
236
237void generatePipelineLoop(IDISA::IDISA_Builder * iBuilder, std::vector<KernelBuilder *> kernels, std::vector<Value *> instances, Value * fileSize) {
238   
239    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
240    Function * main = entryBlock->getParent();
241       
242    const unsigned segmentSize = codegen::SegmentSize;
243    Type * const size_ty = iBuilder->getSizeTy();
244
245    // Create the basic blocks for the loop.
246    BasicBlock * segmentCondBlock = nullptr;
247    BasicBlock * segmentBodyBlock = nullptr;
248    if (segmentSize > 1) {
249        segmentCondBlock = BasicBlock::Create(iBuilder->getContext(), "segmentCond", main, 0);
250        segmentBodyBlock = BasicBlock::Create(iBuilder->getContext(), "segmentBody", main, 0);
251    }
252    BasicBlock * fullCondBlock = BasicBlock::Create(iBuilder->getContext(), "fullCond", main, 0);
253    BasicBlock * fullBodyBlock = BasicBlock::Create(iBuilder->getContext(), "fullBody", main, 0);
254    BasicBlock * finalBlock = BasicBlock::Create(iBuilder->getContext(), "final", main, 0);
255    BasicBlock * exitBlock = BasicBlock::Create(iBuilder->getContext(), "exit", main, 0);
256   
257   
258    Value * initialBufferSize = nullptr;
259    Value * initialBlockNo = nullptr;
260    BasicBlock * initialBlock = nullptr;
261   
262    if (segmentSize > 1) {
263        iBuilder->CreateBr(segmentCondBlock);
264        iBuilder->SetInsertPoint(segmentCondBlock);
265        PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
266        remainingBytes->addIncoming(fileSize, entryBlock);
267        PHINode * blockNo = iBuilder->CreatePHI(size_ty, 2, "blockNo");
268        blockNo->addIncoming(ConstantInt::get(size_ty, 0), entryBlock);
269       
270        Constant * const step = ConstantInt::get(size_ty, iBuilder->getStride() * segmentSize);
271        Value * segmentCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
272        iBuilder->CreateCondBr(segmentCondTest, fullCondBlock, segmentBodyBlock);
273       
274        iBuilder->SetInsertPoint(segmentBodyBlock);
275        Value * segBlocks = ConstantInt::get(size_ty, segmentSize);
276        for (unsigned i = 0; i < kernels.size(); i++) {
277            kernels[i]->createDoSegmentCall(instances[i], segBlocks);
278        }
279        remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, step), segmentBodyBlock);
280        blockNo->addIncoming(iBuilder->CreateAdd(blockNo, segBlocks), segmentBodyBlock);
281       
282        iBuilder->CreateBr(segmentCondBlock);
283        initialBufferSize = remainingBytes;
284        initialBlockNo = blockNo;
285        initialBlock = segmentCondBlock;
286    } else {
287        initialBufferSize = fileSize;
288        initialBlockNo = ConstantInt::get(size_ty, 0);
289        initialBlock = entryBlock;
290        iBuilder->CreateBr(fullCondBlock);
291    }
292   
293    iBuilder->SetInsertPoint(fullCondBlock);
294    PHINode * remainingBytes = iBuilder->CreatePHI(size_ty, 2, "remainingBytes");
295    remainingBytes->addIncoming(initialBufferSize, initialBlock);
296    PHINode * blockNo = iBuilder->CreatePHI(size_ty, 2, "blockNo");
297    blockNo->addIncoming(initialBlockNo, initialBlock);
298   
299    Constant * const step = ConstantInt::get(size_ty, iBuilder->getStride());
300    Value * fullCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
301    iBuilder->CreateCondBr(fullCondTest, finalBlock, fullBodyBlock);
302   
303    // Full Block Pipeline loop
304    iBuilder->SetInsertPoint(fullBodyBlock);
305    for (unsigned i = 0; i < kernels.size(); i++) {
306        kernels[i]->createDoSegmentCall(instances[i], ConstantInt::get(size_ty, 1));
307    }
308   
309    remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, step), fullBodyBlock);
310    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, ConstantInt::get(size_ty, 1)), fullBodyBlock);
311    iBuilder->CreateBr(fullCondBlock);
312   
313    iBuilder->SetInsertPoint(finalBlock);
314    for (unsigned i = 0; i < kernels.size(); i++) {
315        kernels[i]->createFinalBlockCall(instances[i], remainingBytes);
316    }
317    iBuilder->CreateBr(exitBlock);
318    iBuilder->SetInsertPoint(exitBlock);
319
320}
Note: See TracBrowser for help on using the repository browser.