1 | /* |
---|
2 | * Copyright (c) 2016-7 International Characters. |
---|
3 | * This software is licensed to the public under the Open Software License 3.0. |
---|
4 | */ |
---|
5 | |
---|
6 | #include "kernel.h" |
---|
7 | #include <toolchain/toolchain.h> |
---|
8 | #include <kernels/streamset.h> |
---|
9 | #include <llvm/IR/Constants.h> |
---|
10 | #include <llvm/IR/Function.h> |
---|
11 | #include <llvm/IR/Instructions.h> |
---|
12 | #include <llvm/IR/MDBuilder.h> |
---|
13 | #include <llvm/IR/Module.h> |
---|
14 | #include <llvm/Support/raw_ostream.h> |
---|
15 | #if LLVM_VERSION_INTEGER < LLVM_4_0_0 |
---|
16 | #include <llvm/Bitcode/ReaderWriter.h> |
---|
17 | #else |
---|
18 | #include <llvm/Bitcode/BitcodeWriter.h> |
---|
19 | #endif |
---|
20 | #include <llvm/Transforms/Utils/Local.h> |
---|
21 | #include <kernels/streamset.h> |
---|
22 | #include <sstream> |
---|
23 | #include <kernels/kernel_builder.h> |
---|
24 | #include <boost/math/common_factor_rt.hpp> |
---|
25 | #include <llvm/Support/Debug.h> |
---|
26 | |
---|
27 | using namespace llvm; |
---|
28 | using namespace parabix; |
---|
29 | using namespace boost::math; |
---|
30 | |
---|
31 | namespace kernel { |
---|
32 | |
---|
33 | const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock"; |
---|
34 | const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock"; |
---|
35 | const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock"; |
---|
36 | const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo"; |
---|
37 | const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount"; |
---|
38 | const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount"; |
---|
39 | const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount"; |
---|
40 | const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal"; |
---|
41 | const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr"; |
---|
42 | const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks"; |
---|
43 | const std::string Kernel::CYCLECOUNT_SCALAR = "CPUcycles"; |
---|
44 | |
---|
45 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
46 | * @brief addScalar |
---|
47 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
48 | unsigned Kernel::addScalar(Type * const type, const std::string & name) { |
---|
49 | if (LLVM_UNLIKELY(mKernelStateType != nullptr)) { |
---|
50 | report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized"); |
---|
51 | } |
---|
52 | if (LLVM_UNLIKELY(mKernelMap.count(name))) { |
---|
53 | report_fatal_error(getName() + " already contains scalar field " + name); |
---|
54 | } |
---|
55 | const auto index = mKernelFields.size(); |
---|
56 | mKernelMap.emplace(name, index); |
---|
57 | mKernelFields.push_back(type); |
---|
58 | return index; |
---|
59 | } |
---|
60 | |
---|
61 | |
---|
62 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
63 | * @brief addUnnamedScalar |
---|
64 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
65 | unsigned Kernel::addUnnamedScalar(Type * const type) { |
---|
66 | if (LLVM_UNLIKELY(mKernelStateType != nullptr)) { |
---|
67 | report_fatal_error("Cannot add unnamed field to " + getName() + " after kernel state finalized"); |
---|
68 | } |
---|
69 | const auto index = mKernelFields.size(); |
---|
70 | mKernelFields.push_back(type); |
---|
71 | return index; |
---|
72 | } |
---|
73 | |
---|
74 | |
---|
75 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
76 | * @brief prepareStreamSetNameMap |
---|
77 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
78 | void Kernel::prepareStreamSetNameMap() { |
---|
79 | for (unsigned i = 0; i < mStreamSetInputs.size(); i++) { |
---|
80 | mStreamMap.emplace(mStreamSetInputs[i].getName(), std::make_pair(Port::Input, i)); |
---|
81 | } |
---|
82 | for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) { |
---|
83 | mStreamMap.emplace(mStreamSetOutputs[i].getName(), std::make_pair(Port::Output, i)); |
---|
84 | } |
---|
85 | } |
---|
86 | |
---|
87 | |
---|
88 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
89 | * @brief bindPorts |
---|
90 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
91 | void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) { |
---|
92 | assert (mModule == nullptr); |
---|
93 | assert (mStreamSetInputBuffers.empty()); |
---|
94 | assert (mStreamSetOutputBuffers.empty()); |
---|
95 | |
---|
96 | if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) { |
---|
97 | report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetInputs.size()) + |
---|
98 | " input stream sets but was given " |
---|
99 | + std::to_string(inputs.size())); |
---|
100 | } |
---|
101 | |
---|
102 | for (unsigned i = 0; i < inputs.size(); ++i) { |
---|
103 | StreamSetBuffer * const buf = inputs[i]; |
---|
104 | if (LLVM_UNLIKELY(buf == nullptr)) { |
---|
105 | report_fatal_error(getName() + ": input stream set " + std::to_string(i) |
---|
106 | + " cannot be null"); |
---|
107 | } |
---|
108 | buf->addConsumer(this); |
---|
109 | } |
---|
110 | |
---|
111 | if (LLVM_UNLIKELY(mStreamSetOutputs.size() != outputs.size())) { |
---|
112 | report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetOutputs.size()) |
---|
113 | + " output stream sets but was given " |
---|
114 | + std::to_string(outputs.size())); |
---|
115 | } |
---|
116 | |
---|
117 | for (unsigned i = 0; i < outputs.size(); ++i) { |
---|
118 | StreamSetBuffer * const buf = outputs[i]; |
---|
119 | if (LLVM_UNLIKELY(buf == nullptr)) { |
---|
120 | report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null"); |
---|
121 | } |
---|
122 | if (LLVM_LIKELY(buf->getProducer() == nullptr)) { |
---|
123 | buf->setProducer(this); |
---|
124 | } else { |
---|
125 | report_fatal_error(getName() + ": output stream set " + std::to_string(i) |
---|
126 | + " is already produced by kernel " + buf->getProducer()->getName()); |
---|
127 | } |
---|
128 | } |
---|
129 | |
---|
130 | mStreamSetInputBuffers.assign(inputs.begin(), inputs.end()); |
---|
131 | mStreamSetOutputBuffers.assign(outputs.begin(), outputs.end()); |
---|
132 | } |
---|
133 | |
---|
134 | |
---|
135 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
136 | * @brief getCacheName |
---|
137 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
138 | std::string Kernel::getCacheName(const std::unique_ptr<KernelBuilder> & idb) const { |
---|
139 | std::stringstream cacheName; |
---|
140 | cacheName << getName() << '_' << idb->getBuilderUniqueName(); |
---|
141 | for (const StreamSetBuffer * b: mStreamSetInputBuffers) { |
---|
142 | cacheName << ':' << b->getUniqueID(); |
---|
143 | } |
---|
144 | for (const StreamSetBuffer * b: mStreamSetOutputBuffers) { |
---|
145 | cacheName << ':' << b->getUniqueID(); |
---|
146 | } |
---|
147 | return cacheName.str(); |
---|
148 | } |
---|
149 | |
---|
150 | |
---|
151 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
152 | * @brief setModule |
---|
153 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
154 | Module * Kernel::setModule(Module * const module) { |
---|
155 | assert (mModule == nullptr || mModule == module); |
---|
156 | assert (module != nullptr); |
---|
157 | mModule = module; |
---|
158 | return mModule; |
---|
159 | } |
---|
160 | |
---|
161 | |
---|
162 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
163 | * @brief makeModule |
---|
164 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
165 | Module * Kernel::makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb) { |
---|
166 | return setModule(new Module(getCacheName(idb), idb->getContext())); |
---|
167 | } |
---|
168 | |
---|
169 | |
---|
170 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
171 | * @brief prepareKernel |
---|
172 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
173 | void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) { |
---|
174 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb); |
---|
175 | if (LLVM_UNLIKELY(mKernelStateType != nullptr)) { |
---|
176 | report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized"); |
---|
177 | } |
---|
178 | addBaseKernelProperties(idb); |
---|
179 | addInternalKernelProperties(idb); |
---|
180 | // NOTE: StructType::create always creates a new type even if an identical one exists. |
---|
181 | if (LLVM_UNLIKELY(mModule == nullptr)) { |
---|
182 | setModule(new Module(getCacheName(idb), idb->getContext())); |
---|
183 | } |
---|
184 | mKernelStateType = mModule->getTypeByName(getName()); |
---|
185 | if (LLVM_LIKELY(mKernelStateType == nullptr)) { |
---|
186 | mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName()); |
---|
187 | assert (mKernelStateType); |
---|
188 | } |
---|
189 | } |
---|
190 | |
---|
191 | |
---|
192 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
193 | * @brief prepareCachedKernel |
---|
194 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
195 | void Kernel::prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb) { |
---|
196 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb); |
---|
197 | if (LLVM_UNLIKELY(mKernelStateType != nullptr)) { |
---|
198 | report_fatal_error(getName() + ": cannot prepare kernel after kernel state finalized"); |
---|
199 | } |
---|
200 | assert (getModule()); |
---|
201 | addBaseKernelProperties(idb); |
---|
202 | mKernelStateType = getModule()->getTypeByName(getName()); |
---|
203 | if (LLVM_UNLIKELY(mKernelStateType == nullptr)) { |
---|
204 | report_fatal_error("Kernel definition for " + getName() + " could not be found in the cache object"); |
---|
205 | } |
---|
206 | } |
---|
207 | |
---|
208 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
209 | * @brief getItemsPerStride |
---|
210 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
211 | std::pair<unsigned, unsigned> Kernel::getStreamRate(const Port p, const unsigned i) const { |
---|
212 | const ProcessingRate & rate = (p == Port::Input) ? mStreamSetInputs[i].getRate() : mStreamSetOutputs[i].getRate(); |
---|
213 | unsigned min = 0, max = 0; |
---|
214 | if (rate.isFixed()) { |
---|
215 | min = max = rate.getRate(); |
---|
216 | } else if (rate.isBounded()) { |
---|
217 | min = rate.getLowerBound(); |
---|
218 | max = rate.getUpperBound(); |
---|
219 | } else if (rate.isUnknown()) { |
---|
220 | min = rate.getLowerBound(); |
---|
221 | max = 0; |
---|
222 | } else if (rate.isExactlyRelative()) { |
---|
223 | for (unsigned j = 0; j < mStreamSetInputs.size(); ++j) { |
---|
224 | if (mStreamSetInputs[j].getName() == rate.getReference()) { |
---|
225 | std::tie(min, max) = getStreamRate(Port::Input, j); |
---|
226 | min = (min * rate.getNumerator()) / rate.getDenominator(); |
---|
227 | assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0); |
---|
228 | max = (max * rate.getNumerator()) / rate.getDenominator(); |
---|
229 | return std::make_pair(min, max); |
---|
230 | } |
---|
231 | } |
---|
232 | for (unsigned j = 0; j < mStreamSetOutputs.size(); ++j) { |
---|
233 | if (mStreamSetOutputs[j].getName() == rate.getReference()) { |
---|
234 | assert (p == Port::Output); |
---|
235 | std::tie(min, max) = getStreamRate(Port::Output, j); |
---|
236 | min = (min * rate.getNumerator()) / rate.getDenominator(); |
---|
237 | assert (max == 0 || (max * rate.getNumerator()) % rate.getDenominator() == 0); |
---|
238 | max = (max * rate.getNumerator()) / rate.getDenominator(); |
---|
239 | return std::make_pair(min, max); |
---|
240 | } |
---|
241 | } |
---|
242 | llvm_unreachable("Reference rate must be associated with an input or output!"); |
---|
243 | } |
---|
244 | return std::make_pair(min, max); |
---|
245 | } |
---|
246 | |
---|
247 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
248 | * @brief addBaseKernelProperties |
---|
249 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
250 | void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { |
---|
251 | |
---|
252 | const unsigned inputSetCount = mStreamSetInputs.size(); |
---|
253 | const unsigned outputSetCount = mStreamSetOutputs.size(); |
---|
254 | |
---|
255 | assert (inputSetCount == mStreamSetInputBuffers.size()); |
---|
256 | assert (outputSetCount == mStreamSetOutputBuffers.size()); |
---|
257 | |
---|
258 | if (mStride == 0) { |
---|
259 | // Set the default kernel stride. |
---|
260 | mStride = idb->getBitBlockWidth(); |
---|
261 | } |
---|
262 | |
---|
263 | IntegerType * const sizeTy = idb->getSizeTy(); |
---|
264 | |
---|
265 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
266 | const Binding & b = mStreamSetInputs[i]; |
---|
267 | //const ProcessingRate & rate = b.getRate(); |
---|
268 | //if (rate.isBounded() || rate.isUnknown()) { |
---|
269 | addScalar(sizeTy, b.getName() + PROCESSED_ITEM_COUNT_SUFFIX); |
---|
270 | //} |
---|
271 | } |
---|
272 | |
---|
273 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
274 | const Binding & b = mStreamSetOutputs[i]; |
---|
275 | //const ProcessingRate & rate = b.getRate(); |
---|
276 | //if (rate.isBounded() || rate.isUnknown()) { |
---|
277 | addScalar(sizeTy, b.getName() + PRODUCED_ITEM_COUNT_SUFFIX); |
---|
278 | //} |
---|
279 | } |
---|
280 | |
---|
281 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
282 | mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetInputs[i].getName() + BUFFER_PTR_SUFFIX); |
---|
283 | } |
---|
284 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
285 | mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getStreamSetHandle()->getType(), mStreamSetOutputs[i].getName() + BUFFER_PTR_SUFFIX); |
---|
286 | } |
---|
287 | for (const auto & binding : mScalarInputs) { |
---|
288 | addScalar(binding.getType(), binding.getName()); |
---|
289 | } |
---|
290 | for (const auto & binding : mScalarOutputs) { |
---|
291 | addScalar(binding.getType(), binding.getName()); |
---|
292 | } |
---|
293 | if (mStreamMap.empty()) { |
---|
294 | prepareStreamSetNameMap(); |
---|
295 | } |
---|
296 | for (const auto & binding : mInternalScalars) { |
---|
297 | addScalar(binding.getType(), binding.getName()); |
---|
298 | } |
---|
299 | Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo(); |
---|
300 | for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) { |
---|
301 | addScalar(consumerSetTy, mStreamSetOutputs[i].getName() + CONSUMER_SUFFIX); |
---|
302 | } |
---|
303 | addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR); |
---|
304 | addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL); |
---|
305 | for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) { |
---|
306 | addScalar(sizeTy, mStreamSetOutputs[i].getName() + CONSUMED_ITEM_COUNT_SUFFIX); |
---|
307 | } |
---|
308 | // We compile in a 64-bit CPU cycle counter into every kernel. It will remain unused |
---|
309 | // in normal execution, but when codegen::EnableCycleCounter is specified, pipelines |
---|
310 | // will be able to add instrumentation to cached modules without recompilation. |
---|
311 | addScalar(idb->getInt64Ty(), CYCLECOUNT_SCALAR); |
---|
312 | |
---|
313 | } |
---|
314 | |
---|
315 | |
---|
316 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
317 | * @brief makeSignature |
---|
318 | * |
---|
319 | * Default kernel signature: generate the IR and emit as byte code. |
---|
320 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
321 | std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) { |
---|
322 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get()); |
---|
323 | if (LLVM_UNLIKELY(hasSignature())) { |
---|
324 | generateKernel(idb); |
---|
325 | std::string signature; |
---|
326 | raw_string_ostream OS(signature); |
---|
327 | WriteBitcodeToFile(getModule(), OS); |
---|
328 | return signature; |
---|
329 | } else { |
---|
330 | return getModule()->getModuleIdentifier(); |
---|
331 | } |
---|
332 | } |
---|
333 | |
---|
334 | |
---|
335 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
336 | * @brief generateKernel |
---|
337 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
338 | void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) { |
---|
339 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get()); |
---|
340 | // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already |
---|
341 | // generated the unoptimized IR. |
---|
342 | if (!mIsGenerated) { |
---|
343 | const auto m = idb->getModule(); |
---|
344 | const auto ip = idb->saveIP(); |
---|
345 | // const auto saveInstance = getInstance(); |
---|
346 | idb->setModule(mModule); |
---|
347 | addKernelDeclarations(idb); |
---|
348 | callGenerateInitializeMethod(idb); |
---|
349 | callGenerateDoSegmentMethod(idb); |
---|
350 | callGenerateFinalizeMethod(idb); |
---|
351 | // setInstance(saveInstance); |
---|
352 | idb->setModule(m); |
---|
353 | idb->restoreIP(ip); |
---|
354 | mIsGenerated = true; |
---|
355 | } |
---|
356 | } |
---|
357 | |
---|
358 | |
---|
359 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
360 | * @brief callGenerateInitializeMethod |
---|
361 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
362 | inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) { |
---|
363 | mCurrentMethod = getInitFunction(idb->getModule()); |
---|
364 | idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod)); |
---|
365 | Function::arg_iterator args = mCurrentMethod->arg_begin(); |
---|
366 | setInstance(&*(args++)); |
---|
367 | idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance()); |
---|
368 | for (const auto & binding : mScalarInputs) { |
---|
369 | idb->setScalarField(binding.getName(), &*(args++)); |
---|
370 | } |
---|
371 | for (const auto & binding : mStreamSetOutputs) { |
---|
372 | idb->setConsumerLock(binding.getName(), &*(args++)); |
---|
373 | } |
---|
374 | generateInitializeMethod(idb); |
---|
375 | idb->CreateRetVoid(); |
---|
376 | } |
---|
377 | |
---|
378 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
379 | * @brief callGenerateDoSegmentMethod |
---|
380 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
381 | inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) { |
---|
382 | mCurrentMethod = getDoSegmentFunction(idb->getModule()); |
---|
383 | idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod)); |
---|
384 | auto args = mCurrentMethod->arg_begin(); |
---|
385 | setInstance(&*(args++)); |
---|
386 | mIsFinal = &*(args++); |
---|
387 | mAvailablePrincipleItemCount = nullptr; |
---|
388 | // if (mHasPrincipleItemCount) { |
---|
389 | // mAvailablePrincipleItemCount = &*(args++); |
---|
390 | // } |
---|
391 | const auto n = mStreamSetInputs.size(); |
---|
392 | mAvailableItemCount.resize(n, nullptr); |
---|
393 | for (unsigned i = 0; i < n; i++) { |
---|
394 | // const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
395 | // Value * itemCount = nullptr; |
---|
396 | // if (rate.isFixed()) { |
---|
397 | // itemCount = mAvailablePrincipleItemCount; |
---|
398 | // if (rate.getRate() != 1) { |
---|
399 | // itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getRate())); |
---|
400 | // } |
---|
401 | // } else if (rate.isBounded() || rate.isUnknown()) { |
---|
402 | // itemCount = &*(args++); |
---|
403 | // } else if (rate.isRelative()) { |
---|
404 | // for (unsigned j = 0; j < i; ++j) { |
---|
405 | // if (mStreamSetInputs[j].getName() == rate.getReference()) { |
---|
406 | // itemCount = mAvailableItemCount[j]; |
---|
407 | // break; |
---|
408 | // } |
---|
409 | // } |
---|
410 | // if (LLVM_UNLIKELY(itemCount == nullptr)) { |
---|
411 | // report_fatal_error(mStreamSetInputs[i].getName() + " is declared before " + rate.getReference()); |
---|
412 | // } |
---|
413 | // if (rate.getNumerator() != 1) { |
---|
414 | // itemCount = idb->CreateMul(itemCount, ConstantInt::get(itemCount->getType(), rate.getNumerator())); |
---|
415 | // } |
---|
416 | // if (rate.getDenominator() != 1) { |
---|
417 | // itemCount = idb->CreateUDiv(itemCount, ConstantInt::get(itemCount->getType(), rate.getDenominator())); |
---|
418 | // } |
---|
419 | // } |
---|
420 | // assert (itemCount); |
---|
421 | // mAvailableItemCount[i] = itemCount; |
---|
422 | |
---|
423 | assert (args != mCurrentMethod->arg_end()); |
---|
424 | mAvailableItemCount[i] = &*(args++); |
---|
425 | } |
---|
426 | assert (args == mCurrentMethod->arg_end()); |
---|
427 | |
---|
428 | generateKernelMethod(idb); // must be overridden by the Kernel subtype |
---|
429 | mIsFinal = nullptr; |
---|
430 | mAvailableItemCount.clear(); |
---|
431 | idb->CreateRetVoid(); |
---|
432 | } |
---|
433 | |
---|
434 | |
---|
435 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
436 | * @brief callGenerateFinalizeMethod |
---|
437 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
438 | inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) { |
---|
439 | mCurrentMethod = getTerminateFunction(idb->getModule()); |
---|
440 | idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod)); |
---|
441 | auto args = mCurrentMethod->arg_begin(); |
---|
442 | setInstance(&*(args++)); |
---|
443 | generateFinalizeMethod(idb); // may be overridden by the Kernel subtype |
---|
444 | const auto n = mScalarOutputs.size(); |
---|
445 | if (n == 0) { |
---|
446 | idb->CreateRetVoid(); |
---|
447 | } else { |
---|
448 | Value * outputs[n]; |
---|
449 | for (unsigned i = 0; i < n; ++i) { |
---|
450 | outputs[i] = idb->getScalarField(mScalarOutputs[i].getName()); |
---|
451 | } |
---|
452 | if (n == 1) { |
---|
453 | idb->CreateRet(outputs[0]); |
---|
454 | } else { |
---|
455 | idb->CreateAggregateRet(outputs, n); |
---|
456 | } |
---|
457 | } |
---|
458 | } |
---|
459 | |
---|
460 | |
---|
461 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
462 | * @brief getScalarIndex |
---|
463 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
464 | unsigned Kernel::getScalarIndex(const std::string & name) const { |
---|
465 | const auto f = mKernelMap.find(name); |
---|
466 | if (LLVM_UNLIKELY(f == mKernelMap.end())) { |
---|
467 | assert (false); |
---|
468 | report_fatal_error(getName() + " does not contain scalar: " + name); |
---|
469 | } |
---|
470 | return f->second; |
---|
471 | } |
---|
472 | |
---|
473 | |
---|
474 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
475 | * @brief createInstance |
---|
476 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
477 | Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) { |
---|
478 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb); |
---|
479 | if (LLVM_UNLIKELY(mKernelStateType == nullptr)) { |
---|
480 | report_fatal_error("Cannot instantiate " + getName() + " before calling prepareKernel()"); |
---|
481 | } |
---|
482 | setInstance(idb->CreateCacheAlignedAlloca(mKernelStateType)); |
---|
483 | return getInstance(); |
---|
484 | } |
---|
485 | |
---|
486 | |
---|
487 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
488 | * @brief initializeInstance |
---|
489 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
490 | void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) { |
---|
491 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb); |
---|
492 | if (LLVM_UNLIKELY(getInstance() == nullptr)) { |
---|
493 | report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()"); |
---|
494 | } |
---|
495 | std::vector<Value *> args; |
---|
496 | args.reserve(1 + mInitialArguments.size() + mStreamSetInputBuffers.size() + (mStreamSetOutputBuffers.size() * 2)); |
---|
497 | args.push_back(getInstance()); |
---|
498 | for (unsigned i = 0; i < mInitialArguments.size(); ++i) { |
---|
499 | Value * arg = mInitialArguments[i]; |
---|
500 | if (LLVM_UNLIKELY(arg == nullptr)) { |
---|
501 | report_fatal_error(getName() + ": initial argument " + std::to_string(i) |
---|
502 | + " cannot be null when calling createInstance()"); |
---|
503 | } |
---|
504 | args.push_back(arg); |
---|
505 | } |
---|
506 | for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) { |
---|
507 | assert (mStreamSetInputBuffers[i]); |
---|
508 | Value * arg = mStreamSetInputBuffers[i]->getStreamSetHandle(); |
---|
509 | if (LLVM_UNLIKELY(arg == nullptr)) { |
---|
510 | report_fatal_error(getName() + ": input stream set " + std::to_string(i) |
---|
511 | + " was not allocated prior to calling createInstance()"); |
---|
512 | } |
---|
513 | args.push_back(arg); |
---|
514 | } |
---|
515 | assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size()); |
---|
516 | for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) { |
---|
517 | assert (mStreamSetOutputBuffers[i]); |
---|
518 | Value * arg = mStreamSetOutputBuffers[i]->getStreamSetHandle(); |
---|
519 | if (LLVM_UNLIKELY(arg == nullptr)) { |
---|
520 | report_fatal_error(getName() + ": output stream set " + std::to_string(i) |
---|
521 | + " was not allocated prior to calling createInstance()"); |
---|
522 | } |
---|
523 | args.push_back(arg); |
---|
524 | } |
---|
525 | assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size()); |
---|
526 | IntegerType * const sizeTy = idb->getSizeTy(); |
---|
527 | PointerType * const sizePtrTy = sizeTy->getPointerTo(); |
---|
528 | PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo(); |
---|
529 | StructType * const consumerTy = StructType::get(sizeTy, sizePtrPtrTy, nullptr); |
---|
530 | for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) { |
---|
531 | const auto output = mStreamSetOutputBuffers[i]; |
---|
532 | const auto & consumers = output->getConsumers(); |
---|
533 | const auto n = consumers.size(); |
---|
534 | AllocaInst * const outputConsumers = idb->CreateAlloca(consumerTy); |
---|
535 | Value * const consumerSegNoArray = idb->CreateAlloca(ArrayType::get(sizePtrTy, n)); |
---|
536 | for (unsigned i = 0; i < n; ++i) { |
---|
537 | Kernel * const consumer = consumers[i]; |
---|
538 | assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance()); |
---|
539 | idb->setKernel(consumer); |
---|
540 | Value * const segmentNoPtr = idb->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR); |
---|
541 | idb->CreateStore(segmentNoPtr, idb->CreateGEP(consumerSegNoArray, { idb->getInt32(0), idb->getInt32(i) })); |
---|
542 | } |
---|
543 | idb->setKernel(this); |
---|
544 | Value * const consumerCountPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(0)}); |
---|
545 | idb->CreateStore(idb->getSize(n), consumerCountPtr); |
---|
546 | Value * const consumerSegNoArrayPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(1)}); |
---|
547 | idb->CreateStore(idb->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr); |
---|
548 | args.push_back(outputConsumers); |
---|
549 | } |
---|
550 | idb->CreateCall(getInitFunction(idb->getModule()), args); |
---|
551 | } |
---|
552 | |
---|
553 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
554 | * @brief finalizeInstance |
---|
555 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
556 | void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) { |
---|
557 | assert ("KernelBuilder does not have a valid IDISA Builder" && idb); |
---|
558 | mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() }); |
---|
559 | } |
---|
560 | |
---|
561 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
562 | * @brief getStreamPort |
---|
563 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
564 | Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const { |
---|
565 | const auto f = mStreamMap.find(name); |
---|
566 | if (LLVM_UNLIKELY(f == mStreamMap.end())) { |
---|
567 | report_fatal_error(getName() + " does not contain stream set " + name); |
---|
568 | } |
---|
569 | return f->second; |
---|
570 | } |
---|
571 | |
---|
572 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
573 | * @brief generateKernelMethod |
---|
574 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
575 | void SegmentOrientedKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) { |
---|
576 | |
---|
577 | Constant * const log2BlockWidth = b->getSize(std::log2(b->getBitBlockWidth())); |
---|
578 | |
---|
579 | const auto inputSetCount = mStreamSetInputs.size(); |
---|
580 | mStreamSetInputBufferPtr.resize(inputSetCount); |
---|
581 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
582 | const auto & name = mStreamSetInputs[i].getName(); |
---|
583 | Value * ic = b->getProcessedItemCount(name); |
---|
584 | Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth); |
---|
585 | mStreamSetInputBufferPtr[i] = b->getInputStreamPtr(name, blockIndex); |
---|
586 | } |
---|
587 | |
---|
588 | const auto outputSetCount = mStreamSetOutputs.size(); |
---|
589 | mStreamSetOutputBufferPtr.resize(outputSetCount); |
---|
590 | for (unsigned i = 0; i < outputSetCount; ++i) { |
---|
591 | const auto & name = mStreamSetOutputs[i].getName(); |
---|
592 | Value * ic = b->getProducedItemCount(name); |
---|
593 | Value * const blockIndex = b->CreateLShr(ic, log2BlockWidth); |
---|
594 | mStreamSetOutputBufferPtr[i] = b->getOutputStreamPtr(name, blockIndex); |
---|
595 | } |
---|
596 | |
---|
597 | generateDoSegmentMethod(b); |
---|
598 | |
---|
599 | } |
---|
600 | |
---|
601 | /** ------------------------------------------------------------------------------------------------------------- * |
---|
602 | * @brief generateKernelMethod |
---|
603 | ** ------------------------------------------------------------------------------------------------------------- */ |
---|
604 | void MultiBlockKernel::generateKernelMethod(const std::unique_ptr<KernelBuilder> & kb) { |
---|
605 | |
---|
606 | const auto inputSetCount = mStreamSetInputs.size(); |
---|
607 | const auto outputSetCount = mStreamSetOutputs.size(); |
---|
608 | const auto totalSetCount = inputSetCount + outputSetCount; |
---|
609 | |
---|
610 | // Scan through and see if any of our input streams is marked as the principle |
---|
611 | |
---|
612 | bool hasPrinciple = false; |
---|
613 | unsigned principleInput = 0; |
---|
614 | |
---|
615 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
616 | for (const auto attr : mStreamSetInputs[i].getAttributes()) { |
---|
617 | if (attr.isPrinciple()) { |
---|
618 | hasPrinciple = true; |
---|
619 | principleInput = i; |
---|
620 | break; |
---|
621 | } |
---|
622 | } |
---|
623 | } |
---|
624 | |
---|
625 | // Now we iteratively process these blocks using the doMultiBlock method. |
---|
626 | // In each iteration, we check how many linearly accessible / writable |
---|
627 | // items can be processed with our current input / output buffers. If we |
---|
628 | // cannot support an full stride, we check whether (a) there is enough |
---|
629 | // input data to process but it is not linearly accessible, in which case |
---|
630 | // we move the data into temporary buffers or (b) there is not enough data |
---|
631 | // to process, in which case we abort unless IsFinal was set. |
---|
632 | |
---|
633 | // Now proceed with creation of the doSegment method. |
---|
634 | BasicBlock * const doSegmentLoop = kb->CreateBasicBlock("DoSegmentLoop"); |
---|
635 | kb->CreateBr(doSegmentLoop); |
---|
636 | |
---|
637 | /// DO SEGMENT LOOP |
---|
638 | |
---|
639 | kb->SetInsertPoint(doSegmentLoop); |
---|
640 | |
---|
641 | // For each input buffer, determine the processedItemCount, the block pointer for the |
---|
642 | // buffer block containing the next item, and the number of linearly available items. |
---|
643 | |
---|
644 | Value * processedItemCount[inputSetCount]; |
---|
645 | Value * baseInputBuffer[inputSetCount]; |
---|
646 | Value * unprocessed[inputSetCount]; |
---|
647 | Value * linearlyAvailable[inputSetCount]; |
---|
648 | Value * readableStrides[inputSetCount]; |
---|
649 | |
---|
650 | Constant * const log2BlockWidth = kb->getSize(std::log2(kb->getBitBlockWidth())); |
---|
651 | |
---|
652 | Value * numOfStrides = nullptr; |
---|
653 | |
---|
654 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
655 | const auto name = mStreamSetInputs[i].getName(); |
---|
656 | const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
657 | |
---|
658 | processedItemCount[i] = kb->getProcessedItemCount(name); |
---|
659 | |
---|
660 | assert (processedItemCount[i]->getType() == mAvailableItemCount[i]->getType()); |
---|
661 | |
---|
662 | Value * const blockIndex = kb->CreateLShr(processedItemCount[i], log2BlockWidth); |
---|
663 | baseInputBuffer[i] = kb->getInputStreamPtr(name, blockIndex); |
---|
664 | |
---|
665 | if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) { |
---|
666 | kb->CreateAssert(kb->CreateICmpUGE(mAvailableItemCount[i], processedItemCount[i]), |
---|
667 | "Processed item count cannot exceed the available item count"); |
---|
668 | } |
---|
669 | |
---|
670 | unprocessed[i] = kb->CreateSub(mAvailableItemCount[i], processedItemCount[i]); |
---|
671 | |
---|
672 | //kb->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed[i]); |
---|
673 | |
---|
674 | // INVESTIGATE: If the input rate of this stream is constant and known a priori, we could |
---|
675 | // avoid checking whether it is linearly accessible. Should we have an attribute for this? |
---|
676 | |
---|
677 | linearlyAvailable[i] = kb->getLinearlyAccessibleItems(name, processedItemCount[i], unprocessed[i]); |
---|
678 | |
---|
679 | //kb->CallPrintInt(getName() + "_" + name + "_linearlyAvailable", linearlyAvailable[i]); |
---|
680 | |
---|
681 | readableStrides[i] = nullptr; |
---|
682 | |
---|
683 | if (rate.isFixed() || rate.isBounded()) { |
---|
684 | Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride); |
---|
685 | readableStrides[i] = kb->CreateUDiv(linearlyAvailable[i], maxStrideSize); |
---|
686 | if (numOfStrides) { |
---|
687 | numOfStrides = kb->CreateUMin(numOfStrides, readableStrides[i]); |
---|
688 | } else { |
---|
689 | numOfStrides = readableStrides[i]; |
---|
690 | } |
---|
691 | } |
---|
692 | } |
---|
693 | |
---|
694 | //kb->CallPrintInt(getName() + "_numOfStrides", numOfStrides); |
---|
695 | |
---|
696 | // Now determine the linearly writeable blocks, based on available blocks reduced |
---|
697 | // by limitations of output buffer space. |
---|
698 | |
---|
699 | Value * producedItemCount[outputSetCount]; |
---|
700 | Value * baseOutputBuffer[outputSetCount]; |
---|
701 | Value * writableStrides[outputSetCount]; |
---|
702 | Value * linearlyWritable[outputSetCount]; |
---|
703 | |
---|
704 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
705 | const auto & name = mStreamSetOutputs[i].getName(); |
---|
706 | const ProcessingRate & rate = mStreamSetOutputs[i].getRate(); |
---|
707 | producedItemCount[i] = kb->getProducedItemCount(name); |
---|
708 | |
---|
709 | //kb->CallPrintInt(getName() + "_" + name + "_producedItemCount", producedItemCount[i]); |
---|
710 | |
---|
711 | Value * const blockIndex = kb->CreateLShr(producedItemCount[i], log2BlockWidth); |
---|
712 | baseOutputBuffer[i] = kb->getOutputStreamPtr(name, blockIndex); |
---|
713 | linearlyWritable[i] = nullptr; |
---|
714 | writableStrides[i] = nullptr; |
---|
715 | if (rate.isFixed() || rate.isBounded()) { |
---|
716 | linearlyWritable[i] = kb->getLinearlyWritableItems(name, producedItemCount[i]); |
---|
717 | |
---|
718 | //kb->CallPrintInt(getName() + "_" + name + "_linearlyWritable", linearlyWritable[i]); |
---|
719 | |
---|
720 | Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride); |
---|
721 | writableStrides[i] = kb->CreateUDiv(linearlyWritable[i], maxStrideSize); |
---|
722 | if (numOfStrides) { |
---|
723 | numOfStrides = kb->CreateUMin(numOfStrides, writableStrides[i]); |
---|
724 | } else { |
---|
725 | numOfStrides = writableStrides[i]; |
---|
726 | } |
---|
727 | } |
---|
728 | } |
---|
729 | |
---|
730 | //kb->CallPrintInt(getName() + "_numOfStrides'", numOfStrides); |
---|
731 | |
---|
732 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
733 | const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
734 | if (rate.isFixed()) { |
---|
735 | mAvailableItemCount[i] = kb->CreateMul(numOfStrides, kb->getSize(rate.getRate() * mStride)); |
---|
736 | } else { |
---|
737 | mAvailableItemCount[i] = linearlyAvailable[i]; |
---|
738 | } |
---|
739 | |
---|
740 | //kb->CallPrintInt(getName() + "_" + mStreamSetInputs[i].getName() + "_avail", mAvailableItemCount[i]); |
---|
741 | } |
---|
742 | |
---|
743 | // Define and allocate the temporary buffer area. |
---|
744 | Type * tempBuffers[totalSetCount]; |
---|
745 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
746 | Type * bufType = baseInputBuffer[i]->getType()->getPointerElementType(); |
---|
747 | assert (baseInputBuffer[i]->getType()->getPointerAddressSpace() == 0); |
---|
748 | const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
749 | unsigned count = 0; |
---|
750 | if (rate.isFixed()) { |
---|
751 | count = rate.getRate(); |
---|
752 | } else if (rate.isBounded()) { |
---|
753 | count = rate.getUpperBound() + 2; |
---|
754 | } |
---|
755 | tempBuffers[i] = ArrayType::get(bufType, count); |
---|
756 | } |
---|
757 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
758 | Type * const bufType = baseOutputBuffer[i]->getType()->getPointerElementType(); |
---|
759 | assert (baseOutputBuffer[i]->getType()->getPointerAddressSpace() == 0); |
---|
760 | const ProcessingRate & rate = mStreamSetOutputs[i].getRate(); |
---|
761 | unsigned count = 0; |
---|
762 | if (rate.isFixed()) { |
---|
763 | count = rate.getRate(); |
---|
764 | } else if (rate.isBounded()) { |
---|
765 | count = rate.getUpperBound() + 2; |
---|
766 | } |
---|
767 | tempBuffers[i + inputSetCount] = ArrayType::get(bufType, count); |
---|
768 | } |
---|
769 | |
---|
770 | Type * const tempParameterStructType = StructType::create(kb->getContext(), ArrayRef<Type *>(tempBuffers, totalSetCount)); |
---|
771 | |
---|
772 | Value * const tempBufferArea = kb->CreateCacheAlignedAlloca(tempParameterStructType); |
---|
773 | |
---|
774 | BasicBlock * const temporaryBufferCheck = kb->CreateBasicBlock("temporaryBufferCheck"); |
---|
775 | BasicBlock * const doMultiBlock = kb->CreateBasicBlock("doMultiBlock"); |
---|
776 | BasicBlock * const copyToTemporaryBuffers = kb->CreateBasicBlock("copyToTemporaryBuffers"); |
---|
777 | BasicBlock * const segmentDone = kb->CreateBasicBlock("segmentDone"); |
---|
778 | |
---|
779 | Value * const hasFullStride = numOfStrides ? kb->CreateICmpNE(numOfStrides, kb->getSize(0)) : kb->getTrue(); |
---|
780 | kb->CreateCondBr(hasFullStride, doMultiBlock, temporaryBufferCheck); |
---|
781 | |
---|
782 | // We use temporary buffers in 3 different cases that preclude full stride processing. |
---|
783 | |
---|
784 | // (a) One or more input buffers does not have a sufficient number of input items linearly available. |
---|
785 | // (b) One or more output buffers does not have sufficient linearly available buffer space. |
---|
786 | // (c) We have processed all the full strides of input and only the final block remains. |
---|
787 | |
---|
788 | kb->SetInsertPoint(temporaryBufferCheck); |
---|
789 | |
---|
790 | // Even if we copy the input data into a linear arrays, is there enough data to perform this stride? |
---|
791 | // If not, proceed only if this is our final block. |
---|
792 | Value * hasFullFragmentedStride = nullptr; |
---|
793 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
794 | const ProcessingRate & r = mStreamSetInputs[i].getRate(); |
---|
795 | if (r.isBounded() || (r.isUnknown() && r.getLowerBound() > 0)) { |
---|
796 | const auto l = r.isBounded() ? r.getUpperBound() : r.getLowerBound(); |
---|
797 | Constant * const strideSize = kb->getSize(l * mStride); |
---|
798 | Value * enoughAvail = kb->CreateICmpUGE(unprocessed[i], strideSize); |
---|
799 | if (hasFullFragmentedStride) { |
---|
800 | hasFullFragmentedStride = kb->CreateAnd(hasFullFragmentedStride, enoughAvail); |
---|
801 | } else { |
---|
802 | hasFullFragmentedStride = enoughAvail; |
---|
803 | } |
---|
804 | } |
---|
805 | } |
---|
806 | |
---|
807 | Value * hasFragmentedOrFinalStride = nullptr; |
---|
808 | if (hasFullFragmentedStride) { |
---|
809 | hasFragmentedOrFinalStride = kb->CreateOr(hasFullFragmentedStride, mIsFinal); |
---|
810 | // Although this might be the final segment, we may have a full fragmented stride to process prior |
---|
811 | // to the actual final stride. |
---|
812 | mIsFinal = kb->CreateAnd(mIsFinal, kb->CreateNot(hasFullFragmentedStride)); |
---|
813 | } else { |
---|
814 | hasFragmentedOrFinalStride = mIsFinal; |
---|
815 | } |
---|
816 | kb->CreateCondBr(hasFragmentedOrFinalStride, copyToTemporaryBuffers, segmentDone); |
---|
817 | |
---|
818 | /// COPY TO TEMPORARY BUFFERS |
---|
819 | kb->SetInsertPoint(copyToTemporaryBuffers); |
---|
820 | |
---|
821 | kb->CreateAlignedStore(Constant::getNullValue(tempParameterStructType), tempBufferArea, kb->getCacheAlignment()); |
---|
822 | |
---|
823 | // For each input and output buffer, copy over necessary data starting from the last block boundary. |
---|
824 | |
---|
825 | Value * temporaryInputBuffer[inputSetCount]; |
---|
826 | Value * temporaryAvailable[inputSetCount]; |
---|
827 | |
---|
828 | for (unsigned i = 0; i < inputSetCount; i++) { |
---|
829 | temporaryInputBuffer[i] = baseInputBuffer[i]; |
---|
830 | if (readableStrides[i]) { |
---|
831 | const auto name = mStreamSetInputs[i].getName(); |
---|
832 | const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
833 | assert (rate.getUpperBound() > 0); |
---|
834 | Constant * const maxStrideSize = kb->getSize(rate.getUpperBound() * mStride); |
---|
835 | temporaryAvailable[i] = kb->CreateUMin(unprocessed[i], maxStrideSize); |
---|
836 | |
---|
837 | BasicBlock * entry = kb->GetInsertBlock(); |
---|
838 | BasicBlock * copy = kb->CreateBasicBlock(name + "Copy"); |
---|
839 | BasicBlock * resume = kb->CreateBasicBlock(name + "ResumeCopy"); |
---|
840 | Value * const test = kb->CreateOr(kb->CreateICmpNE(readableStrides[i], kb->getSize(0)), mIsFinal); |
---|
841 | kb->CreateCondBr(test, resume, copy); |
---|
842 | |
---|
843 | kb->SetInsertPoint(copy); |
---|
844 | Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(i), kb->getInt32(0)}); |
---|
845 | assert (tempBufferPtr->getType() == baseInputBuffer[i]->getType()); |
---|
846 | Value * const neededItems = linearlyAvailable[i]; |
---|
847 | Value * const bytesCopied = kb->copy(name, tempBufferPtr, baseInputBuffer[i], neededItems); |
---|
848 | Value * const nextInputPtr = kb->getRawInputPointer(name, kb->getSize(0)); |
---|
849 | Value * const remaining = kb->CreateSub(temporaryAvailable[i], neededItems); |
---|
850 | Value * nextBufPtr = kb->CreatePointerCast(tempBufferPtr, kb->getInt8PtrTy()); |
---|
851 | nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied); |
---|
852 | kb->copy(name, nextBufPtr, nextInputPtr, remaining); |
---|
853 | |
---|
854 | kb->CreateBr(resume); |
---|
855 | |
---|
856 | kb->SetInsertPoint(resume); |
---|
857 | PHINode * bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2); |
---|
858 | bufferPtr->addIncoming(baseInputBuffer[i], entry); |
---|
859 | bufferPtr->addIncoming(tempBufferPtr, copy); |
---|
860 | temporaryInputBuffer[i] = bufferPtr; |
---|
861 | } |
---|
862 | } |
---|
863 | |
---|
864 | Value * temporaryOutputBuffer[outputSetCount]; |
---|
865 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
866 | temporaryOutputBuffer[i] = baseOutputBuffer[i]; |
---|
867 | if (writableStrides[i]) { |
---|
868 | const auto name = mStreamSetOutputs[i].getName(); |
---|
869 | |
---|
870 | BasicBlock * const entry = kb->GetInsertBlock(); |
---|
871 | BasicBlock * const copy = kb->CreateBasicBlock(name + "Copy"); |
---|
872 | BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopy"); |
---|
873 | |
---|
874 | Value * const test = kb->CreateOr(kb->CreateICmpNE(writableStrides[i], kb->getSize(0)), mIsFinal); |
---|
875 | kb->CreateCondBr(test, resume, copy); |
---|
876 | |
---|
877 | kb->SetInsertPoint(copy); |
---|
878 | Value * const tempBufferPtr = kb->CreateGEP(tempBufferArea, {kb->getInt32(0), kb->getInt32(inputSetCount + i), kb->getInt32(0)}); |
---|
879 | assert (tempBufferPtr->getType() == baseOutputBuffer[i]->getType()); |
---|
880 | Value * const itemsToCopy = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1)); |
---|
881 | kb->copy(name, tempBufferPtr, baseOutputBuffer[i], itemsToCopy); |
---|
882 | kb->CreateBr(resume); |
---|
883 | |
---|
884 | kb->SetInsertPoint(resume); |
---|
885 | PHINode * bufferPtr = kb->CreatePHI(tempBufferPtr->getType(), 2); |
---|
886 | bufferPtr->addIncoming(baseOutputBuffer[i], entry); |
---|
887 | bufferPtr->addIncoming(tempBufferPtr, copy); |
---|
888 | temporaryOutputBuffer[i] = bufferPtr; |
---|
889 | } |
---|
890 | } |
---|
891 | |
---|
892 | kb->CreateBr(doMultiBlock); |
---|
893 | BasicBlock * const usingTemporaryBuffers = kb->GetInsertBlock(); |
---|
894 | doMultiBlock->moveAfter(usingTemporaryBuffers); |
---|
895 | |
---|
896 | /// DO MULTI BLOCK |
---|
897 | |
---|
898 | // At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets. |
---|
899 | // Now prepare the doMultiBlock call. |
---|
900 | kb->SetInsertPoint(doMultiBlock); |
---|
901 | |
---|
902 | PHINode * const isFinal = kb->CreatePHI(mIsFinal->getType(), 2); |
---|
903 | isFinal->addIncoming(kb->getFalse(), doSegmentLoop); |
---|
904 | isFinal->addIncoming(mIsFinal, usingTemporaryBuffers); |
---|
905 | mIsFinal = isFinal; |
---|
906 | |
---|
907 | mStreamSetInputBufferPtr.resize(inputSetCount); |
---|
908 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
909 | assert (baseInputBuffer[i] && temporaryInputBuffer[i]); |
---|
910 | if (baseInputBuffer[i] != temporaryInputBuffer[i]) { |
---|
911 | PHINode * const avail = kb->CreatePHI(kb->getSizeTy(), 2); |
---|
912 | avail->addIncoming(mAvailableItemCount[i], doSegmentLoop); |
---|
913 | avail->addIncoming(temporaryAvailable[i], usingTemporaryBuffers); |
---|
914 | mAvailableItemCount[i] = avail; |
---|
915 | PHINode * const bufferPtr = kb->CreatePHI(baseInputBuffer[i]->getType(), 2); |
---|
916 | bufferPtr->addIncoming(baseInputBuffer[i], doSegmentLoop); |
---|
917 | assert (baseInputBuffer[i]->getType() == temporaryInputBuffer[i]->getType()); |
---|
918 | bufferPtr->addIncoming(temporaryInputBuffer[i], usingTemporaryBuffers); |
---|
919 | temporaryInputBuffer[i] = bufferPtr; |
---|
920 | } |
---|
921 | mStreamSetInputBufferPtr[i] = temporaryInputBuffer[i]; |
---|
922 | } |
---|
923 | |
---|
924 | mStreamSetOutputBufferPtr.resize(outputSetCount); |
---|
925 | for (unsigned i = 0; i < outputSetCount; ++i) { |
---|
926 | assert (baseOutputBuffer[i] && temporaryOutputBuffer[i]); |
---|
927 | if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) { |
---|
928 | PHINode * const bufferPtr = kb->CreatePHI(baseOutputBuffer[i]->getType(), 2); |
---|
929 | bufferPtr->addIncoming(baseOutputBuffer[i], doSegmentLoop); |
---|
930 | assert (baseOutputBuffer[i]->getType() == temporaryOutputBuffer[i]->getType()); |
---|
931 | bufferPtr->addIncoming(temporaryOutputBuffer[i], usingTemporaryBuffers); |
---|
932 | temporaryOutputBuffer[i] = bufferPtr; |
---|
933 | } |
---|
934 | mStreamSetOutputBufferPtr[i] = temporaryOutputBuffer[i]; |
---|
935 | } |
---|
936 | |
---|
937 | // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to |
---|
938 | // provide the required multi-block kernel logic. |
---|
939 | generateMultiBlockLogic(kb, numOfStrides); |
---|
940 | |
---|
941 | // If we have no fixed rate inputs, we won't know when we're done parsing until we test |
---|
942 | // whether any input data was processed. |
---|
943 | bool mayMakeNoProgress = true; |
---|
944 | |
---|
945 | // Update the processed item count of any Fixed input or output stream. While doing so, also |
---|
946 | // calculate the LCM of their rates. The LCM is used to calculate the final item counts. |
---|
947 | |
---|
948 | unsigned rateLCM = 1; |
---|
949 | |
---|
950 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
951 | const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
952 | if (rate.isFixed()) { |
---|
953 | mayMakeNoProgress = false; |
---|
954 | rateLCM = lcm(rateLCM, rate.getRate()); |
---|
955 | Value * const processed = mAvailableItemCount[i]; // kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate())); |
---|
956 | Value * const ic = kb->CreateAdd(processedItemCount[i], processed); |
---|
957 | kb->setProcessedItemCount(mStreamSetInputs[i].getName(), ic); |
---|
958 | } |
---|
959 | } |
---|
960 | |
---|
961 | for (unsigned i = 0; i < outputSetCount; ++i) { |
---|
962 | const ProcessingRate & rate = mStreamSetOutputs[i].getRate(); |
---|
963 | if (rate.isFixed()) { |
---|
964 | rateLCM = lcm(rateLCM, rate.getRate()); |
---|
965 | Value * const produced = kb->CreateMul(numOfStrides, kb->getSize(mStride * rate.getRate())); |
---|
966 | Value * const ic = kb->CreateAdd(producedItemCount[i], produced); |
---|
967 | kb->setProducedItemCount(mStreamSetOutputs[i].getName(), ic); |
---|
968 | } |
---|
969 | } |
---|
970 | |
---|
971 | BasicBlock * const finalStrideCheck = kb->CreateBasicBlock("finalStrideCheck"); |
---|
972 | BasicBlock * const finalStrideAdjustment = kb->CreateBasicBlock("finalStrideAdjustment"); |
---|
973 | BasicBlock * const standardCopyBack = kb->CreateBasicBlock("standardCopyBack"); |
---|
974 | BasicBlock * const temporaryBufferCopyBack = kb->CreateBasicBlock("temporaryBufferCopyBack"); |
---|
975 | |
---|
976 | kb->CreateLikelyCondBr(hasFullStride, standardCopyBack, finalStrideCheck); |
---|
977 | |
---|
978 | |
---|
979 | /// FINAL STRIDE CHECK |
---|
980 | kb->SetInsertPoint(finalStrideCheck); |
---|
981 | kb->CreateUnlikelyCondBr(mIsFinal, finalStrideAdjustment, temporaryBufferCopyBack); |
---|
982 | |
---|
983 | /// FINAL STRIDE ADJUSTMENT |
---|
984 | kb->SetInsertPoint(finalStrideAdjustment); |
---|
985 | |
---|
986 | // If this is our final stride, adjust the Fixed output item counts. The main loop assumes that |
---|
987 | // the ITEM COUNT % FIXED RATE = 0 for all Fixed Input and Output streams. We correct that here |
---|
988 | // to calculate them based on the actual input item counts. |
---|
989 | |
---|
990 | // NOTE: This appears overly complex to avoid an integer overflow without reducing the maximum |
---|
991 | // integer size. For each Fixed output stream, this calculates: |
---|
992 | |
---|
993 | // CEILING(MIN(Total Available Item Count / Fixed Input Rate) * Fixed Output Rate) |
---|
994 | |
---|
995 | Value * basePreviouslyProcessedItemCount = nullptr; |
---|
996 | Value * scaledInverseOfStrideItemCount = nullptr; |
---|
997 | |
---|
998 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
999 | const ProcessingRate & r = mStreamSetInputs[i].getRate(); |
---|
1000 | if (r.isFixed()) { |
---|
1001 | assert (rateLCM % r.getRate() == 0); |
---|
1002 | Value * const a = kb->CreateMul(mAvailableItemCount[i], kb->getSize(rateLCM / r.getRate())); // unprocessed |
---|
1003 | Value * const p = kb->CreateUDiv(processedItemCount[i], kb->getSize(r.getRate())); |
---|
1004 | if (scaledInverseOfStrideItemCount) { |
---|
1005 | scaledInverseOfStrideItemCount = kb->CreateUMin(scaledInverseOfStrideItemCount, a); |
---|
1006 | basePreviouslyProcessedItemCount = kb->CreateUMin(basePreviouslyProcessedItemCount, p); |
---|
1007 | } else { |
---|
1008 | scaledInverseOfStrideItemCount = a; |
---|
1009 | basePreviouslyProcessedItemCount = p; |
---|
1010 | } |
---|
1011 | } |
---|
1012 | // const auto name = mStreamSetInputs[i].getName(); |
---|
1013 | // Value * const processed = kb->CreateAdd(processedItemCount[i], unprocessed[i]); |
---|
1014 | // kb->setProcessedItemCount(name, processed); |
---|
1015 | } |
---|
1016 | |
---|
1017 | for (unsigned i = 0; i < outputSetCount; ++i) { |
---|
1018 | const auto name = mStreamSetOutputs[i].getName(); |
---|
1019 | const ProcessingRate & r = mStreamSetOutputs[i].getRate(); |
---|
1020 | Value * produced = nullptr; |
---|
1021 | if (r.isFixed()) { |
---|
1022 | assert (rateLCM % r.getRate() == 0); |
---|
1023 | assert (basePreviouslyProcessedItemCount && scaledInverseOfStrideItemCount); |
---|
1024 | Value * const p = kb->CreateMul(basePreviouslyProcessedItemCount, kb->getSize(r.getRate())); |
---|
1025 | Value * const ic = kb->CreateUDivCeil(scaledInverseOfStrideItemCount, kb->getSize(rateLCM / r.getRate())); |
---|
1026 | produced = kb->CreateAdd(p, ic); |
---|
1027 | } else { // check if we have an attribute; if so, get the current produced count and adjust it |
---|
1028 | bool noAttributes = true; |
---|
1029 | for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) { |
---|
1030 | if (attr.isAdd() || attr.isRoundUpTo()) { |
---|
1031 | noAttributes = false; |
---|
1032 | break; |
---|
1033 | } |
---|
1034 | } |
---|
1035 | if (noAttributes) { |
---|
1036 | continue; |
---|
1037 | } |
---|
1038 | produced = kb->getProducedItemCount(name); |
---|
1039 | } |
---|
1040 | for (const Attribute & attr : mStreamSetOutputs[i].getAttributes()) { |
---|
1041 | if (attr.isAdd()) { |
---|
1042 | produced = kb->CreateAdd(produced, kb->getSize(attr.getAmount())); |
---|
1043 | } else if (attr.isRoundUpTo()) { |
---|
1044 | produced = kb->CreateRoundUp(produced, kb->getSize(attr.getAmount())); |
---|
1045 | } |
---|
1046 | } |
---|
1047 | kb->setProducedItemCount(name, produced); |
---|
1048 | } |
---|
1049 | |
---|
1050 | kb->CreateBr(temporaryBufferCopyBack); |
---|
1051 | |
---|
1052 | /// TEMPORARY BUFFER COPY BACK |
---|
1053 | kb->SetInsertPoint(temporaryBufferCopyBack); |
---|
1054 | |
---|
1055 | // Copy back data to the actual output buffers. |
---|
1056 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
1057 | |
---|
1058 | if (baseOutputBuffer[i] != temporaryOutputBuffer[i]) { |
---|
1059 | |
---|
1060 | const auto name = mStreamSetOutputs[i].getName(); |
---|
1061 | |
---|
1062 | BasicBlock * const copy = kb->CreateBasicBlock(name + "CopyBack"); |
---|
1063 | BasicBlock * const resume = kb->CreateBasicBlock(name + "ResumeCopyBack"); |
---|
1064 | Value * const usedTemporary = kb->CreateICmpNE(temporaryOutputBuffer[i], baseOutputBuffer[i]); |
---|
1065 | |
---|
1066 | // If we used a temporary buffer ... |
---|
1067 | kb->CreateCondBr(usedTemporary, copy, resume); |
---|
1068 | |
---|
1069 | kb->SetInsertPoint(copy); |
---|
1070 | Value * bytesCopied = kb->copy(name, baseOutputBuffer[i], temporaryOutputBuffer[i], linearlyWritable[i]); |
---|
1071 | Value * nextOutputPtr = kb->getRawOutputPointer(name, kb->getSize(0)); |
---|
1072 | Value * producedCount = kb->getProducedItemCount(name); |
---|
1073 | |
---|
1074 | Value * remaining = kb->CreateSub(producedCount, linearlyWritable[i]); |
---|
1075 | Value * nextBufPtr = kb->CreatePointerCast(temporaryOutputBuffer[i], kb->getInt8PtrTy()); |
---|
1076 | nextBufPtr = kb->CreateGEP(nextBufPtr, bytesCopied); |
---|
1077 | |
---|
1078 | kb->copy(name, nextOutputPtr, nextBufPtr, remaining); |
---|
1079 | kb->CreateBr(resume); |
---|
1080 | |
---|
1081 | kb->SetInsertPoint(resume); |
---|
1082 | } |
---|
1083 | } |
---|
1084 | |
---|
1085 | // We've dealt with the partial block processing and copied information back into the |
---|
1086 | // actual buffers. If this isn't the final block, loop back for more multiblock processing. |
---|
1087 | BasicBlock * setTermination = nullptr; |
---|
1088 | if (hasNoTerminateAttribute()) { |
---|
1089 | kb->CreateCondBr(mIsFinal, segmentDone, standardCopyBack); |
---|
1090 | } else { |
---|
1091 | setTermination = kb->CreateBasicBlock("setTermination"); |
---|
1092 | kb->CreateCondBr(mIsFinal, setTermination, standardCopyBack); |
---|
1093 | } |
---|
1094 | |
---|
1095 | /// STANDARD COPY BACK |
---|
1096 | kb->SetInsertPoint(standardCopyBack); |
---|
1097 | |
---|
1098 | // Do copybacks if necessary. |
---|
1099 | for (unsigned i = 0; i < outputSetCount; i++) { |
---|
1100 | if (mStreamSetOutputBuffers[i]->supportsCopyBack()) { |
---|
1101 | const auto name = mStreamSetOutputs[i].getName(); |
---|
1102 | Value * newProduced = kb->getProducedItemCount(name); |
---|
1103 | kb->CreateCopyBack(name, producedItemCount[i], newProduced); |
---|
1104 | } |
---|
1105 | } |
---|
1106 | |
---|
1107 | // If it is possible to make no progress, verify we processed some of the input. If we haven't, |
---|
1108 | // we're finished this segment. |
---|
1109 | if (mayMakeNoProgress) { |
---|
1110 | Value * madeProgress = nullptr; |
---|
1111 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
1112 | Value * const processed = kb->getProcessedItemCount(mStreamSetInputs[i].getName()); |
---|
1113 | Value * const progress = kb->CreateICmpNE(processed, processedItemCount[i]); |
---|
1114 | if (madeProgress) { |
---|
1115 | madeProgress = kb->CreateOr(madeProgress, progress); |
---|
1116 | } else { |
---|
1117 | madeProgress = progress; |
---|
1118 | } |
---|
1119 | } |
---|
1120 | assert (madeProgress); |
---|
1121 | kb->CreateCondBr(madeProgress, doSegmentLoop, segmentDone); |
---|
1122 | } else { |
---|
1123 | kb->CreateBr(doSegmentLoop); |
---|
1124 | } |
---|
1125 | |
---|
1126 | if (hasNoTerminateAttribute()) { |
---|
1127 | segmentDone->moveAfter(kb->GetInsertBlock()); |
---|
1128 | } else { |
---|
1129 | /// SET TERMINATION |
---|
1130 | setTermination->moveAfter(kb->GetInsertBlock()); |
---|
1131 | kb->SetInsertPoint(setTermination); |
---|
1132 | kb->setTerminationSignal(); |
---|
1133 | kb->CreateBr(segmentDone); |
---|
1134 | segmentDone->moveAfter(setTermination); |
---|
1135 | } |
---|
1136 | |
---|
1137 | kb->SetInsertPoint(segmentDone); |
---|
1138 | |
---|
1139 | } |
---|
1140 | |
---|
1141 | //bool MultiBlockKernel::requiresCopyBack(const ProcessingRate & rate) const { |
---|
1142 | // if (rate.isBounded() || rate.isUnknown()) { |
---|
1143 | // return true; |
---|
1144 | // } else if (rate.isDirectlyRelative()) { |
---|
1145 | // Port port; unsigned i; |
---|
1146 | // std::tie(port, i) = getStreamPort(rate.getReference()); |
---|
1147 | // const auto & binding = (port == Port::Input) ? mStreamSetInputs[i] : mStreamSetOutputs[i]; |
---|
1148 | // return requiresCopyBack(binding.getRate()); |
---|
1149 | // } |
---|
1150 | // return false; |
---|
1151 | //} |
---|
1152 | |
---|
1153 | // The default doSegment method dispatches to the doBlock routine for |
---|
1154 | // each block of the given number of blocksToDo, and then updates counts. |
---|
1155 | |
---|
1156 | void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & idb, llvm::Value * const numOfStrides) { |
---|
1157 | |
---|
1158 | BasicBlock * const entryBlock = idb->GetInsertBlock(); |
---|
1159 | BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond"); |
---|
1160 | mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody"); |
---|
1161 | BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone"); |
---|
1162 | BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock"); |
---|
1163 | BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone"); |
---|
1164 | |
---|
1165 | Value * baseTarget = nullptr; |
---|
1166 | if (idb->supportsIndirectBr()) { |
---|
1167 | baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone)); |
---|
1168 | } |
---|
1169 | |
---|
1170 | Constant * const log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth())); |
---|
1171 | |
---|
1172 | const auto inputSetCount = mStreamSetInputs.size(); |
---|
1173 | Value * baseProcessedIndex[inputSetCount]; |
---|
1174 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
1175 | const ProcessingRate & rate = mStreamSetInputs[i].getRate(); |
---|
1176 | if (rate.isFixed()) { |
---|
1177 | baseProcessedIndex[i] = nullptr; |
---|
1178 | } else { |
---|
1179 | Value * ic = idb->getProcessedItemCount(mStreamSetInputs[i].getName()); |
---|
1180 | ic = idb->CreateLShr(ic, log2BlockSize); |
---|
1181 | baseProcessedIndex[i] = ic; |
---|
1182 | } |
---|
1183 | } |
---|
1184 | |
---|
1185 | const auto outputSetCount = mStreamSetOutputs.size(); |
---|
1186 | Value * baseProducedIndex[outputSetCount]; |
---|
1187 | for (unsigned i = 0; i < outputSetCount; ++i) { |
---|
1188 | const ProcessingRate & rate = mStreamSetOutputs[i].getRate(); |
---|
1189 | if (rate.isFixed()) { |
---|
1190 | baseProducedIndex[i] = nullptr; |
---|
1191 | } else { |
---|
1192 | Value * ic = idb->getProducedItemCount(mStreamSetOutputs[i].getName()); |
---|
1193 | ic = idb->CreateLShr(ic, log2BlockSize); |
---|
1194 | baseProducedIndex[i] = ic; |
---|
1195 | } |
---|
1196 | } |
---|
1197 | |
---|
1198 | Value * const numOfBlocksToProcess = idb->CreateMul(numOfStrides, idb->getSize(mStride / idb->getBitBlockWidth())); |
---|
1199 | |
---|
1200 | idb->CreateBr(strideLoopCond); |
---|
1201 | |
---|
1202 | /// BLOCK COND |
---|
1203 | |
---|
1204 | idb->SetInsertPoint(strideLoopCond); |
---|
1205 | |
---|
1206 | PHINode * branchTarget = nullptr; |
---|
1207 | if (baseTarget) { |
---|
1208 | branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget"); |
---|
1209 | branchTarget->addIncoming(baseTarget, entryBlock); |
---|
1210 | } |
---|
1211 | |
---|
1212 | PHINode * const blockIndex = idb->CreatePHI(idb->getSizeTy(), 2, "index"); |
---|
1213 | blockIndex->addIncoming(idb->getSize(0), entryBlock); |
---|
1214 | |
---|
1215 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
1216 | Value * offset = blockIndex; |
---|
1217 | if (baseProcessedIndex[i]) { |
---|
1218 | offset = idb->getProcessedItemCount(mStreamSetInputs[i].getName()); |
---|
1219 | offset = idb->CreateLShr(offset, log2BlockSize); |
---|
1220 | offset = idb->CreateSub(offset, baseProcessedIndex[i]); |
---|
1221 | } |
---|
1222 | mStreamSetInputBufferPtr[i] = idb->CreateGEP(mStreamSetInputBufferPtr[i], offset); |
---|
1223 | } |
---|
1224 | |
---|
1225 | for (unsigned i = 0; i < outputSetCount; ++i) { |
---|
1226 | Value * offset = blockIndex; |
---|
1227 | if (baseProducedIndex[i]) { |
---|
1228 | offset = idb->getProducedItemCount(mStreamSetOutputs[i].getName()); |
---|
1229 | offset = idb->CreateLShr(offset, log2BlockSize); |
---|
1230 | offset = idb->CreateSub(offset, baseProducedIndex[i]); |
---|
1231 | } |
---|
1232 | mStreamSetOutputBufferPtr[i] = idb->CreateGEP(mStreamSetOutputBufferPtr[i], offset); |
---|
1233 | } |
---|
1234 | |
---|
1235 | Value * const notDone = idb->CreateICmpULT(blockIndex, numOfBlocksToProcess); |
---|
1236 | idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone); |
---|
1237 | |
---|
1238 | /// BLOCK BODY |
---|
1239 | |
---|
1240 | idb->SetInsertPoint(mStrideLoopBody); |
---|
1241 | |
---|
1242 | if (idb->supportsIndirectBr()) { |
---|
1243 | mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget"); |
---|
1244 | mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond); |
---|
1245 | } |
---|
1246 | |
---|
1247 | /// GENERATE DO BLOCK METHOD |
---|
1248 | |
---|
1249 | writeDoBlockMethod(idb); |
---|
1250 | |
---|
1251 | BasicBlock * const bodyEnd = idb->GetInsertBlock(); |
---|
1252 | blockIndex->addIncoming(idb->CreateAdd(blockIndex, idb->getSize(1)), bodyEnd); |
---|
1253 | if (branchTarget) { |
---|
1254 | branchTarget->addIncoming(mStrideLoopTarget, bodyEnd); |
---|
1255 | } |
---|
1256 | idb->CreateBr(strideLoopCond); |
---|
1257 | |
---|
1258 | stridesDone->moveAfter(bodyEnd); |
---|
1259 | |
---|
1260 | /// STRIDE DONE |
---|
1261 | |
---|
1262 | idb->SetInsertPoint(stridesDone); |
---|
1263 | |
---|
1264 | // Now conditionally perform the final block processing depending on the doFinal parameter. |
---|
1265 | if (branchTarget) { |
---|
1266 | mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3); |
---|
1267 | mStrideLoopBranch->addDestination(doFinalBlock); |
---|
1268 | mStrideLoopBranch->addDestination(segmentDone); |
---|
1269 | } else { |
---|
1270 | idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone); |
---|
1271 | } |
---|
1272 | |
---|
1273 | doFinalBlock->moveAfter(stridesDone); |
---|
1274 | |
---|
1275 | idb->SetInsertPoint(doFinalBlock); |
---|
1276 | |
---|
1277 | Value * remainingItems = nullptr; |
---|
1278 | for (unsigned i = 0; i < inputSetCount; ++i) { |
---|
1279 | const ProcessingRate & r = mStreamSetInputs[i].getRate(); |
---|
1280 | if (r.isFixed()) { |
---|
1281 | Value * ic = idb->CreateUDiv(mAvailableItemCount[i], idb->getSize(r.getRate())); |
---|
1282 | if (remainingItems) { |
---|
1283 | remainingItems = idb->CreateUMax(remainingItems, ic); |
---|
1284 | } else { |
---|
1285 | remainingItems = ic; |
---|
1286 | } |
---|
1287 | } |
---|
1288 | } |
---|
1289 | |
---|
1290 | writeFinalBlockMethod(idb, remainingItems); |
---|
1291 | |
---|
1292 | idb->CreateBr(segmentDone); |
---|
1293 | |
---|
1294 | segmentDone->moveAfter(idb->GetInsertBlock()); |
---|
1295 | |
---|
1296 | idb->SetInsertPoint(segmentDone); |
---|
1297 | |
---|
1298 | // Update the branch prediction metadata to indicate that the likely target will be segmentDone |
---|
1299 | if (branchTarget) { |
---|
1300 | MDBuilder mdb(idb->getContext()); |
---|
1301 | const auto destinations = mStrideLoopBranch->getNumDestinations(); |
---|
1302 | uint32_t weights[destinations]; |
---|
1303 | for (unsigned i = 0; i < destinations; ++i) { |
---|
1304 | weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1; |
---|
1305 | } |
---|
1306 | ArrayRef<uint32_t> bw(weights, destinations); |
---|
1307 | mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw)); |
---|
1308 | } |
---|
1309 | |
---|
1310 | } |
---|
1311 | |
---|
1312 | inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) { |
---|
1313 | |
---|
1314 | Value * const self = getInstance(); |
---|
1315 | Function * const cp = mCurrentMethod; |
---|
1316 | auto ip = idb->saveIP(); |
---|
1317 | std::vector<Value *> availableItemCount(0); |
---|
1318 | |
---|
1319 | /// Check if the do block method is called and create the function if necessary |
---|
1320 | if (!idb->supportsIndirectBr()) { |
---|
1321 | |
---|
1322 | std::vector<Type *> params; |
---|
1323 | params.reserve(1 + mAvailableItemCount.size()); |
---|
1324 | params.push_back(self->getType()); |
---|
1325 | for (Value * avail : mAvailableItemCount) { |
---|
1326 | params.push_back(avail->getType()); |
---|
1327 | } |
---|
1328 | |
---|
1329 | FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false); |
---|
1330 | mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule()); |
---|
1331 | mCurrentMethod->setCallingConv(CallingConv::C); |
---|
1332 | mCurrentMethod->setDoesNotThrow(); |
---|
1333 | mCurrentMethod->setDoesNotCapture(1); |
---|
1334 | auto args = mCurrentMethod->arg_begin(); |
---|
1335 | args->setName("self"); |
---|
1336 | setInstance(&*args); |
---|
1337 | availableItemCount.reserve(mAvailableItemCount.size()); |
---|
1338 | while (++args != mCurrentMethod->arg_end()) { |
---|
1339 | availableItemCount.push_back(&*args); |
---|
1340 | } |
---|
1341 | assert (availableItemCount.size() == mAvailableItemCount.size()); |
---|
1342 | mAvailableItemCount.swap(availableItemCount); |
---|
1343 | idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod)); |
---|
1344 | } |
---|
1345 | |
---|
1346 | generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype |
---|
1347 | |
---|
1348 | if (!idb->supportsIndirectBr()) { |
---|
1349 | // Restore the DoSegment function state then call the DoBlock method |
---|
1350 | idb->CreateRetVoid(); |
---|
1351 | mDoBlockMethod = mCurrentMethod; |
---|
1352 | idb->restoreIP(ip); |
---|
1353 | setInstance(self); |
---|
1354 | mCurrentMethod = cp; |
---|
1355 | mAvailableItemCount.swap(availableItemCount); |
---|
1356 | CreateDoBlockMethodCall(idb); |
---|
1357 | } |
---|
1358 | |
---|
1359 | } |
---|
1360 | |
---|
1361 | inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) { |
---|
1362 | |
---|
1363 | Value * const self = getInstance(); |
---|
1364 | Function * const cp = mCurrentMethod; |
---|
1365 | Value * const remainingItemCount = remainingItems; |
---|
1366 | auto ip = idb->saveIP(); |
---|
1367 | std::vector<Value *> availableItemCount(0); |
---|
1368 | |
---|
1369 | if (!idb->supportsIndirectBr()) { |
---|
1370 | std::vector<Type *> params; |
---|
1371 | params.reserve(2 + mAvailableItemCount.size()); |
---|
1372 | params.push_back(self->getType()); |
---|
1373 | params.push_back(idb->getSizeTy()); |
---|
1374 | for (Value * avail : mAvailableItemCount) { |
---|
1375 | params.push_back(avail->getType()); |
---|
1376 | } |
---|
1377 | FunctionType * const type = FunctionType::get(idb->getVoidTy(), params, false); |
---|
1378 | mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule()); |
---|
1379 | mCurrentMethod->setCallingConv(CallingConv::C); |
---|
1380 | mCurrentMethod->setDoesNotThrow(); |
---|
1381 | mCurrentMethod->setDoesNotCapture(1); |
---|
1382 | auto args = mCurrentMethod->arg_begin(); |
---|
1383 | args->setName("self"); |
---|
1384 | setInstance(&*args); |
---|
1385 | remainingItems = &*(++args); |
---|
1386 | remainingItems->setName("remainingItems"); |
---|
1387 | availableItemCount.reserve(mAvailableItemCount.size()); |
---|
1388 | while (++args != mCurrentMethod->arg_end()) { |
---|
1389 | availableItemCount.push_back(&*args); |
---|
1390 | } |
---|
1391 | assert (availableItemCount.size() == mAvailableItemCount.size()); |
---|
1392 | mAvailableItemCount.swap(availableItemCount); |
---|
1393 | idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod)); |
---|
1394 | } |
---|
1395 | |
---|
1396 | generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype |
---|
1397 | |
---|
1398 | if (!idb->supportsIndirectBr()) { |
---|
1399 | idb->CreateRetVoid(); |
---|
1400 | idb->restoreIP(ip); |
---|
1401 | setInstance(self); |
---|
1402 | mAvailableItemCount.swap(availableItemCount); |
---|
1403 | // Restore the DoSegment function state then call the DoFinal method |
---|
1404 | std::vector<Value *> args; |
---|
1405 | args.reserve(2 + mAvailableItemCount.size()); |
---|
1406 | args.push_back(self); |
---|
1407 | args.push_back(remainingItemCount); |
---|
1408 | for (Value * avail : mAvailableItemCount) { |
---|
1409 | args.push_back(avail); |
---|
1410 | } |
---|
1411 | idb->CreateCall(mCurrentMethod, args); |
---|
1412 | mCurrentMethod = cp; |
---|
1413 | } |
---|
1414 | |
---|
1415 | } |
---|
1416 | |
---|
1417 | // The default finalBlock method simply dispatches to the doBlock routine. |
---|
1418 | void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) { |
---|
1419 | CreateDoBlockMethodCall(idb); |
---|
1420 | } |
---|
1421 | |
---|
1422 | void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) { |
---|
1423 | if (idb->supportsIndirectBr()) { |
---|
1424 | BasicBlock * bb = idb->CreateBasicBlock("resume"); |
---|
1425 | mStrideLoopBranch->addDestination(bb); |
---|
1426 | mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock()); |
---|
1427 | idb->CreateBr(mStrideLoopBody); |
---|
1428 | bb->moveAfter(idb->GetInsertBlock()); |
---|
1429 | idb->SetInsertPoint(bb); |
---|
1430 | } else { |
---|
1431 | std::vector<Value *> args; |
---|
1432 | args.reserve(1 + mAvailableItemCount.size()); |
---|
1433 | args.push_back(getInstance()); |
---|
1434 | for (Value * avail : mAvailableItemCount) { |
---|
1435 | args.push_back(avail); |
---|
1436 | } |
---|
1437 | idb->CreateCall(mDoBlockMethod, args); |
---|
1438 | } |
---|
1439 | } |
---|
1440 | |
---|
1441 | static inline std::string annotateKernelNameWithDebugFlags(std::string && name) { |
---|
1442 | if (codegen::DebugOptionIsSet(codegen::EnableAsserts)) { |
---|
1443 | name += "_EA"; |
---|
1444 | } |
---|
1445 | name += "_O" + std::to_string((int)codegen::OptLevel); |
---|
1446 | return name; |
---|
1447 | } |
---|
1448 | |
---|
1449 | // CONSTRUCTOR |
---|
1450 | Kernel::Kernel(std::string && kernelName, |
---|
1451 | std::vector<Binding> && stream_inputs, |
---|
1452 | std::vector<Binding> && stream_outputs, |
---|
1453 | std::vector<Binding> && scalar_parameters, |
---|
1454 | std::vector<Binding> && scalar_outputs, |
---|
1455 | std::vector<Binding> && internal_scalars) |
---|
1456 | : KernelInterface(annotateKernelNameWithDebugFlags(std::move(kernelName)) |
---|
1457 | , std::move(stream_inputs), std::move(stream_outputs) |
---|
1458 | , std::move(scalar_parameters), std::move(scalar_outputs) |
---|
1459 | , std::move(internal_scalars)) |
---|
1460 | , mCurrentMethod(nullptr) |
---|
1461 | , mAvailablePrincipleItemCount(nullptr) |
---|
1462 | , mNoTerminateAttribute(false) |
---|
1463 | , mIsGenerated(false) |
---|
1464 | , mStride(0) |
---|
1465 | , mIsFinal(nullptr) |
---|
1466 | , mOutputScalarResult(nullptr) { |
---|
1467 | |
---|
1468 | } |
---|
1469 | |
---|
1470 | Kernel::~Kernel() { |
---|
1471 | |
---|
1472 | } |
---|
1473 | |
---|
1474 | // CONSTRUCTOR |
---|
1475 | BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName, |
---|
1476 | std::vector<Binding> && stream_inputs, |
---|
1477 | std::vector<Binding> && stream_outputs, |
---|
1478 | std::vector<Binding> && scalar_parameters, |
---|
1479 | std::vector<Binding> && scalar_outputs, |
---|
1480 | std::vector<Binding> && internal_scalars) |
---|
1481 | : MultiBlockKernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) |
---|
1482 | , mDoBlockMethod(nullptr) |
---|
1483 | , mStrideLoopBody(nullptr) |
---|
1484 | , mStrideLoopBranch(nullptr) |
---|
1485 | , mStrideLoopTarget(nullptr) { |
---|
1486 | |
---|
1487 | } |
---|
1488 | |
---|
1489 | // MULTI-BLOCK KERNEL CONSTRUCTOR |
---|
1490 | MultiBlockKernel::MultiBlockKernel(std::string && kernelName, |
---|
1491 | std::vector<Binding> && stream_inputs, |
---|
1492 | std::vector<Binding> && stream_outputs, |
---|
1493 | std::vector<Binding> && scalar_parameters, |
---|
1494 | std::vector<Binding> && scalar_outputs, |
---|
1495 | std::vector<Binding> && internal_scalars) |
---|
1496 | : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) { |
---|
1497 | |
---|
1498 | } |
---|
1499 | |
---|
1500 | // CONSTRUCTOR |
---|
1501 | SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName, |
---|
1502 | std::vector<Binding> && stream_inputs, |
---|
1503 | std::vector<Binding> && stream_outputs, |
---|
1504 | std::vector<Binding> && scalar_parameters, |
---|
1505 | std::vector<Binding> && scalar_outputs, |
---|
1506 | std::vector<Binding> && internal_scalars) |
---|
1507 | : Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) { |
---|
1508 | |
---|
1509 | } |
---|
1510 | |
---|
1511 | |
---|
1512 | } |
---|