source: icGREP/icgrep-devel/icgrep/kernels/kernel.cpp @ 5439

Last change on this file since 5439 was 5439, checked in by cameron, 2 years ago

Multiblock Kernels: initial check-in

File size: 64.2 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "kernel.h"
7#include <toolchain/toolchain.h>
8#include <kernels/streamset.h>
9#include <llvm/IR/Constants.h>
10#include <llvm/IR/Function.h>
11#include <llvm/IR/Instructions.h>
12#include <llvm/IR/MDBuilder.h>
13#include <llvm/IR/Module.h>
14#include <llvm/Support/raw_ostream.h>
15#include <llvm/Bitcode/ReaderWriter.h>
16#include <llvm/Transforms/Utils/Local.h>
17#include <kernels/streamset.h>
18#include <sstream>
19#include <kernels/kernel_builder.h>
20
21using namespace llvm;
22using namespace parabix;
23
24namespace kernel {
25
26const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
27const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
28const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
29const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
30const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
31const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
32const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
33const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal";
34const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr";
35const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks";
36
37unsigned Kernel::addScalar(Type * const type, const std::string & name) {
38    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
39        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
40    }
41    if (LLVM_UNLIKELY(mKernelMap.count(name))) {
42        report_fatal_error(getName() + " already contains scalar field " + name);
43    }
44    const auto index = mKernelFields.size();
45    mKernelMap.emplace(name, index);
46    mKernelFields.push_back(type);
47    return index;
48}
49
50unsigned Kernel::addUnnamedScalar(Type * const type) {
51    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
52        report_fatal_error("Cannot add unnamed field  to " + getName() + " after kernel state finalized");
53    }
54    const auto index = mKernelFields.size();
55    mKernelFields.push_back(type);
56    return index;
57}
58
59// Get the value of a scalar field for the current instance.
60llvm::Value * Kernel::getScalarFieldPtr(llvm::Value * index) const {
61    return iBuilder->CreateGEP(getInstance(), {iBuilder->getInt32(0), index});
62}
63
64llvm::Value * Kernel::getScalarFieldPtr(const std::string & fieldName) const {
65    return getScalarFieldPtr(iBuilder->getInt32(getScalarIndex(fieldName)));
66}
67
68llvm::Value * Kernel::getScalarField(const std::string & fieldName) const {
69    return iBuilder->CreateLoad(getScalarFieldPtr(fieldName), fieldName);
70}
71
72// Set the value of a scalar field for the current instance.
73void Kernel::setScalarField(const std::string & fieldName, llvm::Value * value) const {
74    iBuilder->CreateStore(value, getScalarFieldPtr(fieldName));
75}
76
77void Kernel::prepareStreamSetNameMap() {
78    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
79        mStreamMap.emplace(mStreamSetInputs[i].name, std::make_pair(Port::Input, i));
80    }
81    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
82        mStreamMap.emplace(mStreamSetOutputs[i].name, std::make_pair(Port::Output, i));
83    }
84}
85   
86void Kernel::prepareKernel() {
87    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
88    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
89        report_fatal_error("Cannot prepare kernel after kernel state finalized");
90    }
91    if (mStreamSetInputs.size() != mStreamSetInputBuffers.size()) {
92        std::string tmp;
93        raw_string_ostream out(tmp);
94        out << "kernel contains " << mStreamSetInputBuffers.size() << " input buffers for "
95            << mStreamSetInputs.size() << " input stream sets.";
96        report_fatal_error(out.str());
97    }
98    if (mStreamSetOutputs.size() != mStreamSetOutputBuffers.size()) {
99        std::string tmp;
100        raw_string_ostream out(tmp);
101        out << "kernel contains " << mStreamSetOutputBuffers.size() << " output buffers for "
102            << mStreamSetOutputs.size() << " output stream sets.";
103        report_fatal_error(out.str());
104    }
105    const auto blockSize = iBuilder->getBitBlockWidth();
106    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
107        if ((mStreamSetInputBuffers[i]->getBufferBlocks() > 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < codegen::SegmentSize + (blockSize + mLookAheadPositions - 1)/blockSize)) {
108            report_fatal_error("Kernel preparation: Buffer size too small " + mStreamSetInputs[i].name);
109        }
110        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getPointerType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
111        if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
112            addScalar(iBuilder->getSizeTy(), mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
113        }       
114    }
115
116    IntegerType * const sizeTy = iBuilder->getSizeTy();
117    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
118        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getPointerType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
119        if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
120            addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
121        }
122    }
123    for (const auto binding : mScalarInputs) {
124        addScalar(binding.type, binding.name);
125    }
126    for (const auto binding : mScalarOutputs) {
127        addScalar(binding.type, binding.name);
128    }
129    if (mStreamMap.empty()) {
130        prepareStreamSetNameMap();
131    }
132    for (auto binding : mInternalScalars) {
133        addScalar(binding.type, binding.name);
134    }
135
136    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
137    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
138        addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
139    }
140
141    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
142    addScalar(iBuilder->getInt1Ty(), TERMINATION_SIGNAL);
143
144    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
145        addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
146    }
147
148    mKernelStateType = StructType::create(iBuilder->getContext(), mKernelFields, getName());
149}
150
151void Kernel::createKernelStub(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
152    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
153    assert ("IDISA Builder does not have a valid Module" && iBuilder->getModule());
154    std::stringstream cacheName;   
155    cacheName << getName() << '_' << iBuilder->getBuilderUniqueName();
156    for (const StreamSetBuffer * b: inputs) {
157        cacheName <<  ':' <<  b->getUniqueID();
158    }
159    for (const StreamSetBuffer * b: outputs) {
160        cacheName <<  ':' <<  b->getUniqueID();
161    }
162    Module * const kernelModule = new Module(cacheName.str(), iBuilder->getContext());
163    kernelModule->setTargetTriple(iBuilder->getModule()->getTargetTriple());
164    createKernelStub(inputs, outputs, kernelModule);
165}
166
167void Kernel::createKernelStub(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs, Module * const kernelModule) {
168    assert (mModule == nullptr);
169    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
170    assert (mStreamSetInputBuffers.empty());
171    assert (mStreamSetOutputBuffers.empty());
172
173    if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) {
174        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetInputs.size()) +
175                           " input stream sets but was given "
176                           + std::to_string(inputs.size()));
177    }
178
179    for (unsigned i = 0; i < inputs.size(); ++i) {
180        StreamSetBuffer * const buf = inputs[i];
181        if (LLVM_UNLIKELY(buf == nullptr)) {
182            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
183                               + " cannot be null");
184        }
185        buf->addConsumer(this);
186    }
187
188    if (LLVM_UNLIKELY(mStreamSetOutputs.size() != outputs.size())) {
189        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetOutputs.size())
190                           + " output stream sets but was given "
191                           + std::to_string(outputs.size()));
192    }
193
194    for (unsigned i = 0; i < outputs.size(); ++i) {
195        StreamSetBuffer * const buf = outputs[i];
196        if (LLVM_UNLIKELY(buf == nullptr)) {
197            report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null");
198        }
199        if (LLVM_LIKELY(buf->getProducer() == nullptr)) {
200            buf->setProducer(this);
201        } else {
202            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
203                               + " is already produced by kernel " + buf->getProducer()->getName());
204        }
205    }
206
207    mModule = kernelModule;
208
209    mStreamSetInputBuffers.assign(inputs.begin(), inputs.end());
210    mStreamSetOutputBuffers.assign(outputs.begin(), outputs.end());
211
212    prepareKernel();
213}
214
215
216// Default kernel signature: generate the IR and emit as byte code.
217std::string Kernel::makeSignature() {
218    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
219    if (LLVM_LIKELY(moduleIDisSignature())) {
220        return getModule()->getModuleIdentifier();
221    } else {
222        generateKernel();
223        std::string signature;
224        raw_string_ostream OS(signature);
225        WriteBitcodeToFile(getModule(), OS);
226        return signature;
227    }
228}
229
230void Kernel::generateKernel() {
231    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
232    // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
233    // generated the unoptimized IR.
234    if (!mIsGenerated) {
235        auto ip = iBuilder->saveIP();
236        auto saveInstance = getInstance();
237        addKernelDeclarations();
238        callGenerateInitializeMethod();
239        callGenerateDoSegmentMethod();       
240        callGenerateFinalizeMethod();
241        setInstance(saveInstance);
242        iBuilder->restoreIP(ip);
243        mIsGenerated = true;
244    }
245}
246
247inline void Kernel::callGenerateInitializeMethod() {
248    mCurrentMethod = getInitFunction(iBuilder->getModule());
249    iBuilder->SetInsertPoint(CreateBasicBlock("entry"));
250    Function::arg_iterator args = mCurrentMethod->arg_begin();
251    setInstance(&*(args++));
252    iBuilder->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
253    for (const auto & binding : mScalarInputs) {
254        setScalarField(binding.name, &*(args++));
255    }
256    for (const auto & binding : mStreamSetOutputs) {
257        setConsumerLock(binding.name, &*(args++));
258    }
259    generateInitializeMethod();
260    iBuilder->CreateRetVoid();
261}
262
263inline void Kernel::callGenerateDoSegmentMethod() {
264    mCurrentMethod = getDoSegmentFunction(iBuilder->getModule());
265    BasicBlock * const entry = CreateBasicBlock(getName() + "_entry");
266    iBuilder->SetInsertPoint(entry);
267    auto args = mCurrentMethod->arg_begin();
268    setInstance(&*(args++));
269    mIsFinal = &*(args++);
270    const auto n = mStreamSetInputs.size();
271    mAvailableItemCount.resize(n, nullptr);
272    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
273        mAvailableItemCount[i] = &*(args++);
274    }
275    generateDoSegmentMethod(); // must be overridden by the KernelBuilder subtype
276    mIsFinal = nullptr;
277    mAvailableItemCount.clear();
278    iBuilder->CreateRetVoid();
279}
280
281inline void Kernel::callGenerateFinalizeMethod() {
282    mCurrentMethod = getTerminateFunction(iBuilder->getModule());
283    iBuilder->SetInsertPoint(CreateBasicBlock("entry"));
284    auto args = mCurrentMethod->arg_begin();
285    setInstance(&*(args++));
286    generateFinalizeMethod(); // may be overridden by the KernelBuilder subtype
287    const auto n = mScalarOutputs.size();
288    if (n == 0) {
289        iBuilder->CreateRetVoid();
290    } else {
291        Value * outputs[n];
292        for (unsigned i = 0; i < n; ++i) {
293            outputs[i] = getScalarField(mScalarOutputs[i].name);
294        }
295        if (n == 1) {
296            iBuilder->CreateRet(outputs[0]);
297        } else {
298            iBuilder->CreateAggregateRet(outputs, n);
299        }
300    }
301}
302
303unsigned Kernel::getScalarIndex(const std::string & name) const {
304    assert ("getScalarIndex was given a null IDISA Builder" && iBuilder);
305    const auto f = mKernelMap.find(name);
306    if (LLVM_UNLIKELY(f == mKernelMap.end())) {
307        report_fatal_error(getName() + " does not contain scalar: " + name);
308    }
309    return f->second;
310}
311
312Value * Kernel::getProducedItemCount(const std::string & name, Value * doFinal) const {
313    Port port; unsigned ssIdx;
314    std::tie(port, ssIdx) = getStreamPort(name);
315    assert (port == Port::Output);
316    if (mStreamSetOutputs[ssIdx].rate.isExact()) {
317        std::string refSet = mStreamSetOutputs[ssIdx].rate.referenceStreamSet();
318        std::string principalField;
319        if (refSet.empty()) {
320            if (mStreamSetInputs.empty()) {
321                principalField = mStreamSetOutputs[0].name + PRODUCED_ITEM_COUNT_SUFFIX;
322            } else {
323                principalField = mStreamSetInputs[0].name + PROCESSED_ITEM_COUNT_SUFFIX;
324            }
325        } else {
326            Port port; unsigned pfIndex;
327            std::tie(port, pfIndex) = getStreamPort(refSet);
328            if (port == Port::Input) {
329               principalField = refSet + PROCESSED_ITEM_COUNT_SUFFIX;
330            } else {
331               principalField = refSet + PRODUCED_ITEM_COUNT_SUFFIX;
332            }
333        }
334        Value * principalItemsProcessed = getScalarField(principalField);
335        return mStreamSetOutputs[ssIdx].rate.CreateRatioCalculation(iBuilder, principalItemsProcessed, doFinal);
336    }
337    return getScalarField(name + PRODUCED_ITEM_COUNT_SUFFIX);
338}
339
340llvm::Value * Kernel::getAvailableItemCount(const std::string & name) const {
341    for (unsigned i = 0; i < mStreamSetInputs.size(); ++i) {
342        if (mStreamSetInputs[i].name == name) {
343            return mAvailableItemCount[i];
344        }
345    }
346    return nullptr;
347}
348
349Value * Kernel::getProcessedItemCount(const std::string & name) const {
350    Port port; unsigned ssIdx;
351    std::tie(port, ssIdx) = getStreamPort(name);
352    assert (port == Port::Input);
353    if (mStreamSetInputs[ssIdx].rate.isExact()) {
354        std::string refSet = mStreamSetInputs[ssIdx].rate.referenceStreamSet();
355        if (refSet.empty()) {
356            refSet = mStreamSetInputs[0].name;
357        }
358        Value * principalItemsProcessed = getScalarField(refSet + PROCESSED_ITEM_COUNT_SUFFIX);
359        return mStreamSetInputs[ssIdx].rate.CreateRatioCalculation(iBuilder, principalItemsProcessed);
360    }
361    return getScalarField(name + PROCESSED_ITEM_COUNT_SUFFIX);
362}
363
364Value * Kernel::getConsumedItemCount(const std::string & name) const {
365    return getScalarField(name + CONSUMED_ITEM_COUNT_SUFFIX);
366}
367
368void Kernel::setProducedItemCount(const std::string & name, Value * value) const {
369    setScalarField(name + PRODUCED_ITEM_COUNT_SUFFIX, value);
370}
371
372void Kernel::setProcessedItemCount(const std::string & name, Value * value) const {
373    setScalarField(name + PROCESSED_ITEM_COUNT_SUFFIX, value);
374}
375
376void Kernel::setConsumedItemCount(const std::string & name, Value * value) const {
377    setScalarField(name + CONSUMED_ITEM_COUNT_SUFFIX, value);
378}
379
380Value * Kernel::getTerminationSignal() const {
381    return getScalarField(TERMINATION_SIGNAL);
382}
383
384void Kernel::setTerminationSignal() const {
385    setScalarField(TERMINATION_SIGNAL, iBuilder->getTrue());
386}
387
388LoadInst * Kernel::acquireLogicalSegmentNo() const {
389    assert (iBuilder);
390    return iBuilder->CreateAtomicLoadAcquire(getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR));
391}
392
393void Kernel::releaseLogicalSegmentNo(Value * nextSegNo) const {
394    iBuilder->CreateAtomicStoreRelease(nextSegNo, getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR));
395}
396
397llvm::Value * Kernel::getLinearlyAccessibleItems(const std::string & name, llvm::Value * fromPosition) const {
398    const StreamSetBuffer * const buf = getInputStreamSetBuffer(name);
399    return buf->getLinearlyAccessibleItems(iBuilder, fromPosition);
400}
401
402llvm::Value * Kernel::getConsumerLock(const std::string & name) const {
403    return getScalarField(name + CONSUMER_SUFFIX);
404}
405
406void Kernel::setConsumerLock(const std::string & name, llvm::Value * value) const {
407    setScalarField(name + CONSUMER_SUFFIX, value);
408}
409
410inline Value * Kernel::computeBlockIndex(const std::vector<Binding> & bindings, const std::string & name, Value * itemCount) const {
411    for (const Binding & b : bindings) {
412        if (b.name == name) {
413            const auto divisor = iBuilder->getBitBlockWidth();
414            if (LLVM_LIKELY((divisor & (divisor - 1)) == 0)) {
415                return iBuilder->CreateLShr(itemCount, std::log2(divisor));
416            } else {
417                return iBuilder->CreateUDiv(itemCount, iBuilder->getSize(divisor));
418            }
419        }
420    }
421    report_fatal_error("Error: no binding in " + getName() + " for " + name);
422}
423
424Value * Kernel::getInputStreamBlockPtr(const std::string & name, Value * streamIndex) const {
425    Value * const blockIndex = computeBlockIndex(mStreamSetInputs, name, getProcessedItemCount(name));
426    const StreamSetBuffer * const buf = getInputStreamSetBuffer(name);
427    return buf->getStreamBlockPtr(iBuilder, getStreamSetBufferPtr(name), streamIndex, blockIndex, true);
428}
429
430Value * Kernel::loadInputStreamBlock(const std::string & name, Value * streamIndex) const {
431    return iBuilder->CreateBlockAlignedLoad(getInputStreamBlockPtr(name, streamIndex));
432}
433
434Value * Kernel::getInputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) const {
435    Value * const blockIndex = computeBlockIndex(mStreamSetInputs, name, getProcessedItemCount(name));
436    const StreamSetBuffer * const buf = getInputStreamSetBuffer(name);
437    return buf->getStreamPackPtr(iBuilder, getStreamSetBufferPtr(name), streamIndex, blockIndex, packIndex, true);
438}
439
440Value * Kernel::loadInputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex) const {
441    return iBuilder->CreateBlockAlignedLoad(getInputStreamPackPtr(name, streamIndex, packIndex));
442}
443
444llvm::Value * Kernel::getInputStreamSetCount(const std::string & name) const {
445    return getInputStreamSetBuffer(name)->getStreamSetCount(iBuilder, getStreamSetBufferPtr(name));
446}
447
448llvm::Value * Kernel::getAdjustedInputStreamBlockPtr(Value * blockAdjustment, const std::string & name, llvm::Value * streamIndex) const {
449    Value * blockIndex = computeBlockIndex(mStreamSetInputs, name, getProcessedItemCount(name));
450    blockIndex = iBuilder->CreateAdd(blockIndex, blockAdjustment);
451    const StreamSetBuffer * const buf = getInputStreamSetBuffer(name);
452    return buf->getStreamBlockPtr(iBuilder, getStreamSetBufferPtr(name), streamIndex, blockIndex, true);
453}
454
455Value * Kernel::getOutputStreamBlockPtr(const std::string & name, Value * streamIndex) const {
456    Value * const blockIndex = computeBlockIndex(mStreamSetOutputs, name, getProducedItemCount(name));
457    const StreamSetBuffer * const buf = getOutputStreamSetBuffer(name);
458    return buf->getStreamBlockPtr(iBuilder, getStreamSetBufferPtr(name), streamIndex, blockIndex, false);
459}
460
461void Kernel::storeOutputStreamBlock(const std::string & name, Value * streamIndex, Value * toStore) const {
462    return iBuilder->CreateBlockAlignedStore(toStore, getOutputStreamBlockPtr(name, streamIndex));
463}
464
465Value * Kernel::getOutputStreamPackPtr(const std::string & name, Value * streamIndex, Value * packIndex) const {
466    Value * const blockIndex = computeBlockIndex(mStreamSetOutputs, name, getProducedItemCount(name));
467    const StreamSetBuffer * const buf = getOutputStreamSetBuffer(name);
468    return buf->getStreamPackPtr(iBuilder, getStreamSetBufferPtr(name), streamIndex, blockIndex, packIndex, false);
469}
470
471void Kernel::storeOutputStreamPack(const std::string & name, Value * streamIndex, Value * packIndex, Value * toStore) const {
472    return iBuilder->CreateBlockAlignedStore(toStore, getOutputStreamPackPtr(name, streamIndex, packIndex));
473}
474
475llvm::Value * Kernel::getOutputStreamSetCount(const std::string & name) const {
476    return getOutputStreamSetBuffer(name)->getStreamSetCount(iBuilder, getStreamSetBufferPtr(name));
477}
478
479Value * Kernel::getRawInputPointer(const std::string & name, Value * streamIndex, Value * absolutePosition) const {
480    return getInputStreamSetBuffer(name)->getRawItemPointer(iBuilder, getStreamSetBufferPtr(name), streamIndex, absolutePosition);
481}
482
483Value * Kernel::getRawOutputPointer(const std::string & name, Value * streamIndex, Value * absolutePosition) const {
484    return getOutputStreamSetBuffer(name)->getRawItemPointer(iBuilder, getStreamSetBufferPtr(name), streamIndex, absolutePosition);
485}
486
487Value * Kernel::getBaseAddress(const std::string & name) const {
488    return getAnyStreamSetBuffer(name)->getBaseAddress(iBuilder, getStreamSetBufferPtr(name));
489}
490
491void Kernel::setBaseAddress(const std::string & name, Value * const addr) const {
492    return getAnyStreamSetBuffer(name)->setBaseAddress(iBuilder, getStreamSetBufferPtr(name), addr);
493}
494
495Value * Kernel::getBufferedSize(const std::string & name) const {
496    return getAnyStreamSetBuffer(name)->getBufferedSize(iBuilder, getStreamSetBufferPtr(name));
497}
498
499void Kernel::setBufferedSize(const std::string & name, Value * size) const {
500    unsigned index; Port port;
501    std::tie(port, index) = getStreamPort(name);
502    const StreamSetBuffer * buf = nullptr;
503    if (port == Port::Input) {
504        assert (index < mStreamSetInputBuffers.size());
505        buf = mStreamSetInputBuffers[index];
506    } else {
507        assert (index < mStreamSetOutputBuffers.size());
508        buf = mStreamSetOutputBuffers[index];
509    }
510    buf->setBufferedSize(iBuilder, getStreamSetBufferPtr(name), size);
511}
512
513BasicBlock * Kernel::CreateWaitForConsumers() const {
514
515    const auto consumers = getStreamOutputs();
516    BasicBlock * const entry = iBuilder->GetInsertBlock();
517    if (consumers.empty()) {
518        return entry;
519    } else {
520        Function * const parent = entry->getParent();
521        IntegerType * const sizeTy = iBuilder->getSizeTy();
522        ConstantInt * const zero = iBuilder->getInt32(0);
523        ConstantInt * const one = iBuilder->getInt32(1);
524        ConstantInt * const size0 = iBuilder->getSize(0);
525
526        Value * const segNo = acquireLogicalSegmentNo();
527        const auto n = consumers.size();
528        BasicBlock * load[n + 1];
529        BasicBlock * wait[n];
530        for (unsigned i = 0; i < n; ++i) {
531            load[i] = BasicBlock::Create(iBuilder->getContext(), consumers[i].name + "Load", parent);
532            wait[i] = BasicBlock::Create(iBuilder->getContext(), consumers[i].name + "Wait", parent);
533        }
534        load[n] = BasicBlock::Create(iBuilder->getContext(), "Resume", parent);
535        iBuilder->CreateBr(load[0]);
536        for (unsigned i = 0; i < n; ++i) {
537
538            iBuilder->SetInsertPoint(load[i]);
539            Value * const outputConsumers = getConsumerLock(consumers[i].name);
540
541            Value * const consumerCount = iBuilder->CreateLoad(iBuilder->CreateGEP(outputConsumers, {zero, zero}));
542            Value * const consumerPtr = iBuilder->CreateLoad(iBuilder->CreateGEP(outputConsumers, {zero, one}));
543            Value * const noConsumers = iBuilder->CreateICmpEQ(consumerCount, size0);
544            iBuilder->CreateUnlikelyCondBr(noConsumers, load[i + 1], wait[i]);
545
546            iBuilder->SetInsertPoint(wait[i]);
547            PHINode * const consumerPhi = iBuilder->CreatePHI(sizeTy, 2);
548            consumerPhi->addIncoming(size0, load[i]);
549
550            Value * const conSegPtr = iBuilder->CreateLoad(iBuilder->CreateGEP(consumerPtr, consumerPhi));
551            Value * const processedSegmentCount = iBuilder->CreateAtomicLoadAcquire(conSegPtr);
552            Value * const ready = iBuilder->CreateICmpEQ(segNo, processedSegmentCount);
553            assert (ready->getType() == iBuilder->getInt1Ty());
554            Value * const nextConsumerIdx = iBuilder->CreateAdd(consumerPhi, iBuilder->CreateZExt(ready, sizeTy));
555            consumerPhi->addIncoming(nextConsumerIdx, wait[i]);
556            Value * const next = iBuilder->CreateICmpEQ(nextConsumerIdx, consumerCount);
557            iBuilder->CreateCondBr(next, load[i + 1], wait[i]);
558        }
559
560        BasicBlock * const exit = load[n];
561        iBuilder->SetInsertPoint(exit);
562        return exit;
563    }
564
565}
566
567Value * Kernel::getStreamSetBufferPtr(const std::string & name) const {
568    return getScalarField(name + BUFFER_PTR_SUFFIX);
569}
570
571//Argument * Kernel::getParameter(Function * const f, const std::string & name) const {
572//    for (auto & arg : f->getArgumentList()) {
573//        if (arg.getName().equals(name)) {
574//            return &arg;
575//        }
576//    }
577//    report_fatal_error(getName() + " does not have parameter " + name);
578//}
579
580CallInst * Kernel::createDoSegmentCall(const std::vector<Value *> & args) const {
581    Function * const doSegment = getDoSegmentFunction(iBuilder->getModule());
582    assert (doSegment->getArgumentList().size() == args.size());
583    return iBuilder->CreateCall(doSegment, args);
584}
585
586Value * Kernel::getAccumulator(const std::string & accumName) const {
587    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
588    if (LLVM_UNLIKELY(mOutputScalarResult == nullptr)) {
589        report_fatal_error("Cannot get accumulator " + accumName + " until " + getName() + " has terminated.");
590    }
591    const auto n = mScalarOutputs.size();
592    if (LLVM_UNLIKELY(n == 0)) {
593        report_fatal_error(getName() + " has no output scalars.");
594    } else {
595        for (unsigned i = 0; i < n; ++i) {
596            const Binding & b = mScalarOutputs[i];
597            if (b.name == accumName) {
598                if (n == 1) {
599                    return mOutputScalarResult;
600                } else {
601                    return iBuilder->CreateExtractValue(mOutputScalarResult, {i});
602                }
603            }
604        }
605        report_fatal_error(getName() + " has no output scalar named " + accumName);
606    }
607}
608
609BasicBlock * Kernel::CreateBasicBlock(std::string && name) const {
610    return BasicBlock::Create(iBuilder->getContext(), name, mCurrentMethod);
611}
612
613Value * Kernel::createInstance() {
614    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
615    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
616        report_fatal_error("Cannot instantiate " + getName() + " before calling prepareKernel()");
617    }
618    setInstance(iBuilder->CreateCacheAlignedAlloca(mKernelStateType));
619    return getInstance();
620}
621
622void Kernel::initializeInstance() {
623    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
624    if (LLVM_UNLIKELY(getInstance() == nullptr)) {
625        report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()");
626    }
627    std::vector<Value *> args;
628    args.reserve(1 + mInitialArguments.size() + mStreamSetInputBuffers.size() + (mStreamSetOutputBuffers.size() * 2));
629    args.push_back(getInstance());
630    for (unsigned i = 0; i < mInitialArguments.size(); ++i) {
631        Value * arg = mInitialArguments[i];
632        if (LLVM_UNLIKELY(arg == nullptr)) {
633            report_fatal_error(getName() + ": initial argument " + std::to_string(i)
634                               + " cannot be null when calling createInstance()");
635        }
636        args.push_back(arg);
637    }
638    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
639        assert (mStreamSetInputBuffers[i]);
640        Value * arg = mStreamSetInputBuffers[i]->getStreamSetBasePtr();
641        if (LLVM_UNLIKELY(arg == nullptr)) {
642            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
643                               + " was not allocated prior to calling createInstance()");
644        }
645        args.push_back(arg);
646    }
647    assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
648    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
649        assert (mStreamSetOutputBuffers[i]);
650        Value * arg = mStreamSetOutputBuffers[i]->getStreamSetBasePtr();
651        if (LLVM_UNLIKELY(arg == nullptr)) {
652            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
653                               + " was not allocated prior to calling createInstance()");
654        }
655        args.push_back(arg);
656    }
657    assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
658    IntegerType * const sizeTy = iBuilder->getSizeTy();
659    PointerType * const sizePtrTy = sizeTy->getPointerTo();
660    PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo();
661    StructType * const consumerTy = StructType::get(sizeTy, sizePtrPtrTy, nullptr);
662    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
663        const auto output = mStreamSetOutputBuffers[i];
664        const auto & consumers = output->getConsumers();
665        const auto n = consumers.size();
666        AllocaInst * const outputConsumers = iBuilder->CreateAlloca(consumerTy);
667        Value * const consumerSegNoArray = iBuilder->CreateAlloca(ArrayType::get(sizePtrTy, n));
668        for (unsigned i = 0; i < n; ++i) {
669            Kernel * const consumer = consumers[i];
670            assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance());
671            Value * const segmentNoPtr = consumer->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
672            iBuilder->CreateStore(segmentNoPtr, iBuilder->CreateGEP(consumerSegNoArray, { iBuilder->getInt32(0), iBuilder->getInt32(i) }));
673        }
674        Value * const consumerCountPtr = iBuilder->CreateGEP(outputConsumers, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
675        iBuilder->CreateStore(iBuilder->getSize(n), consumerCountPtr);
676        Value * const consumerSegNoArrayPtr = iBuilder->CreateGEP(outputConsumers, {iBuilder->getInt32(0), iBuilder->getInt32(1)});
677        iBuilder->CreateStore(iBuilder->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
678        args.push_back(outputConsumers);
679    }
680
681    iBuilder->CreateCall(getInitFunction(iBuilder->getModule()), args);
682}
683
684//  The default doSegment method dispatches to the doBlock routine for
685//  each block of the given number of blocksToDo, and then updates counts.
686
687void BlockOrientedKernel::generateDoSegmentMethod() {   
688    BasicBlock * const entryBlock = iBuilder->GetInsertBlock();
689    BasicBlock * const strideLoopCond = CreateBasicBlock(getName() + "_strideLoopCond");
690    mStrideLoopBody = CreateBasicBlock(getName() + "_strideLoopBody");
691    BasicBlock * const stridesDone = CreateBasicBlock(getName() + "_stridesDone");
692    BasicBlock * const doFinalBlock = CreateBasicBlock(getName() + "_doFinalBlock");
693    BasicBlock * const segmentDone = CreateBasicBlock(getName() + "_segmentDone");
694
695    Value * baseTarget = nullptr;
696    if (useIndirectBr()) {
697        baseTarget = iBuilder->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
698    }
699
700    ConstantInt * stride = iBuilder->getSize(iBuilder->getStride());
701    Value * availablePos = mAvailableItemCount[0];
702    Value * processed = getProcessedItemCount(mStreamSetInputs[0].name);
703    Value * itemsAvail = iBuilder->CreateSub(availablePos, processed);
704    Value * stridesToDo = iBuilder->CreateUDiv(itemsAvail, stride);
705
706    iBuilder->CreateBr(strideLoopCond);
707
708    iBuilder->SetInsertPoint(strideLoopCond);
709
710    PHINode * branchTarget = nullptr;
711    if (useIndirectBr()) {
712        branchTarget = iBuilder->CreatePHI(baseTarget->getType(), 2, "branchTarget");
713        branchTarget->addIncoming(baseTarget, entryBlock);
714    }
715
716    PHINode * const stridesRemaining = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "stridesRemaining");
717    stridesRemaining->addIncoming(stridesToDo, entryBlock);
718    // NOTE: stridesRemaining may go to a negative number in the final block if the generateFinalBlockMethod(...)
719    // calls CreateDoBlockMethodCall(). Do *not* replace the comparator with an unsigned one!
720    Value * notDone = iBuilder->CreateICmpSGT(stridesRemaining, iBuilder->getSize(0));
721    iBuilder->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
722
723    iBuilder->SetInsertPoint(mStrideLoopBody);
724
725    if (useIndirectBr()) {
726        mStrideLoopTarget = iBuilder->CreatePHI(baseTarget->getType(), 2, "strideTarget");
727        mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
728    }
729
730    /// GENERATE DO BLOCK METHOD
731
732    writeDoBlockMethod();
733
734    /// UPDATE PROCESSED COUNTS
735
736    processed = getProcessedItemCount(mStreamSetInputs[0].name);
737    Value * itemsDone = iBuilder->CreateAdd(processed, stride);
738    setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
739
740    stridesRemaining->addIncoming(iBuilder->CreateSub(stridesRemaining, iBuilder->getSize(1)), iBuilder->GetInsertBlock());
741
742    BasicBlock * bodyEnd = iBuilder->GetInsertBlock();
743    if (useIndirectBr()) {
744        branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
745    }
746    iBuilder->CreateBr(strideLoopCond);
747
748    stridesDone->moveAfter(bodyEnd);
749
750    iBuilder->SetInsertPoint(stridesDone);
751
752    // Now conditionally perform the final block processing depending on the doFinal parameter.
753    if (useIndirectBr()) {
754        mStrideLoopBranch = iBuilder->CreateIndirectBr(branchTarget, 3);
755        mStrideLoopBranch->addDestination(doFinalBlock);
756        mStrideLoopBranch->addDestination(segmentDone);
757    } else {
758        iBuilder->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
759    }
760
761    doFinalBlock->moveAfter(stridesDone);
762
763    iBuilder->SetInsertPoint(doFinalBlock);
764
765    Value * remainingItems = iBuilder->CreateSub(mAvailableItemCount[0], getProcessedItemCount(mStreamSetInputs[0].name));
766    writeFinalBlockMethod(remainingItems);
767
768    itemsDone = mAvailableItemCount[0];
769    setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
770    setTerminationSignal();
771    iBuilder->CreateBr(segmentDone);
772
773    segmentDone->moveAfter(iBuilder->GetInsertBlock());
774
775    iBuilder->SetInsertPoint(segmentDone);
776
777    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
778    if (useIndirectBr()) {
779        MDBuilder mdb(iBuilder->getContext());
780        const auto destinations = mStrideLoopBranch->getNumDestinations();
781        uint32_t weights[destinations];
782        for (unsigned i = 0; i < destinations; ++i) {
783            weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1;
784        }
785        ArrayRef<uint32_t> bw(weights, destinations);
786        mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw));
787    }
788
789}
790
791inline void BlockOrientedKernel::writeDoBlockMethod() {
792
793    Value * const self = getInstance();
794    Function * const cp = mCurrentMethod;
795    auto ip = iBuilder->saveIP();
796
797    /// Check if the do block method is called and create the function if necessary   
798    if (!useIndirectBr()) {
799        FunctionType * const type = FunctionType::get(iBuilder->getVoidTy(), {self->getType()}, false);
800        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, iBuilder->getModule());
801        mCurrentMethod->setCallingConv(CallingConv::C);
802        mCurrentMethod->setDoesNotThrow();
803        mCurrentMethod->setDoesNotCapture(1);
804        auto args = mCurrentMethod->arg_begin();
805        args->setName("self");
806        setInstance(&*args);
807        iBuilder->SetInsertPoint(CreateBasicBlock("entry"));
808    }
809
810    std::vector<Value *> priorProduced;
811    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
812        if (isa<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]) || isa<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
813            priorProduced.push_back(getProducedItemCount(mStreamSetOutputs[i].name));
814        }
815    }
816
817    generateDoBlockMethod(); // must be implemented by the BlockOrientedKernelBuilder subtype
818
819    unsigned priorIdx = 0;
820    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
821        Value * log2BlockSize = iBuilder->getSize(std::log2(iBuilder->getBitBlockWidth()));
822        if (SwizzledCopybackBuffer * const cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
823            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
824            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
825            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
826            Value * priorBlock = iBuilder->CreateLShr(priorProduced[priorIdx], log2BlockSize);
827            Value * priorOffset = iBuilder->CreateAnd(priorProduced[priorIdx], iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
828            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
829            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(iBuilder, priorBlock);
830            Value * accessible = iBuilder->CreateSub(iBuilder->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
831            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
832            iBuilder->CreateCondBr(wraparound, copyBack, done);
833            iBuilder->SetInsertPoint(copyBack);
834            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
835            cb->createCopyBack(iBuilder, instance, copyItems);
836            iBuilder->CreateBr(done);
837            iBuilder->SetInsertPoint(done);
838            priorIdx++;
839        }
840        if (CircularCopybackBuffer * const cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
841            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
842            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
843            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
844            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
845            Value * accessible = cb->getLinearlyAccessibleItems(iBuilder, priorProduced[priorIdx]);
846            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
847            iBuilder->CreateCondBr(wraparound, copyBack, done);
848            iBuilder->SetInsertPoint(copyBack);
849            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
850            cb->createCopyBack(iBuilder, instance, copyItems);
851            iBuilder->CreateBr(done);
852            iBuilder->SetInsertPoint(done);
853            priorIdx++;
854        }
855    }
856
857
858    /// Call the do block method if necessary then restore the current function state to the do segement method
859    if (!useIndirectBr()) {
860        iBuilder->CreateRetVoid();
861        mDoBlockMethod = mCurrentMethod;
862        iBuilder->restoreIP(ip);
863        iBuilder->CreateCall(mCurrentMethod, self);
864        setInstance(self);
865        mCurrentMethod = cp;
866    }
867
868}
869
870inline void BlockOrientedKernel::writeFinalBlockMethod(Value * remainingItems) {
871
872    Value * const self = getInstance();
873    Function * const cp = mCurrentMethod;
874    Value * const remainingItemCount = remainingItems;
875    auto ip = iBuilder->saveIP();
876
877    if (!useIndirectBr()) {
878        FunctionType * const type = FunctionType::get(iBuilder->getVoidTy(), {self->getType(), iBuilder->getSizeTy()}, false);
879        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, iBuilder->getModule());
880        mCurrentMethod->setCallingConv(CallingConv::C);
881        mCurrentMethod->setDoesNotThrow();
882        mCurrentMethod->setDoesNotCapture(1);
883        auto args = mCurrentMethod->arg_begin();
884        args->setName("self");
885        setInstance(&*args);
886        remainingItems = &*(++args);
887        remainingItems->setName("remainingItems");
888        iBuilder->SetInsertPoint(CreateBasicBlock("entry"));
889    }
890
891    generateFinalBlockMethod(remainingItems); // may be implemented by the BlockOrientedKernel subtype
892
893    RecursivelyDeleteTriviallyDeadInstructions(remainingItems); // if remainingItems was not used, this will eliminate it.
894
895    if (!useIndirectBr()) {
896        iBuilder->CreateRetVoid();       
897        iBuilder->restoreIP(ip);
898        iBuilder->CreateCall(mCurrentMethod, {self, remainingItemCount});
899        mCurrentMethod = cp;
900        setInstance(self);
901    }
902
903}
904
905//  The default finalBlock method simply dispatches to the doBlock routine.
906void BlockOrientedKernel::generateFinalBlockMethod(Value * /* remainingItems */) {
907    CreateDoBlockMethodCall();
908}
909
910bool BlockOrientedKernel::useIndirectBr() const {
911    return iBuilder->supportsIndirectBr();
912}
913
914void BlockOrientedKernel::CreateDoBlockMethodCall() {
915    if (useIndirectBr()) {
916        BasicBlock * bb = CreateBasicBlock("resume");
917        mStrideLoopBranch->addDestination(bb);
918        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), iBuilder->GetInsertBlock());
919        iBuilder->CreateBr(mStrideLoopBody);
920        bb->moveAfter(iBuilder->GetInsertBlock());
921        iBuilder->SetInsertPoint(bb);
922    } else {
923        iBuilder->CreateCall(mDoBlockMethod, getInstance());
924    }
925}
926
927void Kernel::finalizeInstance() {
928    assert ("KernelBuilder does not have a valid IDISA Builder" && iBuilder);
929    mOutputScalarResult = iBuilder->CreateCall(getTerminateFunction(iBuilder->getModule()), { getInstance() });
930}
931
932Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
933    const auto f = mStreamMap.find(name);
934    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
935        report_fatal_error(getName() + " does not contain stream set " + name);
936    }
937    return f->second;
938}
939
940   
941void MultiBlockKernel::generateDoSegmentMethod() {
942   
943    // First prepare the multi-block method that will be used.
944   
945    std::vector<Type *> multiBlockParmTypes;
946    multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
947    for (auto buffer : mStreamSetInputBuffers) {
948        multiBlockParmTypes.push_back(buffer->getPointerType());
949    }
950    for (auto buffer : mStreamSetOutputBuffers) {
951        multiBlockParmTypes.push_back(buffer->getPointerType());
952    }
953    FunctionType * const type = FunctionType::get(iBuilder->getVoidTy(), multiBlockParmTypes, false);
954    Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, iBuilder->getModule());
955    multiBlockFunction->setCallingConv(CallingConv::C);
956    multiBlockFunction->setDoesNotThrow();
957    auto args = multiBlockFunction->arg_begin();
958    args->setName("self");
959    for (auto binding : mStreamSetInputs) {
960        (++args)->setName(binding.name + "BufPtr");
961    }
962    for (auto binding : mStreamSetOutputs) {
963        (args++)->setName(binding.name + "BufPtr");
964    }
965   
966    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
967    // provide the required multi-block kernel logic.
968    auto ip = iBuilder->saveIP();
969    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "multiBlockEntry", multiBlockFunction, 0));
970    generateMultiBlockLogic(); 
971    iBuilder->CreateRetVoid();
972    iBuilder->restoreIP(ip);
973   
974    // Now proceed with creation of the doSegment method.
975   
976    BasicBlock * const entry = iBuilder->GetInsertBlock();
977    BasicBlock * const doSegmentOuterLoop = CreateBasicBlock(getName() + "_doSegmentOuterLoop");
978    BasicBlock * const doMultiBlockCall = CreateBasicBlock(getName() + "_doMultiBlockCall");
979    BasicBlock * const finalBlockCheck = CreateBasicBlock(getName() + "_finalBlockCheck");
980    BasicBlock * const doTempBufferBlock = CreateBasicBlock(getName() + "_doTempBufferBlock");
981    BasicBlock * const segmentDone = CreateBasicBlock(getName() + "_segmentDone");
982   
983    Value * blockBaseMask = iBuilder->CreateNot(iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
984   
985    //
986    //  A. Temporary Buffer Area Determination
987    //
988    // For final block processing and for processing near the end of physical buffer
989    // boundaries, we need to allocate temporary space for processing a full block of input.
990    // Compute the size requirements to store stream set data at the declared processing
991    // rates in reference to one block of the principal input stream. 
992    //
993
994    unsigned bitBlockWidth = iBuilder->getBitBlockWidth();
995    std::vector<Type *> tempBuffers;
996    std::vector<unsigned> itemsPerPrincipalBlock;
997    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
998        auto & rate = mStreamSetInputs[i].rate;
999        std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
1000        if (refSet.empty()) {
1001            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
1002        }
1003        else {
1004            Port port; unsigned ssIdx;
1005            std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
1006            assert (port == Port::Input && ssIdx < i);
1007            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
1008        }
1009        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth;
1010        if (blocks > 1) {
1011            tempBuffers.push_back(ArrayType::get(mStreamSetInputBuffers[i]->getType(), blocks));
1012        }
1013        else {
1014            tempBuffers.push_back(mStreamSetInputBuffers[i]->getType());
1015        }
1016    }
1017    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
1018        auto & rate = mStreamSetOutputs[i].rate;
1019        std::string refSet = mStreamSetOutputs[i].rate.referenceStreamSet();
1020        if (refSet.empty()) {
1021            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
1022        }
1023        else {
1024            Port port; unsigned ssIdx;
1025            std::tie(port, ssIdx) = getStreamPort(mStreamSetOutputs[i].name);
1026            if (port == Port::Output) ssIdx += mStreamSetInputs.size();
1027            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
1028        }
1029        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth;
1030        if (blocks > 1) {
1031            tempBuffers.push_back(ArrayType::get(mStreamSetOutputBuffers[i]->getType(), blocks));
1032        }
1033        else {
1034            tempBuffers.push_back(mStreamSetOutputBuffers[i]->getType());
1035        }
1036    }
1037    Type * tempParameterStructType = StructType::create(iBuilder->getContext(), tempBuffers);
1038    Value * tempParameterArea = iBuilder->CreateCacheAlignedAlloca(tempParameterStructType);
1039   
1040    ConstantInt * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
1041    Value * availablePos = mAvailableItemCount[0];
1042    Value * itemsAvail = availablePos;
1043    //  Make sure that corresponding data is available depending on processing rate
1044    //  for all input stream sets.
1045    for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
1046        Value * a = mAvailableItemCount[i];
1047        auto & rate = mStreamSetInputs[i].rate;
1048        assert (((rate.referenceStreamSet() == "") || (rate.referenceStreamSet() == mStreamSetInputs[0].name)) && "Multiblock kernel input rate not with respect to principal stream.");
1049        Value * maxItems = rate.CreateMaxReferenceItemsCalculation(iBuilder, a);
1050        itemsAvail = iBuilder->CreateSelect(iBuilder->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
1051    }
1052   
1053    Value * processed = getProcessedItemCount(mStreamSetInputs[0].name);
1054    Value * itemsToDo = iBuilder->CreateSub(itemsAvail, processed);
1055    Value * fullBlocksToDo = iBuilder->CreateUDiv(itemsToDo, blockSize);
1056    Value * excessItems = iBuilder->CreateURem(itemsToDo, blockSize);
1057   
1058    //  Now we iteratively process these blocks using the doMultiBlock method. 
1059    //  In each iteration, we process the maximum number of linearly accessible
1060    //  blocks on the principal input, reduced to ensure that the corresponding
1061    //  data is linearly available at the specified processing rates for the other inputs,
1062    //  and that each of the output buffers has sufficient linearly available space
1063    //  (using overflow areas, if necessary) for the maximum output that can be
1064    //  produced.
1065   
1066    //iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(fullBlocksToDo, iBuilder->getSize(0)), doSegmentOuterLoop, finalBlockCheck);
1067    iBuilder->CreateBr(doSegmentOuterLoop);
1068   
1069    iBuilder->SetInsertPoint(doSegmentOuterLoop);
1070    PHINode * const blocksRemaining = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "blocksRemaining");
1071    blocksRemaining->addIncoming(fullBlocksToDo, entry);
1072   
1073   
1074    // For each input buffer, determine the processedItemCount, the block pointer for the
1075    // buffer block containing the next item, and the number of linearly available items.
1076    //
1077    std::vector<Value *> processedItemCount;
1078    std::vector<Value *> inputBlockPtr;
1079    std::vector<Value *> producedItemCount;
1080    std::vector<Value *> outputBlockPtr;
1081   
1082    //  Calculate linearly available blocks for all input stream sets.
1083    Value * linearlyAvailBlocks = nullptr;
1084    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
1085        Value * p = getProcessedItemCount(mStreamSetInputs[i].name);
1086        Value * blkNo = iBuilder->CreateUDiv(p, blockSize);
1087        Value * b = getInputStreamBlockPtr(mStreamSetInputs[i].name, iBuilder->getInt32(0));
1088        processedItemCount.push_back(p);
1089        inputBlockPtr.push_back(b);
1090        auto & rate = mStreamSetInputs[i].rate;
1091        Value * blocks = nullptr;
1092        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator()) && (rate.referenceStreamSet() == "")) {
1093            blocks = mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(iBuilder, blkNo);
1094        }
1095        else {
1096            Value * linearlyAvailItems = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(iBuilder, p);
1097            Value * items = rate.CreateMaxReferenceItemsCalculation(iBuilder, linearlyAvailItems);
1098            blocks = iBuilder->CreateUDiv(items, blockSize);
1099        }
1100        if (i == 0) {
1101            linearlyAvailBlocks = blocks;
1102        }
1103        else {
1104            linearlyAvailBlocks = iBuilder->CreateSelect(iBuilder->CreateICmpULT(blocks, linearlyAvailBlocks), blocks, linearlyAvailBlocks);
1105        }
1106    }
1107   
1108    //  Now determine the linearly writeable blocks, based on available blocks reduced
1109    //  by limitations of output buffer space.
1110    Value * linearlyWritableBlocks = linearlyAvailBlocks;
1111   
1112    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
1113        Value * p = getProducedItemCount(mStreamSetOutputs[i].name);
1114        Value * blkNo = iBuilder->CreateUDiv(p, blockSize);
1115        Value * b = getOutputStreamBlockPtr(mStreamSetOutputs[i].name, iBuilder->getInt32(0));
1116        producedItemCount.push_back(p);
1117        outputBlockPtr.push_back(b);
1118        auto & rate = mStreamSetOutputs[i].rate;
1119        Value * blocks = nullptr;
1120        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator())) {
1121            blocks = mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(iBuilder, blkNo);
1122        }
1123        else {
1124            Value * writableItems = mStreamSetOutputBuffers[0]->getLinearlyWritableItems(iBuilder, p);
1125            blocks = iBuilder->CreateUDiv(writableItems, blockSize);
1126        }
1127        linearlyWritableBlocks = iBuilder->CreateSelect(iBuilder->CreateICmpULT(blocks, linearlyWritableBlocks), blocks, linearlyWritableBlocks);
1128    }
1129    Value * haveBlocks = iBuilder->CreateICmpUGT(linearlyWritableBlocks, iBuilder->getSize(0));
1130   
1131    iBuilder->CreateCondBr(haveBlocks, doMultiBlockCall, doTempBufferBlock);
1132   
1133    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
1134    //  Now prepare the doMultiBlock call.
1135    iBuilder->SetInsertPoint(doMultiBlockCall);
1136   
1137    Value * linearlyAvailItems = iBuilder->CreateMul(linearlyWritableBlocks, blockSize);
1138   
1139    std::vector<Value *> doMultiBlockArgs;
1140    doMultiBlockArgs.push_back(linearlyAvailItems);
1141    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
1142        doMultiBlockArgs.push_back(getRawInputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), processedItemCount[i]));
1143    }
1144    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
1145        doMultiBlockArgs.push_back(getRawOutputPointer(mStreamSetOutputs[i].name, iBuilder->getInt32(0), producedItemCount[i]));
1146    }
1147       
1148    iBuilder->CreateCall(multiBlockFunction, doMultiBlockArgs);
1149   
1150    // Do copybacks if necessary.
1151    unsigned priorIdx = 0;
1152    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
1153        Value * log2BlockSize = iBuilder->getSize(std::log2(iBuilder->getBitBlockWidth()));
1154        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
1155            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
1156            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
1157            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
1158            Value * priorBlock = iBuilder->CreateLShr(producedItemCount[i], log2BlockSize);
1159            Value * priorOffset = iBuilder->CreateAnd(producedItemCount[i], iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
1160            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
1161            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(iBuilder, priorBlock);
1162            Value * accessible = iBuilder->CreateSub(iBuilder->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
1163            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
1164            iBuilder->CreateCondBr(wraparound, copyBack, done);
1165            iBuilder->SetInsertPoint(copyBack);
1166            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
1167            cb->createCopyBack(iBuilder, instance, copyItems);
1168            iBuilder->CreateBr(done);
1169            iBuilder->SetInsertPoint(done);
1170            priorIdx++;
1171        }
1172        if (auto cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
1173            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
1174            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
1175            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
1176            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
1177            Value * accessible = cb->getLinearlyAccessibleItems(iBuilder, producedItemCount[i]);
1178            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
1179            iBuilder->CreateCondBr(wraparound, copyBack, done);
1180            iBuilder->SetInsertPoint(copyBack);
1181            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
1182            cb->createCopyBack(iBuilder, instance, copyItems);
1183            iBuilder->CreateBr(done);
1184            iBuilder->SetInsertPoint(done);
1185            priorIdx++;
1186        }
1187    }
1188    setProcessedItemCount(mStreamSetInputs[0].name, iBuilder->CreateAdd(processed, linearlyAvailItems));
1189    Value * reducedBlocksToDo = iBuilder->CreateSub(blocksRemaining, linearlyWritableBlocks);
1190    Value * fullBlocksRemain = iBuilder->CreateICmpUGT(reducedBlocksToDo, iBuilder->getSize(0));
1191    BasicBlock * multiBlockFinal = iBuilder->GetInsertBlock();
1192    blocksRemaining->addIncoming(reducedBlocksToDo, multiBlockFinal);
1193    iBuilder->CreateCondBr(fullBlocksRemain, doSegmentOuterLoop, finalBlockCheck);
1194   
1195    // All the full blocks of input have been processed.  If mIsFinal is true,
1196    // we should process the remaining partial block (i.e., excessItems as determined at entry).
1197    iBuilder->SetInsertPoint(finalBlockCheck);
1198    iBuilder->CreateCondBr(mIsFinal, doTempBufferBlock, segmentDone);
1199   
1200    // 
1201    // We use temporary buffers in 3 different cases that preclude full block processing.
1202    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
1203    // (b) One or more output buffers does not have sufficient linearly available buffer space.
1204    // (c) We have processed all the full blocks of input and only the excessItems remain.
1205    // In each case we set up temporary buffers for input and output and then
1206    // call the Multiblock routine.
1207    //
1208    iBuilder->SetInsertPoint(doTempBufferBlock);
1209    PHINode * const tempBlockItems = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "tempBlockItems");
1210    tempBlockItems->addIncoming(blockSize, doSegmentOuterLoop);
1211    tempBlockItems->addIncoming(excessItems, finalBlockCheck);
1212   
1213    // Will this be the final block processing?
1214    Value * doFinal = iBuilder->CreateICmpULT(tempBlockItems, blockSize);
1215   
1216    // Begin constructing the doMultiBlock args.
1217    std::vector<Value *> tempArgs;
1218    tempArgs.push_back(tempBlockItems);
1219   
1220    // Prepare the temporary buffer area.
1221    //
1222    // First zero it out.
1223    Constant * const tempAreaSize = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(tempParameterStructType), iBuilder->getSizeTy(), false);
1224    iBuilder->CreateMemZero(tempParameterArea, tempAreaSize);
1225   
1226    // For each input and output buffer, copy over necessary data starting from the last
1227    // block boundary.
1228    std::vector<Value *> finalItemPos;
1229    finalItemPos.push_back(iBuilder->CreateAdd(processedItemCount[0], tempBlockItems));
1230
1231    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); i++) {
1232        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(i));
1233        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetInputBuffers[i]->getPointerType());
1234       
1235        auto & rate = mStreamSetInputs[i].rate;
1236        Value * blockItemPos = iBuilder->CreateAnd(processedItemCount[i], blockBaseMask);
1237       
1238        // The number of items to copy is determined by the processing rate requirements.
1239        if (i > 1) {
1240            std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
1241            if (refSet.empty()) {
1242                finalItemPos.push_back(rate.CreateRatioCalculation(iBuilder, finalItemPos[0], doFinal));
1243            }
1244            else {
1245                Port port; unsigned ssIdx;
1246                std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
1247                assert (port == Port::Input && ssIdx < i);
1248                finalItemPos.push_back(rate.CreateRatioCalculation(iBuilder, finalItemPos[ssIdx], doFinal));
1249            }
1250        }
1251        Value * neededItems = iBuilder->CreateSub(finalItemPos[i], blockItemPos);
1252        Value * availFromBase = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(iBuilder, blockItemPos);
1253        Value * copyItems1 = iBuilder->CreateSelect(iBuilder->CreateICmpULT(neededItems, availFromBase), neededItems, availFromBase);
1254        Value * copyItems2 = iBuilder->CreateSub(neededItems, copyItems1);
1255        mStreamSetInputBuffers[i]->createBlockAlignedCopy(iBuilder, tempBufPtr, inputBlockPtr[i], copyItems1);
1256        Value * nextBufPtr = iBuilder->CreateGEP(tempBufPtr, iBuilder->CreateUDiv(availFromBase, blockSize));
1257        mStreamSetInputBuffers[i]->createBlockAlignedCopy(iBuilder, nextBufPtr, getStreamSetBufferPtr(mStreamSetInputs[i].name), copyItems2);
1258        Value * itemAddress = iBuilder->CreatePtrToInt(getRawOutputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), processedItemCount[i]), iBuilder->getSizeTy());
1259        Value * baseAddress = iBuilder->CreatePtrToInt(inputBlockPtr[i], iBuilder->getSizeTy());
1260        Value * tempAddress = iBuilder->CreateAdd(iBuilder->CreatePtrToInt(tempBufPtr, iBuilder->getSizeTy()), iBuilder->CreateSub(itemAddress, baseAddress));
1261        tempArgs.push_back(iBuilder->CreateBitCast(tempAddress, mStreamSetInputBuffers[i]->getPointerType()));
1262    }
1263
1264    std::vector<Value *> blockItemPos;
1265    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
1266        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(mStreamSetInputs.size() + i));
1267        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
1268        blockItemPos.push_back(iBuilder->CreateAnd(producedItemCount[i], blockBaseMask));
1269        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, tempBufPtr, outputBlockPtr[i], iBuilder->CreateSub(producedItemCount[i], blockItemPos[i]));
1270        Value * itemAddress = iBuilder->CreatePtrToInt(getRawOutputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), producedItemCount[i]), iBuilder->getSizeTy());
1271        Value * baseAddress = iBuilder->CreatePtrToInt(outputBlockPtr[i], iBuilder->getSizeTy());
1272        Value * tempAddress = iBuilder->CreateAdd(iBuilder->CreatePtrToInt(tempBufPtr, iBuilder->getSizeTy()), iBuilder->CreateSub(itemAddress, baseAddress));
1273        tempArgs.push_back(iBuilder->CreateBitCast(tempAddress, mStreamSetOutputBuffers[i]->getPointerType()));
1274    }
1275
1276    iBuilder->CreateCall(multiBlockFunction, tempArgs);
1277
1278    // Copy back data to the actual output buffers.
1279   
1280    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
1281        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(mStreamSetInputs.size() + i));
1282        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
1283        Value * final_items = getProducedItemCount(mStreamSetOutputs[i].name);
1284        Value * copyItems = iBuilder->CreateSub(final_items, blockItemPos[i]);
1285        Value * copyItems1 = mStreamSetOutputBuffers[i]->getLinearlyWritableItems(iBuilder, blockItemPos[i]); // must be a whole number of blocks.
1286        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, outputBlockPtr[i], tempBufPtr, copyItems1);
1287        Value * copyItems2 = iBuilder->CreateSelect(iBuilder->CreateICmpULT(copyItems, copyItems), iBuilder->getSize(0), iBuilder->CreateSub(copyItems, copyItems1));
1288        tempBufPtr = iBuilder->CreateGEP(tempBufPtr, iBuilder->CreateUDiv(copyItems1, blockSize));
1289        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, getStreamSetBufferPtr(mStreamSetOutputs[i].name), tempBufPtr, copyItems2);
1290    }
1291
1292    setProcessedItemCount(mStreamSetInputs[0].name, finalItemPos[0]);
1293
1294    //  We've dealt with the partial block processing and copied information back into the
1295    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
1296    //
1297    iBuilder->CreateCondBr(doFinal, segmentDone, doSegmentOuterLoop);
1298    iBuilder->SetInsertPoint(segmentDone);
1299}
1300                                                           
1301// CONSTRUCTOR
1302Kernel::Kernel(std::string && kernelName,
1303                             std::vector<Binding> && stream_inputs,
1304                             std::vector<Binding> && stream_outputs,
1305                             std::vector<Binding> && scalar_parameters,
1306                             std::vector<Binding> && scalar_outputs,
1307                             std::vector<Binding> && internal_scalars)
1308: KernelInterface(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1309, mCurrentMethod(nullptr)
1310, mNoTerminateAttribute(false)
1311, mIsGenerated(false)
1312, mIsFinal(nullptr)
1313, mOutputScalarResult(nullptr) {
1314
1315}
1316
1317Kernel::~Kernel() {
1318
1319}
1320
1321// CONSTRUCTOR
1322BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
1323                                         std::vector<Binding> && stream_inputs,
1324                                         std::vector<Binding> && stream_outputs,
1325                                         std::vector<Binding> && scalar_parameters,
1326                                         std::vector<Binding> && scalar_outputs,
1327                                         std::vector<Binding> && internal_scalars)
1328: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1329, mDoBlockMethod(nullptr)
1330, mStrideLoopBody(nullptr)
1331, mStrideLoopBranch(nullptr)
1332, mStrideLoopTarget(nullptr) {
1333
1334}
1335
1336// CONSTRUCTOR
1337SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
1338                                             std::vector<Binding> && stream_inputs,
1339                                             std::vector<Binding> && stream_outputs,
1340                                             std::vector<Binding> && scalar_parameters,
1341                                             std::vector<Binding> && scalar_outputs,
1342                                             std::vector<Binding> && internal_scalars)
1343: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1344
1345}
1346
1347// CONSTRUCTOR
1348MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
1349                                   std::vector<Binding> && stream_inputs,
1350                                   std::vector<Binding> && stream_outputs,
1351                                   std::vector<Binding> && scalar_parameters,
1352                                   std::vector<Binding> && scalar_outputs,
1353                                   std::vector<Binding> && internal_scalars)
1354: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1355   
1356}
1357}
Note: See TracBrowser for help on using the repository browser.