source: icGREP/icgrep-devel/icgrep/kernels/kernel.cpp @ 5446

Last change on this file since 5446 was 5446, checked in by nmedfort, 2 years ago

Refactoring work + correction for getRawItemPointer

File size: 50.5 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "kernel.h"
7#include <toolchain/toolchain.h>
8#include <kernels/streamset.h>
9#include <llvm/IR/Constants.h>
10#include <llvm/IR/Function.h>
11#include <llvm/IR/Instructions.h>
12#include <llvm/IR/MDBuilder.h>
13#include <llvm/IR/Module.h>
14#include <llvm/Support/raw_ostream.h>
15#include <llvm/Bitcode/ReaderWriter.h>
16#include <llvm/Transforms/Utils/Local.h>
17#include <kernels/streamset.h>
18#include <sstream>
19#include <kernels/kernel_builder.h>
20
21using namespace llvm;
22using namespace parabix;
23
24namespace kernel {
25
26const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
27const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
28const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
29const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
30const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
31const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
32const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
33const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal";
34const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr";
35const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks";
36
37unsigned Kernel::addScalar(Type * const type, const std::string & name) {
38    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
39        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
40    }
41    if (LLVM_UNLIKELY(mKernelMap.count(name))) {
42        report_fatal_error(getName() + " already contains scalar field " + name);
43    }
44    const auto index = mKernelFields.size();
45    mKernelMap.emplace(name, index);
46    mKernelFields.push_back(type);
47    return index;
48}
49
50unsigned Kernel::addUnnamedScalar(Type * const type) {
51    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
52        report_fatal_error("Cannot add unnamed field  to " + getName() + " after kernel state finalized");
53    }
54    const auto index = mKernelFields.size();
55    mKernelFields.push_back(type);
56    return index;
57}
58
59void Kernel::prepareStreamSetNameMap() {
60    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
61        mStreamMap.emplace(mStreamSetInputs[i].name, std::make_pair(Port::Input, i));
62    }
63    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
64        mStreamMap.emplace(mStreamSetOutputs[i].name, std::make_pair(Port::Output, i));
65    }
66}
67
68void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
69    assert (mModule == nullptr);
70    assert (mStreamSetInputBuffers.empty());
71    assert (mStreamSetOutputBuffers.empty());
72
73    if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) {
74        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetInputs.size()) +
75                           " input stream sets but was given "
76                           + std::to_string(inputs.size()));
77    }
78
79    for (unsigned i = 0; i < inputs.size(); ++i) {
80        StreamSetBuffer * const buf = inputs[i];
81        if (LLVM_UNLIKELY(buf == nullptr)) {
82            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
83                               + " cannot be null");
84        }
85        buf->addConsumer(this);
86    }
87
88    if (LLVM_UNLIKELY(mStreamSetOutputs.size() != outputs.size())) {
89        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetOutputs.size())
90                           + " output stream sets but was given "
91                           + std::to_string(outputs.size()));
92    }
93
94    for (unsigned i = 0; i < outputs.size(); ++i) {
95        StreamSetBuffer * const buf = outputs[i];
96        if (LLVM_UNLIKELY(buf == nullptr)) {
97            report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null");
98        }
99        if (LLVM_LIKELY(buf->getProducer() == nullptr)) {
100            buf->setProducer(this);
101        } else {
102            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
103                               + " is already produced by kernel " + buf->getProducer()->getName());
104        }
105    }
106
107    mStreamSetInputBuffers.assign(inputs.begin(), inputs.end());
108    mStreamSetOutputBuffers.assign(outputs.begin(), outputs.end());
109}
110
111Module * Kernel::makeModule(const std::unique_ptr<KernelBuilder> & idb) {
112    assert (mModule == nullptr);
113    std::stringstream cacheName;   
114    cacheName << getName() << '_' << idb->getBuilderUniqueName();
115    for (const StreamSetBuffer * b: mStreamSetInputBuffers) {
116        cacheName <<  ':' <<  b->getUniqueID();
117    }
118    for (const StreamSetBuffer * b: mStreamSetOutputBuffers) {
119        cacheName <<  ':' <<  b->getUniqueID();
120    }
121    mModule = new Module(cacheName.str(), idb->getContext());
122    prepareKernel(idb);
123    return mModule;
124}
125
126Module * Kernel::setModule(const std::unique_ptr<KernelBuilder> & idb, llvm::Module * const module) {
127    assert (mModule == nullptr);
128    mModule = module;
129    prepareKernel(idb);
130    return mModule;
131}
132
133void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
134    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
135    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
136        report_fatal_error("Cannot prepare kernel after kernel state finalized");
137    }
138    const auto blockSize = idb->getBitBlockWidth();
139    const auto requiredBlocks = codegen::SegmentSize + ((blockSize + mLookAheadPositions - 1) / blockSize);
140
141    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
142        if ((mStreamSetInputBuffers[i]->getBufferBlocks() > 0) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
143            report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
144        }
145        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getPointerType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
146        if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
147            addScalar(idb->getSizeTy(), mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
148        }
149    }
150
151    IntegerType * const sizeTy = idb->getSizeTy();
152    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
153        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getPointerType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
154        if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
155            addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
156        }
157    }
158    for (const auto & binding : mScalarInputs) {
159        addScalar(binding.type, binding.name);
160    }
161    for (const auto & binding : mScalarOutputs) {
162        addScalar(binding.type, binding.name);
163    }
164    if (mStreamMap.empty()) {
165        prepareStreamSetNameMap();
166    }
167    for (const auto & binding : mInternalScalars) {
168        addScalar(binding.type, binding.name);
169    }
170
171    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
172    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
173        addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
174    }
175
176    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
177    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
178
179    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
180        addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
181    }
182
183    mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
184}
185
186// Default kernel signature: generate the IR and emit as byte code.
187std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) {
188    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
189    if (LLVM_LIKELY(moduleIDisSignature())) {
190        return getModule()->getModuleIdentifier();
191    } else {
192        generateKernel(idb);
193        std::string signature;
194        raw_string_ostream OS(signature);
195        WriteBitcodeToFile(getModule(), OS);
196        return signature;
197    }
198}
199
200void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
201    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
202    // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
203    // generated the unoptimized IR.
204    if (!mIsGenerated) {
205        const auto m = idb->getModule();
206        const auto ip = idb->saveIP();
207        const auto saveInstance = getInstance();
208        idb->setModule(mModule);
209        addKernelDeclarations(idb);
210        callGenerateInitializeMethod(idb);
211        callGenerateDoSegmentMethod(idb);
212        callGenerateFinalizeMethod(idb);
213        setInstance(saveInstance);
214        idb->setModule(m);
215        idb->restoreIP(ip);
216        mIsGenerated = true;
217    }
218}
219
220inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
221    mCurrentMethod = getInitFunction(idb->getModule());
222    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
223    Function::arg_iterator args = mCurrentMethod->arg_begin();
224    setInstance(&*(args++));
225    idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
226    for (const auto & binding : mScalarInputs) {
227        idb->setScalarField(binding.name, &*(args++));
228    }
229    for (const auto & binding : mStreamSetOutputs) {
230        idb->setConsumerLock(binding.name, &*(args++));
231    }
232    generateInitializeMethod(idb);
233    idb->CreateRetVoid();
234}
235
236inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
237    mCurrentMethod = getDoSegmentFunction(idb->getModule());
238    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
239    auto args = mCurrentMethod->arg_begin();
240    setInstance(&*(args++));
241    mIsFinal = &*(args++);
242    const auto n = mStreamSetInputs.size();
243    mAvailableItemCount.resize(n, nullptr);
244    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
245        mAvailableItemCount[i] = &*(args++);
246    }
247    generateDoSegmentMethod(idb); // must be overridden by the KernelBuilder subtype
248    mIsFinal = nullptr;
249    mAvailableItemCount.clear();
250    idb->CreateRetVoid();
251}
252
253inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
254    mCurrentMethod = getTerminateFunction(idb->getModule());
255    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
256    auto args = mCurrentMethod->arg_begin();
257    setInstance(&*(args++));
258    generateFinalizeMethod(idb); // may be overridden by the KernelBuilder subtype
259    const auto n = mScalarOutputs.size();
260    if (n == 0) {
261        idb->CreateRetVoid();
262    } else {
263        Value * outputs[n];
264        for (unsigned i = 0; i < n; ++i) {
265            outputs[i] = idb->getScalarField(mScalarOutputs[i].name);
266        }
267        if (n == 1) {
268            idb->CreateRet(outputs[0]);
269        } else {
270            idb->CreateAggregateRet(outputs, n);
271        }
272    }
273}
274
275unsigned Kernel::getScalarIndex(const std::string & name) const {
276    const auto f = mKernelMap.find(name);
277    if (LLVM_UNLIKELY(f == mKernelMap.end())) {
278        assert (false);
279        report_fatal_error(getName() + " does not contain scalar: " + name);
280    }
281    return f->second;
282}
283
284Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) {
285    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
286    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
287        report_fatal_error("Cannot instantiate " + getName() + " before calling prepareKernel()");
288    }
289    setInstance(idb->CreateCacheAlignedAlloca(mKernelStateType));
290    return getInstance();
291}
292
293void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
294    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
295    if (LLVM_UNLIKELY(getInstance() == nullptr)) {
296        report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()");
297    }
298    std::vector<Value *> args;
299    args.reserve(1 + mInitialArguments.size() + mStreamSetInputBuffers.size() + (mStreamSetOutputBuffers.size() * 2));
300    args.push_back(getInstance());
301    for (unsigned i = 0; i < mInitialArguments.size(); ++i) {
302        Value * arg = mInitialArguments[i];
303        if (LLVM_UNLIKELY(arg == nullptr)) {
304            report_fatal_error(getName() + ": initial argument " + std::to_string(i)
305                               + " cannot be null when calling createInstance()");
306        }
307        args.push_back(arg);
308    }
309    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
310        assert (mStreamSetInputBuffers[i]);
311        Value * arg = mStreamSetInputBuffers[i]->getStreamSetBasePtr();
312        if (LLVM_UNLIKELY(arg == nullptr)) {
313            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
314                               + " was not allocated prior to calling createInstance()");
315        }
316        args.push_back(arg);
317    }
318    assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
319    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
320        assert (mStreamSetOutputBuffers[i]);
321        Value * arg = mStreamSetOutputBuffers[i]->getStreamSetBasePtr();
322        if (LLVM_UNLIKELY(arg == nullptr)) {
323            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
324                               + " was not allocated prior to calling createInstance()");
325        }
326        args.push_back(arg);
327    }
328    assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
329    IntegerType * const sizeTy = idb->getSizeTy();
330    PointerType * const sizePtrTy = sizeTy->getPointerTo();
331    PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo();
332    StructType * const consumerTy = StructType::get(sizeTy, sizePtrPtrTy, nullptr);
333    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
334        const auto output = mStreamSetOutputBuffers[i];
335        const auto & consumers = output->getConsumers();
336        const auto n = consumers.size();
337        AllocaInst * const outputConsumers = idb->CreateAlloca(consumerTy);
338        Value * const consumerSegNoArray = idb->CreateAlloca(ArrayType::get(sizePtrTy, n));
339        for (unsigned i = 0; i < n; ++i) {
340            Kernel * const consumer = consumers[i];
341            assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance());
342            idb->setKernel(consumer);
343            Value * const segmentNoPtr = idb->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
344            idb->CreateStore(segmentNoPtr, idb->CreateGEP(consumerSegNoArray, { idb->getInt32(0), idb->getInt32(i) }));
345        }
346        idb->setKernel(this);
347        Value * const consumerCountPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(0)});
348        idb->CreateStore(idb->getSize(n), consumerCountPtr);
349        Value * const consumerSegNoArrayPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(1)});
350        idb->CreateStore(idb->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
351        args.push_back(outputConsumers);
352    }
353    idb->CreateCall(getInitFunction(idb->getModule()), args);
354}
355
356//  The default doSegment method dispatches to the doBlock routine for
357//  each block of the given number of blocksToDo, and then updates counts.
358
359void BlockOrientedKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb) {
360    BasicBlock * const entryBlock = idb->GetInsertBlock();
361    BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
362    mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody");
363    BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone");
364    BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock");
365    BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone");
366
367    Value * baseTarget = nullptr;
368    if (idb->supportsIndirectBr()) {
369        baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
370    }
371
372    ConstantInt * stride = idb->getSize(idb->getStride());
373    Value * availablePos = mAvailableItemCount[0];
374    Value * processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
375    Value * itemsAvail = idb->CreateSub(availablePos, processed);
376    Value * stridesToDo = idb->CreateUDiv(itemsAvail, stride);
377
378    idb->CreateBr(strideLoopCond);
379
380    idb->SetInsertPoint(strideLoopCond);
381
382    PHINode * branchTarget = nullptr;
383    if (idb->supportsIndirectBr()) {
384        branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
385        branchTarget->addIncoming(baseTarget, entryBlock);
386    }
387
388    PHINode * const stridesRemaining = idb->CreatePHI(idb->getSizeTy(), 2, "stridesRemaining");
389    stridesRemaining->addIncoming(stridesToDo, entryBlock);
390    // NOTE: stridesRemaining may go to a negative number in the final block if the generateFinalBlockMethod(...)
391    // calls CreateDoBlockMethodCall(). Do *not* replace the comparator with an unsigned one!
392    Value * notDone = idb->CreateICmpSGT(stridesRemaining, idb->getSize(0));
393    idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
394
395    idb->SetInsertPoint(mStrideLoopBody);
396
397    if (idb->supportsIndirectBr()) {
398        mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget");
399        mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
400    }
401
402    /// GENERATE DO BLOCK METHOD
403
404    writeDoBlockMethod(idb);
405
406    /// UPDATE PROCESSED COUNTS
407
408    processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
409    Value * itemsDone = idb->CreateAdd(processed, stride);
410    idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
411
412    stridesRemaining->addIncoming(idb->CreateSub(stridesRemaining, idb->getSize(1)), idb->GetInsertBlock());
413
414    BasicBlock * bodyEnd = idb->GetInsertBlock();
415    if (idb->supportsIndirectBr()) {
416        branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
417    }
418    idb->CreateBr(strideLoopCond);
419
420    stridesDone->moveAfter(bodyEnd);
421
422    idb->SetInsertPoint(stridesDone);
423
424    // Now conditionally perform the final block processing depending on the doFinal parameter.
425    if (idb->supportsIndirectBr()) {
426        mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
427        mStrideLoopBranch->addDestination(doFinalBlock);
428        mStrideLoopBranch->addDestination(segmentDone);
429    } else {
430        idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
431    }
432
433    doFinalBlock->moveAfter(stridesDone);
434
435    idb->SetInsertPoint(doFinalBlock);
436
437    Value * remainingItems = idb->CreateSub(mAvailableItemCount[0], idb->getProcessedItemCount(mStreamSetInputs[0].name));
438
439    writeFinalBlockMethod(idb, remainingItems);
440
441    itemsDone = mAvailableItemCount[0];
442    idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
443    idb->setTerminationSignal();
444    idb->CreateBr(segmentDone);
445
446    segmentDone->moveAfter(idb->GetInsertBlock());
447
448    idb->SetInsertPoint(segmentDone);
449
450    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
451    if (idb->supportsIndirectBr()) {
452        MDBuilder mdb(idb->getContext());
453        const auto destinations = mStrideLoopBranch->getNumDestinations();
454        uint32_t weights[destinations];
455        for (unsigned i = 0; i < destinations; ++i) {
456            weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1;
457        }
458        ArrayRef<uint32_t> bw(weights, destinations);
459        mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw));
460    }
461
462}
463
464inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) {
465
466    Value * const self = getInstance();
467    Function * const cp = mCurrentMethod;
468    auto ip = idb->saveIP();
469
470    /// Check if the do block method is called and create the function if necessary   
471    if (!idb->supportsIndirectBr()) {
472        FunctionType * const type = FunctionType::get(idb->getVoidTy(), {self->getType()}, false);
473        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule());
474        mCurrentMethod->setCallingConv(CallingConv::C);
475        mCurrentMethod->setDoesNotThrow();
476        mCurrentMethod->setDoesNotCapture(1);
477        auto args = mCurrentMethod->arg_begin();
478        args->setName("self");
479        setInstance(&*args);
480        idb->SetInsertPoint(idb->CreateBasicBlock("entry"));
481    }
482
483    std::vector<Value *> priorProduced;
484    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
485        if (isa<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]) || isa<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
486            priorProduced.push_back(idb->getProducedItemCount(mStreamSetOutputs[i].name));
487        }
488    }
489
490    generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
491
492    unsigned priorIdx = 0;
493    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
494        Value * log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
495        if (SwizzledCopybackBuffer * const cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
496            BasicBlock * copyBack = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
497            BasicBlock * done = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
498            Value * newlyProduced = idb->CreateSub(idb->getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
499            Value * priorBlock = idb->CreateLShr(priorProduced[priorIdx], log2BlockSize);
500            Value * priorOffset = idb->CreateAnd(priorProduced[priorIdx], idb->getSize(idb->getBitBlockWidth() - 1));
501            Value * instance = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
502            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(idb.get(), priorBlock);
503            Value * accessible = idb->CreateSub(idb->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
504            Value * wraparound = idb->CreateICmpULT(accessible, newlyProduced);
505            idb->CreateCondBr(wraparound, copyBack, done);
506            idb->SetInsertPoint(copyBack);
507            Value * copyItems = idb->CreateSub(newlyProduced, accessible);
508            cb->createCopyBack(idb.get(), instance, copyItems);
509            idb->CreateBr(done);
510            idb->SetInsertPoint(done);
511            priorIdx++;
512        }
513        if (CircularCopybackBuffer * const cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
514            BasicBlock * copyBack = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
515            BasicBlock * done = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
516            Value * instance = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
517            Value * newlyProduced = idb->CreateSub(idb->getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
518            Value * accessible = cb->getLinearlyAccessibleItems(idb.get(), priorProduced[priorIdx]);
519            Value * wraparound = idb->CreateICmpULT(accessible, newlyProduced);
520            idb->CreateCondBr(wraparound, copyBack, done);
521            idb->SetInsertPoint(copyBack);
522            Value * copyItems = idb->CreateSub(newlyProduced, accessible);
523            cb->createCopyBack(idb.get(), instance, copyItems);
524            idb->CreateBr(done);
525            idb->SetInsertPoint(done);
526            priorIdx++;
527        }
528    }
529
530
531    /// Call the do block method if necessary then restore the current function state to the do segement method
532    if (!idb->supportsIndirectBr()) {
533        idb->CreateRetVoid();
534        mDoBlockMethod = mCurrentMethod;
535        idb->restoreIP(ip);
536        idb->CreateCall(mCurrentMethod, self);
537        setInstance(self);
538        mCurrentMethod = cp;
539    }
540
541}
542
543inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) {
544
545    Value * const self = getInstance();
546    Function * const cp = mCurrentMethod;
547    Value * const remainingItemCount = remainingItems;
548    auto ip = idb->saveIP();
549
550    if (!idb->supportsIndirectBr()) {
551        FunctionType * const type = FunctionType::get(idb->getVoidTy(), {self->getType(), idb->getSizeTy()}, false);
552        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule());
553        mCurrentMethod->setCallingConv(CallingConv::C);
554        mCurrentMethod->setDoesNotThrow();
555        mCurrentMethod->setDoesNotCapture(1);
556        auto args = mCurrentMethod->arg_begin();
557        args->setName("self");
558        setInstance(&*args);
559        remainingItems = &*(++args);
560        remainingItems->setName("remainingItems");
561        idb->SetInsertPoint(idb->CreateBasicBlock("entry"));
562    }
563
564    generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
565
566    RecursivelyDeleteTriviallyDeadInstructions(remainingItems); // if remainingItems was not used, this will eliminate it.
567
568    if (!idb->supportsIndirectBr()) {
569        idb->CreateRetVoid();
570        idb->restoreIP(ip);
571        idb->CreateCall(mCurrentMethod, {self, remainingItemCount});
572        mCurrentMethod = cp;
573        setInstance(self);
574    }
575
576}
577
578//  The default finalBlock method simply dispatches to the doBlock routine.
579void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) {
580    CreateDoBlockMethodCall(idb);
581}
582
583void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) {
584    if (idb->supportsIndirectBr()) {
585        BasicBlock * bb = idb->CreateBasicBlock("resume");
586        mStrideLoopBranch->addDestination(bb);
587        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock());
588        idb->CreateBr(mStrideLoopBody);
589        bb->moveAfter(idb->GetInsertBlock());
590        idb->SetInsertPoint(bb);
591    } else {
592        idb->CreateCall(mDoBlockMethod, getInstance());
593    }
594}
595
596void MultiBlockKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
597
598    auto ip = kb->saveIP();
599    Function * const cp = mCurrentMethod;
600    const auto saveInstance = getInstance();
601
602    // First prepare the multi-block method that will be used.
603
604    DataLayout DL(kb->getModule());
605    IntegerType * const intAddressTy = DL.getIntPtrType(kb->getContext());
606
607    std::vector<Type *> multiBlockParmTypes;
608    multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
609    multiBlockParmTypes.push_back(kb->getSizeTy());
610    for (auto buffer : mStreamSetInputBuffers) {
611        multiBlockParmTypes.push_back(buffer->getPointerType());
612    }
613    for (auto buffer : mStreamSetOutputBuffers) {
614        multiBlockParmTypes.push_back(buffer->getPointerType());
615    }
616
617    FunctionType * const type = FunctionType::get(kb->getVoidTy(), multiBlockParmTypes, false);
618    Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, kb->getModule());
619    multiBlockFunction->setCallingConv(CallingConv::C);
620    multiBlockFunction->setDoesNotThrow();
621    auto args = multiBlockFunction->arg_begin();
622    args->setName("self");
623    setInstance(&*args);
624    (++args)->setName("itemsToDo");
625    for (auto binding : mStreamSetInputs) {
626        (++args)->setName(binding.name + "BufPtr");
627    }
628    for (auto binding : mStreamSetOutputs) {
629        (++args)->setName(binding.name + "BufPtr");
630    }
631
632    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
633    // provide the required multi-block kernel logic.
634    mCurrentMethod = multiBlockFunction;
635    kb->SetInsertPoint(BasicBlock::Create(kb->getContext(), "multiBlockEntry", multiBlockFunction, 0));
636    generateMultiBlockLogic(kb);
637
638    kb->CreateRetVoid();
639
640    kb->restoreIP(ip);
641    mCurrentMethod = cp;
642    setInstance(saveInstance);
643
644    // Now proceed with creation of the doSegment method.
645
646    BasicBlock * const entry = kb->GetInsertBlock();
647    BasicBlock * const doSegmentOuterLoop = kb->CreateBasicBlock(getName() + "_doSegmentOuterLoop");
648    BasicBlock * const doMultiBlockCall = kb->CreateBasicBlock(getName() + "_doMultiBlockCall");
649    BasicBlock * const tempBlockCheck = kb->CreateBasicBlock(getName() + "_tempBlockCheck");
650    BasicBlock * const doTempBufferBlock = kb->CreateBasicBlock(getName() + "_doTempBufferBlock");
651    BasicBlock * const segmentDone = kb->CreateBasicBlock(getName() + "_segmentDone");
652
653    Value * blockBaseMask = kb->CreateNot(kb->getSize(kb->getBitBlockWidth() - 1));
654
655    //
656    //  A. Temporary Buffer Area Determination
657    //
658    // For final block processing and for processing near the end of physical buffer
659    // boundaries, we need to allocate temporary space for processing a full block of input.
660    // Compute the size requirements to store stream set data at the declared processing
661    // rates in reference to one block of the principal input stream.
662    //
663
664    unsigned bitBlockWidth = kb->getBitBlockWidth();
665    std::vector<Type *> tempBuffers;
666    std::vector<unsigned> itemsPerPrincipalBlock;
667    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
668        auto & rate = mStreamSetInputs[i].rate;
669        std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
670        if (refSet.empty()) {
671            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
672        }
673        else {
674            Port port; unsigned ssIdx;
675            std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
676            assert (port == Port::Input && ssIdx < i);
677            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
678        }
679        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth +2;
680        if (blocks > 1) {
681            tempBuffers.push_back(ArrayType::get(mStreamSetInputBuffers[i]->getType(), blocks));
682        }
683        else {
684            tempBuffers.push_back(mStreamSetInputBuffers[i]->getType());
685        }
686    }
687
688    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
689        auto & rate = mStreamSetOutputs[i].rate;
690        std::string refSet = mStreamSetOutputs[i].rate.referenceStreamSet();
691        if (refSet.empty()) {
692            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
693        }
694        else {
695            Port port; unsigned ssIdx;
696            std::tie(port, ssIdx) = getStreamPort(mStreamSetOutputs[i].name);
697            if (port == Port::Output) ssIdx += mStreamSetInputs.size();
698            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
699        }
700        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth +2;
701        if (blocks > 1) {
702            tempBuffers.push_back(ArrayType::get(mStreamSetOutputBuffers[i]->getType(), blocks));
703        }
704        else {
705            tempBuffers.push_back(mStreamSetOutputBuffers[i]->getType());
706        }
707    }
708
709    Type * tempParameterStructType = StructType::create(kb->getContext(), tempBuffers);
710    Value * tempParameterArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
711
712    ConstantInt * blockSize = kb->getSize(kb->getBitBlockWidth());
713
714    Value * availablePos = mAvailableItemCount[0];
715    Value * itemsAvail = availablePos;
716
717    //  Make sure that corresponding data is available depending on processing rate
718    //  for all input stream sets.
719
720    for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
721        Value * a = mAvailableItemCount[i];
722        auto & rate = mStreamSetInputs[i].rate;
723        assert (((rate.referenceStreamSet().empty()) || (rate.referenceStreamSet() == mStreamSetInputs[0].name)) && "Multiblock kernel input rate not with respect to principal stream.");
724        Value * maxItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), a);
725        itemsAvail = kb->CreateSelect(kb->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
726    }
727
728    Value * processed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
729    Value * itemsToDo = kb->CreateSub(itemsAvail, processed);
730    Value * fullBlocksToDo = kb->CreateUDiv(itemsToDo, blockSize);
731    Value * excessItems = kb->CreateURem(itemsToDo, blockSize);
732
733    //  Now we iteratively process these blocks using the doMultiBlock method.
734    //  In each iteration, we process the maximum number of linearly accessible
735    //  blocks on the principal input, reduced to ensure that the corresponding
736    //  data is linearly available at the specified processing rates for the other inputs,
737    //  and that each of the output buffers has sufficient linearly available space
738    //  (using overflow areas, if necessary) for the maximum output that can be
739    //  produced.
740
741    kb->CreateBr(doSegmentOuterLoop);
742    kb->SetInsertPoint(doSegmentOuterLoop);
743    PHINode * const blocksRemaining = kb->CreatePHI(kb->getSizeTy(), 2, "blocksRemaining");
744    blocksRemaining->addIncoming(fullBlocksToDo, entry);
745
746    // For each input buffer, determine the processedItemCount, the block pointer for the
747    // buffer block containing the next item, and the number of linearly available items.
748
749    std::vector<Value *> processedItemCount;
750    std::vector<Value *> inputBlockPtr;
751    std::vector<Value *> producedItemCount;
752    std::vector<Value *> outputBlockPtr;
753
754    //  Now determine the linearly available blocks, based on blocks remaining reduced
755    //  by limitations of linearly available input buffer space.
756
757    Value * linearlyAvailBlocks = blocksRemaining;
758    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
759        Value * p = kb->getProcessedItemCount(mStreamSetInputs[i].name);
760        Value * blkNo = kb->CreateUDiv(p, blockSize);
761        Value * b = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
762        processedItemCount.push_back(p);
763        inputBlockPtr.push_back(b);
764        auto & rate = mStreamSetInputs[i].rate;
765        Value * blocks = nullptr;
766        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator()) && (rate.referenceStreamSet() == "")) {
767            blocks = mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(kb.get(), blkNo);
768        } else {
769            Value * linearlyAvailItems = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(kb.get(), p);
770            Value * items = rate.CreateMaxReferenceItemsCalculation(kb.get(), linearlyAvailItems);
771            blocks = kb->CreateUDiv(items, blockSize);
772        }
773        linearlyAvailBlocks = kb->CreateSelect(kb->CreateICmpULT(blocks, linearlyAvailBlocks), blocks, linearlyAvailBlocks);
774    }
775    //  Now determine the linearly writeable blocks, based on available blocks reduced
776    //  by limitations of output buffer space.
777    Value * linearlyWritableBlocks = linearlyAvailBlocks;
778
779    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
780        Value * p = kb->getProducedItemCount(mStreamSetOutputs[i].name);
781        Value * blkNo = kb->CreateUDiv(p, blockSize);
782        Value * b = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
783        producedItemCount.push_back(p);
784        outputBlockPtr.push_back(b);
785        auto & rate = mStreamSetOutputs[i].rate;
786        Value * blocks = nullptr;
787        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator())) {
788            blocks = mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(kb.get(), blkNo);
789        } else {
790            Value * writableItems = mStreamSetOutputBuffers[0]->getLinearlyWritableItems(kb.get(), p);
791            blocks = kb->CreateUDiv(writableItems, blockSize);
792        }
793        linearlyWritableBlocks = kb->CreateSelect(kb->CreateICmpULT(blocks, linearlyWritableBlocks), blocks, linearlyWritableBlocks);
794    }
795    Value * haveBlocks = kb->CreateICmpUGT(linearlyWritableBlocks, kb->getSize(0));
796    kb->CreateCondBr(haveBlocks, doMultiBlockCall, tempBlockCheck);
797
798    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
799    //  Now prepare the doMultiBlock call.
800    kb->SetInsertPoint(doMultiBlockCall);
801
802    Value * linearlyAvailItems = kb->CreateMul(linearlyWritableBlocks, blockSize);
803
804    std::vector<Value *> doMultiBlockArgs;
805    doMultiBlockArgs.push_back(getInstance());
806    doMultiBlockArgs.push_back(linearlyAvailItems);
807    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
808        Value * bufPtr = kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), processedItemCount[i]);
809        bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetInputBuffers[i]->getPointerType());
810        doMultiBlockArgs.push_back(bufPtr);
811    }
812    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
813        Value * bufPtr = kb->getRawOutputPointer(mStreamSetOutputs[i].name, kb->getInt32(0), producedItemCount[i]);
814        bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetOutputBuffers[i]->getPointerType());
815        doMultiBlockArgs.push_back(bufPtr);
816    }
817
818    kb->CreateCall(multiBlockFunction, doMultiBlockArgs);
819    // Do copybacks if necessary.
820    unsigned priorIdx = 0;
821    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
822        Value * log2BlockSize = kb->getSize(std::log2(kb->getBitBlockWidth()));
823        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
824            BasicBlock * copyBack = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
825            BasicBlock * done = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
826            Value * newlyProduced = kb->CreateSub(kb->getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
827            Value * priorBlock = kb->CreateLShr(producedItemCount[i], log2BlockSize);
828            Value * priorOffset = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
829            Value * instance = kb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
830            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(kb.get(), priorBlock);
831            Value * accessible = kb->CreateSub(kb->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
832            Value * wraparound = kb->CreateICmpULT(accessible, newlyProduced);
833            kb->CreateCondBr(wraparound, copyBack, done);
834            kb->SetInsertPoint(copyBack);
835            Value * copyItems = kb->CreateSub(newlyProduced, accessible);
836            cb->createCopyBack(kb.get(), instance, copyItems);
837            kb->CreateBr(done);
838            kb->SetInsertPoint(done);
839            priorIdx++;
840        }
841        if (auto cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
842            BasicBlock * copyBack = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
843            BasicBlock * done = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
844            Value * instance = kb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
845            Value * newlyProduced = kb->CreateSub(kb->getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
846            Value * accessible = cb->getLinearlyAccessibleItems(kb.get(), producedItemCount[i]);
847            Value * wraparound = kb->CreateICmpULT(accessible, newlyProduced);
848            kb->CreateCondBr(wraparound, copyBack, done);
849            kb->SetInsertPoint(copyBack);
850            Value * copyItems = kb->CreateSub(newlyProduced, accessible);
851            cb->createCopyBack(kb.get(), instance, copyItems);
852            kb->CreateBr(done);
853            kb->SetInsertPoint(done);
854            priorIdx++;
855        }
856    }
857    kb->setProcessedItemCount(mStreamSetInputs[0].name, kb->CreateAdd(processed, linearlyAvailItems));
858    Value * reducedBlocksToDo = kb->CreateSub(blocksRemaining, linearlyWritableBlocks);
859    Value * fullBlocksRemain = kb->CreateICmpUGT(reducedBlocksToDo, kb->getSize(0));
860    BasicBlock * multiBlockFinal = kb->GetInsertBlock();
861    blocksRemaining->addIncoming(reducedBlocksToDo, multiBlockFinal);
862    kb->CreateCondBr(fullBlocksRemain, doSegmentOuterLoop, tempBlockCheck);
863    //iBuilder->CreateBr(doSegmentOuterLoop);
864    //
865    // We use temporary buffers in 3 different cases that preclude full block processing.
866    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
867    // (b) One or more output buffers does not have sufficient linearly available buffer space.
868    // (c) We have processed all the full blocks of input and only the excessItems remain.
869    // In each case we set up temporary buffers for input and output and then
870    // call the Multiblock routine.
871    //
872
873    kb->SetInsertPoint(tempBlockCheck);
874    PHINode * const tempBlocksRemain = kb->CreatePHI(kb->getSizeTy(), 2, "tempBlocksRemain");
875    tempBlocksRemain->addIncoming(blocksRemaining, doSegmentOuterLoop);
876    tempBlocksRemain->addIncoming(reducedBlocksToDo, multiBlockFinal);
877
878    haveBlocks = kb->CreateICmpUGT(tempBlocksRemain, kb->getSize(0));
879    kb->CreateCondBr(kb->CreateOr(mIsFinal, haveBlocks), doTempBufferBlock, segmentDone);
880
881    //
882    // We use temporary buffers in 3 different cases that preclude full block processing.
883    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
884    // (b) One or more output buffers does not have sufficient linearly available buffer space.
885    // (c) We have processed all the full blocks of input and only the excessItems remain.
886    // In each case we set up temporary buffers for input and output and then
887    // call the Multiblock routine.
888    //
889    kb->SetInsertPoint(doTempBufferBlock);
890    Value * tempBlockItems = kb->CreateSelect(haveBlocks, blockSize, excessItems);
891
892    // Begin constructing the doMultiBlock args.
893    std::vector<Value *> tempArgs;
894    tempArgs.push_back(getInstance());
895    tempArgs.push_back(tempBlockItems);
896
897    // Prepare the temporary buffer area.
898    //
899    // First zero it out.
900    Constant * const tempAreaSize = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(tempParameterStructType), kb->getSizeTy(), false);
901    kb->CreateMemZero(tempParameterArea, tempAreaSize);
902
903    // For each input and output buffer, copy over necessary data starting from the last
904    // block boundary.
905    std::vector<Value *> finalItemPos;
906    finalItemPos.push_back(kb->CreateAdd(processedItemCount[0], tempBlockItems));
907
908    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); i++) {
909        Value * tempBufPtr = kb->CreateGEP(tempParameterArea, kb->getInt32(i));
910        tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetInputBuffers[i]->getPointerType());
911
912        Value * blockItemPos = kb->CreateAnd(processedItemCount[i], blockBaseMask);
913
914        // The number of items to copy is determined by the processing rate requirements.
915        if (i > 1) {
916            auto & rate = mStreamSetInputs[i].rate;
917            std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
918            if (refSet.empty()) {
919                finalItemPos.push_back(rate.CreateRatioCalculation(kb.get(), finalItemPos[0], kb->CreateNot(haveBlocks)));
920            }
921            else {
922                Port port; unsigned ssIdx;
923                std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
924                assert (port == Port::Input && ssIdx < i);
925                finalItemPos.push_back(rate.CreateRatioCalculation(kb.get(), finalItemPos[ssIdx], kb->CreateNot(haveBlocks)));
926            }
927        }
928        Value * neededItems = kb->CreateSub(finalItemPos[i], blockItemPos);
929        Value * availFromBase = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(kb.get(), blockItemPos);
930        Value * copyItems1 = kb->CreateSelect(kb->CreateICmpULT(neededItems, availFromBase), neededItems, availFromBase);
931        Value * copyItems2 = kb->CreateSub(neededItems, copyItems1);
932        Value * inputPtr = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
933        mStreamSetInputBuffers[i]->createBlockAlignedCopy(kb.get(), tempBufPtr, inputPtr, copyItems1);
934        Value * nextBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(availFromBase, blockSize));
935        mStreamSetInputBuffers[i]->createBlockAlignedCopy(kb.get(), nextBufPtr, kb->getStreamSetBufferPtr(mStreamSetInputs[i].name), copyItems2);
936
937        Value * itemAddress = kb->CreatePtrToInt(kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), processedItemCount[i]), intAddressTy);
938        Value * baseAddress = kb->CreatePtrToInt(inputBlockPtr[i], intAddressTy);
939        Value * tempAddress = kb->CreateAdd(kb->CreatePtrToInt(tempBufPtr, kb->getSizeTy()), kb->CreateSub(itemAddress, baseAddress));
940        tempArgs.push_back(kb->CreateIntToPtr(tempAddress, mStreamSetInputBuffers[i]->getPointerType()));
941    }
942
943    std::vector<Value *> blockItemPos;
944    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
945        Value * tempBufPtr = kb->CreateGEP(tempParameterArea, kb->getInt32(mStreamSetInputs.size() + i));
946        tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
947        blockItemPos.push_back(kb->CreateAnd(producedItemCount[i], blockBaseMask));
948        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), tempBufPtr, outputBlockPtr[i], kb->CreateSub(producedItemCount[i], blockItemPos[i]));
949        Value * itemAddress = kb->CreatePtrToInt(kb->getRawOutputPointer(mStreamSetInputs[i].name, kb->getInt32(0), producedItemCount[i]), kb->getSizeTy());
950        Value * outputPtr = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
951        Value * baseAddress = kb->CreatePtrToInt(outputPtr, intAddressTy);
952        Value * tempAddress = kb->CreateAdd(kb->CreatePtrToInt(tempBufPtr, intAddressTy), kb->CreateSub(itemAddress, baseAddress));
953        tempArgs.push_back(kb->CreateIntToPtr(tempAddress, mStreamSetOutputBuffers[i]->getPointerType()));
954    }
955
956
957    kb->CreateCall(multiBlockFunction, tempArgs);
958
959    // Copy back data to the actual output buffers.
960
961    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
962        Value * tempBufPtr = kb->CreateGEP(tempParameterArea, kb->getInt32(mStreamSetInputs.size() + i));
963        tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
964        Value * final_items = kb->getProducedItemCount(mStreamSetOutputs[i].name);
965        Value * copyItems = kb->CreateSub(final_items, blockItemPos[i]);
966        Value * copyItems1 = mStreamSetOutputBuffers[i]->getLinearlyWritableItems(kb.get(), blockItemPos[i]); // must be a whole number of blocks.
967        Value * outputPtr = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
968        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), outputPtr, tempBufPtr, copyItems1);
969        Value * copyItems2 = kb->CreateSelect(kb->CreateICmpULT(copyItems, copyItems), kb->getSize(0), kb->CreateSub(copyItems, copyItems1));
970        tempBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(copyItems1, blockSize));
971        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), kb->getStreamSetBufferPtr(mStreamSetOutputs[i].name), tempBufPtr, copyItems2);
972    }
973
974    kb->setProcessedItemCount(mStreamSetInputs[0].name, finalItemPos[0]);
975
976    //  We've dealt with the partial block processing and copied information back into the
977    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
978    //
979    blocksRemaining->addIncoming(kb->CreateSub(tempBlocksRemain, kb->CreateZExt(haveBlocks, kb->getSizeTy())), kb->GetInsertBlock());
980    kb->CreateCondBr(haveBlocks, doSegmentOuterLoop, segmentDone);
981    kb->SetInsertPoint(segmentDone);
982}
983
984void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
985    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
986    mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
987}
988
989Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
990    const auto f = mStreamMap.find(name);
991    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
992        report_fatal_error(getName() + " does not contain stream set " + name);
993    }
994    return f->second;
995}
996
997// CONSTRUCTOR
998Kernel::Kernel(std::string && kernelName,
999                             std::vector<Binding> && stream_inputs,
1000                             std::vector<Binding> && stream_outputs,
1001                             std::vector<Binding> && scalar_parameters,
1002                             std::vector<Binding> && scalar_outputs,
1003                             std::vector<Binding> && internal_scalars)
1004: KernelInterface(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1005, mCurrentMethod(nullptr)
1006, mNoTerminateAttribute(false)
1007, mIsGenerated(false)
1008, mIsFinal(nullptr)
1009, mOutputScalarResult(nullptr) {
1010
1011}
1012
1013Kernel::~Kernel() {
1014
1015}
1016
1017// CONSTRUCTOR
1018BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
1019                                         std::vector<Binding> && stream_inputs,
1020                                         std::vector<Binding> && stream_outputs,
1021                                         std::vector<Binding> && scalar_parameters,
1022                                         std::vector<Binding> && scalar_outputs,
1023                                         std::vector<Binding> && internal_scalars)
1024: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
1025, mDoBlockMethod(nullptr)
1026, mStrideLoopBody(nullptr)
1027, mStrideLoopBranch(nullptr)
1028, mStrideLoopTarget(nullptr) {
1029
1030}
1031
1032// CONSTRUCTOR
1033MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
1034                                     std::vector<Binding> && stream_inputs,
1035                                     std::vector<Binding> && stream_outputs,
1036                                     std::vector<Binding> && scalar_parameters,
1037                                     std::vector<Binding> && scalar_outputs,
1038                                             std::vector<Binding> && internal_scalars)
1039: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1040   
1041}
1042
1043// CONSTRUCTOR
1044SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
1045                                             std::vector<Binding> && stream_inputs,
1046                                             std::vector<Binding> && stream_outputs,
1047                                             std::vector<Binding> && scalar_parameters,
1048                                             std::vector<Binding> && scalar_outputs,
1049                                             std::vector<Binding> && internal_scalars)
1050: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1051   
1052}
1053   
1054}
Note: See TracBrowser for help on using the repository browser.