source: icGREP/icgrep-devel/icgrep/kernels/kernel.cpp @ 5448

Last change on this file since 5448 was 5448, checked in by cameron, 2 years ago

Hack for source/external buffers with mBufferBlocks=1; u8u16 test with segment-pipeline-parallel; simplified copying

File size: 50.5 KB
RevLine 
[4924]1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "kernel.h"
[5425]7#include <toolchain/toolchain.h>
[5297]8#include <kernels/streamset.h>
9#include <llvm/IR/Constants.h>
10#include <llvm/IR/Function.h>
11#include <llvm/IR/Instructions.h>
[5350]12#include <llvm/IR/MDBuilder.h>
[5267]13#include <llvm/IR/Module.h>
14#include <llvm/Support/raw_ostream.h>
[5392]15#include <llvm/Bitcode/ReaderWriter.h>
[5350]16#include <llvm/Transforms/Utils/Local.h>
[5408]17#include <kernels/streamset.h>
18#include <sstream>
[5436]19#include <kernels/kernel_builder.h>
[4924]20
[5435]21using namespace llvm;
22using namespace parabix;
[5287]23
[5435]24namespace kernel {
[5287]25
[5435]26const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
27const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
[5439]28const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
[5435]29const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
30const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
31const std::string Kernel::CONSUMED_ITEM_COUNT_SUFFIX = "_consumedItemCount";
32const std::string Kernel::PRODUCED_ITEM_COUNT_SUFFIX = "_producedItemCount";
33const std::string Kernel::TERMINATION_SIGNAL = "terminationSignal";
34const std::string Kernel::BUFFER_PTR_SUFFIX = "_bufferPtr";
35const std::string Kernel::CONSUMER_SUFFIX = "_consumerLocks";
[5292]36
[5435]37unsigned Kernel::addScalar(Type * const type, const std::string & name) {
[5063]38    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
[5320]39        report_fatal_error("Cannot add field " + name + " to " + getName() + " after kernel state finalized");
[4924]40    }
[5283]41    if (LLVM_UNLIKELY(mKernelMap.count(name))) {
[5320]42        report_fatal_error(getName() + " already contains scalar field " + name);
[5283]43    }
[5227]44    const auto index = mKernelFields.size();
45    mKernelMap.emplace(name, index);
46    mKernelFields.push_back(type);
47    return index;
[4924]48}
[4968]49
[5435]50unsigned Kernel::addUnnamedScalar(Type * const type) {
[5283]51    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
[5418]52        report_fatal_error("Cannot add unnamed field  to " + getName() + " after kernel state finalized");
[5283]53    }
54    const auto index = mKernelFields.size();
55    mKernelFields.push_back(type);
56    return index;
57}
58
[5435]59void Kernel::prepareStreamSetNameMap() {
[5299]60    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
[5398]61        mStreamMap.emplace(mStreamSetInputs[i].name, std::make_pair(Port::Input, i));
[5299]62    }
63    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5398]64        mStreamMap.emplace(mStreamSetOutputs[i].name, std::make_pair(Port::Output, i));
[5299]65    }
66}
[5440]67
[5446]68void Kernel::bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs) {
[5440]69    assert (mModule == nullptr);
70    assert (mStreamSetInputBuffers.empty());
71    assert (mStreamSetOutputBuffers.empty());
72
73    if (LLVM_UNLIKELY(mStreamSetInputs.size() != inputs.size())) {
74        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetInputs.size()) +
75                           " input stream sets but was given "
76                           + std::to_string(inputs.size()));
77    }
78
79    for (unsigned i = 0; i < inputs.size(); ++i) {
80        StreamSetBuffer * const buf = inputs[i];
81        if (LLVM_UNLIKELY(buf == nullptr)) {
82            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
83                               + " cannot be null");
84        }
85        buf->addConsumer(this);
86    }
87
88    if (LLVM_UNLIKELY(mStreamSetOutputs.size() != outputs.size())) {
89        report_fatal_error(getName() + ": expected " + std::to_string(mStreamSetOutputs.size())
90                           + " output stream sets but was given "
91                           + std::to_string(outputs.size()));
92    }
93
94    for (unsigned i = 0; i < outputs.size(); ++i) {
95        StreamSetBuffer * const buf = outputs[i];
96        if (LLVM_UNLIKELY(buf == nullptr)) {
97            report_fatal_error(getName() + ": output stream set " + std::to_string(i) + " cannot be null");
98        }
99        if (LLVM_LIKELY(buf->getProducer() == nullptr)) {
100            buf->setProducer(this);
101        } else {
102            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
103                               + " is already produced by kernel " + buf->getProducer()->getName());
104        }
105    }
106
107    mStreamSetInputBuffers.assign(inputs.begin(), inputs.end());
108    mStreamSetOutputBuffers.assign(outputs.begin(), outputs.end());
[5446]109}
110
111Module * Kernel::makeModule(const std::unique_ptr<KernelBuilder> & idb) {
112    assert (mModule == nullptr);
113    std::stringstream cacheName;   
114    cacheName << getName() << '_' << idb->getBuilderUniqueName();
115    for (const StreamSetBuffer * b: mStreamSetInputBuffers) {
116        cacheName <<  ':' <<  b->getUniqueID();
117    }
118    for (const StreamSetBuffer * b: mStreamSetOutputBuffers) {
119        cacheName <<  ':' <<  b->getUniqueID();
120    }
121    mModule = new Module(cacheName.str(), idb->getContext());
[5440]122    prepareKernel(idb);
[5446]123    return mModule;
[5440]124}
125
[5446]126Module * Kernel::setModule(const std::unique_ptr<KernelBuilder> & idb, llvm::Module * const module) {
127    assert (mModule == nullptr);
128    mModule = module;
129    prepareKernel(idb);
130    return mModule;
131}
132
[5440]133void Kernel::prepareKernel(const std::unique_ptr<KernelBuilder> & idb) {
134    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
[5246]135    if (LLVM_UNLIKELY(mKernelStateType != nullptr)) {
[5307]136        report_fatal_error("Cannot prepare kernel after kernel state finalized");
[5246]137    }
[5440]138    const auto blockSize = idb->getBitBlockWidth();
[5446]139    const auto requiredBlocks = codegen::SegmentSize + ((blockSize + mLookAheadPositions - 1) / blockSize);
140
[5133]141    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
[5448]142        if ((mStreamSetInputBuffers[i]->getBufferBlocks() > 1) && (mStreamSetInputBuffers[i]->getBufferBlocks() < requiredBlocks)) {
[5446]143            report_fatal_error(getName() + ": " + mStreamSetInputs[i].name + " requires buffer size " + std::to_string(requiredBlocks));
[5142]144        }
[5307]145        mScalarInputs.emplace_back(mStreamSetInputBuffers[i]->getPointerType(), mStreamSetInputs[i].name + BUFFER_PTR_SUFFIX);
[5434]146        if ((i == 0) || !mStreamSetInputs[i].rate.isExact()) {
[5440]147            addScalar(idb->getSizeTy(), mStreamSetInputs[i].name + PROCESSED_ITEM_COUNT_SUFFIX);
148        }
[5086]149    }
[5408]150
[5440]151    IntegerType * const sizeTy = idb->getSizeTy();
[5133]152    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5307]153        mScalarInputs.emplace_back(mStreamSetOutputBuffers[i]->getPointerType(), mStreamSetOutputs[i].name + BUFFER_PTR_SUFFIX);
[5328]154        if ((mStreamSetInputs.empty() && (i == 0)) || !mStreamSetOutputs[i].rate.isExact()) {
[5408]155            addScalar(sizeTy, mStreamSetOutputs[i].name + PRODUCED_ITEM_COUNT_SUFFIX);
[5327]156        }
[5086]157    }
[5446]158    for (const auto & binding : mScalarInputs) {
[5202]159        addScalar(binding.type, binding.name);
[5076]160    }
[5446]161    for (const auto & binding : mScalarOutputs) {
[5202]162        addScalar(binding.type, binding.name);
[5076]163    }
[5398]164    if (mStreamMap.empty()) {
[5392]165        prepareStreamSetNameMap();
[5307]166    }
[5446]167    for (const auto & binding : mInternalScalars) {
[5202]168        addScalar(binding.type, binding.name);
[5076]169    }
[5408]170
171    Type * const consumerSetTy = StructType::get(sizeTy, sizeTy->getPointerTo()->getPointerTo(), nullptr)->getPointerTo();
172    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5418]173        addScalar(consumerSetTy, mStreamSetOutputs[i].name + CONSUMER_SUFFIX);
[5408]174    }
175
176    addScalar(sizeTy, LOGICAL_SEGMENT_NO_SCALAR);
[5440]177    addScalar(idb->getInt1Ty(), TERMINATION_SIGNAL);
[5408]178
[5418]179    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
180        addScalar(sizeTy, mStreamSetOutputs[i].name + CONSUMED_ITEM_COUNT_SUFFIX);
181    }
182
[5440]183    mKernelStateType = StructType::create(idb->getContext(), mKernelFields, getName());
[4970]184}
185
[5392]186// Default kernel signature: generate the IR and emit as byte code.
[5440]187std::string Kernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> & idb) {
188    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
[5431]189    if (LLVM_LIKELY(moduleIDisSignature())) {
190        return getModule()->getModuleIdentifier();
[5401]191    } else {
[5440]192        generateKernel(idb);
[5401]193        std::string signature;
194        raw_string_ostream OS(signature);
[5431]195        WriteBitcodeToFile(getModule(), OS);
[5401]196        return signature;
197    }
[5392]198}
199
[5440]200void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
201    assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
[5411]202    // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
[5401]203    // generated the unoptimized IR.
204    if (!mIsGenerated) {
[5440]205        const auto m = idb->getModule();
206        const auto ip = idb->saveIP();
207        const auto saveInstance = getInstance();
208        idb->setModule(mModule);
209        addKernelDeclarations(idb);
210        callGenerateInitializeMethod(idb);
211        callGenerateDoSegmentMethod(idb);
212        callGenerateFinalizeMethod(idb);
[5411]213        setInstance(saveInstance);
[5440]214        idb->setModule(m);
215        idb->restoreIP(ip);
[5431]216        mIsGenerated = true;
[4995]217    }
[5250]218}
[5246]219
[5440]220inline void Kernel::callGenerateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
221    mCurrentMethod = getInitFunction(idb->getModule());
222    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
[5347]223    Function::arg_iterator args = mCurrentMethod->arg_begin();
[5408]224    setInstance(&*(args++));
[5440]225    idb->CreateStore(ConstantAggregateZero::get(mKernelStateType), getInstance());
[5436]226    for (const auto & binding : mScalarInputs) {
[5440]227        idb->setScalarField(binding.name, &*(args++));
[5051]228    }
[5436]229    for (const auto & binding : mStreamSetOutputs) {
[5440]230        idb->setConsumerLock(binding.name, &*(args++));
[5408]231    }
[5440]232    generateInitializeMethod(idb);
233    idb->CreateRetVoid();
[5051]234}
235
[5440]236inline void Kernel::callGenerateDoSegmentMethod(const std::unique_ptr<kernel::KernelBuilder> & idb) {
237    mCurrentMethod = getDoSegmentFunction(idb->getModule());
238    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
[5411]239    auto args = mCurrentMethod->arg_begin();
240    setInstance(&*(args++));
[5418]241    mIsFinal = &*(args++);
242    const auto n = mStreamSetInputs.size();
243    mAvailableItemCount.resize(n, nullptr);
244    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
245        mAvailableItemCount[i] = &*(args++);
246    }
[5440]247    generateDoSegmentMethod(idb); // must be overridden by the KernelBuilder subtype
[5418]248    mIsFinal = nullptr;
249    mAvailableItemCount.clear();
[5440]250    idb->CreateRetVoid();
[5411]251}
252
[5440]253inline void Kernel::callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb) {
254    mCurrentMethod = getTerminateFunction(idb->getModule());
255    idb->SetInsertPoint(BasicBlock::Create(idb->getContext(), "entry", mCurrentMethod));
[5418]256    auto args = mCurrentMethod->arg_begin();
257    setInstance(&*(args++));
[5440]258    generateFinalizeMethod(idb); // may be overridden by the KernelBuilder subtype
[5418]259    const auto n = mScalarOutputs.size();
260    if (n == 0) {
[5440]261        idb->CreateRetVoid();
[5418]262    } else {
263        Value * outputs[n];
264        for (unsigned i = 0; i < n; ++i) {
[5440]265            outputs[i] = idb->getScalarField(mScalarOutputs[i].name);
[5418]266        }
267        if (n == 1) {
[5440]268            idb->CreateRet(outputs[0]);
[5418]269        } else {
[5440]270            idb->CreateAggregateRet(outputs, n);
[5418]271        }
272    }
273}
274
[5435]275unsigned Kernel::getScalarIndex(const std::string & name) const {
[5227]276    const auto f = mKernelMap.find(name);
277    if (LLVM_UNLIKELY(f == mKernelMap.end())) {
[5320]278        report_fatal_error(getName() + " does not contain scalar: " + name);
[5000]279    }
[5435]280    return f->second;
[4959]281}
[4924]282
[5440]283Value * Kernel::createInstance(const std::unique_ptr<KernelBuilder> & idb) {
284    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
[5246]285    if (LLVM_UNLIKELY(mKernelStateType == nullptr)) {
[5320]286        report_fatal_error("Cannot instantiate " + getName() + " before calling prepareKernel()");
[5246]287    }
[5440]288    setInstance(idb->CreateCacheAlignedAlloca(mKernelStateType));
[5408]289    return getInstance();
290}
[5320]291
[5440]292void Kernel::initializeInstance(const std::unique_ptr<KernelBuilder> & idb) {
293    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
[5408]294    if (LLVM_UNLIKELY(getInstance() == nullptr)) {
295        report_fatal_error("Cannot initialize " + getName() + " before calling createInstance()");
296    }
[5320]297    std::vector<Value *> args;
[5408]298    args.reserve(1 + mInitialArguments.size() + mStreamSetInputBuffers.size() + (mStreamSetOutputBuffers.size() * 2));
299    args.push_back(getInstance());
[5320]300    for (unsigned i = 0; i < mInitialArguments.size(); ++i) {
301        Value * arg = mInitialArguments[i];
302        if (LLVM_UNLIKELY(arg == nullptr)) {
303            report_fatal_error(getName() + ": initial argument " + std::to_string(i)
304                               + " cannot be null when calling createInstance()");
305        }
306        args.push_back(arg);
[5133]307    }
[5320]308    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
309        assert (mStreamSetInputBuffers[i]);
310        Value * arg = mStreamSetInputBuffers[i]->getStreamSetBasePtr();
311        if (LLVM_UNLIKELY(arg == nullptr)) {
312            report_fatal_error(getName() + ": input stream set " + std::to_string(i)
313                               + " was not allocated prior to calling createInstance()");
314        }
315        args.push_back(arg);
[5133]316    }
[5320]317    assert (mStreamSetInputs.size() == mStreamSetInputBuffers.size());
318    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
319        assert (mStreamSetOutputBuffers[i]);
320        Value * arg = mStreamSetOutputBuffers[i]->getStreamSetBasePtr();
321        if (LLVM_UNLIKELY(arg == nullptr)) {
322            report_fatal_error(getName() + ": output stream set " + std::to_string(i)
323                               + " was not allocated prior to calling createInstance()");
324        }
325        args.push_back(arg);
[5133]326    }
[5320]327    assert (mStreamSetOutputs.size() == mStreamSetOutputBuffers.size());
[5440]328    IntegerType * const sizeTy = idb->getSizeTy();
[5408]329    PointerType * const sizePtrTy = sizeTy->getPointerTo();
330    PointerType * const sizePtrPtrTy = sizePtrTy->getPointerTo();
331    StructType * const consumerTy = StructType::get(sizeTy, sizePtrPtrTy, nullptr);
332    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
[5418]333        const auto output = mStreamSetOutputBuffers[i];
334        const auto & consumers = output->getConsumers();
[5411]335        const auto n = consumers.size();
[5440]336        AllocaInst * const outputConsumers = idb->CreateAlloca(consumerTy);
337        Value * const consumerSegNoArray = idb->CreateAlloca(ArrayType::get(sizePtrTy, n));
[5408]338        for (unsigned i = 0; i < n; ++i) {
[5435]339            Kernel * const consumer = consumers[i];
[5418]340            assert ("all instances must be created prior to initialization of any instance" && consumer->getInstance());
[5440]341            idb->setKernel(consumer);
342            Value * const segmentNoPtr = idb->getScalarFieldPtr(LOGICAL_SEGMENT_NO_SCALAR);
343            idb->CreateStore(segmentNoPtr, idb->CreateGEP(consumerSegNoArray, { idb->getInt32(0), idb->getInt32(i) }));
[5408]344        }
[5440]345        idb->setKernel(this);
346        Value * const consumerCountPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(0)});
347        idb->CreateStore(idb->getSize(n), consumerCountPtr);
348        Value * const consumerSegNoArrayPtr = idb->CreateGEP(outputConsumers, {idb->getInt32(0), idb->getInt32(1)});
349        idb->CreateStore(idb->CreatePointerCast(consumerSegNoArray, sizePtrPtrTy), consumerSegNoArrayPtr);
[5408]350        args.push_back(outputConsumers);
351    }
[5440]352    idb->CreateCall(getInitFunction(idb->getModule()), args);
[5133]353}
[5104]354
[5285]355//  The default doSegment method dispatches to the doBlock routine for
356//  each block of the given number of blocksToDo, and then updates counts.
[5347]357
[5440]358void BlockOrientedKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb) {
359    BasicBlock * const entryBlock = idb->GetInsertBlock();
360    BasicBlock * const strideLoopCond = idb->CreateBasicBlock(getName() + "_strideLoopCond");
361    mStrideLoopBody = idb->CreateBasicBlock(getName() + "_strideLoopBody");
362    BasicBlock * const stridesDone = idb->CreateBasicBlock(getName() + "_stridesDone");
363    BasicBlock * const doFinalBlock = idb->CreateBasicBlock(getName() + "_doFinalBlock");
364    BasicBlock * const segmentDone = idb->CreateBasicBlock(getName() + "_segmentDone");
[5285]365
[5351]366    Value * baseTarget = nullptr;
[5440]367    if (idb->supportsIndirectBr()) {
368        baseTarget = idb->CreateSelect(mIsFinal, BlockAddress::get(doFinalBlock), BlockAddress::get(segmentDone));
[5351]369    }
370
[5440]371    ConstantInt * stride = idb->getSize(idb->getStride());
[5418]372    Value * availablePos = mAvailableItemCount[0];
[5440]373    Value * processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
374    Value * itemsAvail = idb->CreateSub(availablePos, processed);
375    Value * stridesToDo = idb->CreateUDiv(itemsAvail, stride);
[5350]376
[5440]377    idb->CreateBr(strideLoopCond);
[5285]378
[5440]379    idb->SetInsertPoint(strideLoopCond);
[5351]380
381    PHINode * branchTarget = nullptr;
[5440]382    if (idb->supportsIndirectBr()) {
383        branchTarget = idb->CreatePHI(baseTarget->getType(), 2, "branchTarget");
[5351]384        branchTarget->addIncoming(baseTarget, entryBlock);
385    }
386
[5440]387    PHINode * const stridesRemaining = idb->CreatePHI(idb->getSizeTy(), 2, "stridesRemaining");
[5285]388    stridesRemaining->addIncoming(stridesToDo, entryBlock);
[5353]389    // NOTE: stridesRemaining may go to a negative number in the final block if the generateFinalBlockMethod(...)
390    // calls CreateDoBlockMethodCall(). Do *not* replace the comparator with an unsigned one!
[5440]391    Value * notDone = idb->CreateICmpSGT(stridesRemaining, idb->getSize(0));
392    idb->CreateLikelyCondBr(notDone, mStrideLoopBody, stridesDone);
[5285]393
[5440]394    idb->SetInsertPoint(mStrideLoopBody);
[5285]395
[5440]396    if (idb->supportsIndirectBr()) {
397        mStrideLoopTarget = idb->CreatePHI(baseTarget->getType(), 2, "strideTarget");
[5351]398        mStrideLoopTarget->addIncoming(branchTarget, strideLoopCond);
[5350]399    }
400
[5347]401    /// GENERATE DO BLOCK METHOD
[5285]402
[5440]403    writeDoBlockMethod(idb);
[5347]404
405    /// UPDATE PROCESSED COUNTS
406
[5440]407    processed = idb->getProcessedItemCount(mStreamSetInputs[0].name);
408    Value * itemsDone = idb->CreateAdd(processed, stride);
409    idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
[5347]410
[5440]411    stridesRemaining->addIncoming(idb->CreateSub(stridesRemaining, idb->getSize(1)), idb->GetInsertBlock());
[5297]412
[5440]413    BasicBlock * bodyEnd = idb->GetInsertBlock();
414    if (idb->supportsIndirectBr()) {
[5351]415        branchTarget->addIncoming(mStrideLoopTarget, bodyEnd);
[5350]416    }
[5440]417    idb->CreateBr(strideLoopCond);
[5350]418
[5351]419    stridesDone->moveAfter(bodyEnd);
420
[5440]421    idb->SetInsertPoint(stridesDone);
[5297]422
[5285]423    // Now conditionally perform the final block processing depending on the doFinal parameter.
[5440]424    if (idb->supportsIndirectBr()) {
425        mStrideLoopBranch = idb->CreateIndirectBr(branchTarget, 3);
[5351]426        mStrideLoopBranch->addDestination(doFinalBlock);
427        mStrideLoopBranch->addDestination(segmentDone);
428    } else {
[5440]429        idb->CreateUnlikelyCondBr(mIsFinal, doFinalBlock, segmentDone);
[5351]430    }
431
432    doFinalBlock->moveAfter(stridesDone);
433
[5440]434    idb->SetInsertPoint(doFinalBlock);
[5285]435
[5440]436    Value * remainingItems = idb->CreateSub(mAvailableItemCount[0], idb->getProcessedItemCount(mStreamSetInputs[0].name));
[5285]437
[5440]438    writeFinalBlockMethod(idb, remainingItems);
439
[5418]440    itemsDone = mAvailableItemCount[0];
[5440]441    idb->setProcessedItemCount(mStreamSetInputs[0].name, itemsDone);
442    idb->setTerminationSignal();
443    idb->CreateBr(segmentDone);
[5285]444
[5440]445    segmentDone->moveAfter(idb->GetInsertBlock());
[5351]446
[5440]447    idb->SetInsertPoint(segmentDone);
[5351]448
449    // Update the branch prediction metadata to indicate that the likely target will be segmentDone
[5440]450    if (idb->supportsIndirectBr()) {
451        MDBuilder mdb(idb->getContext());
[5350]452        const auto destinations = mStrideLoopBranch->getNumDestinations();
[5351]453        uint32_t weights[destinations];
454        for (unsigned i = 0; i < destinations; ++i) {
455            weights[i] = (mStrideLoopBranch->getDestination(i) == segmentDone) ? 100 : 1;
[5350]456        }
[5351]457        ArrayRef<uint32_t> bw(weights, destinations);
458        mStrideLoopBranch->setMetadata(LLVMContext::MD_prof, mdb.createBranchWeights(bw));
[5350]459    }
460
[5285]461}
462
[5440]463inline void BlockOrientedKernel::writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & idb) {
[5292]464
[5408]465    Value * const self = getInstance();
[5347]466    Function * const cp = mCurrentMethod;
[5440]467    auto ip = idb->saveIP();
[5292]468
[5347]469    /// Check if the do block method is called and create the function if necessary   
[5440]470    if (!idb->supportsIndirectBr()) {
471        FunctionType * const type = FunctionType::get(idb->getVoidTy(), {self->getType()}, false);
472        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + DO_BLOCK_SUFFIX, idb->getModule());
[5347]473        mCurrentMethod->setCallingConv(CallingConv::C);
474        mCurrentMethod->setDoesNotThrow();
475        mCurrentMethod->setDoesNotCapture(1);
476        auto args = mCurrentMethod->arg_begin();
[5408]477        args->setName("self");
478        setInstance(&*args);
[5440]479        idb->SetInsertPoint(idb->CreateBasicBlock("entry"));
[5347]480    }
481
[5330]482    std::vector<Value *> priorProduced;
483    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5355]484        if (isa<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]) || isa<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
[5440]485            priorProduced.push_back(idb->getProducedItemCount(mStreamSetOutputs[i].name));
[5330]486        }
487    }
[5347]488
[5440]489    generateDoBlockMethod(idb); // must be implemented by the BlockOrientedKernelBuilder subtype
[5347]490
[5361]491    unsigned priorIdx = 0;
[5330]492    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5440]493        Value * log2BlockSize = idb->getSize(std::log2(idb->getBitBlockWidth()));
[5431]494        if (SwizzledCopybackBuffer * const cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
[5440]495            BasicBlock * copyBack = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
496            BasicBlock * done = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
497            Value * newlyProduced = idb->CreateSub(idb->getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
498            Value * priorBlock = idb->CreateLShr(priorProduced[priorIdx], log2BlockSize);
499            Value * priorOffset = idb->CreateAnd(priorProduced[priorIdx], idb->getSize(idb->getBitBlockWidth() - 1));
500            Value * instance = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
501            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(idb.get(), priorBlock);
502            Value * accessible = idb->CreateSub(idb->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
503            Value * wraparound = idb->CreateICmpULT(accessible, newlyProduced);
504            idb->CreateCondBr(wraparound, copyBack, done);
505            idb->SetInsertPoint(copyBack);
506            Value * copyItems = idb->CreateSub(newlyProduced, accessible);
507            cb->createCopyBack(idb.get(), instance, copyItems);
508            idb->CreateBr(done);
509            idb->SetInsertPoint(done);
[5355]510            priorIdx++;
511        }
[5431]512        if (CircularCopybackBuffer * const cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
[5440]513            BasicBlock * copyBack = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
514            BasicBlock * done = idb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
515            Value * instance = idb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
516            Value * newlyProduced = idb->CreateSub(idb->getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
517            Value * accessible = cb->getLinearlyAccessibleItems(idb.get(), priorProduced[priorIdx]);
518            Value * wraparound = idb->CreateICmpULT(accessible, newlyProduced);
519            idb->CreateCondBr(wraparound, copyBack, done);
520            idb->SetInsertPoint(copyBack);
521            Value * copyItems = idb->CreateSub(newlyProduced, accessible);
522            cb->createCopyBack(idb.get(), instance, copyItems);
523            idb->CreateBr(done);
524            idb->SetInsertPoint(done);
[5330]525            priorIdx++;
526        }
[5329]527    }
[5347]528
[5431]529
[5350]530    /// Call the do block method if necessary then restore the current function state to the do segement method
[5440]531    if (!idb->supportsIndirectBr()) {
532        idb->CreateRetVoid();
[5350]533        mDoBlockMethod = mCurrentMethod;
[5440]534        idb->restoreIP(ip);
535        idb->CreateCall(mCurrentMethod, self);
[5408]536        setInstance(self);
[5350]537        mCurrentMethod = cp;
538    }
539
[5285]540}
541
[5440]542inline void BlockOrientedKernel::writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * remainingItems) {
[5292]543
[5408]544    Value * const self = getInstance();
[5347]545    Function * const cp = mCurrentMethod;
546    Value * const remainingItemCount = remainingItems;
[5440]547    auto ip = idb->saveIP();
[5285]548
[5440]549    if (!idb->supportsIndirectBr()) {
550        FunctionType * const type = FunctionType::get(idb->getVoidTy(), {self->getType(), idb->getSizeTy()}, false);
551        mCurrentMethod = Function::Create(type, GlobalValue::InternalLinkage, getName() + FINAL_BLOCK_SUFFIX, idb->getModule());
[5347]552        mCurrentMethod->setCallingConv(CallingConv::C);
553        mCurrentMethod->setDoesNotThrow();
554        mCurrentMethod->setDoesNotCapture(1);
555        auto args = mCurrentMethod->arg_begin();
[5408]556        args->setName("self");
557        setInstance(&*args);
[5347]558        remainingItems = &*(++args);
559        remainingItems->setName("remainingItems");
[5440]560        idb->SetInsertPoint(idb->CreateBasicBlock("entry"));
[5292]561    }
[5347]562
[5440]563    generateFinalBlockMethod(idb, remainingItems); // may be implemented by the BlockOrientedKernel subtype
[5347]564
[5351]565    RecursivelyDeleteTriviallyDeadInstructions(remainingItems); // if remainingItems was not used, this will eliminate it.
566
[5440]567    if (!idb->supportsIndirectBr()) {
568        idb->CreateRetVoid();
569        idb->restoreIP(ip);
570        idb->CreateCall(mCurrentMethod, {self, remainingItemCount});
[5347]571        mCurrentMethod = cp;
[5408]572        setInstance(self);
[5347]573    }
574
[5285]575}
576
[5347]577//  The default finalBlock method simply dispatches to the doBlock routine.
[5440]578void BlockOrientedKernel::generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & idb, Value * /* remainingItems */) {
579    CreateDoBlockMethodCall(idb);
[5292]580}
581
[5440]582void BlockOrientedKernel::CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & idb) {
583    if (idb->supportsIndirectBr()) {
584        BasicBlock * bb = idb->CreateBasicBlock("resume");
[5350]585        mStrideLoopBranch->addDestination(bb);
[5440]586        mStrideLoopTarget->addIncoming(BlockAddress::get(bb), idb->GetInsertBlock());
587        idb->CreateBr(mStrideLoopBody);
588        bb->moveAfter(idb->GetInsertBlock());
589        idb->SetInsertPoint(bb);
[5350]590    } else {
[5440]591        idb->CreateCall(mDoBlockMethod, getInstance());
[5292]592    }
[5285]593}
594
[5440]595void MultiBlockKernel::generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & kb) {
[5292]596
[5446]597    auto ip = kb->saveIP();
[5442]598    Function * const cp = mCurrentMethod;
[5443]599    const auto saveInstance = getInstance();
[5446]600
[5440]601    // First prepare the multi-block method that will be used.
[5418]602
[5446]603    DataLayout DL(kb->getModule());
604    IntegerType * const intAddressTy = DL.getIntPtrType(kb->getContext());
605
[5439]606    std::vector<Type *> multiBlockParmTypes;
607    multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
[5446]608    multiBlockParmTypes.push_back(kb->getSizeTy());
[5439]609    for (auto buffer : mStreamSetInputBuffers) {
610        multiBlockParmTypes.push_back(buffer->getPointerType());
611    }
612    for (auto buffer : mStreamSetOutputBuffers) {
613        multiBlockParmTypes.push_back(buffer->getPointerType());
614    }
[5446]615
616    FunctionType * const type = FunctionType::get(kb->getVoidTy(), multiBlockParmTypes, false);
617    Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, kb->getModule());
[5439]618    multiBlockFunction->setCallingConv(CallingConv::C);
619    multiBlockFunction->setDoesNotThrow();
620    auto args = multiBlockFunction->arg_begin();
621    args->setName("self");
[5443]622    setInstance(&*args);
[5441]623    (++args)->setName("itemsToDo");
[5439]624    for (auto binding : mStreamSetInputs) {
625        (++args)->setName(binding.name + "BufPtr");
626    }
627    for (auto binding : mStreamSetOutputs) {
[5441]628        (++args)->setName(binding.name + "BufPtr");
[5439]629    }
[5440]630
631    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
[5439]632    // provide the required multi-block kernel logic.
[5442]633    mCurrentMethod = multiBlockFunction;
[5446]634    kb->SetInsertPoint(BasicBlock::Create(kb->getContext(), "multiBlockEntry", multiBlockFunction, 0));
[5441]635    generateMultiBlockLogic(kb);
[5440]636
[5446]637    kb->CreateRetVoid();
638
639    kb->restoreIP(ip);
[5442]640    mCurrentMethod = cp;
[5443]641    setInstance(saveInstance);
[5446]642
[5439]643    // Now proceed with creation of the doSegment method.
[5440]644
[5446]645    BasicBlock * const entry = kb->GetInsertBlock();
646    BasicBlock * const doSegmentOuterLoop = kb->CreateBasicBlock(getName() + "_doSegmentOuterLoop");
647    BasicBlock * const doMultiBlockCall = kb->CreateBasicBlock(getName() + "_doMultiBlockCall");
648    BasicBlock * const tempBlockCheck = kb->CreateBasicBlock(getName() + "_tempBlockCheck");
649    BasicBlock * const doTempBufferBlock = kb->CreateBasicBlock(getName() + "_doTempBufferBlock");
650    BasicBlock * const segmentDone = kb->CreateBasicBlock(getName() + "_segmentDone");
[5440]651
[5446]652    Value * blockBaseMask = kb->CreateNot(kb->getSize(kb->getBitBlockWidth() - 1));
[5440]653
[5439]654    //
655    //  A. Temporary Buffer Area Determination
656    //
657    // For final block processing and for processing near the end of physical buffer
658    // boundaries, we need to allocate temporary space for processing a full block of input.
659    // Compute the size requirements to store stream set data at the declared processing
[5440]660    // rates in reference to one block of the principal input stream.
661    //
[5439]662
[5446]663    unsigned bitBlockWidth = kb->getBitBlockWidth();
[5439]664    std::vector<Type *> tempBuffers;
665    std::vector<unsigned> itemsPerPrincipalBlock;
666    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
667        auto & rate = mStreamSetInputs[i].rate;
668        std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
669        if (refSet.empty()) {
670            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
671        }
672        else {
673            Port port; unsigned ssIdx;
674            std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
675            assert (port == Port::Input && ssIdx < i);
676            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
677        }
[5442]678        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth +2;
[5439]679        if (blocks > 1) {
680            tempBuffers.push_back(ArrayType::get(mStreamSetInputBuffers[i]->getType(), blocks));
681        }
682        else {
683            tempBuffers.push_back(mStreamSetInputBuffers[i]->getType());
684        }
685    }
[5446]686
[5439]687    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
688        auto & rate = mStreamSetOutputs[i].rate;
689        std::string refSet = mStreamSetOutputs[i].rate.referenceStreamSet();
690        if (refSet.empty()) {
691            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
692        }
693        else {
694            Port port; unsigned ssIdx;
695            std::tie(port, ssIdx) = getStreamPort(mStreamSetOutputs[i].name);
696            if (port == Port::Output) ssIdx += mStreamSetInputs.size();
697            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
698        }
[5442]699        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth +2;
[5439]700        if (blocks > 1) {
701            tempBuffers.push_back(ArrayType::get(mStreamSetOutputBuffers[i]->getType(), blocks));
702        }
703        else {
704            tempBuffers.push_back(mStreamSetOutputBuffers[i]->getType());
705        }
706    }
[5440]707
[5446]708    Type * tempParameterStructType = StructType::create(kb->getContext(), tempBuffers);
709    Value * tempParameterArea = kb->CreateCacheAlignedAlloca(tempParameterStructType);
[5442]710
[5446]711    ConstantInt * blockSize = kb->getSize(kb->getBitBlockWidth());
712
[5439]713    Value * availablePos = mAvailableItemCount[0];
714    Value * itemsAvail = availablePos;
[5446]715
[5439]716    //  Make sure that corresponding data is available depending on processing rate
717    //  for all input stream sets.
[5446]718
[5439]719    for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
720        Value * a = mAvailableItemCount[i];
721        auto & rate = mStreamSetInputs[i].rate;
[5440]722        assert (((rate.referenceStreamSet().empty()) || (rate.referenceStreamSet() == mStreamSetInputs[0].name)) && "Multiblock kernel input rate not with respect to principal stream.");
[5446]723        Value * maxItems = rate.CreateMaxReferenceItemsCalculation(kb.get(), a);
724        itemsAvail = kb->CreateSelect(kb->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
[5439]725    }
[5440]726
[5446]727    Value * processed = kb->getProcessedItemCount(mStreamSetInputs[0].name);
728    Value * itemsToDo = kb->CreateSub(itemsAvail, processed);
729    Value * fullBlocksToDo = kb->CreateUDiv(itemsToDo, blockSize);
730    Value * excessItems = kb->CreateURem(itemsToDo, blockSize);
[5440]731
732    //  Now we iteratively process these blocks using the doMultiBlock method.
[5439]733    //  In each iteration, we process the maximum number of linearly accessible
734    //  blocks on the principal input, reduced to ensure that the corresponding
735    //  data is linearly available at the specified processing rates for the other inputs,
736    //  and that each of the output buffers has sufficient linearly available space
737    //  (using overflow areas, if necessary) for the maximum output that can be
738    //  produced.
[5440]739
[5446]740    kb->CreateBr(doSegmentOuterLoop);
741    kb->SetInsertPoint(doSegmentOuterLoop);
742    PHINode * const blocksRemaining = kb->CreatePHI(kb->getSizeTy(), 2, "blocksRemaining");
[5439]743    blocksRemaining->addIncoming(fullBlocksToDo, entry);
[5446]744
[5439]745    // For each input buffer, determine the processedItemCount, the block pointer for the
746    // buffer block containing the next item, and the number of linearly available items.
[5446]747
[5439]748    std::vector<Value *> processedItemCount;
749    std::vector<Value *> inputBlockPtr;
750    std::vector<Value *> producedItemCount;
751    std::vector<Value *> outputBlockPtr;
[5440]752
[5442]753    //  Now determine the linearly available blocks, based on blocks remaining reduced
754    //  by limitations of linearly available input buffer space.
[5446]755
[5442]756    Value * linearlyAvailBlocks = blocksRemaining;
[5439]757    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
[5446]758        Value * p = kb->getProcessedItemCount(mStreamSetInputs[i].name);
759        Value * blkNo = kb->CreateUDiv(p, blockSize);
760        Value * b = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
[5439]761        processedItemCount.push_back(p);
762        inputBlockPtr.push_back(b);
763        auto & rate = mStreamSetInputs[i].rate;
764        Value * blocks = nullptr;
765        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator()) && (rate.referenceStreamSet() == "")) {
[5446]766            blocks = mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(kb.get(), blkNo);
[5440]767        } else {
[5446]768            Value * linearlyAvailItems = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(kb.get(), p);
769            Value * items = rate.CreateMaxReferenceItemsCalculation(kb.get(), linearlyAvailItems);
770            blocks = kb->CreateUDiv(items, blockSize);
[5439]771        }
[5446]772        linearlyAvailBlocks = kb->CreateSelect(kb->CreateICmpULT(blocks, linearlyAvailBlocks), blocks, linearlyAvailBlocks);
[5439]773    }
774    //  Now determine the linearly writeable blocks, based on available blocks reduced
775    //  by limitations of output buffer space.
776    Value * linearlyWritableBlocks = linearlyAvailBlocks;
[5440]777
[5439]778    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5446]779        Value * p = kb->getProducedItemCount(mStreamSetOutputs[i].name);
780        Value * blkNo = kb->CreateUDiv(p, blockSize);
781        Value * b = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
[5439]782        producedItemCount.push_back(p);
783        outputBlockPtr.push_back(b);
784        auto & rate = mStreamSetOutputs[i].rate;
785        Value * blocks = nullptr;
786        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator())) {
[5446]787            blocks = mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(kb.get(), blkNo);
[5440]788        } else {
[5446]789            Value * writableItems = mStreamSetOutputBuffers[0]->getLinearlyWritableItems(kb.get(), p);
790            blocks = kb->CreateUDiv(writableItems, blockSize);
[5439]791        }
[5446]792        linearlyWritableBlocks = kb->CreateSelect(kb->CreateICmpULT(blocks, linearlyWritableBlocks), blocks, linearlyWritableBlocks);
[5439]793    }
[5446]794    Value * haveBlocks = kb->CreateICmpUGT(linearlyWritableBlocks, kb->getSize(0));
795    kb->CreateCondBr(haveBlocks, doMultiBlockCall, tempBlockCheck);
[5440]796
[5439]797    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
798    //  Now prepare the doMultiBlock call.
[5446]799    kb->SetInsertPoint(doMultiBlockCall);
[5440]800
[5446]801    Value * linearlyAvailItems = kb->CreateMul(linearlyWritableBlocks, blockSize);
[5440]802
[5439]803    std::vector<Value *> doMultiBlockArgs;
[5441]804    doMultiBlockArgs.push_back(getInstance());
[5439]805    doMultiBlockArgs.push_back(linearlyAvailItems);
806    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
[5446]807        Value * bufPtr = kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), processedItemCount[i]);
808        bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetInputBuffers[i]->getPointerType());
[5442]809        doMultiBlockArgs.push_back(bufPtr);
[5439]810    }
811    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5446]812        Value * bufPtr = kb->getRawOutputPointer(mStreamSetOutputs[i].name, kb->getInt32(0), producedItemCount[i]);
813        bufPtr = kb->CreatePointerCast(bufPtr, mStreamSetOutputBuffers[i]->getPointerType());
[5442]814        doMultiBlockArgs.push_back(bufPtr);
[5439]815    }
[5440]816
[5446]817    kb->CreateCall(multiBlockFunction, doMultiBlockArgs);
[5439]818    // Do copybacks if necessary.
819    unsigned priorIdx = 0;
820    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
[5446]821        Value * log2BlockSize = kb->getSize(std::log2(kb->getBitBlockWidth()));
[5439]822        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
[5446]823            BasicBlock * copyBack = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
824            BasicBlock * done = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
825            Value * newlyProduced = kb->CreateSub(kb->getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
826            Value * priorBlock = kb->CreateLShr(producedItemCount[i], log2BlockSize);
827            Value * priorOffset = kb->CreateAnd(producedItemCount[i], kb->getSize(kb->getBitBlockWidth() - 1));
828            Value * instance = kb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
829            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(kb.get(), priorBlock);
830            Value * accessible = kb->CreateSub(kb->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
831            Value * wraparound = kb->CreateICmpULT(accessible, newlyProduced);
832            kb->CreateCondBr(wraparound, copyBack, done);
833            kb->SetInsertPoint(copyBack);
834            Value * copyItems = kb->CreateSub(newlyProduced, accessible);
835            cb->createCopyBack(kb.get(), instance, copyItems);
836            kb->CreateBr(done);
837            kb->SetInsertPoint(done);
[5439]838            priorIdx++;
839        }
840        if (auto cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
[5446]841            BasicBlock * copyBack = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
842            BasicBlock * done = kb->CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
843            Value * instance = kb->getStreamSetBufferPtr(mStreamSetOutputs[i].name);
844            Value * newlyProduced = kb->CreateSub(kb->getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
845            Value * accessible = cb->getLinearlyAccessibleItems(kb.get(), producedItemCount[i]);
846            Value * wraparound = kb->CreateICmpULT(accessible, newlyProduced);
847            kb->CreateCondBr(wraparound, copyBack, done);
848            kb->SetInsertPoint(copyBack);
849            Value * copyItems = kb->CreateSub(newlyProduced, accessible);
850            cb->createCopyBack(kb.get(), instance, copyItems);
851            kb->CreateBr(done);
852            kb->SetInsertPoint(done);
[5439]853            priorIdx++;
854        }
855    }
[5446]856    kb->setProcessedItemCount(mStreamSetInputs[0].name, kb->CreateAdd(processed, linearlyAvailItems));
857    Value * reducedBlocksToDo = kb->CreateSub(blocksRemaining, linearlyWritableBlocks);
858    Value * fullBlocksRemain = kb->CreateICmpUGT(reducedBlocksToDo, kb->getSize(0));
859    BasicBlock * multiBlockFinal = kb->GetInsertBlock();
[5439]860    blocksRemaining->addIncoming(reducedBlocksToDo, multiBlockFinal);
[5446]861    kb->CreateCondBr(fullBlocksRemain, doSegmentOuterLoop, tempBlockCheck);
[5442]862    //iBuilder->CreateBr(doSegmentOuterLoop);
863    //
864    // We use temporary buffers in 3 different cases that preclude full block processing.
865    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
866    // (b) One or more output buffers does not have sufficient linearly available buffer space.
867    // (c) We have processed all the full blocks of input and only the excessItems remain.
868    // In each case we set up temporary buffers for input and output and then
869    // call the Multiblock routine.
870    //
[5440]871
[5446]872    kb->SetInsertPoint(tempBlockCheck);
873    PHINode * const tempBlocksRemain = kb->CreatePHI(kb->getSizeTy(), 2, "tempBlocksRemain");
[5442]874    tempBlocksRemain->addIncoming(blocksRemaining, doSegmentOuterLoop);
875    tempBlocksRemain->addIncoming(reducedBlocksToDo, multiBlockFinal);
[5440]876
[5446]877    haveBlocks = kb->CreateICmpUGT(tempBlocksRemain, kb->getSize(0));
878    kb->CreateCondBr(kb->CreateOr(mIsFinal, haveBlocks), doTempBufferBlock, segmentDone);
879
[5440]880    //
[5439]881    // We use temporary buffers in 3 different cases that preclude full block processing.
882    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
883    // (b) One or more output buffers does not have sufficient linearly available buffer space.
884    // (c) We have processed all the full blocks of input and only the excessItems remain.
885    // In each case we set up temporary buffers for input and output and then
886    // call the Multiblock routine.
887    //
[5446]888    kb->SetInsertPoint(doTempBufferBlock);
889    Value * tempBlockItems = kb->CreateSelect(haveBlocks, blockSize, excessItems);
[5440]890
[5439]891    // Begin constructing the doMultiBlock args.
892    std::vector<Value *> tempArgs;
[5441]893    tempArgs.push_back(getInstance());
[5439]894    tempArgs.push_back(tempBlockItems);
[5440]895
[5439]896    // Prepare the temporary buffer area.
897    //
898    // First zero it out.
[5446]899    Constant * const tempAreaSize = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(tempParameterStructType), kb->getSizeTy(), false);
900    kb->CreateMemZero(tempParameterArea, tempAreaSize);
901
[5439]902    // For each input and output buffer, copy over necessary data starting from the last
903    // block boundary.
904    std::vector<Value *> finalItemPos;
[5446]905    finalItemPos.push_back(kb->CreateAdd(processedItemCount[0], tempBlockItems));
[5439]906
907    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); i++) {
[5446]908        Value * tempBufPtr = kb->CreateGEP(tempParameterArea, kb->getInt32(i));
909        tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetInputBuffers[i]->getPointerType());
[5440]910
[5446]911        Value * blockItemPos = kb->CreateAnd(processedItemCount[i], blockBaseMask);
[5440]912
[5439]913        // The number of items to copy is determined by the processing rate requirements.
914        if (i > 1) {
[5442]915            auto & rate = mStreamSetInputs[i].rate;
[5439]916            std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
917            if (refSet.empty()) {
[5446]918                finalItemPos.push_back(rate.CreateRatioCalculation(kb.get(), finalItemPos[0], kb->CreateNot(haveBlocks)));
[5439]919            }
920            else {
921                Port port; unsigned ssIdx;
922                std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
923                assert (port == Port::Input && ssIdx < i);
[5446]924                finalItemPos.push_back(rate.CreateRatioCalculation(kb.get(), finalItemPos[ssIdx], kb->CreateNot(haveBlocks)));
[5439]925            }
926        }
[5446]927        Value * neededItems = kb->CreateSub(finalItemPos[i], blockItemPos);
928        Value * availFromBase = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(kb.get(), blockItemPos);
929        Value * copyItems1 = kb->CreateSelect(kb->CreateICmpULT(neededItems, availFromBase), neededItems, availFromBase);
930        Value * copyItems2 = kb->CreateSub(neededItems, copyItems1);
931        Value * inputPtr = kb->getInputStreamBlockPtr(mStreamSetInputs[i].name, kb->getInt32(0));
932        mStreamSetInputBuffers[i]->createBlockAlignedCopy(kb.get(), tempBufPtr, inputPtr, copyItems1);
933        Value * nextBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(availFromBase, blockSize));
934        mStreamSetInputBuffers[i]->createBlockAlignedCopy(kb.get(), nextBufPtr, kb->getStreamSetBufferPtr(mStreamSetInputs[i].name), copyItems2);
935
[5447]936        Value * itemAddress = kb->getRawInputPointer(mStreamSetInputs[i].name, kb->getInt32(0), processedItemCount[i]);
937        itemAddress = kb->CreatePtrToInt(itemAddress, intAddressTy);
938
939        Value * baseAddress = inputBlockPtr[i];
940        baseAddress = kb->CreatePtrToInt(baseAddress, intAddressTy);
941
942
943        Value * tempAddress = kb->CreateAdd(kb->CreatePtrToInt(tempBufPtr, intAddressTy), kb->CreateSub(itemAddress, baseAddress));
[5446]944        tempArgs.push_back(kb->CreateIntToPtr(tempAddress, mStreamSetInputBuffers[i]->getPointerType()));
[5439]945    }
946
947    std::vector<Value *> blockItemPos;
948    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
[5446]949        Value * tempBufPtr = kb->CreateGEP(tempParameterArea, kb->getInt32(mStreamSetInputs.size() + i));
950        tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
951        blockItemPos.push_back(kb->CreateAnd(producedItemCount[i], blockBaseMask));
952        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), tempBufPtr, outputBlockPtr[i], kb->CreateSub(producedItemCount[i], blockItemPos[i]));
[5447]953        Value * itemAddress = kb->CreatePtrToInt(kb->getRawOutputPointer(mStreamSetInputs[i].name, kb->getInt32(0), producedItemCount[i]), intAddressTy);
[5446]954        Value * outputPtr = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
955        Value * baseAddress = kb->CreatePtrToInt(outputPtr, intAddressTy);
956        Value * tempAddress = kb->CreateAdd(kb->CreatePtrToInt(tempBufPtr, intAddressTy), kb->CreateSub(itemAddress, baseAddress));
957        tempArgs.push_back(kb->CreateIntToPtr(tempAddress, mStreamSetOutputBuffers[i]->getPointerType()));
[5439]958    }
959
960
[5446]961    kb->CreateCall(multiBlockFunction, tempArgs);
962
[5439]963    // Copy back data to the actual output buffers.
[5440]964
[5439]965    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
[5446]966        Value * tempBufPtr = kb->CreateGEP(tempParameterArea, kb->getInt32(mStreamSetInputs.size() + i));
967        tempBufPtr = kb->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
968        Value * final_items = kb->getProducedItemCount(mStreamSetOutputs[i].name);
969        Value * copyItems = kb->CreateSub(final_items, blockItemPos[i]);
970        Value * copyItems1 = mStreamSetOutputBuffers[i]->getLinearlyWritableItems(kb.get(), blockItemPos[i]); // must be a whole number of blocks.
971        Value * outputPtr = kb->getOutputStreamBlockPtr(mStreamSetOutputs[i].name, kb->getInt32(0));
972        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), outputPtr, tempBufPtr, copyItems1);
973        Value * copyItems2 = kb->CreateSelect(kb->CreateICmpULT(copyItems, copyItems), kb->getSize(0), kb->CreateSub(copyItems, copyItems1));
974        tempBufPtr = kb->CreateGEP(tempBufPtr, kb->CreateUDiv(copyItems1, blockSize));
975        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(kb.get(), kb->getStreamSetBufferPtr(mStreamSetOutputs[i].name), tempBufPtr, copyItems2);
[5439]976    }
977
[5446]978    kb->setProcessedItemCount(mStreamSetInputs[0].name, finalItemPos[0]);
[5439]979
980    //  We've dealt with the partial block processing and copied information back into the
981    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
982    //
[5446]983    blocksRemaining->addIncoming(kb->CreateSub(tempBlocksRemain, kb->CreateZExt(haveBlocks, kb->getSizeTy())), kb->GetInsertBlock());
984    kb->CreateCondBr(haveBlocks, doSegmentOuterLoop, segmentDone);
985    kb->SetInsertPoint(segmentDone);
[5439]986}
[5440]987
988void Kernel::finalizeInstance(const std::unique_ptr<KernelBuilder> & idb) {
989    assert ("KernelBuilder does not have a valid IDISA Builder" && idb);
990    mOutputScalarResult = idb->CreateCall(getTerminateFunction(idb->getModule()), { getInstance() });
991}
992
993Kernel::StreamPort Kernel::getStreamPort(const std::string & name) const {
994    const auto f = mStreamMap.find(name);
995    if (LLVM_UNLIKELY(f == mStreamMap.end())) {
996        report_fatal_error(getName() + " does not contain stream set " + name);
997    }
998    return f->second;
999}
1000
[5285]1001// CONSTRUCTOR
[5435]1002Kernel::Kernel(std::string && kernelName,
[5283]1003                             std::vector<Binding> && stream_inputs,
1004                             std::vector<Binding> && stream_outputs,
1005                             std::vector<Binding> && scalar_parameters,
1006                             std::vector<Binding> && scalar_outputs,
1007                             std::vector<Binding> && internal_scalars)
[5435]1008: KernelInterface(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
[5350]1009, mCurrentMethod(nullptr)
[5408]1010, mNoTerminateAttribute(false)
[5418]1011, mIsGenerated(false)
1012, mIsFinal(nullptr)
1013, mOutputScalarResult(nullptr) {
[5283]1014
1015}
1016
[5435]1017Kernel::~Kernel() {
[5283]1018
[5408]1019}
1020
[5285]1021// CONSTRUCTOR
[5435]1022BlockOrientedKernel::BlockOrientedKernel(std::string && kernelName,
[5408]1023                                         std::vector<Binding> && stream_inputs,
1024                                         std::vector<Binding> && stream_outputs,
1025                                         std::vector<Binding> && scalar_parameters,
1026                                         std::vector<Binding> && scalar_outputs,
1027                                         std::vector<Binding> && internal_scalars)
[5435]1028: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars))
[5408]1029, mDoBlockMethod(nullptr)
1030, mStrideLoopBody(nullptr)
1031, mStrideLoopBranch(nullptr)
1032, mStrideLoopTarget(nullptr) {
1033
1034}
1035
1036// CONSTRUCTOR
[5441]1037MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
1038                                     std::vector<Binding> && stream_inputs,
1039                                     std::vector<Binding> && stream_outputs,
1040                                     std::vector<Binding> && scalar_parameters,
1041                                     std::vector<Binding> && scalar_outputs,
1042                                             std::vector<Binding> && internal_scalars)
1043: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
1044   
1045}
1046
1047// CONSTRUCTOR
[5435]1048SegmentOrientedKernel::SegmentOrientedKernel(std::string && kernelName,
[5283]1049                                             std::vector<Binding> && stream_inputs,
1050                                             std::vector<Binding> && stream_outputs,
1051                                             std::vector<Binding> && scalar_parameters,
1052                                             std::vector<Binding> && scalar_outputs,
1053                                             std::vector<Binding> && internal_scalars)
[5435]1054: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
[5441]1055   
[5283]1056}
[5441]1057   
[5435]1058}
Note: See TracBrowser for help on using the repository browser.