Changeset 5439 for icGREP/icgrep-devel


Ignore:
Timestamp:
May 10, 2017, 1:47:23 PM (2 years ago)
Author:
cameron
Message:

Multiblock Kernels: initial check-in

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5436 r5439  
    2626const std::string Kernel::DO_BLOCK_SUFFIX = "_DoBlock";
    2727const std::string Kernel::FINAL_BLOCK_SUFFIX = "_FinalBlock";
     28const std::string Kernel::MULTI_BLOCK_SUFFIX = "_MultiBlock";
    2829const std::string Kernel::LOGICAL_SEGMENT_NO_SCALAR = "logicalSegNo";
    2930const std::string Kernel::PROCESSED_ITEM_COUNT_SUFFIX = "_processedItemCount";
     
    395396
    396397llvm::Value * Kernel::getLinearlyAccessibleItems(const std::string & name, llvm::Value * fromPosition) const {
    397     llvm::Value * instance = getStreamSetBufferPtr(name);
    398398    const StreamSetBuffer * const buf = getInputStreamSetBuffer(name);
    399     return buf->getLinearlyAccessibleItems(iBuilder, instance, fromPosition);
     399    return buf->getLinearlyAccessibleItems(iBuilder, fromPosition);
    400400}
    401401
     
    827827            Value * priorOffset = iBuilder->CreateAnd(priorProduced[priorIdx], iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
    828828            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
    829             Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(iBuilder, instance, priorBlock);
     829            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(iBuilder, priorBlock);
    830830            Value * accessible = iBuilder->CreateSub(iBuilder->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
    831831            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
     
    843843            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
    844844            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), priorProduced[priorIdx]);
    845             Value * accessible = cb->getLinearlyAccessibleItems(iBuilder, instance, priorProduced[priorIdx]);
     845            Value * accessible = cb->getLinearlyAccessibleItems(iBuilder, priorProduced[priorIdx]);
    846846            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
    847847            iBuilder->CreateCondBr(wraparound, copyBack, done);
     
    938938}
    939939
     940   
     941void MultiBlockKernel::generateDoSegmentMethod() {
     942   
     943    // First prepare the multi-block method that will be used.
     944   
     945    std::vector<Type *> multiBlockParmTypes;
     946    multiBlockParmTypes.push_back(mKernelStateType->getPointerTo());
     947    for (auto buffer : mStreamSetInputBuffers) {
     948        multiBlockParmTypes.push_back(buffer->getPointerType());
     949    }
     950    for (auto buffer : mStreamSetOutputBuffers) {
     951        multiBlockParmTypes.push_back(buffer->getPointerType());
     952    }
     953    FunctionType * const type = FunctionType::get(iBuilder->getVoidTy(), multiBlockParmTypes, false);
     954    Function * multiBlockFunction = Function::Create(type, GlobalValue::InternalLinkage, getName() + MULTI_BLOCK_SUFFIX, iBuilder->getModule());
     955    multiBlockFunction->setCallingConv(CallingConv::C);
     956    multiBlockFunction->setDoesNotThrow();
     957    auto args = multiBlockFunction->arg_begin();
     958    args->setName("self");
     959    for (auto binding : mStreamSetInputs) {
     960        (++args)->setName(binding.name + "BufPtr");
     961    }
     962    for (auto binding : mStreamSetOutputs) {
     963        (args++)->setName(binding.name + "BufPtr");
     964    }
     965   
     966    // Now use the generateMultiBlockLogic method of the MultiBlockKernelBuilder subtype to
     967    // provide the required multi-block kernel logic.
     968    auto ip = iBuilder->saveIP();
     969    iBuilder->SetInsertPoint(BasicBlock::Create(iBuilder->getContext(), "multiBlockEntry", multiBlockFunction, 0));
     970    generateMultiBlockLogic();
     971    iBuilder->CreateRetVoid();
     972    iBuilder->restoreIP(ip);
     973   
     974    // Now proceed with creation of the doSegment method.
     975   
     976    BasicBlock * const entry = iBuilder->GetInsertBlock();
     977    BasicBlock * const doSegmentOuterLoop = CreateBasicBlock(getName() + "_doSegmentOuterLoop");
     978    BasicBlock * const doMultiBlockCall = CreateBasicBlock(getName() + "_doMultiBlockCall");
     979    BasicBlock * const finalBlockCheck = CreateBasicBlock(getName() + "_finalBlockCheck");
     980    BasicBlock * const doTempBufferBlock = CreateBasicBlock(getName() + "_doTempBufferBlock");
     981    BasicBlock * const segmentDone = CreateBasicBlock(getName() + "_segmentDone");
     982   
     983    Value * blockBaseMask = iBuilder->CreateNot(iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
     984   
     985    //
     986    //  A. Temporary Buffer Area Determination
     987    //
     988    // For final block processing and for processing near the end of physical buffer
     989    // boundaries, we need to allocate temporary space for processing a full block of input.
     990    // Compute the size requirements to store stream set data at the declared processing
     991    // rates in reference to one block of the principal input stream. 
     992    //
     993
     994    unsigned bitBlockWidth = iBuilder->getBitBlockWidth();
     995    std::vector<Type *> tempBuffers;
     996    std::vector<unsigned> itemsPerPrincipalBlock;
     997    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     998        auto & rate = mStreamSetInputs[i].rate;
     999        std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
     1000        if (refSet.empty()) {
     1001            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
     1002        }
     1003        else {
     1004            Port port; unsigned ssIdx;
     1005            std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
     1006            assert (port == Port::Input && ssIdx < i);
     1007            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
     1008        }
     1009        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth;
     1010        if (blocks > 1) {
     1011            tempBuffers.push_back(ArrayType::get(mStreamSetInputBuffers[i]->getType(), blocks));
     1012        }
     1013        else {
     1014            tempBuffers.push_back(mStreamSetInputBuffers[i]->getType());
     1015        }
     1016    }
     1017    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     1018        auto & rate = mStreamSetOutputs[i].rate;
     1019        std::string refSet = mStreamSetOutputs[i].rate.referenceStreamSet();
     1020        if (refSet.empty()) {
     1021            itemsPerPrincipalBlock.push_back(rate.calculateRatio(bitBlockWidth));
     1022        }
     1023        else {
     1024            Port port; unsigned ssIdx;
     1025            std::tie(port, ssIdx) = getStreamPort(mStreamSetOutputs[i].name);
     1026            if (port == Port::Output) ssIdx += mStreamSetInputs.size();
     1027            itemsPerPrincipalBlock.push_back(rate.calculateRatio(itemsPerPrincipalBlock[ssIdx]));
     1028        }
     1029        unsigned blocks = (itemsPerPrincipalBlock.back() + bitBlockWidth - 1)/bitBlockWidth;
     1030        if (blocks > 1) {
     1031            tempBuffers.push_back(ArrayType::get(mStreamSetOutputBuffers[i]->getType(), blocks));
     1032        }
     1033        else {
     1034            tempBuffers.push_back(mStreamSetOutputBuffers[i]->getType());
     1035        }
     1036    }
     1037    Type * tempParameterStructType = StructType::create(iBuilder->getContext(), tempBuffers);
     1038    Value * tempParameterArea = iBuilder->CreateCacheAlignedAlloca(tempParameterStructType);
     1039   
     1040    ConstantInt * blockSize = iBuilder->getSize(iBuilder->getBitBlockWidth());
     1041    Value * availablePos = mAvailableItemCount[0];
     1042    Value * itemsAvail = availablePos;
     1043    //  Make sure that corresponding data is available depending on processing rate
     1044    //  for all input stream sets.
     1045    for (unsigned i = 1; i < mStreamSetInputs.size(); i++) {
     1046        Value * a = mAvailableItemCount[i];
     1047        auto & rate = mStreamSetInputs[i].rate;
     1048        assert (((rate.referenceStreamSet() == "") || (rate.referenceStreamSet() == mStreamSetInputs[0].name)) && "Multiblock kernel input rate not with respect to principal stream.");
     1049        Value * maxItems = rate.CreateMaxReferenceItemsCalculation(iBuilder, a);
     1050        itemsAvail = iBuilder->CreateSelect(iBuilder->CreateICmpULT(itemsAvail, maxItems), itemsAvail, maxItems);
     1051    }
     1052   
     1053    Value * processed = getProcessedItemCount(mStreamSetInputs[0].name);
     1054    Value * itemsToDo = iBuilder->CreateSub(itemsAvail, processed);
     1055    Value * fullBlocksToDo = iBuilder->CreateUDiv(itemsToDo, blockSize);
     1056    Value * excessItems = iBuilder->CreateURem(itemsToDo, blockSize);
     1057   
     1058    //  Now we iteratively process these blocks using the doMultiBlock method. 
     1059    //  In each iteration, we process the maximum number of linearly accessible
     1060    //  blocks on the principal input, reduced to ensure that the corresponding
     1061    //  data is linearly available at the specified processing rates for the other inputs,
     1062    //  and that each of the output buffers has sufficient linearly available space
     1063    //  (using overflow areas, if necessary) for the maximum output that can be
     1064    //  produced.
     1065   
     1066    //iBuilder->CreateCondBr(iBuilder->CreateICmpUGT(fullBlocksToDo, iBuilder->getSize(0)), doSegmentOuterLoop, finalBlockCheck);
     1067    iBuilder->CreateBr(doSegmentOuterLoop);
     1068   
     1069    iBuilder->SetInsertPoint(doSegmentOuterLoop);
     1070    PHINode * const blocksRemaining = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "blocksRemaining");
     1071    blocksRemaining->addIncoming(fullBlocksToDo, entry);
     1072   
     1073   
     1074    // For each input buffer, determine the processedItemCount, the block pointer for the
     1075    // buffer block containing the next item, and the number of linearly available items.
     1076    //
     1077    std::vector<Value *> processedItemCount;
     1078    std::vector<Value *> inputBlockPtr;
     1079    std::vector<Value *> producedItemCount;
     1080    std::vector<Value *> outputBlockPtr;
     1081   
     1082    //  Calculate linearly available blocks for all input stream sets.
     1083    Value * linearlyAvailBlocks = nullptr;
     1084    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     1085        Value * p = getProcessedItemCount(mStreamSetInputs[i].name);
     1086        Value * blkNo = iBuilder->CreateUDiv(p, blockSize);
     1087        Value * b = getInputStreamBlockPtr(mStreamSetInputs[i].name, iBuilder->getInt32(0));
     1088        processedItemCount.push_back(p);
     1089        inputBlockPtr.push_back(b);
     1090        auto & rate = mStreamSetInputs[i].rate;
     1091        Value * blocks = nullptr;
     1092        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator()) && (rate.referenceStreamSet() == "")) {
     1093            blocks = mStreamSetInputBuffers[i]->getLinearlyAccessibleBlocks(iBuilder, blkNo);
     1094        }
     1095        else {
     1096            Value * linearlyAvailItems = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(iBuilder, p);
     1097            Value * items = rate.CreateMaxReferenceItemsCalculation(iBuilder, linearlyAvailItems);
     1098            blocks = iBuilder->CreateUDiv(items, blockSize);
     1099        }
     1100        if (i == 0) {
     1101            linearlyAvailBlocks = blocks;
     1102        }
     1103        else {
     1104            linearlyAvailBlocks = iBuilder->CreateSelect(iBuilder->CreateICmpULT(blocks, linearlyAvailBlocks), blocks, linearlyAvailBlocks);
     1105        }
     1106    }
     1107   
     1108    //  Now determine the linearly writeable blocks, based on available blocks reduced
     1109    //  by limitations of output buffer space.
     1110    Value * linearlyWritableBlocks = linearlyAvailBlocks;
     1111   
     1112    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     1113        Value * p = getProducedItemCount(mStreamSetOutputs[i].name);
     1114        Value * blkNo = iBuilder->CreateUDiv(p, blockSize);
     1115        Value * b = getOutputStreamBlockPtr(mStreamSetOutputs[i].name, iBuilder->getInt32(0));
     1116        producedItemCount.push_back(p);
     1117        outputBlockPtr.push_back(b);
     1118        auto & rate = mStreamSetOutputs[i].rate;
     1119        Value * blocks = nullptr;
     1120        if ((rate.isFixedRatio()) && (rate.getRatioNumerator() == rate.getRatioDenominator())) {
     1121            blocks = mStreamSetOutputBuffers[0]->getLinearlyWritableBlocks(iBuilder, blkNo);
     1122        }
     1123        else {
     1124            Value * writableItems = mStreamSetOutputBuffers[0]->getLinearlyWritableItems(iBuilder, p);
     1125            blocks = iBuilder->CreateUDiv(writableItems, blockSize);
     1126        }
     1127        linearlyWritableBlocks = iBuilder->CreateSelect(iBuilder->CreateICmpULT(blocks, linearlyWritableBlocks), blocks, linearlyWritableBlocks);
     1128    }
     1129    Value * haveBlocks = iBuilder->CreateICmpUGT(linearlyWritableBlocks, iBuilder->getSize(0));
     1130   
     1131    iBuilder->CreateCondBr(haveBlocks, doMultiBlockCall, doTempBufferBlock);
     1132   
     1133    //  At this point we have verified the availability of one or more blocks of input data and output buffer space for all stream sets.
     1134    //  Now prepare the doMultiBlock call.
     1135    iBuilder->SetInsertPoint(doMultiBlockCall);
     1136   
     1137    Value * linearlyAvailItems = iBuilder->CreateMul(linearlyWritableBlocks, blockSize);
     1138   
     1139    std::vector<Value *> doMultiBlockArgs;
     1140    doMultiBlockArgs.push_back(linearlyAvailItems);
     1141    for (unsigned i = 0; i < mStreamSetInputs.size(); i++) {
     1142        doMultiBlockArgs.push_back(getRawInputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), processedItemCount[i]));
     1143    }
     1144    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     1145        doMultiBlockArgs.push_back(getRawOutputPointer(mStreamSetOutputs[i].name, iBuilder->getInt32(0), producedItemCount[i]));
     1146    }
     1147       
     1148    iBuilder->CreateCall(multiBlockFunction, doMultiBlockArgs);
     1149   
     1150    // Do copybacks if necessary.
     1151    unsigned priorIdx = 0;
     1152    for (unsigned i = 0; i < mStreamSetOutputs.size(); i++) {
     1153        Value * log2BlockSize = iBuilder->getSize(std::log2(iBuilder->getBitBlockWidth()));
     1154        if (auto cb = dyn_cast<SwizzledCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
     1155            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
     1156            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
     1157            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
     1158            Value * priorBlock = iBuilder->CreateLShr(producedItemCount[i], log2BlockSize);
     1159            Value * priorOffset = iBuilder->CreateAnd(producedItemCount[i], iBuilder->getSize(iBuilder->getBitBlockWidth() - 1));
     1160            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
     1161            Value * accessibleBlocks = cb->getLinearlyAccessibleBlocks(iBuilder, priorBlock);
     1162            Value * accessible = iBuilder->CreateSub(iBuilder->CreateShl(accessibleBlocks, log2BlockSize), priorOffset);
     1163            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
     1164            iBuilder->CreateCondBr(wraparound, copyBack, done);
     1165            iBuilder->SetInsertPoint(copyBack);
     1166            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
     1167            cb->createCopyBack(iBuilder, instance, copyItems);
     1168            iBuilder->CreateBr(done);
     1169            iBuilder->SetInsertPoint(done);
     1170            priorIdx++;
     1171        }
     1172        if (auto cb = dyn_cast<CircularCopybackBuffer>(mStreamSetOutputBuffers[i]))  {
     1173            BasicBlock * copyBack = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBack");
     1174            BasicBlock * done = CreateBasicBlock(mStreamSetOutputs[i].name + "_copyBackDone");
     1175            Value * instance = getStreamSetBufferPtr(mStreamSetOutputs[i].name);
     1176            Value * newlyProduced = iBuilder->CreateSub(getProducedItemCount(mStreamSetOutputs[i].name), producedItemCount[i]);
     1177            Value * accessible = cb->getLinearlyAccessibleItems(iBuilder, producedItemCount[i]);
     1178            Value * wraparound = iBuilder->CreateICmpULT(accessible, newlyProduced);
     1179            iBuilder->CreateCondBr(wraparound, copyBack, done);
     1180            iBuilder->SetInsertPoint(copyBack);
     1181            Value * copyItems = iBuilder->CreateSub(newlyProduced, accessible);
     1182            cb->createCopyBack(iBuilder, instance, copyItems);
     1183            iBuilder->CreateBr(done);
     1184            iBuilder->SetInsertPoint(done);
     1185            priorIdx++;
     1186        }
     1187    }
     1188    setProcessedItemCount(mStreamSetInputs[0].name, iBuilder->CreateAdd(processed, linearlyAvailItems));
     1189    Value * reducedBlocksToDo = iBuilder->CreateSub(blocksRemaining, linearlyWritableBlocks);
     1190    Value * fullBlocksRemain = iBuilder->CreateICmpUGT(reducedBlocksToDo, iBuilder->getSize(0));
     1191    BasicBlock * multiBlockFinal = iBuilder->GetInsertBlock();
     1192    blocksRemaining->addIncoming(reducedBlocksToDo, multiBlockFinal);
     1193    iBuilder->CreateCondBr(fullBlocksRemain, doSegmentOuterLoop, finalBlockCheck);
     1194   
     1195    // All the full blocks of input have been processed.  If mIsFinal is true,
     1196    // we should process the remaining partial block (i.e., excessItems as determined at entry).
     1197    iBuilder->SetInsertPoint(finalBlockCheck);
     1198    iBuilder->CreateCondBr(mIsFinal, doTempBufferBlock, segmentDone);
     1199   
     1200    // 
     1201    // We use temporary buffers in 3 different cases that preclude full block processing.
     1202    // (a) One or more input buffers does not have a sufficient number of input items linearly available.
     1203    // (b) One or more output buffers does not have sufficient linearly available buffer space.
     1204    // (c) We have processed all the full blocks of input and only the excessItems remain.
     1205    // In each case we set up temporary buffers for input and output and then
     1206    // call the Multiblock routine.
     1207    //
     1208    iBuilder->SetInsertPoint(doTempBufferBlock);
     1209    PHINode * const tempBlockItems = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2, "tempBlockItems");
     1210    tempBlockItems->addIncoming(blockSize, doSegmentOuterLoop);
     1211    tempBlockItems->addIncoming(excessItems, finalBlockCheck);
     1212   
     1213    // Will this be the final block processing?
     1214    Value * doFinal = iBuilder->CreateICmpULT(tempBlockItems, blockSize);
     1215   
     1216    // Begin constructing the doMultiBlock args.
     1217    std::vector<Value *> tempArgs;
     1218    tempArgs.push_back(tempBlockItems);
     1219   
     1220    // Prepare the temporary buffer area.
     1221    //
     1222    // First zero it out.
     1223    Constant * const tempAreaSize = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(tempParameterStructType), iBuilder->getSizeTy(), false);
     1224    iBuilder->CreateMemZero(tempParameterArea, tempAreaSize);
     1225   
     1226    // For each input and output buffer, copy over necessary data starting from the last
     1227    // block boundary.
     1228    std::vector<Value *> finalItemPos;
     1229    finalItemPos.push_back(iBuilder->CreateAdd(processedItemCount[0], tempBlockItems));
     1230
     1231    for (unsigned i = 0; i < mStreamSetInputBuffers.size(); i++) {
     1232        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(i));
     1233        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetInputBuffers[i]->getPointerType());
     1234       
     1235        auto & rate = mStreamSetInputs[i].rate;
     1236        Value * blockItemPos = iBuilder->CreateAnd(processedItemCount[i], blockBaseMask);
     1237       
     1238        // The number of items to copy is determined by the processing rate requirements.
     1239        if (i > 1) {
     1240            std::string refSet = mStreamSetInputs[i].rate.referenceStreamSet();
     1241            if (refSet.empty()) {
     1242                finalItemPos.push_back(rate.CreateRatioCalculation(iBuilder, finalItemPos[0], doFinal));
     1243            }
     1244            else {
     1245                Port port; unsigned ssIdx;
     1246                std::tie(port, ssIdx) = getStreamPort(mStreamSetInputs[i].name);
     1247                assert (port == Port::Input && ssIdx < i);
     1248                finalItemPos.push_back(rate.CreateRatioCalculation(iBuilder, finalItemPos[ssIdx], doFinal));
     1249            }
     1250        }
     1251        Value * neededItems = iBuilder->CreateSub(finalItemPos[i], blockItemPos);
     1252        Value * availFromBase = mStreamSetInputBuffers[i]->getLinearlyAccessibleItems(iBuilder, blockItemPos);
     1253        Value * copyItems1 = iBuilder->CreateSelect(iBuilder->CreateICmpULT(neededItems, availFromBase), neededItems, availFromBase);
     1254        Value * copyItems2 = iBuilder->CreateSub(neededItems, copyItems1);
     1255        mStreamSetInputBuffers[i]->createBlockAlignedCopy(iBuilder, tempBufPtr, inputBlockPtr[i], copyItems1);
     1256        Value * nextBufPtr = iBuilder->CreateGEP(tempBufPtr, iBuilder->CreateUDiv(availFromBase, blockSize));
     1257        mStreamSetInputBuffers[i]->createBlockAlignedCopy(iBuilder, nextBufPtr, getStreamSetBufferPtr(mStreamSetInputs[i].name), copyItems2);
     1258        Value * itemAddress = iBuilder->CreatePtrToInt(getRawOutputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), processedItemCount[i]), iBuilder->getSizeTy());
     1259        Value * baseAddress = iBuilder->CreatePtrToInt(inputBlockPtr[i], iBuilder->getSizeTy());
     1260        Value * tempAddress = iBuilder->CreateAdd(iBuilder->CreatePtrToInt(tempBufPtr, iBuilder->getSizeTy()), iBuilder->CreateSub(itemAddress, baseAddress));
     1261        tempArgs.push_back(iBuilder->CreateBitCast(tempAddress, mStreamSetInputBuffers[i]->getPointerType()));
     1262    }
     1263
     1264    std::vector<Value *> blockItemPos;
     1265    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
     1266        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(mStreamSetInputs.size() + i));
     1267        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
     1268        blockItemPos.push_back(iBuilder->CreateAnd(producedItemCount[i], blockBaseMask));
     1269        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, tempBufPtr, outputBlockPtr[i], iBuilder->CreateSub(producedItemCount[i], blockItemPos[i]));
     1270        Value * itemAddress = iBuilder->CreatePtrToInt(getRawOutputPointer(mStreamSetInputs[i].name, iBuilder->getInt32(0), producedItemCount[i]), iBuilder->getSizeTy());
     1271        Value * baseAddress = iBuilder->CreatePtrToInt(outputBlockPtr[i], iBuilder->getSizeTy());
     1272        Value * tempAddress = iBuilder->CreateAdd(iBuilder->CreatePtrToInt(tempBufPtr, iBuilder->getSizeTy()), iBuilder->CreateSub(itemAddress, baseAddress));
     1273        tempArgs.push_back(iBuilder->CreateBitCast(tempAddress, mStreamSetOutputBuffers[i]->getPointerType()));
     1274    }
     1275
     1276    iBuilder->CreateCall(multiBlockFunction, tempArgs);
     1277
     1278    // Copy back data to the actual output buffers.
     1279   
     1280    for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); i++) {
     1281        Value * tempBufPtr = iBuilder->CreateGEP(tempParameterArea, iBuilder->getInt32(mStreamSetInputs.size() + i));
     1282        tempBufPtr = iBuilder->CreatePointerCast(tempBufPtr, mStreamSetOutputBuffers[i]->getPointerType());
     1283        Value * final_items = getProducedItemCount(mStreamSetOutputs[i].name);
     1284        Value * copyItems = iBuilder->CreateSub(final_items, blockItemPos[i]);
     1285        Value * copyItems1 = mStreamSetOutputBuffers[i]->getLinearlyWritableItems(iBuilder, blockItemPos[i]); // must be a whole number of blocks.
     1286        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, outputBlockPtr[i], tempBufPtr, copyItems1);
     1287        Value * copyItems2 = iBuilder->CreateSelect(iBuilder->CreateICmpULT(copyItems, copyItems), iBuilder->getSize(0), iBuilder->CreateSub(copyItems, copyItems1));
     1288        tempBufPtr = iBuilder->CreateGEP(tempBufPtr, iBuilder->CreateUDiv(copyItems1, blockSize));
     1289        mStreamSetOutputBuffers[i]->createBlockAlignedCopy(iBuilder, getStreamSetBufferPtr(mStreamSetOutputs[i].name), tempBufPtr, copyItems2);
     1290    }
     1291
     1292    setProcessedItemCount(mStreamSetInputs[0].name, finalItemPos[0]);
     1293
     1294    //  We've dealt with the partial block processing and copied information back into the
     1295    //  actual buffers.  If this isn't the final block, loop back for more multiblock processing.
     1296    //
     1297    iBuilder->CreateCondBr(doFinal, segmentDone, doSegmentOuterLoop);
     1298    iBuilder->SetInsertPoint(segmentDone);
     1299}
     1300                                                           
    9401301// CONSTRUCTOR
    9411302Kernel::Kernel(std::string && kernelName,
     
    9841345}
    9851346
    986 }
     1347// CONSTRUCTOR
     1348MultiBlockKernel::MultiBlockKernel(std::string && kernelName,
     1349                                   std::vector<Binding> && stream_inputs,
     1350                                   std::vector<Binding> && stream_outputs,
     1351                                   std::vector<Binding> && scalar_parameters,
     1352                                   std::vector<Binding> && scalar_outputs,
     1353                                   std::vector<Binding> && internal_scalars)
     1354: Kernel(std::move(kernelName), std::move(stream_inputs), std::move(stream_outputs), std::move(scalar_parameters), std::move(scalar_outputs), std::move(internal_scalars)) {
     1355   
     1356}
     1357}
  • icGREP/icgrep-devel/icgrep/kernels/kernel.h

    r5436 r5439  
    4040    static const std::string DO_BLOCK_SUFFIX;
    4141    static const std::string FINAL_BLOCK_SUFFIX;
     42    static const std::string MULTI_BLOCK_SUFFIX;
    4243    static const std::string LOGICAL_SEGMENT_NO_SCALAR;
    4344    static const std::string PROCESSED_ITEM_COUNT_SUFFIX;
     
    383384};
    384385
    385 
     386/*   
     387The Multi-Block Kernel Builder
     388------------------------------
     389
     390The Multi-Block Kernel Builder is designed to simplify the programming of
     391efficient kernels with possibly variable and/or nonaligned output, subject to
     392exact or MaxRatio processing constraints.   The following restrictions apply.
     393   
     394#.  The input consists of one or more stream sets, the first of which is
     395    known as the principal input stream set. 
     396   
     397#.  If there is more than one input stream set, the additional stream sets must
     398    have a processing rate defined with respect to the input stream set of one
     399    of the following types:  FixedRate, Add1 or RoundUp.    Note that stream sets
     400    declared without a processing rate attribute have the FixedRate(1) attribute
     401    by default and therefore satisfy this constraint.
     402   
     403#.  All output stream sets must be declared with processing rate attributes
     404    of one of the following types:
     405    *  FixedRate, Add1, Roundup, or MaxRatio with respect to the principal input stream set.
     406    *  FixedRate with respect to some other output stream set.
     407   
     408    When using the Multi-Block Kernel Builder to program a new type of kernel,
     409    the programmer must implement the generateDoMultiBlockMethod for normal
     410    multi-block processing according to the requirements below, as well as
     411    providing for special final block processing, if necessary.
     412           
     413#.  The doMultiBlockMethod will be called with the following parameters:
     414    * the number of items of the principal input stream to process (itemsToDo),
     415    * pointers to linear contiguous buffer areas for each of the input stream sets, and
     416    * pointers to linear contiguous output buffer areas for each of the output stream sets.
     417    * pointers are to the address of the first item of the first stream of the stream set.
     418
     419#.  The Multi-Block Kernel Builder will arrange that these input parameters may be
     420    processed under the following simplifying assumptions.
     421    * the number of itemsToDo will either be an exact multiple of the BlockSize,
     422      or, for processing the final block, a value less than BlockSize
     423    * all input buffers will be safe to access and have data available in
     424      accord with their processing rates based on the given number of itemsToDo
     425      of the principal input stream set; no further bounds checking is needed.
     426    * all output buffers will be safe to access and have space available
     427      for the given maximum output generation rates based on the given number
     428      of blocksToDo of the principal input stream set; no further bounds checking
     429      is needed.
     430    * for final block processing, all input buffers will be extended to be safely
     431      treated as containing data corresponding to a full block of the principal
     432      input stream set, with the actual data in each buffer padded with null values
     433      beyond the end of input.  Similarly, all output buffers will contain space
     434      sufficient for the maximum output that can be generated for a full block of
     435      input processing.
     436    * input and output pointers will be typed to allow convenient and logical access
     437      to corresponding streams based on their declared stream set type and processing rate.
     438    * for any input pointer p, a GEP instruction with a single int32 index i
     439      will produce a pointer to the buffer position corresponding to the ith block of the
     440      principal input stream set. 
     441    * for any output stream set declared with a Fixed or Add1 processing rate with respect
     442      to the principal input stream set, a GEP instruction with a single int32 index i
     443      will produce a pointer to the buffer position corresponding to the ith block of the
     444      principal input stream set.
     445                   
     446#.  Upon completion of multi-block processing, the Multi-Block Kernel Builder will arrange that
     447    processed and produced item counts are updated for all stream sets that have exact
     448    processing rate attributes.   Programmers are responsible for updating the producedItemCount
     449    of any stream set declared with a variable attribute (MaxRatio).
     450                           
     451#.  An important caveat is that buffer areas may change arbitrarily between
     452    calls to the doMultiBlockMethod.   In no case should a kernel store a
     453    buffer pointer in its internal state.   Furthermore a kernel must not make
     454    any assumptions about the accessibility of stream set data outside of the
     455    processing range outside of the block boundaries associated with the given itemsToDo.
     456*/
     457
     458class MultiBlockKernel : public Kernel {
     459protected:
     460
     461    MultiBlockKernel(std::string && kernelName,
     462                     std::vector<Binding> && stream_inputs,
     463                     std::vector<Binding> && stream_outputs,
     464                     std::vector<Binding> && scalar_parameters,
     465                     std::vector<Binding> && scalar_outputs,
     466                     std::vector<Binding> && internal_scalars);
     467
     468    // Each multi-block kernel subtype must provide its own logic for handling
     469    // doMultiBlock calls, subject to the requirements laid out above.
     470    // The generateMultiBlockLogic must be written to generate this logic, given
     471    // a created but empty function.  Upon entry to generateMultiBlockLogic,
     472    // the builder insertion point will be set to the entry block; upone
     473    // exit the RetVoid instruction will be added to complete the method.
     474    //
     475    virtual void generateMultiBlockLogic () = 0;
     476
     477    // Given a kernel subtype with an appropriate interface, the generateDoSegment
     478    // method of the multi-block kernel builder makes all the necessary arrangements
     479    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
     480    void generateDoSegmentMethod() override final;
     481};
     482   
     483   
    386484}
    387485#endif
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5436 r5439  
    117117
    118118Value * KernelBuilder::getLinearlyAccessibleItems(const std::string & name, Value * fromPosition) {
    119     Value * instance = getStreamSetBufferPtr(name);
    120     const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
    121     return buf->getLinearlyAccessibleItems(this, instance, fromPosition);
     119    const StreamSetBuffer * const buf = mKernel->getInputStreamSetBuffer(name);
     120    return buf->getLinearlyAccessibleItems(this, fromPosition);
    122121}
    123122
  • icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp

    r5436 r5439  
    3030    Value * wraparound = nullptr;
    3131    if (isa<CircularBuffer>(b) || isa<CircularCopybackBuffer>(b)) {
    32         Value * instance = getStreamSetBufferPtr("codeUnitBuffer");
    3332
    3433
    3534
    36         Value * accessible = b->getLinearlyAccessibleItems(iBuilder, instance, processed);
     35        Value * accessible = b->getLinearlyAccessibleItems(iBuilder, processed);
    3736        wraparound = iBuilder->CreateICmpULT(accessible, itemsToDo);
    3837        itemsToDo = iBuilder->CreateSelect(wraparound, accessible, itemsToDo);
     
    120119    Value * wraparound = nullptr;
    121120    if (isa<CircularBuffer>(b) || isa<CircularCopybackBuffer>(b)) {
    122         Value * instance = getStreamSetBufferPtr("codeUnitBuffer");
    123         Value * accessible = b->getLinearlyAccessibleItems(iBuilder, instance, processed);
     121        Value * accessible = b->getLinearlyAccessibleItems(iBuilder, processed);
    124122        wraparound = iBuilder->CreateICmpULT(accessible, itemsToDo);
    125123        itemsToDo = iBuilder->CreateSelect(wraparound, accessible, itemsToDo);
  • icGREP/icgrep-devel/icgrep/kernels/streamset.cpp

    r5436 r5439  
    110110}
    111111
    112 Value * StreamSetBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value * /* self */, Value * fromPosition) const {
     112Value * StreamSetBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value * fromPosition) const {
    113113    if (isa<ArrayType>(mType) && dyn_cast<ArrayType>(mType)->getNumElements() > 1) {
    114114        Constant * stride = iBuilder->getSize(iBuilder->getStride());
     
    120120}
    121121
    122 Value * StreamSetBuffer::getLinearlyAccessibleBlocks(IDISA::IDISA_Builder * const iBuilder, Value * /* self */, Value * fromBlock) const {
     122Value * StreamSetBuffer::getLinearlyAccessibleBlocks(IDISA::IDISA_Builder * const iBuilder, Value * fromBlock) const {
    123123    Constant * bufBlocks = iBuilder->getSize(mBufferBlocks);
    124124    return iBuilder->CreateSub(bufBlocks, iBuilder->CreateURem(fromBlock, bufBlocks));
    125125}
    126126
    127 Value * StreamSetBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromPosition) const {
    128     return getLinearlyAccessibleItems(iBuilder, self, fromPosition);
    129 }
    130 
    131 Value * StreamSetBuffer::getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromBlock) const {
    132     return getLinearlyAccessibleBlocks(iBuilder, self, fromBlock);
     127Value * StreamSetBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * fromPosition) const {
     128    return getLinearlyAccessibleItems(iBuilder, fromPosition);
     129}
     130
     131Value * StreamSetBuffer::getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, Value * fromBlock) const {
     132    return getLinearlyAccessibleBlocks(iBuilder, fromBlock);
    133133}
    134134
     
    209209}
    210210
    211 Value * SourceBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value *) const {
     211Value * SourceBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value *) const {
    212212    report_fatal_error("External buffers: getLinearlyAccessibleItems is not supported.");
    213213}
     
    222222}
    223223
    224 Value * ExternalBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value *, Value *) const {
     224Value * ExternalBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value *) const {
    225225    report_fatal_error("External buffers: getLinearlyAccessibleItems is not supported.");
    226226}
     
    245245}
    246246
    247 Value * CircularCopybackBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromPosition) const {
    248     return iBuilder->CreateAdd(getLinearlyAccessibleItems(iBuilder, self, fromPosition), iBuilder->getSize(mOverflowBlocks * iBuilder->getBitBlockWidth()));
    249 }
    250 
    251 Value * CircularCopybackBuffer::getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromBlock) const {
    252     return iBuilder->CreateAdd(getLinearlyAccessibleBlocks(iBuilder, self, fromBlock), iBuilder->getSize(mOverflowBlocks));
     247Value * CircularCopybackBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * fromPosition) const {
     248    return iBuilder->CreateAdd(getLinearlyAccessibleItems(iBuilder, fromPosition), iBuilder->getSize(mOverflowBlocks * iBuilder->getBitBlockWidth()));
     249}
     250
     251Value * CircularCopybackBuffer::getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, Value * fromBlock) const {
     252    return iBuilder->CreateAdd(getLinearlyAccessibleBlocks(iBuilder, fromBlock), iBuilder->getSize(mOverflowBlocks));
    253253}
    254254
     
    302302}
    303303
    304 Value * SwizzledCopybackBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromPosition) const {
    305     return iBuilder->CreateAdd(getLinearlyAccessibleItems(iBuilder, self, fromPosition), iBuilder->getSize(mOverflowBlocks * iBuilder->getBitBlockWidth()));
    306 }
    307 
    308 Value * SwizzledCopybackBuffer::getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, Value * self, Value * fromBlock) const {
    309     return iBuilder->CreateAdd(getLinearlyAccessibleBlocks(iBuilder, self, fromBlock), iBuilder->getSize(mOverflowBlocks));
     304Value * SwizzledCopybackBuffer::getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, Value * fromPosition) const {
     305    return iBuilder->CreateAdd(getLinearlyAccessibleItems(iBuilder, fromPosition), iBuilder->getSize(mOverflowBlocks * iBuilder->getBitBlockWidth()));
     306}
     307
     308Value * SwizzledCopybackBuffer::getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, Value * fromBlock) const {
     309    return iBuilder->CreateAdd(getLinearlyAccessibleBlocks(iBuilder, fromBlock), iBuilder->getSize(mOverflowBlocks));
    310310}
    311311
     
    458458}
    459459
    460 Value * ExpandableBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value * self, Value *) const {
     460Value * ExpandableBuffer::getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, Value *) const {
    461461    report_fatal_error("Expandable buffers: getLinearlyAccessibleItems is not supported.");
    462462}
  • icGREP/icgrep-devel/icgrep/kernels/streamset.h

    r5436 r5439  
    7777
    7878    // The number of items that cam be linearly accessed from a given logical stream position.
    79     virtual llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition) const;
    80    
    81     virtual llvm::Value * getLinearlyAccessibleBlocks(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromBlock) const;
     79    virtual llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const;
     80   
     81    virtual llvm::Value * getLinearlyAccessibleBlocks(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromBlock) const;
    8282   
    8383    virtual void createBlockAlignedCopy(IDISA::IDISA_Builder * const iBuilder, llvm::Value * targetBlockPtr, llvm::Value * sourceBlockPtr, llvm::Value * itemsToCopy) const;
    8484
    85     virtual llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition) const;
    86    
    87     virtual llvm::Value * getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromBlock) const;
     85    virtual llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const;
     86   
     87    virtual llvm::Value * getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromBlock) const;
    8888   
    8989    virtual ~StreamSetBuffer() = 0;
     
    159159    llvm::Value * getBufferedSize(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self) const override;
    160160
    161     llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition) const override;
     161    llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const override;
    162162
    163163protected:
     
    180180    void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    181181
    182     llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition, llvm::Value *) const override;
     182    llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const override;
    183183
    184184protected:
     
    217217    void createCopyBack(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * overflowItems) const;
    218218       
    219     llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition) const override;
    220    
    221     llvm::Value * getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromBlock) const override;
     219    llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const override;
     220   
     221    llvm::Value * getLinearlyWritableBlocks(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromBlock) const override;
    222222   
    223223protected:
     
    241241    void createCopyBack(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * overflowItems) const;
    242242   
    243     llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition) const override;
    244    
    245     llvm::Value * getLinearlyWritableBlocks(IDISA::IDISA_Builder * constiBuilder, llvm::Value * self, llvm::Value * fromBlock) const override;
     243    llvm::Value * getLinearlyWritableItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const override;
     244   
     245    llvm::Value * getLinearlyWritableBlocks(IDISA::IDISA_Builder * constiBuilder, llvm::Value * fromBlock) const override;
    246246   
    247247protected:
     
    266266    llvm::Value * getStreamPackPtr(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * streamIndex, llvm::Value * blockIndex, llvm::Value * packIndex, const bool readOnly) const override;
    267267
    268     llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * self, llvm::Value * fromPosition) const override;
     268    llvm::Value * getLinearlyAccessibleItems(IDISA::IDISA_Builder * const iBuilder, llvm::Value * fromPosition) const override;
    269269
    270270    void allocateBuffer(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
Note: See TracChangeset for help on using the changeset viewer.