Ignore:
Timestamp:
Feb 6, 2018, 4:57:35 PM (20 months ago)
Author:
nmedfort
Message:

More work on the pipeline I/O rate handling

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r5856 r5865  
    214214void Kernel::addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb) {
    215215
    216     if (mStreamMap.empty()) {
    217         prepareStreamSetNameMap();
    218     }
     216    assert (mStreamMap.empty());
     217
     218    prepareStreamSetNameMap();
    219219
    220220    normalizeStreamProcessingRates();
     
    290290    if (LLVM_UNLIKELY(hasSignature())) {
    291291        generateKernel(idb);
    292         std::string signature;
    293         raw_string_ostream OS(signature);
    294         WriteBitcodeToFile(getModule(), OS);
    295         return signature;
     292        std::string tmp;
     293        raw_string_ostream signature(tmp);
     294        WriteBitcodeToFile(getModule(), signature);
     295        return signature.str();
    296296    } else {
    297297        return getModule()->getModuleIdentifier();
     
    304304 ** ------------------------------------------------------------------------------------------------------------- */
    305305void Kernel::generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb) {
    306     assert ("KernelBuilder does not have a valid IDISA Builder" && idb.get());
    307     // If the module id cannot uniquely identify this kernel, "generateKernelSignature()" will have already
    308     // generated the unoptimized IR.
    309     if (!mIsGenerated) {
    310         const auto m = idb->getModule();
    311         const auto ip = idb->saveIP();
    312         // const auto saveInstance = getInstance();
    313         idb->setModule(mModule);
    314         addKernelDeclarations(idb);
    315         callGenerateInitializeMethod(idb);
    316         callGenerateDoSegmentMethod(idb);
    317         callGenerateFinalizeMethod(idb);
    318         // setInstance(saveInstance);
    319         idb->setModule(m);
    320         idb->restoreIP(ip);
    321         mIsGenerated = true;
    322     }
     306    assert ("Kernel does not have a valid IDISA Builder" && idb.get());
     307    if (LLVM_UNLIKELY(mIsGenerated)) return;
     308    idb->setModule(mModule);
     309    addKernelDeclarations(idb);
     310    callGenerateInitializeMethod(idb);
     311    callGenerateDoSegmentMethod(idb);
     312    callGenerateFinalizeMethod(idb);
     313    mIsGenerated = true;
    323314}
    324315
     
    685676}
    686677
     678// #define DEBUG_LOG
     679
    687680/** ------------------------------------------------------------------------------------------------------------- *
    688681 * @brief generateKernelMethod
     
    710703            Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType();
    711704            auto ub = getUpperBound(rate);
     705            assert (ub != 0);
    712706            if (LLVM_UNLIKELY(input.hasLookahead())) {
    713707                ub += RateValue(input.getLookahead(), mStride);
     
    727721        if (requiresTemporaryOutputBuffer(output, rate)) {
    728722            auto ub = getUpperBound(rate);
    729             if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
    730                 ub += mStreamSetOutputBuffers[i]->overflowSize();
    731             }
    732             Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
    733             Constant * const arraySize = b->getInt64(ceiling(ub));
    734             AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
    735             assert (ptr->isStaticAlloca());
    736             temporaryOutputBuffer[i] = ptr;
     723            if (ub > 0) {
     724                if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) {
     725                    ub += mStreamSetOutputBuffers[i]->overflowSize();
     726                }
     727                Type * const ty = mStreamSetOutputBuffers[i]->getStreamSetBlockType();
     728                Constant * const arraySize = b->getInt64(ceiling(ub));
     729                AllocaInst * const ptr = b->CreateAlignedAlloca(ty, blockAlignment, arraySize);
     730                assert (ptr->isStaticAlloca());
     731                temporaryOutputBuffer[i] = ptr;
     732            }
    737733        }
    738734    }
     
    762758
    763759    Value * const initiallyFinal = mIsFinal;
    764 
    765 //    b->CallPrintInt(getName() + "_initiallyFinal", initiallyFinal);
    766 
     760    #ifdef DEBUG_LOG
     761    b->CallPrintInt(getName() + "_initiallyFinal", initiallyFinal);
     762    #endif
    767763    // Now proceed with creation of the doSegment method.
    768764    BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");
     
    791787        const auto & name = input.getName();
    792788        Value * const processed = b->getProcessedItemCount(name);
    793 
    794 //        b->CallPrintInt(getName() + "_" + name + "_avail", mAvailableItemCount[i]);
    795 //        b->CallPrintInt(getName() + "_" + name + "_processed", processed);
    796 
     789        #ifdef DEBUG_LOG
     790        b->CallPrintInt(getName() + "_" + name + "_avail", mAvailableItemCount[i]);
     791        b->CallPrintInt(getName() + "_" + name + "_processed", processed);
     792        #endif
    797793        mInitialProcessedItemCount[i] = processed;
    798794        mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH));
     
    802798        }
    803799
    804         Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);       
    805 //        b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
    806 
     800        Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed);
     801        #ifdef DEBUG_LOG
     802        b->CallPrintInt(getName() + "_" + name + "_unprocessed", unprocessed);
     803        #endif
    807804        Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed);
    808 //        b->CallPrintInt(getName() + "_" + name + "_accessible", accessible);
    809 
     805        #ifdef DEBUG_LOG
     806        b->CallPrintInt(getName() + "_" + name + "_accessible", accessible);
     807        #endif
    810808        mAvailableItemCount[i] = unprocessed;
    811 
    812809        linearlyAccessible[i] = accessible;
    813810        inputStrideSize[i] = getStrideSize(b, input.getRate());
     
    926923        const auto & name = output.getName();
    927924        Value * const produced = b->getProducedItemCount(name);
    928 //        b->CallPrintInt(getName() + "_" + name + "_produced", produced);
    929 
     925        #ifdef DEBUG_LOG
     926        b->CallPrintInt(getName() + "_" + name + "_produced", produced);
     927        #endif
    930928        Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH));
    931929        mInitialProducedItemCount[i] = produced;
    932930        mStreamSetOutputBaseAddress[i] = baseBuffer;
    933 
     931        linearlyWritable[i] = nullptr;
    934932        // Is the number of linearly writable items sufficient for a stride?
    935933        outputStrideSize[i] = getStrideSize(b, output.getRate());
     
    986984        const ProcessingRate & rate = input.getRate();
    987985        if (rate.isFixed() && input.nonDeferred()) {
    988 //            b->CallPrintInt(getName() + "_" + input.getName() + "_processed (+)", mAvailableItemCount[i]);
    989986            Value * const ic = b->CreateAdd(mInitialProcessedItemCount[i], mAvailableItemCount[i]);
    990987            b->setProcessedItemCount(input.getName(), ic);
     
    998995            Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]);
    999996            Value * const ic = b->CreateAdd(mInitialProducedItemCount[i], produced);
    1000 //            b->CallPrintInt(getName() + "_" + output.getName() + "_produced (+)", produced);
    1001997            b->setProducedItemCount(output.getName(), ic);
    1002998        }
     
    10261022    // Copy back data to the actual output buffers.
    10271023    for (unsigned i = 0; i < outputSetCount; i++) {
    1028 
    10291024        AllocaInst * const tempBuffer = temporaryOutputBuffer[i];
    10301025        if (LLVM_UNLIKELY(tempBuffer == nullptr)) {
    10311026            continue;
    10321027        }
    1033 
    10341028        const auto & name = mStreamSetOutputs[i].getName();
    10351029        Value * const produced = b->getProducedItemCount(name);
     
    10481042        //Value * const newProducedItemCount = b->getProducedItemCount(name);
    10491043        Value * const newlyProduced = b->CreateSub(produced, mInitialProducedItemCount[i]);
     1044
     1045
    10501046        Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]);
    10511047        const auto alignment = getItemAlignment(mStreamSetOutputs[i]);
     
    11041100        const auto & name = mStreamSetOutputs[i].getName();
    11051101        Value * const produced = b->getProducedItemCount(name);
     1102
    11061103        // If this output has a Fixed/Bounded rate, determine whether we have room for another stride.
    11071104        if (LLVM_LIKELY(outputStrideSize[i] != nullptr)) {
     
    11121109            }
    11131110            Value * const unconsumed = b->CreateSub(produced, consumed);
     1111
     1112//            b->CallPrintInt(getName() + "_" + name + "_unconsumed", unconsumed);
     1113
    11141114            Value * const capacity = b->getBufferedSize(name);
     1115
     1116//            b->CallPrintInt(getName() + "_" + name + "_capacity", capacity);
     1117
    11151118            if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    11161119                b->CreateAssert(b->CreateICmpULE(unconsumed, capacity),
    1117                                 getName() + ": " + name + " unconsumed data exceeds capacity");
    1118             }
     1120                                getName() + ": " + name + " more data was written than its capacity allows");
     1121            }
     1122
     1123
     1124
    11191125            Value * const remaining = b->CreateSub(capacity, unconsumed);
    11201126            Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, outputStrideSize[i]);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5856 r5865  
    164164}
    165165
    166 /** ------------------------------------------------------------------------------------------------------------- *
    167  * @brief getFieldWidth
    168  ** ------------------------------------------------------------------------------------------------------------- */
    169 inline unsigned getFieldWidth(const unsigned bitWidth, const unsigned blockWidth) {
    170     for (unsigned k = 16; k <= blockWidth; k *= 2) {
    171         if ((bitWidth & (k - 1)) != 0) {
    172             return k / 2;
    173         }
    174     }
    175     return blockWidth;
     166inline static unsigned ceil_log2(const unsigned v) {
     167    assert ("log2(0) is undefined!" && v != 0);
     168    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
    176169}
    177170
     
    186179    assert (target->getType()->isPointerTy());
    187180    assert (isConstantZero(targetOffset) || isConstantZero(sourceOffset));
    188 
    189     const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    190 
    191     const auto itemWidth = getItemWidth(buf->getBaseType());
     181    const StreamSetBuffer * const buffer = mKernel->getAnyStreamSetBuffer(name);
     182    const auto itemWidth = getItemWidth(buffer->getBaseType());
    192183    assert ("invalid item width" && is_power_2(itemWidth));
    193184    const auto blockWidth = getBitBlockWidth();
    194185    // Although our item width may be n bits, if we know we're always processing m items per block, our field width
    195186    // (w.r.t the stream copy) would be n*m. By taking this into account we can optimize and simplify the copy code.
    196     const auto fieldWidth = getFieldWidth(itemWidth * itemAlignment, blockWidth);
    197     const auto alignment = (fieldWidth + 7) / 8;
     187    const auto fieldWidth = std::min(1U << ceil_log2(itemWidth * itemAlignment), blockWidth);
     188    assert ((blockWidth % fieldWidth) == 0);
    198189
    199190    if (LLVM_LIKELY(itemWidth < fieldWidth)) {
     
    201192        Constant * const FACTOR = getSize(factor);
    202193        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    203             ConstantInt * const ALIGNMENT = getSize(alignment);
    204194            const auto kernelName = mKernel->getName()+ ": " + name;
    205             CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     195            if (fieldWidth > 8) {
     196                const auto alignment = (fieldWidth + 7) / 8;
     197                ConstantInt * const ALIGNMENT = getSize(alignment);
     198                CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     199                CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
     200            }
    206201            CreateAssertZero(CreateURem(targetOffset, FACTOR), kernelName + " target offset is misaligned (" + std::to_string(factor) + ")");
    207             CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
    208202            CreateAssertZero(CreateURem(sourceOffset, FACTOR), kernelName + " source offset is misaligned (" + std::to_string(factor) + ")");
    209203        }
     
    236230    */
    237231
    238     Type * const fieldWidthTy = getIntNTy(fieldWidth);
    239 
    240     Value * n = buf->getStreamSetCount(this, getStreamHandle(name));
    241 
    242     if (isConstantOne(n) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset))) {
     232    Value * const n = buffer->getStreamSetCount(this, getStreamHandle(name));
     233    if (((isConstantOne(n) && fieldWidth >= 8) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset)))) {
    243234        if (LLVM_LIKELY(itemWidth < 8)) {
    244235            itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
     
    249240            itemsToCopy = CreateMul(itemsToCopy, n);
    250241        }
    251         PointerType * const ptrTy = fieldWidthTy->getPointerTo();
     242        PointerType * const ptrTy = getIntNTy(fieldWidth)->getPointerTo();
    252243        target = CreateGEP(CreatePointerCast(target, ptrTy), targetOffset);
    253244        source = CreateGEP(CreatePointerCast(source, ptrTy), sourceOffset);
     245        const auto alignment = (fieldWidth + 7) / 8;
    254246        CreateMemCpy(target, source, itemsToCopy, alignment);
    255247
    256248    } else { // either the target offset or source offset is non-zero but not both
    257 
    258249        VectorType * const blockTy = getBitBlockType();
    259250        PointerType * const blockPtrTy = blockTy->getPointerTo();
    260 
    261         target = CreatePointerCast(target, blockPtrTy, "target");
    262         source = CreatePointerCast(source, blockPtrTy, "source");
    263 
    264         assert ((blockWidth % fieldWidth) == 0);
    265 
    266         VectorType * const shiftTy = VectorType::get(fieldWidthTy, blockWidth / fieldWidth);
    267         Constant * const width = getSize(blockWidth / itemWidth);
     251        Constant * const BLOCK_WIDTH = getSize(blockWidth);
     252        target = CreatePointerCast(target, blockPtrTy);
     253        target = CreateGEP(target, CreateUDiv(targetOffset, BLOCK_WIDTH));
     254        source = CreatePointerCast(source, blockPtrTy);
     255        source = CreateGEP(source, CreateUDiv(sourceOffset, BLOCK_WIDTH));
     256        const auto alignment = blockWidth / 8;
    268257        Constant * const ZERO = getSize(0);
    269258        Constant * const ONE = getSize(1);
     259
    270260        BasicBlock * const entry = GetInsertBlock();
     261
     262        // TODO: this code isn't correct. I was hoping to shift by fieldwidth units to give LLVM
     263        // the ability to better select
    271264
    272265        if (isConstantZero(targetOffset)) {
     
    285278                                    2  |FFeee|GGfff|HHggg|    h|
    286279                                    3  |JJiii|KKjjj|LLkkk|    l|
    287              */
    288 
    289             Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, width), n);
    290             Value * const offset = CreateURem(sourceOffset, width);
    291             Value * const offsetVector = simd_fill(fieldWidth, CreateTrunc(offset, fieldWidthTy));
    292             Value * const remaining = CreateSub(width, offset);
    293             Value * const remainingVector = simd_fill(fieldWidth, CreateTrunc(remaining, fieldWidthTy));
    294 
    295             BasicBlock * const streamCopy = CreateBasicBlock(name + "PullCopy");
    296             BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "PullCopyRemaining");
    297             BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "PullCopyEnd");
    298 
    299             CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemaining);
     280            */
     281
     282            sourceOffset = CreateURem(sourceOffset, BLOCK_WIDTH);
     283
     284            Value * const borrowOffset = CreateSub(BLOCK_WIDTH, sourceOffset);
     285            BasicBlock * const streamCopy = CreateBasicBlock();
     286            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
     287            BasicBlock * const streamCopyRemaining = CreateBasicBlock();
     288            BasicBlock * const streamCopyEnd = CreateBasicBlock();
     289
     290            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
     291            CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemainingCond);
    300292
    301293            SetInsertPoint(streamCopy);
    302294            PHINode * const i = CreatePHI(getSizeTy(), 2);
    303295            i->addIncoming(n, entry);
    304             Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
    305             prior = CreateBitCast(CreateLShr(CreateBitCast(prior, shiftTy), offsetVector), blockTy);
    306             Value * value = CreateAlignedLoad(CreateGEP(source, i), alignment);
    307             value = CreateBitCast(CreateShl(CreateBitCast(value, shiftTy), remainingVector), blockTy);
    308             CreateAlignedStore(CreateOr(value, prior), CreateGEP(target, i), alignment);
     296            Value * Ai = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
     297            Ai = mvmd_srl(fieldWidth, Ai, borrowOffset);
     298            Value * Bi = CreateAlignedLoad(CreateGEP(source, i), alignment);
     299            Bi = mvmd_sll(fieldWidth, Bi, sourceOffset);
     300            CreateAlignedStore(CreateOr(Bi, Ai), CreateGEP(target, i), alignment);
    309301            Value * const next_i = CreateAdd(i, ONE);
    310302            i->addIncoming(next_i, streamCopy);
    311             CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemaining);
     303            CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemainingCond);
     304
     305            SetInsertPoint(streamCopyRemainingCond);
     306            Value * const partialBlocksToCopy = CreateAdd(blocksToCopy, n);
     307            Value * const remainingItemsToCopy = CreateURem(itemsToCopy, BLOCK_WIDTH);
     308            CreateLikelyCondBr(CreateIsNotNull(remainingItemsToCopy), streamCopyRemaining, streamCopyEnd);
    312309
    313310            SetInsertPoint(streamCopyRemaining);
    314311            PHINode * const j = CreatePHI(getSizeTy(), 2);
    315             j->addIncoming(blocksToCopy, entry);
    316             j->addIncoming(blocksToCopy, streamCopy);
    317             Value * final = CreateAlignedLoad(CreateGEP(source, j), alignment);
    318             final = CreateBitCast(CreateLShr(CreateBitCast(final, shiftTy), offsetVector), blockTy);
    319             CreateAlignedStore(final, CreateGEP(target, j), alignment);
     312            j->addIncoming(blocksToCopy, streamCopyRemainingCond);
     313            Value * Aj = CreateAlignedLoad(CreateGEP(source, j), alignment);
     314            Aj = mvmd_srl(fieldWidth, Aj, borrowOffset);
     315            CreateAlignedStore(Aj, CreateGEP(target, j), alignment);
    320316            Value * const next_j = CreateAdd(j, ONE);
    321317            j->addIncoming(next_j, streamCopyRemaining);
    322             CreateCondBr(CreateICmpNE(next_j, CreateAdd(blocksToCopy, n)), streamCopyRemaining, streamCopyEnd);
     318            CreateCondBr(CreateICmpNE(next_j, partialBlocksToCopy), streamCopyRemaining, streamCopyEnd);
    323319
    324320            SetInsertPoint(streamCopyEnd);
     
    336332
    337333                                          A     B     C     D
    338                TARGET STREAM        1  |aa---|bbAAA|ccBBB| dCCC|
    339                                     2  |ee---|ffEEE|ggFFF| hGGG|
    340                                     3  |ii---|jjIII|kkJJJ| lKKK|
     334               TARGET STREAM        1  |--XXX|-----|-----|-----|
     335                                    2  |--YYY|-----|-----|-----|
     336                                    3  |--ZZZ|-----|-----|-----|
     337
     338                                          A     B     C     D
     339               OUTPUT STREAM        1  |aaXXX|bbAAA|ccBBB| dCCC|
     340                                    2  |eeYYY|ffEEE|ggFFF| hGGG|
     341                                    3  |iiZZZ|jjIII|kkJJJ| lKKK|
    341342
    342343            */
    343344
    344             BasicBlock * const streamCopy = CreateBasicBlock(name + "PushCopy");
    345             BasicBlock * const streamCopyRemainingCond = CreateBasicBlock(name + "PushCopyRemainingCond");
    346             BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "PushCopyRemaining");
    347             BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "PushCopyEnd");
    348 
    349             Value * const pos = CreateURem(targetOffset, width);
    350             Value * const copied = CreateSub(width, pos);
    351             Value * const copiedVector = simd_fill(fieldWidth, CreateTrunc(copied, fieldWidthTy));
    352             Value * const mask = CreateLShr(Constant::getAllOnesValue(shiftTy), copiedVector);
    353             Value * const offsetVector = simd_fill(fieldWidth, CreateTrunc(pos, fieldWidthTy));
    354 
     345            BasicBlock * const streamCopy = CreateBasicBlock();
     346            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
     347            BasicBlock * const streamCopyRemaining = CreateBasicBlock();
     348            BasicBlock * const streamCopyEnd = CreateBasicBlock();
     349
     350            targetOffset = CreateURem(targetOffset, BLOCK_WIDTH);
     351
     352            Value * const carryOffset = CreateSub(BLOCK_WIDTH, targetOffset);
     353            Value * const mask = mvmd_srl(fieldWidth, Constant::getAllOnesValue(blockTy), carryOffset);
    355354            CreateBr(streamCopy);
    356355
     
    358357            PHINode * const i = CreatePHI(getSizeTy(), 2);
    359358            i->addIncoming(ZERO, entry);
    360             Value * priorTargetValue = CreateAlignedLoad(CreateGEP(target, i), alignment);
    361             priorTargetValue = CreateBitCast(CreateAnd(CreateBitCast(priorTargetValue, shiftTy), mask), blockTy);
    362             Value * sourceValue = CreateAlignedLoad(CreateGEP(source, i), alignment);
    363             sourceValue = CreateBitCast(CreateShl(CreateBitCast(sourceValue, shiftTy), offsetVector), blockTy);
    364             CreateAlignedStore(CreateOr(sourceValue, priorTargetValue), CreateGEP(target, i), alignment);
     359            Value * A0 = CreateAlignedLoad(CreateGEP(target, i), alignment);
     360            A0 = CreateAnd(A0, mask);
     361            Value * Ai = CreateAlignedLoad(CreateGEP(source, i), alignment);
     362            Ai = mvmd_sll(fieldWidth, Ai, targetOffset);
     363            CreateAlignedStore(CreateOr(Ai, A0), CreateGEP(target, i), alignment);
    365364            Value * const next_i = CreateAdd(i, ONE);
    366365            i->addIncoming(next_i, streamCopy);
     
    368367
    369368            SetInsertPoint(streamCopyRemainingCond);
    370             Value * const blocksToCopy = CreateMul(CreateUDiv(CreateSub(itemsToCopy, copied), width), n);
    371             CreateCondBr(CreateICmpULT(copied, itemsToCopy), streamCopyRemaining, streamCopyEnd);
     369            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
     370            CreateCondBr(CreateICmpUGT(blocksToCopy, n), streamCopyRemaining, streamCopyEnd);
    372371
    373372            SetInsertPoint(streamCopyRemaining);
    374373            PHINode * const j = CreatePHI(getSizeTy(), 2);
    375374            j->addIncoming(n, streamCopyRemainingCond);
    376             Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
    377             prior = CreateBitCast(CreateShl(CreateBitCast(prior, shiftTy), offsetVector), blockTy);
    378             Value * value = CreateAlignedLoad(CreateGEP(source, j), alignment);
    379             value = CreateBitCast(CreateLShr(CreateBitCast(value, shiftTy), copiedVector), blockTy);
    380             CreateAlignedStore(CreateOr(value, prior), CreateGEP(target, j), alignment);
     375            Value * Aj = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
     376            Aj = mvmd_srl(fieldWidth, Aj, carryOffset);
     377            Value * Bj = CreateAlignedLoad(CreateGEP(source, j), alignment);
     378            Bj = mvmd_sll(fieldWidth, Bj, targetOffset);
     379            CreateAlignedStore(CreateOr(Bj, Aj), CreateGEP(target, j), alignment);
    381380            Value * const next_j = CreateAdd(j, ONE);
    382381            j->addIncoming(next_j, streamCopyRemaining);
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.h

    r5856 r5865  
    123123    llvm::Value * getLinearlyWritableItems(const std::string & name, llvm::Value * fromPos, bool reverse = false);
    124124   
    125     void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopy, const unsigned itemAlignment);   
     125    void CreateStreamCpy(const std::string & name, llvm::Value * const target, llvm::Value * const targetOffset, llvm::Value * const source, llvm::Value * const sourceOffset, llvm::Value * const itemsToCopyFromOffset, const unsigned itemAlignment);
    126126
    127127    llvm::BasicBlock * CreateConsumerWait();
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_block_decoder.cpp

    r5864 r5865  
    4343
    4444void LZ4BlockDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {
    45     BasicBlock * entry_block = iBuilder->GetInsertBlock();
     45//    BasicBlock * entry_block = iBuilder->GetInsertBlock();
    4646//    iBuilder->CallPrintInt("block_available", iBuilder->getAvailableItemCount("byteStream"));
    4747    BasicBlock * exit_block = iBuilder->CreateBasicBlock("exit");
    4848
    49     BasicBlock * assert_fail_block = iBuilder->CreateBasicBlock("assert_fail_block");
    50     BasicBlock * real_entry_block = iBuilder->CreateBasicBlock("real_entry_block");
     49//    BasicBlock * assert_fail_block = iBuilder->CreateBasicBlock("assert_fail_block");
     50//    BasicBlock * real_entry_block = iBuilder->CreateBasicBlock("real_entry_block");
    5151
    5252    Value* hasSkipHeader = iBuilder->getScalarField("hasSkipHeader");
     
    152152    }
    153153
    154     Value* LZ4BlockDecoderKernel::appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, Value* isCompressed, Value* blockStart, Value* blockEnd) {
     154    void LZ4BlockDecoderKernel::appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, Value* isCompressed, Value* blockStart, Value* blockEnd) {
    155155        // TODO adjust output storing
    156156        this->generateStoreCircularOutput(iBuilder, "isCompressed", iBuilder->getInt1Ty()->getPointerTo(), isCompressed);
     
    176176
    177177    size_t LZ4BlockDecoderKernel::getOutputBufferSize(const unique_ptr<KernelBuilder> &iBuilder, const string& bufferName) {
    178         size_t s = this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks();
     178//        size_t s = this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks();
    179179        return this->getOutputStreamSetBuffer(bufferName)->getBufferBlocks() * iBuilder->getStride();
    180180    }
  • icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_block_decoder.h

    r5864 r5865  
    3434    llvm::Value *generateLoadInput(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value *offset);
    3535
    36     llvm::Value *appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value *isCompressed, llvm::Value *blockStart, llvm::Value *blockEnd);
     36    void appendOutput(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value *isCompressed, llvm::Value *blockStart, llvm::Value *blockEnd);
    3737
    3838    void generateStoreCircularOutput(const std::unique_ptr<KernelBuilder> &iBuilder, const std::string& outputBufferName,
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5857 r5865  
    1414PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
    1515: MultiBlockKernel(name + "",
    16                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)},
     16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"},
    1717                   Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
    18                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet", RateEqualTo("PDEPmarkerStream")}},
     18                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
    1919                  {}, {}, {})
    2020, mSwizzleFactor(swizzleFactor)
     
    154154
    155155    kb->SetInsertPoint(terminate);
    156     Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
    157     itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
    158     kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
     156//    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
     157//    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
     158//    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
    159159    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedSourceBitsPhi);
    160160
Note: See TracChangeset for help on using the changeset viewer.