Changeset 5586


Ignore:
Timestamp:
Jul 31, 2017, 12:46:10 PM (3 months ago)
Author:
nmedfort
Message:

Long Advance bug fix.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/pablo/carry_manager.cpp

    r5493 r5586  
    1010#include <llvm/IR/BasicBlock.h>
    1111#include <llvm/IR/DerivedTypes.h>
     12#include <llvm/Transforms/Utils/Local.h>
    1213#include <pablo/branch.h>
    1314#include <pablo/pe_advance.h>
     
    2324namespace pablo {
    2425
     26inline static bool is_power_2(const unsigned n) {
     27    return (n && ((n & (n - 1)) == 0));
     28}
     29
    2530inline static unsigned ceil_log2(const unsigned v) {
    2631    assert ("log2(0) is undefined!" && v != 0);
    27     return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1);
     32    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
    2833}
    2934
    3035inline static unsigned floor_log2(const unsigned v) {
    3136    assert ("log2(0) is undefined!" && v != 0);
    32     return ((sizeof(unsigned) * CHAR_BIT) - 1) - __builtin_clz(v);
     37    return ((sizeof(unsigned) * CHAR_BIT) - 1U) - __builtin_clz(v);
    3338}
    3439
    3540inline static unsigned nearest_pow2(const unsigned v) {
    36     assert(v > 0 && v < (UINT32_MAX / 2));
    37     return (v < 2) ? 1 : (1 << ceil_log2(v));
     41    assert (v > 0 && v < (UINT32_MAX / 2));
     42    return (v < 2) ? 1 : (1U << ceil_log2(v));
     43}
     44
     45inline static unsigned nearest_multiple(const unsigned n, const unsigned m) {
     46    assert (is_power_2(m));
     47    const unsigned r = (n + m - 1U) & -m;
     48    assert (r >= n);
     49    return r;
     50}
     51
     52inline static bool is_multiple_of(const unsigned n, const unsigned m) {
     53    return nearest_multiple(n, m) == n;
     54}
     55
     56inline static unsigned udiv(const unsigned x, const unsigned y) {
     57    assert (is_power_2(y));
     58    const unsigned z = x >> floor_log2(y);
     59    assert (z == (x / y));
     60    return z;
    3861}
    3962
     
    381404        Value * const nextCarryIn = iBuilder->CreateXor(carryIn, borrow);
    382405        Value * const nextSummary = iBuilder->CreateOr(carryInSummary, nextCarryIn);
     406
    383407        iBuilder->CreateBlockAlignedStore(nextCarryIn, carryInPtr);
    384408        carryInSummary->addIncoming(nextSummary, update);
     
    391415        Value * const carryOut = iBuilder->CreateBlockAlignedLoad(carryOutPtr);
    392416        Value * const nextCarryOut = iBuilder->CreateXor(carryOut, carry);
     417
    393418        iBuilder->CreateBlockAlignedStore(nextCarryOut, carryOutPtr);
    394419        Value * finalCarry = iBuilder->CreateAnd(carryOut, carry);
     
    593618    assert (shiftAmount >= LONG_ADVANCE_BREAKPOINT);
    594619
    595     Type * const streamTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
     620    const auto blockWidth = iBuilder->getBitBlockWidth();
     621    Type * const streamTy = iBuilder->getIntNTy(blockWidth);
     622
     623    Value * indices[3];
     624
     625    indices[0] = iBuilder->getInt32(0);
    596626
    597627    if (mIfDepth > 0) {
    598         if (shiftAmount > iBuilder->getBitBlockWidth()) {
    599             const auto frameIndex = mCurrentFrameIndex++;
     628        if (shiftAmount > blockWidth) {
     629
    600630            Value * carry = iBuilder->CreateZExt(iBuilder->bitblock_any(value), streamTy);
    601             const unsigned summarySize = ceil_udiv(shiftAmount, iBuilder->getBitBlockWidth() * iBuilder->getBitBlockWidth());
    602             for (unsigned i = 0;;++i) {
    603                 Value * const ptr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(frameIndex), iBuilder->getInt32(i)});
     631            const auto summaryBlocks = ceil_udiv(shiftAmount, blockWidth);
     632            const auto summarySize = ceil_udiv(summaryBlocks, blockWidth);
     633            VectorType * const bitBlockTy = iBuilder->getBitBlockType();
     634            IntegerType * const laneTy = cast<IntegerType>(bitBlockTy->getVectorElementType());
     635            const auto laneWidth = laneTy->getIntegerBitWidth();
     636
     637            assert (summarySize > 0);
     638            assert (is_power_2(laneWidth));
     639
     640            indices[1] = iBuilder->getInt32(mCurrentFrameIndex++);
     641
     642            for (unsigned i = 1;;++i) {
     643
     644                assert (i <= summarySize);
     645
     646                indices[2] = iBuilder->getInt32(i - 1);
     647
     648                Value * const ptr = iBuilder->CreateGEP(mCurrentFrame, indices);
    604649                Value * const prior = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(ptr), streamTy);
    605                 Value * const stream = iBuilder->CreateBitCast(iBuilder->CreateOr(iBuilder->CreateShl(prior, 1), carry), iBuilder->getBitBlockType());
     650
     651                Value * advanced = nullptr;
     652                if (LLVM_LIKELY(summaryBlocks < laneWidth)) {
     653                    advanced = iBuilder->CreateOr(iBuilder->CreateShl(prior, 1), carry);
     654                    carry = iBuilder->CreateLShr(prior, summaryBlocks - 1);
     655                } else {
     656                    std::tie(advanced, carry) = iBuilder->bitblock_advance(prior, carry, 1);
     657                }
     658                Value * stream = iBuilder->CreateBitCast(advanced, bitBlockTy);
    606659                if (LLVM_LIKELY(i == summarySize)) {
    607                     Value * const maskedStream = iBuilder->CreateAnd(stream, iBuilder->bitblock_mask_from(iBuilder->getInt32(summarySize % iBuilder->getBitBlockWidth())));
    608                     addToCarryOutSummary(iBuilder, maskedStream);
    609                     iBuilder->CreateBlockAlignedStore(maskedStream, ptr);
     660                    const auto n = bitBlockTy->getVectorNumElements();
     661                    Constant * mask[n];                                       
     662                    const auto m = udiv(summaryBlocks, laneWidth);
     663                    if (m) {
     664                        std::fill_n(mask, m, ConstantInt::getAllOnesValue(laneTy));
     665                    }
     666                    mask[m] = ConstantInt::get(laneTy, (1UL << (summaryBlocks & (laneWidth - 1))) - 1UL);
     667                    if (n > m) {
     668                        std::fill_n(mask + m + 1, n - m, UndefValue::get(laneTy));
     669                    }
     670                    stream = iBuilder->CreateAnd(stream, ConstantVector::get(ArrayRef<Constant *>(mask, n)));
     671
     672                    addToCarryOutSummary(iBuilder, stream);
     673                    iBuilder->CreateBlockAlignedStore(stream, ptr);
     674                    RecursivelyDeleteTriviallyDeadInstructions(carry);
    610675                    break;
    611676                }
    612677                addToCarryOutSummary(iBuilder, stream);
    613678                iBuilder->CreateBlockAlignedStore(stream, ptr);
    614                 carry = iBuilder->CreateLShr(prior, iBuilder->getBitBlockWidth() - 1);
    615679            }
     680
    616681        } else if (LLVM_LIKELY(mCarryInfo->hasExplicitSummary())) {
    617682            addToCarryOutSummary(iBuilder, value);
    618683        }
    619684    }
    620     const auto frameIndex = mCurrentFrameIndex++;
     685
     686    indices[1] = iBuilder->getInt32(mCurrentFrameIndex++);
     687
    621688    // special case using a single buffer entry and the carry_out value.
    622     if (LLVM_LIKELY((shiftAmount < iBuilder->getBitBlockWidth()) && (mLoopDepth == 0))) {
    623         Value * const buffer = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(frameIndex), iBuilder->getInt32(0)});
     689    if (LLVM_LIKELY((shiftAmount < blockWidth) && (mLoopDepth == 0))) {
     690
     691        indices[2] = indices[0]; // iBuilder->getInt32(0)
     692        assert (cast<ConstantInt>(indices[2])->isNullValue());
     693
     694        Value * const buffer = iBuilder->CreateGEP(mCurrentFrame, indices);
    624695        assert (buffer->getType()->getPointerElementType() == iBuilder->getBitBlockType());
    625696        Value * carryIn = iBuilder->CreateBlockAlignedLoad(buffer);
     697
    626698        iBuilder->CreateBlockAlignedStore(value, buffer);
    627699        /* Very special case - no combine */
    628         if (LLVM_UNLIKELY(shiftAmount == iBuilder->getBitBlockWidth())) {
     700        if (LLVM_UNLIKELY(shiftAmount == blockWidth)) {
    629701            return iBuilder->CreateBitCast(carryIn, iBuilder->getBitBlockType());
    630702        }
    631         Value* block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), iBuilder->getBitBlockWidth() - shiftAmount);
     703        Value* block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), blockWidth - shiftAmount);
    632704        Value* block1_shl = iBuilder->CreateShl(iBuilder->CreateBitCast(value, streamTy), shiftAmount);
    633705        return iBuilder->CreateBitCast(iBuilder->CreateOr(block1_shl, block0_shr), iBuilder->getBitBlockType());
    634706    } else { //
    635         const unsigned blockShift = shiftAmount % iBuilder->getBitBlockWidth();
    636         const unsigned blocks = ceil_udiv(shiftAmount, iBuilder->getBitBlockWidth());
     707        const unsigned blockShift = shiftAmount & (blockWidth - 1);
     708        const unsigned summaryBlocks = ceil_udiv(shiftAmount, blockWidth);
     709
    637710        // Create a mask to implement circular buffer indexing
    638         Value * indexMask = iBuilder->getSize(nearest_pow2(blocks + ((mLoopDepth != 0) ? 1 : 0)) - 1);
     711        Value * indexMask = iBuilder->getSize(nearest_pow2(summaryBlocks) - 1);
    639712        Value * blockIndex = iBuilder->getScalarField("CarryBlockIndex");
    640         Value * carryIndex0 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(blocks));
    641         Value * loadIndex0 = iBuilder->CreateAnd(carryIndex0, indexMask);
    642         Value * const carryInPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(frameIndex), loadIndex0});
     713
     714        Value * carryIndex0 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(summaryBlocks));
     715        indices[2] = iBuilder->CreateAnd(carryIndex0, indexMask);
     716        Value * const carryInPtr = iBuilder->CreateGEP(mCurrentFrame, indices);
    643717        Value * carryIn = iBuilder->CreateBlockAlignedLoad(carryInPtr);
    644718
    645         Value * storeIndex = iBuilder->CreateAnd(blockIndex, indexMask);
    646         Value * const carryOutPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(frameIndex), storeIndex});
     719        indices[2] = iBuilder->CreateAnd(blockIndex, indexMask);
     720        Value * const carryOutPtr = iBuilder->CreateGEP(mCurrentFrame, indices);
    647721        assert (carryIn->getType() == iBuilder->getBitBlockType());
    648722
     
    653727            return carryIn;
    654728        } else { // Otherwise we need to combine data from the two oldest blocks.
    655             Value * carryIndex1 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(blocks - 1));
    656             Value * loadIndex1 = iBuilder->CreateAnd(carryIndex1, indexMask);
    657             Value * const carryInPtr2 = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(frameIndex), loadIndex1});
    658             Value * carry_block1 = iBuilder->CreateBlockAlignedLoad(carryInPtr2);
    659             Value * block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), iBuilder->getBitBlockWidth() - blockShift);
    660             Value * block1_shl = iBuilder->CreateShl(iBuilder->CreateBitCast(carry_block1, streamTy), blockShift);
     729            Value * const carryIndex1 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(summaryBlocks - 1));
     730            indices[2] = iBuilder->CreateAnd(carryIndex1, indexMask);
     731
     732            Value * const carryInPtr2 = iBuilder->CreateGEP(mCurrentFrame, indices);
     733            Value * const carryIn2 = iBuilder->CreateBlockAlignedLoad(carryInPtr2);
     734
    661735            iBuilder->CreateBlockAlignedStore(value, carryOutPtr);
    662             return iBuilder->CreateBitCast(iBuilder->CreateOr(block1_shl, block0_shr), iBuilder->getBitBlockType());
     736
     737            Value * const b0 = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), blockWidth - blockShift);
     738            Value * const b1 = iBuilder->CreateShl(iBuilder->CreateBitCast(carryIn2, streamTy), blockShift);
     739            return iBuilder->CreateBitCast(iBuilder->CreateOr(b1, b0), iBuilder->getBitBlockType());
    663740        }
    664741    }
     
    717794        while (frameTy->isStructTy()) {
    718795            ++count;
     796            assert (frameTy->getStructNumElements() > 0);
    719797            frameTy = frameTy->getStructElementType(0);
    720798        }
     
    807885}
    808886
     887static bool hasNonEmptyCarryStruct(const Type * const frameTy) {
     888    if (frameTy->isStructTy()) {
     889        for (unsigned i = 0; i < frameTy->getStructNumElements(); ++i) {
     890            if (hasNonEmptyCarryStruct(frameTy->getStructElementType(i))) {
     891                return true;
     892            }
     893        }
     894        return false;
     895    }
     896    return true;
     897}
     898
     899static bool hasNonEmptyCarryStruct(const std::vector<Type *> & state) {
     900    for (const Type * const frameTy : state) {
     901        if (hasNonEmptyCarryStruct(frameTy)) {
     902            return true;
     903        }
     904    }
     905    return false;
     906}
     907
    809908/** ------------------------------------------------------------------------------------------------------------- *
    810909 * @brief analyse
     
    829928            Type * type = carryPackType;
    830929            if (LLVM_UNLIKELY(amount >= LONG_ADVANCE_BREAKPOINT)) {
    831                 const unsigned blocks = ceil_udiv(amount, iBuilder->getBitBlockWidth());
     930                const auto blockWidth = iBuilder->getBitBlockWidth();
     931                const auto blocks = ceil_udiv(amount, blockWidth);
    832932                type = ArrayType::get(blockTy, nearest_pow2(blocks + ((loopDepth != 0) ? 1 : 0)));
    833                 if (LLVM_UNLIKELY(ifDepth > 0 && amount > iBuilder->getBitBlockWidth())) {
     933                if (LLVM_UNLIKELY(ifDepth > 0 && blocks != 1)) {
     934                    const auto summarySize = ceil_udiv(blocks, blockWidth);
    834935                    // 1 bit will mark the presense of any bit in each block.
    835                     Type * carryType = ArrayType::get(blockTy, ceil_udiv(amount, std::pow(iBuilder->getBitBlockWidth(), 2)));
    836                     state.push_back(carryType);
     936                    state.push_back(ArrayType::get(blockTy, summarySize));
    837937                }
    838938                mHasLongAdvance = true;               
     
    855955        carryState = StructType::get(iBuilder->getContext());
    856956    } else {
    857         if (dyn_cast_or_null<If>(scope->getBranch()) || nonCarryCollapsingMode || isNestedWithinNonCarryCollapsingLoop) {
    858             if (LLVM_LIKELY(state.size() > 1)) {
    859                 summaryType = CarryData::ExplicitSummary;
    860                 // NOTE: summaries are stored differently depending whether we're entering an If or While branch. With an If branch, they
    861                 // preceed the carry state data and with a While loop they succeed it. This is to help cache prefectching performance.
    862                 state.insert(isa<If>(scope->getBranch()) ? state.begin() : state.end(), carryPackType);
    863             } else {
    864                 summaryType = CarryData::ImplicitSummary;
    865                 if (state[0]->isStructTy()) {
    866                     summaryType = CarryData::BorrowedSummary;
     957        // do we have a summary or a sequence of nested empty structs?
     958        if (hasNonEmptyCarryStruct(state)) {
     959            if (dyn_cast_or_null<If>(scope->getBranch()) || nonCarryCollapsingMode || isNestedWithinNonCarryCollapsingLoop) {
     960                if (LLVM_LIKELY(state.size() > 1)) {
     961                    summaryType = CarryData::ExplicitSummary;
     962                    // NOTE: summaries are stored differently depending whether we're entering an If or While branch. With an If branch, they
     963                    // preceed the carry state data and with a While loop they succeed it. This is to help cache prefectching performance.
     964                    state.insert(isa<If>(scope->getBranch()) ? state.begin() : state.end(), carryPackType);
     965                } else {
     966                    summaryType = CarryData::ImplicitSummary;
     967                    if (hasNonEmptyCarryStruct(state[0])) {
     968                        summaryType = CarryData::BorrowedSummary;
     969                    }
    867970                }
    868             }           
     971            }
    869972        }
    870973        carryState = StructType::get(iBuilder->getContext(), state);
Note: See TracChangeset for help on using the changeset viewer.