Ignore:
Timestamp:
Feb 6, 2018, 4:57:35 PM (16 months ago)
Author:
nmedfort
Message:

More work on the pipeline I/O rate handling

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/kernel_builder.cpp

    r5856 r5865  
    164164}
    165165
    166 /** ------------------------------------------------------------------------------------------------------------- *
    167  * @brief getFieldWidth
    168  ** ------------------------------------------------------------------------------------------------------------- */
    169 inline unsigned getFieldWidth(const unsigned bitWidth, const unsigned blockWidth) {
    170     for (unsigned k = 16; k <= blockWidth; k *= 2) {
    171         if ((bitWidth & (k - 1)) != 0) {
    172             return k / 2;
    173         }
    174     }
    175     return blockWidth;
     166inline static unsigned ceil_log2(const unsigned v) {
     167    assert ("log2(0) is undefined!" && v != 0);
     168    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
    176169}
    177170
     
    186179    assert (target->getType()->isPointerTy());
    187180    assert (isConstantZero(targetOffset) || isConstantZero(sourceOffset));
    188 
    189     const StreamSetBuffer * const buf = mKernel->getAnyStreamSetBuffer(name);
    190 
    191     const auto itemWidth = getItemWidth(buf->getBaseType());
     181    const StreamSetBuffer * const buffer = mKernel->getAnyStreamSetBuffer(name);
     182    const auto itemWidth = getItemWidth(buffer->getBaseType());
    192183    assert ("invalid item width" && is_power_2(itemWidth));
    193184    const auto blockWidth = getBitBlockWidth();
    194185    // Although our item width may be n bits, if we know we're always processing m items per block, our field width
    195186    // (w.r.t the stream copy) would be n*m. By taking this into account we can optimize and simplify the copy code.
    196     const auto fieldWidth = getFieldWidth(itemWidth * itemAlignment, blockWidth);
    197     const auto alignment = (fieldWidth + 7) / 8;
     187    const auto fieldWidth = std::min(1U << ceil_log2(itemWidth * itemAlignment), blockWidth);
     188    assert ((blockWidth % fieldWidth) == 0);
    198189
    199190    if (LLVM_LIKELY(itemWidth < fieldWidth)) {
     
    201192        Constant * const FACTOR = getSize(factor);
    202193        if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    203             ConstantInt * const ALIGNMENT = getSize(alignment);
    204194            const auto kernelName = mKernel->getName()+ ": " + name;
    205             CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     195            if (fieldWidth > 8) {
     196                const auto alignment = (fieldWidth + 7) / 8;
     197                ConstantInt * const ALIGNMENT = getSize(alignment);
     198                CreateAssertZero(CreateURem(CreatePtrToInt(target, getSizeTy()), ALIGNMENT), kernelName + " target is misaligned (" + std::to_string(alignment) + ")");
     199                CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
     200            }
    206201            CreateAssertZero(CreateURem(targetOffset, FACTOR), kernelName + " target offset is misaligned (" + std::to_string(factor) + ")");
    207             CreateAssertZero(CreateURem(CreatePtrToInt(source, getSizeTy()), ALIGNMENT), kernelName + " source is misaligned (" + std::to_string(alignment) + ")");
    208202            CreateAssertZero(CreateURem(sourceOffset, FACTOR), kernelName + " source offset is misaligned (" + std::to_string(factor) + ")");
    209203        }
     
    236230    */
    237231
    238     Type * const fieldWidthTy = getIntNTy(fieldWidth);
    239 
    240     Value * n = buf->getStreamSetCount(this, getStreamHandle(name));
    241 
    242     if (isConstantOne(n) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset))) {
     232    Value * const n = buffer->getStreamSetCount(this, getStreamHandle(name));
     233    if (((isConstantOne(n) && fieldWidth >= 8) || fieldWidth == blockWidth || (isConstantZero(targetOffset) && isConstantZero(sourceOffset)))) {
    243234        if (LLVM_LIKELY(itemWidth < 8)) {
    244235            itemsToCopy = CreateUDivCeil(itemsToCopy, getSize(8 / itemWidth));
     
    249240            itemsToCopy = CreateMul(itemsToCopy, n);
    250241        }
    251         PointerType * const ptrTy = fieldWidthTy->getPointerTo();
     242        PointerType * const ptrTy = getIntNTy(fieldWidth)->getPointerTo();
    252243        target = CreateGEP(CreatePointerCast(target, ptrTy), targetOffset);
    253244        source = CreateGEP(CreatePointerCast(source, ptrTy), sourceOffset);
     245        const auto alignment = (fieldWidth + 7) / 8;
    254246        CreateMemCpy(target, source, itemsToCopy, alignment);
    255247
    256248    } else { // either the target offset or source offset is non-zero but not both
    257 
    258249        VectorType * const blockTy = getBitBlockType();
    259250        PointerType * const blockPtrTy = blockTy->getPointerTo();
    260 
    261         target = CreatePointerCast(target, blockPtrTy, "target");
    262         source = CreatePointerCast(source, blockPtrTy, "source");
    263 
    264         assert ((blockWidth % fieldWidth) == 0);
    265 
    266         VectorType * const shiftTy = VectorType::get(fieldWidthTy, blockWidth / fieldWidth);
    267         Constant * const width = getSize(blockWidth / itemWidth);
     251        Constant * const BLOCK_WIDTH = getSize(blockWidth);
     252        target = CreatePointerCast(target, blockPtrTy);
     253        target = CreateGEP(target, CreateUDiv(targetOffset, BLOCK_WIDTH));
     254        source = CreatePointerCast(source, blockPtrTy);
     255        source = CreateGEP(source, CreateUDiv(sourceOffset, BLOCK_WIDTH));
     256        const auto alignment = blockWidth / 8;
    268257        Constant * const ZERO = getSize(0);
    269258        Constant * const ONE = getSize(1);
     259
    270260        BasicBlock * const entry = GetInsertBlock();
     261
     262        // TODO: this code isn't correct. I was hoping to shift by fieldwidth units to give LLVM
     263        // the ability to better select
    271264
    272265        if (isConstantZero(targetOffset)) {
     
    285278                                    2  |FFeee|GGfff|HHggg|    h|
    286279                                    3  |JJiii|KKjjj|LLkkk|    l|
    287              */
    288 
    289             Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, width), n);
    290             Value * const offset = CreateURem(sourceOffset, width);
    291             Value * const offsetVector = simd_fill(fieldWidth, CreateTrunc(offset, fieldWidthTy));
    292             Value * const remaining = CreateSub(width, offset);
    293             Value * const remainingVector = simd_fill(fieldWidth, CreateTrunc(remaining, fieldWidthTy));
    294 
    295             BasicBlock * const streamCopy = CreateBasicBlock(name + "PullCopy");
    296             BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "PullCopyRemaining");
    297             BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "PullCopyEnd");
    298 
    299             CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemaining);
     280            */
     281
     282            sourceOffset = CreateURem(sourceOffset, BLOCK_WIDTH);
     283
     284            Value * const borrowOffset = CreateSub(BLOCK_WIDTH, sourceOffset);
     285            BasicBlock * const streamCopy = CreateBasicBlock();
     286            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
     287            BasicBlock * const streamCopyRemaining = CreateBasicBlock();
     288            BasicBlock * const streamCopyEnd = CreateBasicBlock();
     289
     290            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
     291            CreateCondBr(CreateICmpNE(blocksToCopy, ZERO), streamCopy, streamCopyRemainingCond);
    300292
    301293            SetInsertPoint(streamCopy);
    302294            PHINode * const i = CreatePHI(getSizeTy(), 2);
    303295            i->addIncoming(n, entry);
    304             Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
    305             prior = CreateBitCast(CreateLShr(CreateBitCast(prior, shiftTy), offsetVector), blockTy);
    306             Value * value = CreateAlignedLoad(CreateGEP(source, i), alignment);
    307             value = CreateBitCast(CreateShl(CreateBitCast(value, shiftTy), remainingVector), blockTy);
    308             CreateAlignedStore(CreateOr(value, prior), CreateGEP(target, i), alignment);
     296            Value * Ai = CreateAlignedLoad(CreateGEP(source, CreateSub(i, n)), alignment);
     297            Ai = mvmd_srl(fieldWidth, Ai, borrowOffset);
     298            Value * Bi = CreateAlignedLoad(CreateGEP(source, i), alignment);
     299            Bi = mvmd_sll(fieldWidth, Bi, sourceOffset);
     300            CreateAlignedStore(CreateOr(Bi, Ai), CreateGEP(target, i), alignment);
    309301            Value * const next_i = CreateAdd(i, ONE);
    310302            i->addIncoming(next_i, streamCopy);
    311             CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemaining);
     303            CreateCondBr(CreateICmpNE(next_i, blocksToCopy), streamCopy, streamCopyRemainingCond);
     304
     305            SetInsertPoint(streamCopyRemainingCond);
     306            Value * const partialBlocksToCopy = CreateAdd(blocksToCopy, n);
     307            Value * const remainingItemsToCopy = CreateURem(itemsToCopy, BLOCK_WIDTH);
     308            CreateLikelyCondBr(CreateIsNotNull(remainingItemsToCopy), streamCopyRemaining, streamCopyEnd);
    312309
    313310            SetInsertPoint(streamCopyRemaining);
    314311            PHINode * const j = CreatePHI(getSizeTy(), 2);
    315             j->addIncoming(blocksToCopy, entry);
    316             j->addIncoming(blocksToCopy, streamCopy);
    317             Value * final = CreateAlignedLoad(CreateGEP(source, j), alignment);
    318             final = CreateBitCast(CreateLShr(CreateBitCast(final, shiftTy), offsetVector), blockTy);
    319             CreateAlignedStore(final, CreateGEP(target, j), alignment);
     312            j->addIncoming(blocksToCopy, streamCopyRemainingCond);
     313            Value * Aj = CreateAlignedLoad(CreateGEP(source, j), alignment);
     314            Aj = mvmd_srl(fieldWidth, Aj, borrowOffset);
     315            CreateAlignedStore(Aj, CreateGEP(target, j), alignment);
    320316            Value * const next_j = CreateAdd(j, ONE);
    321317            j->addIncoming(next_j, streamCopyRemaining);
    322             CreateCondBr(CreateICmpNE(next_j, CreateAdd(blocksToCopy, n)), streamCopyRemaining, streamCopyEnd);
     318            CreateCondBr(CreateICmpNE(next_j, partialBlocksToCopy), streamCopyRemaining, streamCopyEnd);
    323319
    324320            SetInsertPoint(streamCopyEnd);
     
    336332
    337333                                          A     B     C     D
    338                TARGET STREAM        1  |aa---|bbAAA|ccBBB| dCCC|
    339                                     2  |ee---|ffEEE|ggFFF| hGGG|
    340                                     3  |ii---|jjIII|kkJJJ| lKKK|
     334               TARGET STREAM        1  |--XXX|-----|-----|-----|
     335                                    2  |--YYY|-----|-----|-----|
     336                                    3  |--ZZZ|-----|-----|-----|
     337
     338                                          A     B     C     D
     339               OUTPUT STREAM        1  |aaXXX|bbAAA|ccBBB| dCCC|
     340                                    2  |eeYYY|ffEEE|ggFFF| hGGG|
     341                                    3  |iiZZZ|jjIII|kkJJJ| lKKK|
    341342
    342343            */
    343344
    344             BasicBlock * const streamCopy = CreateBasicBlock(name + "PushCopy");
    345             BasicBlock * const streamCopyRemainingCond = CreateBasicBlock(name + "PushCopyRemainingCond");
    346             BasicBlock * const streamCopyRemaining = CreateBasicBlock(name + "PushCopyRemaining");
    347             BasicBlock * const streamCopyEnd = CreateBasicBlock(name + "PushCopyEnd");
    348 
    349             Value * const pos = CreateURem(targetOffset, width);
    350             Value * const copied = CreateSub(width, pos);
    351             Value * const copiedVector = simd_fill(fieldWidth, CreateTrunc(copied, fieldWidthTy));
    352             Value * const mask = CreateLShr(Constant::getAllOnesValue(shiftTy), copiedVector);
    353             Value * const offsetVector = simd_fill(fieldWidth, CreateTrunc(pos, fieldWidthTy));
    354 
     345            BasicBlock * const streamCopy = CreateBasicBlock();
     346            BasicBlock * const streamCopyRemainingCond = CreateBasicBlock();
     347            BasicBlock * const streamCopyRemaining = CreateBasicBlock();
     348            BasicBlock * const streamCopyEnd = CreateBasicBlock();
     349
     350            targetOffset = CreateURem(targetOffset, BLOCK_WIDTH);
     351
     352            Value * const carryOffset = CreateSub(BLOCK_WIDTH, targetOffset);
     353            Value * const mask = mvmd_srl(fieldWidth, Constant::getAllOnesValue(blockTy), carryOffset);
    355354            CreateBr(streamCopy);
    356355
     
    358357            PHINode * const i = CreatePHI(getSizeTy(), 2);
    359358            i->addIncoming(ZERO, entry);
    360             Value * priorTargetValue = CreateAlignedLoad(CreateGEP(target, i), alignment);
    361             priorTargetValue = CreateBitCast(CreateAnd(CreateBitCast(priorTargetValue, shiftTy), mask), blockTy);
    362             Value * sourceValue = CreateAlignedLoad(CreateGEP(source, i), alignment);
    363             sourceValue = CreateBitCast(CreateShl(CreateBitCast(sourceValue, shiftTy), offsetVector), blockTy);
    364             CreateAlignedStore(CreateOr(sourceValue, priorTargetValue), CreateGEP(target, i), alignment);
     359            Value * A0 = CreateAlignedLoad(CreateGEP(target, i), alignment);
     360            A0 = CreateAnd(A0, mask);
     361            Value * Ai = CreateAlignedLoad(CreateGEP(source, i), alignment);
     362            Ai = mvmd_sll(fieldWidth, Ai, targetOffset);
     363            CreateAlignedStore(CreateOr(Ai, A0), CreateGEP(target, i), alignment);
    365364            Value * const next_i = CreateAdd(i, ONE);
    366365            i->addIncoming(next_i, streamCopy);
     
    368367
    369368            SetInsertPoint(streamCopyRemainingCond);
    370             Value * const blocksToCopy = CreateMul(CreateUDiv(CreateSub(itemsToCopy, copied), width), n);
    371             CreateCondBr(CreateICmpULT(copied, itemsToCopy), streamCopyRemaining, streamCopyEnd);
     369            Value * const blocksToCopy = CreateMul(CreateUDiv(itemsToCopy, BLOCK_WIDTH), n);
     370            CreateCondBr(CreateICmpUGT(blocksToCopy, n), streamCopyRemaining, streamCopyEnd);
    372371
    373372            SetInsertPoint(streamCopyRemaining);
    374373            PHINode * const j = CreatePHI(getSizeTy(), 2);
    375374            j->addIncoming(n, streamCopyRemainingCond);
    376             Value * prior = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
    377             prior = CreateBitCast(CreateShl(CreateBitCast(prior, shiftTy), offsetVector), blockTy);
    378             Value * value = CreateAlignedLoad(CreateGEP(source, j), alignment);
    379             value = CreateBitCast(CreateLShr(CreateBitCast(value, shiftTy), copiedVector), blockTy);
    380             CreateAlignedStore(CreateOr(value, prior), CreateGEP(target, j), alignment);
     375            Value * Aj = CreateAlignedLoad(CreateGEP(source, CreateSub(j, n)), alignment);
     376            Aj = mvmd_srl(fieldWidth, Aj, carryOffset);
     377            Value * Bj = CreateAlignedLoad(CreateGEP(source, j), alignment);
     378            Bj = mvmd_sll(fieldWidth, Bj, targetOffset);
     379            CreateAlignedStore(CreateOr(Bj, Aj), CreateGEP(target, j), alignment);
    381380            Value * const next_j = CreateAdd(j, ONE);
    382381            j->addIncoming(next_j, streamCopyRemaining);
Note: See TracChangeset for help on using the changeset viewer.