Changeset 5508


Ignore:
Timestamp:
Jun 14, 2017, 6:58:46 AM (2 years ago)
Author:
cameron
Message:

Simplified expand3_4 kernel taking advantage of multiblock builder

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/radix64.cpp

    r5507 r5508  
    3030//          input_pack2 to produce output_pack3.
    3131
    32 // The doSegment method processes input in terms of tripleBlocks, 3 blocks of input,
    33 // producing 4 blocks of output.   Unless less than one tripleBlock remains, the
    34 // doSegment method always processes an integral number of tripleBlocks as a logical
    35 // segment.  Both input and output buffers are hence maintained at block boundaries,
    36 // with the input data completely processed for each tripleBlock.
     32// The MultiBlockLogic is based on a natural stride taking 3 packs at a time.
     33// In this case, the output produced is exactly 4 packs or 4 blocks, with no pending
     34// data maintained in the kernel state.
    3735//
    38 // The pipeline must guarantee that the doSegment method is called with the
    39 // a continous buffer for the full segment (number of blocks).
     36// When processing the final partial stride of data, the kernel performs full
     37// triple-pack processing for each full or partial triple-pack remaining,
     38// relying on the MultiBlockKernel builder to only copy the correct number
     39// of bytes to the actual output stream.
    4040
    4141void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder) {
     
    4343    BasicBlock * expand2_3entry = iBuilder->GetInsertBlock();
    4444    BasicBlock * expand_3_4_loop = iBuilder->CreateBasicBlock("expand_3_4_loop");
    45     BasicBlock * expand3_4_loop_exit = iBuilder->CreateBasicBlock("expand3_4_loop_exit");
    46     BasicBlock * finalStep1 = iBuilder->CreateBasicBlock("finalStep1");
    47     BasicBlock * finalStep2 = iBuilder->CreateBasicBlock("finalStep2");
    48     BasicBlock * step2load = iBuilder->CreateBasicBlock("step2load");
    49     BasicBlock * step2store = iBuilder->CreateBasicBlock("step2store");
    50     BasicBlock * finalStep3 = iBuilder->CreateBasicBlock("finalStep3");
    51     BasicBlock * step3load = iBuilder->CreateBasicBlock("step3load");
    52     BasicBlock * step3store = iBuilder->CreateBasicBlock("step3store");
    53     BasicBlock * step3store2 = iBuilder->CreateBasicBlock("step3store2");
    54     BasicBlock * expand3_4_final = iBuilder->CreateBasicBlock("expand3_4_final");
    5545    BasicBlock * expand3_4_exit = iBuilder->CreateBasicBlock("expand3_4_exit");
    5646   
    5747    // Determine the require shufflevector constants.
    58     const unsigned PACK_SIZE = iBuilder->getStride()/8;
     48    const unsigned PACK_SIZE = iBuilder->getBitBlockWidth()/8;
    5949   
    6050    // Construct a list of indexes in  the form
     
    7666    }
    7767
    78     Constant * tripleBlockSize = iBuilder->getSize(getKernelStride());
    79     Constant * packSize = iBuilder->getSize(PACK_SIZE);
    8068    Constant * triplePackSize = iBuilder->getSize(3 * PACK_SIZE); // 3 packs per loop.
    8169    UndefValue * undefPack = UndefValue::get(iBuilder->fwVectorType(8));
     
    8977    Value * sourceStream = &*(args++);
    9078    Value * expandedStream = &*(args);
    91     Value * isFinal = iBuilder->CreateICmpULT(itemsToDo, tripleBlockSize);
    92 
    93     // The main loop processes 3 packs of data at a time.  For doFinal
    94     // processing, process all the remaining sets of 3 packs, otherwise
    95     // process in multiples of 3 full blocks of data.
    96     //
    97     Value * excessItems = iBuilder->CreateURem(itemsToDo, triplePackSize);
    98     Value * loopItemsToDo = iBuilder->CreateSub(itemsToDo, excessItems);
    99 
     79
     80    // The main loop processes 3 packs of data at a time.
     81   
    10082    Value * sourcePackPtr = iBuilder->CreateBitCast(sourceStream, iBuilder->getBitBlockType()->getPointerTo());
    10183    Value * outputPackPtr = iBuilder->CreateBitCast(expandedStream, iBuilder->getBitBlockType()->getPointerTo());
    10284
    103     Value * hasFullLoop = iBuilder->CreateICmpUGE(loopItemsToDo, triplePackSize);
    104 
    105     iBuilder->CreateCondBr(hasFullLoop, expand_3_4_loop, expand3_4_loop_exit);
     85    iBuilder->CreateCondBr(iBuilder->CreateICmpSGT(itemsToDo, iBuilder->getSize(0)), expand_3_4_loop, expand3_4_exit);
     86   
    10687    iBuilder->SetInsertPoint(expand_3_4_loop);
    10788    PHINode * loopInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
     
    11192    loopInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
    11293    loopOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
    113     loopItemsRemain->addIncoming(loopItemsToDo, expand2_3entry);
     94    loopItemsRemain->addIncoming(itemsToDo, expand2_3entry);
    11495
    11596
     
    144125    loopItemsRemain->addIncoming(remainingItems, expand_3_4_loop);
    145126
    146     Value * continueLoop = iBuilder->CreateICmpUGE(remainingItems, triplePackSize);
    147     iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_loop_exit);
    148    
    149     iBuilder->SetInsertPoint(expand3_4_loop_exit);
    150     PHINode * loopExitInput_ptr = iBuilder->CreatePHI(sourcePackPtr->getType(), 2);
    151     PHINode * loopExitOutput_ptr = iBuilder->CreatePHI(outputPackPtr->getType(), 2);
    152     loopExitInput_ptr->addIncoming(sourcePackPtr, expand2_3entry);
    153     loopExitOutput_ptr->addIncoming(outputPackPtr, expand2_3entry);
    154     loopExitInput_ptr->addIncoming(loopNextInputPack, expand_3_4_loop);
    155     loopExitOutput_ptr->addIncoming(loopNextOutputPack, expand_3_4_loop);
    156 
    157     // Except for final segment processing, we are done.
    158     iBuilder->CreateCondBr(isFinal, expand3_4_final, expand3_4_exit);
    159 
    160     // Final segment processing.   Less than a triplePack remains.
    161     iBuilder->SetInsertPoint(expand3_4_final);
    162    
    163     // There may be one or two remaining full packs and/or a partial pack.
    164     //
    165     // We have several cases depending on the number of reumaing items.  Let N = packSize
    166     // (a) 0 remaining items: all done
    167     // (b) 1..3N/4 remaining items:  do Step1 only, no items or pending data will remain
    168     // (c) 3N/4+1 .. N remaining items:  do Step 1, do Step 2 for pending data from Step 1 only, there is no more input.
    169     // (d) N+1 .. 6N/4 remaining items:  do Step 1 and Step 2, no items or pending data will remain.
    170     // (e) 6N/4+1 .. 2N remaining items: do Steps 1 and 2, do Step 3 for pending data only, there is no more input.
    171     // (f) 2N+1 .. 9N/4 remaining items: do Steps 1 and 2, do Step 3 up to the first write only.
    172     // (g) 9N/4+1 .. 3N - 1 remaining items: do Steps 1, 2 and 3.
    173     Value * condition_a = iBuilder->CreateICmpEQ(excessItems, ConstantInt::getNullValue(iBuilder->getSizeTy()));
    174     iBuilder->CreateCondBr(condition_a, expand3_4_exit, finalStep1);
    175     // Final Step1 processing
    176     iBuilder->SetInsertPoint(finalStep1);
    177     pack0 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(loopExitInput_ptr, packAlign));
    178     expand0 = iBuilder->bitCast(iBuilder->CreateShuffleVector(undefPack, pack0, expand_3_4_shuffle[0]));
    179     iBuilder->CreateAlignedStore(expand0, loopExitOutput_ptr, packAlign);
    180     Value * condition_b = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(3 * PACK_SIZE/4));
    181     iBuilder->CreateCondBr(condition_b, expand3_4_exit, finalStep2);
    182     // Final Step 2 processing
    183     iBuilder->SetInsertPoint(finalStep2);
    184     Value * condition_c = iBuilder->CreateICmpULE(excessItems, packSize);
    185     iBuilder->CreateCondBr(condition_c, step2store, step2load);
    186     iBuilder->SetInsertPoint(step2load);
    187     inPack1_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(1));
    188     pack1 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack1_ptr, packAlign));
    189     iBuilder->CreateBr(step2store);
    190     iBuilder->SetInsertPoint(step2store);
    191     PHINode * pack1phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
    192     pack1phi->addIncoming(undefPack, finalStep2);
    193     pack1phi->addIncoming(pack1, step2load);
    194     outPack1_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(1));
    195     expand1 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack0, pack1phi, expand_3_4_shuffle[1]));
    196     iBuilder->CreateAlignedStore(expand1, outPack1_ptr, packAlign);
    197     Value * condition_d = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(6 * PACK_SIZE/4));
    198     iBuilder->CreateCondBr(condition_d, expand3_4_exit, finalStep3);
    199     // Final Step 3
    200     iBuilder->SetInsertPoint(finalStep3);
    201     Value * condition_e = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(2 * PACK_SIZE));
    202     iBuilder->CreateCondBr(condition_e, step3store, step3load);
    203     iBuilder->SetInsertPoint(step3load);
    204     inPack2_ptr = iBuilder->CreateGEP(loopExitInput_ptr, iBuilder->getInt32(2));
    205     pack2 = iBuilder->fwCast(8, iBuilder->CreateAlignedLoad(inPack2_ptr, packAlign));
    206     iBuilder->CreateBr(step3store);
    207     iBuilder->SetInsertPoint(step3store);
    208     PHINode * pack2phi = iBuilder->CreatePHI(iBuilder->fwVectorType(8), 2);
    209     pack2phi->addIncoming(undefPack, finalStep3);
    210     pack2phi->addIncoming(pack2, step3load);
    211     outPack2_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(2));
    212     expand2 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack1phi, pack2phi, expand_3_4_shuffle[2]));
    213     iBuilder->CreateAlignedStore(expand2, outPack2_ptr, packAlign);
    214     Value * condition_f = iBuilder->CreateICmpULE(excessItems, iBuilder->getSize(9 * PACK_SIZE/4));
    215     iBuilder->CreateCondBr(condition_f, expand3_4_exit, step3store2);
    216     iBuilder->SetInsertPoint(step3store2);
    217     outPack3_ptr = iBuilder->CreateGEP(loopExitOutput_ptr, iBuilder->getInt32(3));
    218     expand3 = iBuilder->bitCast(iBuilder->CreateShuffleVector(pack2phi, undefPack, expand_3_4_shuffle[3]));
    219     iBuilder->CreateAlignedStore(expand3, outPack3_ptr, packAlign);
    220     iBuilder->CreateBr(expand3_4_exit);
    221     //
     127    Value * continueLoop = iBuilder->CreateICmpSGT(remainingItems, iBuilder->getSize(0));
     128    iBuilder->CreateCondBr(continueLoop, expand_3_4_loop, expand3_4_exit);
     129   
    222130    iBuilder->SetInsertPoint(expand3_4_exit);
    223131    }
     
    386294            {Binding{iBuilder->getStreamSetTy(1, 8), "expand34Stream", FixedRatio(4,3)}},
    387295            {}, {}, {}) {
    388     setKernelStride(3 * iBuilder->getBitBlockWidth());
     296    setKernelStride(3 * iBuilder->getBitBlockWidth()/8);
    389297}
    390298
Note: See TracChangeset for help on using the changeset viewer.