Ignore:
Timestamp:
Feb 13, 2018, 5:05:51 PM (14 months ago)
Author:
nmedfort
Message:

Modified PDEP kernel

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5865 r5870  
    66#include <kernels/kernel_builder.h>
    77#include <llvm/Support/raw_ostream.h>
    8 #include <iostream>
     8#include <toolchain/toolchain.h>
    99
    1010using namespace llvm;
     
    1212namespace kernel {
    1313
    14 PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
    15 : MultiBlockKernel(name + "",
    16                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"},
    17                    Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
    18                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
    19                   {}, {}, {})
    20 , mSwizzleFactor(swizzleFactor)
    21 , mPDEPWidth(PDEP_width)
    22 {
    23     assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
    24     assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
     14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
     15: MultiBlockKernel(std::move(name),
     16// input stream sets
     17{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
     18Binding{b->getStreamSetTy(swizzleFactor), "source", FixedRate(), Deferred()}},
     19// output stream set
     20{Binding{b->getStreamSetTy(swizzleFactor), "output"}},
     21{}, {},
     22// internal scalars
     23{Binding{b->getBitBlockType(), "buffer"},
     24Binding{b->getSizeTy(), "buffered"},
     25Binding{b->getSizeTy(), "sourceOffset"}})
     26, mSwizzleFactor(swizzleFactor) {
     27
    2528}
    2629
    27 void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
    28     BasicBlock * entry = kb->GetInsertBlock();
    29 //    kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
    30     BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
    31     BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
    32     BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
    33     BasicBlock * terminate = kb->CreateBasicBlock("terminate");
     30void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     31    BasicBlock * const entry = b->GetInsertBlock();
     32    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
     33    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
     34    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
     35    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
     36    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
    3437
    35     Value * itemsToDo = mAvailableItemCount[0];
     38    Function * pdep = nullptr;
     39    if (pdepWidth == 64) {
     40        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
     41    } else if (pdepWidth == 32) {
     42        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
     43    } else {
     44        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
     45    }
    3646
    37     Value * sourceItemsAvail = mAvailableItemCount[1];
    38 //    kb->CallPrintInt("itemsToDo:", itemsToDo);
    39 //    kb->CallPrintInt("sourceItemsAvail:", sourceItemsAvail);
     47    // We store an internal source offset here because this kernel processes items in an unusual way.
     48    // The pipeline and multiblock assume that if we report we're on the i-th bit of a stream we have
     49    // fully processed all of the bits up to the i-th position.
    4050
     51    //                                             v
     52    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
     53    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
     54    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
     55    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    4156
    42     Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
    43     Value * inputSwizzlesPtr = kb->getInputStreamBlockPtr("sourceStreamSet", kb->getInt32(0)); // mStreamBufferPtr[1];
    44     // Get pointer to start of the output StreamSetBlock we're currently writing to
    45     Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(0)); // mStreamBufferPtr[2];
     57    // However, this kernel divides the stream into K elements and fully consumes a single stream of
     58    // the stream set before consuming the next one. So the same i-th position above is actually:
    4659
    47 //    kb->CallPrintInt("aaa", outputStreamPtr)
     60    //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
     61    //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
     62    //                |XX          |XX          |XX          |XX          |
     63    //                |            |            |            |            |
    4864
    49     Constant * blockWidth = kb->getSize(kb->getBitBlockWidth()); // 256
    50     Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block TODO the assumption is incorrect here
    51     Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
    52     Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
    53        
    54     Value * pdepWidth = kb->getSize(mPDEPWidth);
    55     Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
    56     Value * PDEP_func = nullptr;
    57     if (mPDEPWidth == 64) {
    58         PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
    59     } else if (mPDEPWidth == 32) {
    60         PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
     65    // In the future, we may want the pipeline and multiblock to understand this style of processing
     66    // but for now, we hide it by delaying writing the actual processed offset until we've fully
     67    // processed the entire block.
     68
     69    Value * const initialBuffer = b->getScalarField("buffer");
     70    Value * const initialBufferSize = b->getScalarField("buffered");
     71    Value * const initialSourceOffset = b->getScalarField("sourceOffset");
     72    b->CreateBr(processBlock);
     73
     74    b->SetInsertPoint(processBlock);
     75    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
     76    strideIndex->addIncoming(b->getSize(0), entry);
     77    PHINode * const bufferPhi = b->CreatePHI(initialBuffer->getType(), 2);
     78    bufferPhi->addIncoming(initialBuffer, entry);
     79    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
     80    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
     81    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
     82    bufferSizePhi->addIncoming(initialBufferSize, entry);
     83
     84    // Extract the values we will use in the main processing loop
     85    Value * const markerStream = b->getInputStreamBlockPtr("marker", b->getInt32(0), strideIndex);
     86    Value * const selectors = b->fwCast(pdepWidth, b->CreateBlockAlignedLoad(markerStream));
     87    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
     88
     89    // If we run out of source items here, it is a failure of the MultiBlockKernel and/or PipelineGenerator
     90    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     91        Value * requiredSourceItems = b->CreateExtractElement(numOfSelectors, b->getInt64(0));
     92        for (unsigned i = 1; i < mSwizzleFactor; i++) {
     93            requiredSourceItems = b->CreateAdd(requiredSourceItems, b->CreateExtractElement(numOfSelectors, b->getInt64(i)));
     94        }
     95        Value * const availableSourceItems = b->getAvailableItemCount("source");
     96        Value * const unreadSourceItems = b->CreateSub(availableSourceItems, sourceOffsetPhi);
     97        Value * const hasSufficientSourceItems = b->CreateICmpULE(requiredSourceItems, unreadSourceItems);
     98        b->CreateAssert(hasSufficientSourceItems, getName() + " has insufficient source items for the given marker stream");
    6199    }
    62     kb->CreateBr(checkLoopCond);
    63100
    64     kb->SetInsertPoint(checkLoopCond);
    65     // The following PHINodes' values can come from entry or processBlock
    66     PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    67     PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
    68     PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    69     PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
    70     blocksToDoPhi->addIncoming(blocksToDo, entry);
    71     blockOffsetPhi->addIncoming(kb->getSize(0), entry);
    72     updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
    73     sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
     101    // For each element of the marker block
     102    Value * bufferSize = bufferSizePhi;
     103    Value * sourceOffset = sourceOffsetPhi;
     104    Value * buffer = bufferPhi;
     105    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    74106
    75     Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
    76     kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
     107        // How many bits will we deposit?
     108        Value * const required = b->CreateExtractElement(numOfSelectors, b->getInt32(i));
    77109
    78     kb->SetInsertPoint(checkSourceCount);
    79     // Extract the values we will use in the main processing loop
    80     Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
    81     Value * updatedSourceItems = sourceItemsRemaining;
    82     Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, blockOffsetPhi));
     110        // Aggressively enqueue any additional bits
     111        BasicBlock * const entry = b->GetInsertBlock();
     112        BasicBlock * const enqueueBits = b->CreateBasicBlock();
     113        b->CreateBr(enqueueBits);
    83114
    84     const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
    85     const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
    86    
    87     Value * total_count = mask_popcounts[0];
    88     for (unsigned j = 1; j < mask_popcounts.size(); j++) {
    89         total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
    90     }
    91 //    kb->CallPrintInt("total_count", total_count);
    92 //    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
    93     kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
    94     kb->SetInsertPoint(processBlock);
     115        b->SetInsertPoint(enqueueBits);
     116        PHINode * const bufferSize2 = b->CreatePHI(bufferSize->getType(), 2);
     117        bufferSize2->addIncoming(bufferSize, entry);
     118        PHINode * const sourceOffset2 = b->CreatePHI(sourceOffset->getType(), 2);
     119        sourceOffset2->addIncoming(sourceOffset, entry);
     120        PHINode * const buffer2 = b->CreatePHI(buffer->getType(), 2);
     121        buffer2->addIncoming(buffer, entry);
    95122
    96     // For each mask extracted from the PDEP marker block
    97     for (unsigned i = 0; i < mSwizzleFactor; i++) {
    98         // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
     123        // Calculate the block and swizzle index of the "current" swizzle
     124        Value * const block_index = b->CreateUDiv(sourceOffset2, BLOCK_WIDTH);
     125        Value * const stream_index = b->CreateUDiv(b->CreateURem(sourceOffset2, BLOCK_WIDTH), PDEP_WIDTH);
     126        Value * const ptr = b->getInputStreamBlockPtr("source", stream_index, block_index);
     127        Value * const swizzle = b->CreateBlockAlignedLoad(ptr);
     128        Value * const swizzleOffset = b->CreateURem(sourceOffset2, PDEP_WIDTH);
    99129
    100         Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
    101         Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
    102         Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
     130        // Shift the swizzle to the right to clear off any used bits ...
     131        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
     132        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
    103133
    104         Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
    105         Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
     134        // ... then to the left to align the bits with the buffer and combine them.
     135        Value * const bufferShift = b->simd_fill(pdepWidth, bufferSize2);
     136        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
     137        buffer = b->CreateOr(buffer, pendingBits);
     138        buffer2->addIncoming(buffer, enqueueBits);
    106139
    107 //        kb->CallPrintInt("current_blk_idx", current_blk_idx);
    108 //        kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
     140        // Update the buffer size by the number of bits we have actually enqueued
     141        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), bufferSize2);
     142        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
     143        // ... and increment the source offset by the number we actually inserted
     144        sourceOffset = b->CreateAdd(sourceOffset2, b->CreateSub(bufferSize, bufferSize2));
     145        bufferSize2->addIncoming(bufferSize, enqueueBits);
     146        sourceOffset2->addIncoming(sourceOffset, enqueueBits);
     147        BasicBlock * const depositBits = b->CreateBasicBlock();
     148        b->CreateCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
    109149
    110         // Load current and next BitBlocks/swizzles
    111         // TODO can not guarantee the two GEP is correct, need to check later
    112         Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, kb->CreateAdd(kb->CreateMul(current_blk_idx, kb->getSize(mSwizzleFactor)), current_swizzle_idx));
    113         Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, kb->CreateAdd(kb->CreateMul(next_blk_idx, kb->getSize(mSwizzleFactor)), next_swizzle_idx));
    114 
    115         Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    116         Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    117 //        kb->CallPrintInt("ptr", current_swizzle_ptr);
    118 //        kb->CallPrintRegister("current_swizzle", current_swizzle);
    119 
    120         // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
    121         Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
    122         Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
    123 
    124         Value * borrowed_bits = kb->CreateShl(next_swizzle,
    125                                               kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
    126         Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
    127 
    128         Value * segments = kb->fwCast(mPDEPWidth, combined);
    129         Value * result_swizzle = Constant::getNullValue(segments->getType());
    130         // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
    131 
    132 
    133 
    134 
    135         Value * PDEP_mask = PDEP_masks[i];
     150        b->SetInsertPoint(depositBits);
     151        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
     152        Value * result = UndefValue::get(buffer->getType());
     153        Value * const mask = b->CreateExtractElement(selectors, i);
    136154        for (unsigned j = 0; j < mSwizzleFactor; j++) {
    137             Value * source_field = kb->CreateExtractElement(segments, j);
    138             Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
    139             result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
     155            Value * source_field = b->CreateExtractElement(buffer, j);
     156            Value * PDEP_field = b->CreateCall(pdep, {source_field, mask});
     157            result = b->CreateInsertElement(result, PDEP_field, j);
    140158        }
    141159
    142160        // Store the result
    143         auto outputPos = kb->CreateGEP(outputStreamPtr, kb->CreateAdd(kb->CreateMul(blockOffsetPhi, kb->getSize(mSwizzleFactor)), kb->getSize(i)));
    144         kb->CreateBlockAlignedStore(result_swizzle, outputPos);
    145         updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
    146         updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
     161        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getInt32(i), strideIndex);
     162        b->CreateBlockAlignedStore(result, outputStreamPtr);
     163
     164        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
     165        Value * const usedShift = b->simd_fill(pdepWidth, required);
     166        buffer = b->CreateLShr(buffer, usedShift);
     167        bufferSize = b->CreateSub(bufferSize, required);
    147168    }
    148169
    149     updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
    150     blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
    151     blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
    152     sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
    153     kb->CreateBr(checkLoopCond);
     170    BasicBlock * const finishedBlock = b->GetInsertBlock();
     171    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
     172    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
     173    bufferPhi->addIncoming(buffer, finishedBlock);
     174    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
     175    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
     176    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
    154177
    155     kb->SetInsertPoint(terminate);
    156 //    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
    157 //    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
    158 //    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
    159     kb->setProcessedItemCount("sourceStreamSet", updatedProcessedSourceBitsPhi);
    160 
    161 
    162 //    kb->CallPrintInt("itemsDone:", itemsDone);
    163 //    kb->CallPrintInt("produced:", kb->getProducedItemCount("outputStreamSet"));
    164 //    kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock End:", kb->getSize(0));
     178    b->SetInsertPoint(finishedStrides);
     179    Value * const sourceItemsProcessed = b->CreateMul(b->CreateUDiv(sourceOffset, BLOCK_WIDTH), BLOCK_WIDTH);
     180    b->setProcessedItemCount("source", b->CreateAdd(b->getProcessedItemCount("source"), sourceItemsProcessed));
     181    b->setScalarField("buffer", buffer);
     182    b->setScalarField("buffered", bufferSize);
     183    b->setScalarField("sourceOffset", b->CreateURem(sourceOffset, BLOCK_WIDTH));
    165184}
    166185
    167 std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
    168     Value * pop_counts = kb->simd_popcount(field_width, blk);
    169     std::vector<Value *> counts;
    170     for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
    171         // Store the pop counts for each blk_width field in blk
    172         counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
    173     }
    174     return counts;
    175186}
    176 
    177 std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
    178     // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
    179     // Split the PDEP marker stream block into mPDEPWidth segments.
    180     Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
    181     std::vector<Value *> PDEP_masks;
    182     for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
    183         PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
    184     }
    185     return PDEP_masks;
    186 }
    187 }
Note: See TracChangeset for help on using the changeset viewer.