Changeset 5870


Ignore:
Timestamp:
Feb 13, 2018, 5:05:51 PM (8 months ago)
Author:
nmedfort
Message:

Modified PDEP kernel

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5865 r5870  
    66#include <kernels/kernel_builder.h>
    77#include <llvm/Support/raw_ostream.h>
    8 #include <iostream>
     8#include <toolchain/toolchain.h>
    99
    1010using namespace llvm;
     
    1212namespace kernel {
    1313
    14 PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
    15 : MultiBlockKernel(name + "",
    16                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"},
    17                    Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
    18                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
    19                   {}, {}, {})
    20 , mSwizzleFactor(swizzleFactor)
    21 , mPDEPWidth(PDEP_width)
    22 {
    23     assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
    24     assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
     14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
     15: MultiBlockKernel(std::move(name),
     16// input stream sets
     17{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
     18Binding{b->getStreamSetTy(swizzleFactor), "source", FixedRate(), Deferred()}},
     19// output stream set
     20{Binding{b->getStreamSetTy(swizzleFactor), "output"}},
     21{}, {},
     22// internal scalars
     23{Binding{b->getBitBlockType(), "buffer"},
     24Binding{b->getSizeTy(), "buffered"},
     25Binding{b->getSizeTy(), "sourceOffset"}})
     26, mSwizzleFactor(swizzleFactor) {
     27
    2528}
    2629
    27 void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
    28     BasicBlock * entry = kb->GetInsertBlock();
    29 //    kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
    30     BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
    31     BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
    32     BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
    33     BasicBlock * terminate = kb->CreateBasicBlock("terminate");
     30void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
     31    BasicBlock * const entry = b->GetInsertBlock();
     32    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
     33    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
     34    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
     35    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
     36    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
    3437
    35     Value * itemsToDo = mAvailableItemCount[0];
     38    Function * pdep = nullptr;
     39    if (pdepWidth == 64) {
     40        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
     41    } else if (pdepWidth == 32) {
     42        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
     43    } else {
     44        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
     45    }
    3646
    37     Value * sourceItemsAvail = mAvailableItemCount[1];
    38 //    kb->CallPrintInt("itemsToDo:", itemsToDo);
    39 //    kb->CallPrintInt("sourceItemsAvail:", sourceItemsAvail);
     47    // We store an internal source offset here because this kernel processes items in an unusual way.
     48    // The pipeline and multiblock assume that if we report we're on the i-th bit of a stream we have
     49    // fully processed all of the bits up to the i-th position.
    4050
     51    //                                             v
     52    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
     53    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
     54    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
     55    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    4156
    42     Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
    43     Value * inputSwizzlesPtr = kb->getInputStreamBlockPtr("sourceStreamSet", kb->getInt32(0)); // mStreamBufferPtr[1];
    44     // Get pointer to start of the output StreamSetBlock we're currently writing to
    45     Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(0)); // mStreamBufferPtr[2];
     57    // However, this kernel divides the stream into K elements and fully consumes a single stream of
     58    // the stream set before consuming the next one. So the same i-th position above is actually:
    4659
    47 //    kb->CallPrintInt("aaa", outputStreamPtr)
     60    //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
     61    //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
     62    //                |XX          |XX          |XX          |XX          |
     63    //                |            |            |            |            |
    4864
    49     Constant * blockWidth = kb->getSize(kb->getBitBlockWidth()); // 256
    50     Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block TODO the assumption is incorrect here
    51     Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
    52     Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
    53        
    54     Value * pdepWidth = kb->getSize(mPDEPWidth);
    55     Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
    56     Value * PDEP_func = nullptr;
    57     if (mPDEPWidth == 64) {
    58         PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
    59     } else if (mPDEPWidth == 32) {
    60         PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
     65    // In the future, we may want the pipeline and multiblock to understand this style of processing
     66    // but for now, we hide it by delaying writing the actual processed offset until we've fully
     67    // processed the entire block.
     68
     69    Value * const initialBuffer = b->getScalarField("buffer");
     70    Value * const initialBufferSize = b->getScalarField("buffered");
     71    Value * const initialSourceOffset = b->getScalarField("sourceOffset");
     72    b->CreateBr(processBlock);
     73
     74    b->SetInsertPoint(processBlock);
     75    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
     76    strideIndex->addIncoming(b->getSize(0), entry);
     77    PHINode * const bufferPhi = b->CreatePHI(initialBuffer->getType(), 2);
     78    bufferPhi->addIncoming(initialBuffer, entry);
     79    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
     80    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
     81    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
     82    bufferSizePhi->addIncoming(initialBufferSize, entry);
     83
     84    // Extract the values we will use in the main processing loop
     85    Value * const markerStream = b->getInputStreamBlockPtr("marker", b->getInt32(0), strideIndex);
     86    Value * const selectors = b->fwCast(pdepWidth, b->CreateBlockAlignedLoad(markerStream));
     87    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
     88
     89    // If we run out of source items here, it is a failure of the MultiBlockKernel and/or PipelineGenerator
     90    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
     91        Value * requiredSourceItems = b->CreateExtractElement(numOfSelectors, b->getInt64(0));
     92        for (unsigned i = 1; i < mSwizzleFactor; i++) {
     93            requiredSourceItems = b->CreateAdd(requiredSourceItems, b->CreateExtractElement(numOfSelectors, b->getInt64(i)));
     94        }
     95        Value * const availableSourceItems = b->getAvailableItemCount("source");
     96        Value * const unreadSourceItems = b->CreateSub(availableSourceItems, sourceOffsetPhi);
     97        Value * const hasSufficientSourceItems = b->CreateICmpULE(requiredSourceItems, unreadSourceItems);
     98        b->CreateAssert(hasSufficientSourceItems, getName() + " has insufficient source items for the given marker stream");
    6199    }
    62     kb->CreateBr(checkLoopCond);
    63100
    64     kb->SetInsertPoint(checkLoopCond);
    65     // The following PHINodes' values can come from entry or processBlock
    66     PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    67     PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
    68     PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
    69     PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
    70     blocksToDoPhi->addIncoming(blocksToDo, entry);
    71     blockOffsetPhi->addIncoming(kb->getSize(0), entry);
    72     updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
    73     sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
     101    // For each element of the marker block
     102    Value * bufferSize = bufferSizePhi;
     103    Value * sourceOffset = sourceOffsetPhi;
     104    Value * buffer = bufferPhi;
     105    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    74106
    75     Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
    76     kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
     107        // How many bits will we deposit?
     108        Value * const required = b->CreateExtractElement(numOfSelectors, b->getInt32(i));
    77109
    78     kb->SetInsertPoint(checkSourceCount);
    79     // Extract the values we will use in the main processing loop
    80     Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
    81     Value * updatedSourceItems = sourceItemsRemaining;
    82     Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, blockOffsetPhi));
     110        // Aggressively enqueue any additional bits
     111        BasicBlock * const entry = b->GetInsertBlock();
     112        BasicBlock * const enqueueBits = b->CreateBasicBlock();
     113        b->CreateBr(enqueueBits);
    83114
    84     const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
    85     const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
    86    
    87     Value * total_count = mask_popcounts[0];
    88     for (unsigned j = 1; j < mask_popcounts.size(); j++) {
    89         total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
    90     }
    91 //    kb->CallPrintInt("total_count", total_count);
    92 //    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
    93     kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
    94     kb->SetInsertPoint(processBlock);
     115        b->SetInsertPoint(enqueueBits);
     116        PHINode * const bufferSize2 = b->CreatePHI(bufferSize->getType(), 2);
     117        bufferSize2->addIncoming(bufferSize, entry);
     118        PHINode * const sourceOffset2 = b->CreatePHI(sourceOffset->getType(), 2);
     119        sourceOffset2->addIncoming(sourceOffset, entry);
     120        PHINode * const buffer2 = b->CreatePHI(buffer->getType(), 2);
     121        buffer2->addIncoming(buffer, entry);
    95122
    96     // For each mask extracted from the PDEP marker block
    97     for (unsigned i = 0; i < mSwizzleFactor; i++) {
    98         // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
     123        // Calculate the block and swizzle index of the "current" swizzle
     124        Value * const block_index = b->CreateUDiv(sourceOffset2, BLOCK_WIDTH);
     125        Value * const stream_index = b->CreateUDiv(b->CreateURem(sourceOffset2, BLOCK_WIDTH), PDEP_WIDTH);
     126        Value * const ptr = b->getInputStreamBlockPtr("source", stream_index, block_index);
     127        Value * const swizzle = b->CreateBlockAlignedLoad(ptr);
     128        Value * const swizzleOffset = b->CreateURem(sourceOffset2, PDEP_WIDTH);
    99129
    100         Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
    101         Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
    102         Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
     130        // Shift the swizzle to the right to clear off any used bits ...
     131        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
     132        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
    103133
    104         Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
    105         Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
     134        // ... then to the left to align the bits with the buffer and combine them.
     135        Value * const bufferShift = b->simd_fill(pdepWidth, bufferSize2);
     136        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
     137        buffer = b->CreateOr(buffer, pendingBits);
     138        buffer2->addIncoming(buffer, enqueueBits);
    106139
    107 //        kb->CallPrintInt("current_blk_idx", current_blk_idx);
    108 //        kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
     140        // Update the buffer size by the number of bits we have actually enqueued
     141        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), bufferSize2);
     142        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
     143        // ... and increment the source offset by the number we actually inserted
     144        sourceOffset = b->CreateAdd(sourceOffset2, b->CreateSub(bufferSize, bufferSize2));
     145        bufferSize2->addIncoming(bufferSize, enqueueBits);
     146        sourceOffset2->addIncoming(sourceOffset, enqueueBits);
     147        BasicBlock * const depositBits = b->CreateBasicBlock();
     148        b->CreateCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
    109149
    110         // Load current and next BitBlocks/swizzles
    111         // TODO can not guarantee the two GEP is correct, need to check later
    112         Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, kb->CreateAdd(kb->CreateMul(current_blk_idx, kb->getSize(mSwizzleFactor)), current_swizzle_idx));
    113         Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, kb->CreateAdd(kb->CreateMul(next_blk_idx, kb->getSize(mSwizzleFactor)), next_swizzle_idx));
    114 
    115         Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    116         Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    117 //        kb->CallPrintInt("ptr", current_swizzle_ptr);
    118 //        kb->CallPrintRegister("current_swizzle", current_swizzle);
    119 
    120         // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
    121         Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
    122         Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
    123 
    124         Value * borrowed_bits = kb->CreateShl(next_swizzle,
    125                                               kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
    126         Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
    127 
    128         Value * segments = kb->fwCast(mPDEPWidth, combined);
    129         Value * result_swizzle = Constant::getNullValue(segments->getType());
    130         // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
    131 
    132 
    133 
    134 
    135         Value * PDEP_mask = PDEP_masks[i];
     150        b->SetInsertPoint(depositBits);
     151        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
     152        Value * result = UndefValue::get(buffer->getType());
     153        Value * const mask = b->CreateExtractElement(selectors, i);
    136154        for (unsigned j = 0; j < mSwizzleFactor; j++) {
    137             Value * source_field = kb->CreateExtractElement(segments, j);
    138             Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
    139             result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
     155            Value * source_field = b->CreateExtractElement(buffer, j);
     156            Value * PDEP_field = b->CreateCall(pdep, {source_field, mask});
     157            result = b->CreateInsertElement(result, PDEP_field, j);
    140158        }
    141159
    142160        // Store the result
    143         auto outputPos = kb->CreateGEP(outputStreamPtr, kb->CreateAdd(kb->CreateMul(blockOffsetPhi, kb->getSize(mSwizzleFactor)), kb->getSize(i)));
    144         kb->CreateBlockAlignedStore(result_swizzle, outputPos);
    145         updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
    146         updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
     161        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getInt32(i), strideIndex);
     162        b->CreateBlockAlignedStore(result, outputStreamPtr);
     163
     164        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
     165        Value * const usedShift = b->simd_fill(pdepWidth, required);
     166        buffer = b->CreateLShr(buffer, usedShift);
     167        bufferSize = b->CreateSub(bufferSize, required);
    147168    }
    148169
    149     updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
    150     blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
    151     blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
    152     sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
    153     kb->CreateBr(checkLoopCond);
     170    BasicBlock * const finishedBlock = b->GetInsertBlock();
     171    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
     172    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
     173    bufferPhi->addIncoming(buffer, finishedBlock);
     174    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
     175    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
     176    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
    154177
    155     kb->SetInsertPoint(terminate);
    156 //    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
    157 //    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
    158 //    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
    159     kb->setProcessedItemCount("sourceStreamSet", updatedProcessedSourceBitsPhi);
    160 
    161 
    162 //    kb->CallPrintInt("itemsDone:", itemsDone);
    163 //    kb->CallPrintInt("produced:", kb->getProducedItemCount("outputStreamSet"));
    164 //    kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock End:", kb->getSize(0));
     178    b->SetInsertPoint(finishedStrides);
     179    Value * const sourceItemsProcessed = b->CreateMul(b->CreateUDiv(sourceOffset, BLOCK_WIDTH), BLOCK_WIDTH);
     180    b->setProcessedItemCount("source", b->CreateAdd(b->getProcessedItemCount("source"), sourceItemsProcessed));
     181    b->setScalarField("buffer", buffer);
     182    b->setScalarField("buffered", bufferSize);
     183    b->setScalarField("sourceOffset", b->CreateURem(sourceOffset, BLOCK_WIDTH));
    165184}
    166185
    167 std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
    168     Value * pop_counts = kb->simd_popcount(field_width, blk);
    169     std::vector<Value *> counts;
    170     for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
    171         // Store the pop counts for each blk_width field in blk
    172         counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
    173     }
    174     return counts;
    175186}
    176 
    177 std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
    178     // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
    179     // Split the PDEP marker stream block into mPDEPWidth segments.
    180     Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
    181     std::vector<Value *> PDEP_masks;
    182     for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
    183         PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
    184     }
    185     return PDEP_masks;
    186 }
    187 }
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.h

    r5857 r5870  
    99#include <llvm/IR/Value.h>
    1010#include <string>
    11 namespace IDISA { class IDISA_Builder; }
     11
    1212/*
    13 What this kernel does:
    1413
    15 Given a swizzled input stream set and a PDEP marker stream, apply a PDEP operation to each of the input streams in
    16 the input stream set. The PDEPed result streams are returned in a swizzled output stream set.
     14Conceptually, given an unbounded input stream set of k streams and a marker stream, this kernel uses the
     15Parallel Bits Deposit (PDEP) instruction to copy the input items from the i-th input stream to the i-th
     16output stream the positions indicated by the marker bits. All other output items are set to zero. E.g.,
    1717
    18 The length of the input stream set (in bits) must be greater than or equal to the total popcount of the PDEP marker
    19 stream, otherwise the PDEP operation will run out of source bits before the entire PDEP stream has been processed.
     18 SOURCE >  abcdefgh i0000000 00000000 00000000
     19 MARKER >  ...1.1.1 .....11. ..1...1. ...1.1..
     20 OUTPUT >  ...a.b.c .....de. ..f...g. ...h.i..
    2021
    21 How it works:
     22The complicating factor of this Kernel is that it assumes the input streams are *swizzled*. I.e., it
     23"divides" each block of the marker stream into k elements, M_1 ... M_k, and applies the PDEP operation
     24using M_i to the each of the k elements in the i-th input (swizzled) stream.
    2225
    23 You should know how the PDEP operation works before continuing (Wikipedia has a pretty good explanation.)
     26            CONCEPTUAL VIEW OF INPUT STREAM SET                    ACTUAL LAYOUT OF INPUT STREAM SET
    2427
    25 The swizzled configuration of the input streams mean that the first blockWidth/mSwizzleFactor bits of each (unswizzled) input
    26 stream are contained in the first BitBlock of the first input StreamSetBlock. The second BitBlock contains the next
    27 blockWidth/mSwizzleFactor bits for each input stream, and so on. The key observation underpinning the action of the PDEP kernel is that we apply the PDEP operation
    28 using blockWidth/mSwizzleFactor bits of an input stream as the source bits. Since the first BitBlock (i.e. swizzle) contains blockWidth/mSwizzleFactor
    29 bits from each of the input streams, we can begin processing the input streams in the input stream set by applying the first blockWidth/mSwizzleFactor
    30 bits of the PDEP marker stream to each of the swizzle fields in the first BitBlock.
     28 STREAM 0  abcde...  fg......  hijklm..  nopqrst.     SWIZZLE 0  abcde...  uvwxy...  OPQRS...  89abc...
     29 STREAM 1  uvwxy...  zA......  BCDEFG..  HIJKLMN.     SWIZZLE 1  fg......  zA......  TU......  de......
     30 STREAM 2  OPQRS...  TU......  VWXYZ0..  1234567.     SWIZZLE 2  hijklm..  BCDEFG..  VWXYZ0..  fghijk..
     31 STREAM 3  89abc...  de......  fghijk..  lmnopqr.     SWIZZLE 3  nopqrst.  HIJKLMN.  1234567.  lmnopqr.
    3132
    32 We continue using the first blockWidth/mSwizzleFactor bits of each input stream until we have completely consumed them. This occurs
    33 when the combined popcount of the PDEP masks we've used up to this point > blockWidth/mSwizzleFactor. Once we've exhausted the first
    34 BitBlock (i.e. swizzle), we move on to the next one. This pattern continues until we've consumed
    35 the entire PDEP marker stream. Note that it's possible for the kernel to consume the entire PDEP marker
    36 stream without consuming the entirety of the first BitBlock in the first BitStreamBlock, if the PDEP marker stream has a low popcount
    37 (i.e. > blockWidth/mSwizzleFactor).
    3833
    39 There is actually a slight complication that was glossed over in the description above. Consider the following scenario: we've consumed
    40 half of a blockWidth/mSwizzleFactor segment, and we're now starting the PDEP loop again. However, this time the PDEP marker stream segment is
    41 0xffffffff. That is, the popcount is 64. That means we'll consume 64 bits from the source bit stream, but the current segment only contains 64/2 =
    42 32 bits. To get around this issue, we "look ahead" to the next segment, whether that next segment is the next BitBlock in the current StreamSetBlock
    43 or the first BitBlock in the next StreamSetBlock. Regardless of where we find the segment, we combine the current segment and the next segement in
    44 such a way that we're guarenteed to have 64 source bits to pass to the PDEP operation. The logic responsible for creating this "combined" value
    45 can be found immediately after the opening brace of the outer for loop in the definition of the generateDoBlockMethod function:
     34NOTE: this kernel does *NOT* unswizzle the output. This will eventually be the responsibility of the
     35pipeline to ensure it is done when needed.
    4636
    47 Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_block_idx);  // blk index == stream set block index
    48 
    49 // kb->CreateUDiv(updatedProcessedBits, blockWidth) gives us the absolute block idx of the current block.
    50 // However, getAdjustedStreamBlockPtr (used later) calculates an offseted block for us based on the processed item count
    51 // of sourceStreamSet. We want to get the index of the block we're currently processing relative to the
    52 // "base block" calculated by getAdjustedInputStreamPtr. That's why we subtract base_block_idx from
    53 // kb->CreateUDiv(updatedProcessedBits, blockWidth)
    54 
    55 Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
    56 
    57 // updatedProcessedBits % blockWidth is how many bits of the current block we've processed.
    58 // Divide that by pdepWidth to get which BitBlock/swizzle we're currently processing.
    59 
    60 Value * next_block_idx = kb->CreateSub(kb->CreateUDiv(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), base_block_idx);
    61 Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), pdepWidth);
    62 // Q: Why add pdepWidth (i.e. 64) and not 256?
    63 // A: Although it is true that each BitBlock/swizzle contains 256 bits, each swizzle only contains 64 bits from each of the streams
    64 // it is composed of. Each 64 bit field is consumed in parallel, at the same rate. Therefore, once we've consumed "64 bits"
    65 // we've actually consumed 64*4 bits, and it's time to move to the next one.
    6637*/
    6738
    6839namespace kernel {
    69 class PDEPkernel : public MultiBlockKernel {
     40
     41class PDEPkernel final : public MultiBlockKernel {
    7042public:
    71     PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width = 64, std::string name = "PDEPdel");
     43    PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor = 4, std::string name = "PDEP");
    7244    bool isCachable() const override { return true; }
    7345    bool hasSignature() const override { return false; }
    7446private:
     47    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
     48private:
    7549    const unsigned mSwizzleFactor;
    76     const unsigned mPDEPWidth;
    77     void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
    78     std::vector<llvm::Value *> get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * PDEP_ms_blk,
    79                                               const unsigned mask_width);
    80     std::vector<llvm::Value *> get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * blk,
    81                                                    const unsigned field_width);
    8250};   
     51
    8352}
    8453   
Note: See TracChangeset for help on using the changeset viewer.