Changeset 5627


Ignore:
Timestamp:
Sep 2, 2017, 11:59:14 PM (3 months ago)
Author:
cameron
Message:

PDEP kernels from Adam with pdep_width_less_1 fix

Location:
icGREP/icgrep-devel/icgrep/kernels
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5588 r5627  
    66#include <kernels/kernel_builder.h>
    77#include <llvm/Support/raw_ostream.h>
     8#include <iostream>
    89
    910using namespace llvm;
     
    1112namespace kernel {
    1213
    13 PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned PDEP_width)
    14 : BlockOrientedKernel("PDEPdel",
     14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
     15: MultiBlockKernel("PDEPdel",
    1516                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
    16                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}}, {}, {}, {})
    17 , mSwizzleFactor(kb->getBitBlockWidth() / PDEP_width)
     17                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
     18                  {}, {}, {})
     19, mSwizzleFactor(swizzleFactor)
    1820, mPDEPWidth(PDEP_width)
    1921{
     22    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
    2023    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
    2124}
    2225
    23 void PDEPkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
    24     // Extract the values we will use in the main processing loop
    25     Value * PDEP_ms_blk = kb->loadInputStreamBlock("PDEPmarkerStream", kb->getInt32(0));
    26     const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
    27     const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
    28     Value * processedBits = kb->getProcessedItemCount("sourceStreamSet");
    29     Value * blockWidth = kb->getSize(kb->getBitBlockWidth());
    30     Value * base_block_idx = kb->CreateUDiv(processedBits, blockWidth);
     26void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {   
     27    BasicBlock * entry = kb->GetInsertBlock();
     28    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
     29    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
     30    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
     31
     32    Function::arg_iterator args = mCurrentMethod->arg_begin();
     33    args++; //self
     34    Value * itemsToDo = &*(args++); // Since PDEP marker stream is a bit stream, this is the number of PDEP marker bits to process
     35    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
     36    args++; //sourceItemsAvail
     37    Value * PDEPStrmPtr = &*(args++);
     38    Value * inputSwizzlesPtr = &*(args++);
     39
     40    // Get pointer to start of the output StreamSetBlock we're currently writing to
     41    Value * outputStreamPtr = &*(args);
     42   
     43    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
     44    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
     45    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
     46    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
     47       
    3148    Value * pdepWidth = kb->getSize(mPDEPWidth);
     49    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
    3250    Value * PDEP_func = nullptr;
    3351    if (mPDEPWidth == 64) {
     
    3654        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
    3755    }
    38     Value * updatedProcessedBits = processedBits;
     56    kb->CreateBr(checkLoopCond);
    3957
    40     // For each mask extracted from the PDEP marker stream
     58    kb->SetInsertPoint(checkLoopCond);
     59    // The following PHINodes' values can come from entry or processBlock
     60    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     61    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
     62    PHINode * updatedProcessedBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     63    blocksToDoPhi->addIncoming(blocksToDo, entry);
     64    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
     65    updatedProcessedBitsPhi->addIncoming(processedSourceBits, entry);
     66
     67    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
     68    kb->CreateCondBr(haveRemBlocks, processBlock, terminate);
     69
     70    kb->SetInsertPoint(processBlock);
     71    // Extract the values we will use in the main processing loop
     72    Value * updatedProcessedBits = updatedProcessedBitsPhi;
     73    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, {blockOffsetPhi, kb->getInt32(0)}));
     74    kb->CallPrintRegister("PDEP_ms_blk", PDEP_ms_blk);
     75
     76    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
     77    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
     78
     79    // For each mask extracted from the PDEP marker block
    4180    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    42         // Do block and swizzle index calculations, then combine the "src" and "next" swizzles
    43         Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_block_idx); // blk index == stream set block index
     81        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
     82        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
    4483        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
    45         Value * next_block_idx = kb->CreateSub(kb->CreateUDiv(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), base_block_idx);
    46         Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), pdepWidth);
     84        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedBits);
     85       
     86        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
     87        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
    4788
    4889        // Load current and next BitBlocks/swizzles
    49         Value * current_blk_ptr = kb->getAdjustedInputStreamBlockPtr(current_blk_idx, "sourceStreamSet", current_swizzle_idx);
    50         Value * next_blk_ptr = kb->getAdjustedInputStreamBlockPtr(next_block_idx, "sourceStreamSet", next_swizzle_idx);
    51         Value * current_swizzle = kb->CreateBlockAlignedLoad(current_blk_ptr);
    52         Value * next_swizzle = kb->CreateBlockAlignedLoad(next_blk_ptr);
     90        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {current_blk_idx, current_swizzle_idx});
     91        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {next_blk_idx, next_swizzle_idx});
     92        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
     93        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    5394
    5495        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
     
    5697        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
    5798        Value * borrowed_bits = kb->CreateShl(next_swizzle,
    58                                               kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
     99                                             kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
    59100        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
    60101
    61         Value * PDEP_mask = PDEP_masks[i];
    62102        Value * segments = kb->fwCast(mPDEPWidth, combined);
    63103        Value * result_swizzle = Constant::getNullValue(segments->getType());
    64104        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
     105        Value * PDEP_mask = PDEP_masks[i];
    65106        for (unsigned j = 0; j < mSwizzleFactor; j++) {
    66107            Value * source_field = kb->CreateExtractElement(segments, j);
     
    70111
    71112        // Store the result
    72         kb->storeOutputStreamBlock("outputStreamSet", kb->getSize(i), result_swizzle);
     113        kb->CreateBlockAlignedStore(result_swizzle, kb->CreateGEP(outputStreamPtr, {blockOffsetPhi, kb->getSize(i)}));
     114                                    kb->CallPrintRegister("result_swizzle", result_swizzle);
    73115        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
    74116    }
    75     kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBits);
     117
     118    updatedProcessedBitsPhi->addIncoming(updatedProcessedBits, processBlock);
     119    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
     120    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
     121    kb->CreateBr(checkLoopCond);
     122
     123    kb->SetInsertPoint(terminate);
     124    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);   
    76125}
    77126
     
    79128    Value * pop_counts = kb->simd_popcount(field_width, blk);
    80129    std::vector<Value *> counts;
    81     for (unsigned i = 0; i < mSwizzleFactor; i++) {
     130    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
    82131        // Store the pop counts for each blk_width field in blk
    83132        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
     
    91140    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
    92141    std::vector<Value *> PDEP_masks;
    93     for (unsigned i = 0; i < mSwizzleFactor; i++) {
     142    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
    94143        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
    95144    }
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.h

    r5588 r5627  
    11/*
    2  *  Copyright (c) 2016 International Characters.
     2 *  Copyright (c) 2017 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 */
     
    1313
    1414Given a swizzled input stream set and a PDEP marker stream, apply a PDEP operation to each of the input streams in
    15 the input stream set. The PDEPed result streams are returned in a swizzled output stream set.
     15the input stream set. The PDEPed result streams are returned in a swizzled output stream set.
     16
     17The length of the input stream set (in bits) must be greater than or equal to the total popcount of the PDEP marker
     18stream, otherwise the PDEP operation will run out of source bits before the entire PDEP stream has been processed.
    1619
    1720How it works:
     
    1922You should know how the PDEP operation works before continuing (Wikipedia has a pretty good explanation.)
    2023
    21 The swizzled configuration of the input streams mean that the first blockWidth/mSwizzleFactor bits of each input
     24The swizzled configuration of the input streams mean that the first blockWidth/mSwizzleFactor bits of each (unswizzled) input
    2225stream are contained in the first BitBlock of the first input StreamSetBlock. The second BitBlock contains the next
    2326blockWidth/mSwizzleFactor bits for each input stream, and so on. The key observation underpinning the action of the PDEP kernel is that we apply the PDEP operation
    24 using blockWidth/mSwizzleFactor bits of an input stream as the source bits. Since the first swizzle contains blockWidth/mSwizzleFactor
     27using blockWidth/mSwizzleFactor bits of an input stream as the source bits. Since the first BitBlock (i.e. swizzle) contains blockWidth/mSwizzleFactor
    2528bits from each of the input streams, we can begin processing the input streams in the input stream set by applying the first blockWidth/mSwizzleFactor
    2629bits of the PDEP marker stream to each of the swizzle fields in the first BitBlock.
    2730
    28 We can continue using the first blockWidth/mSwizzleFactor bits of each input stream until we have completely consumed it. This occurs
     31We continue using the first blockWidth/mSwizzleFactor bits of each input stream until we have completely consumed them. This occurs
    2932when the combined popcount of the PDEP masks we've used up to this point > blockWidth/mSwizzleFactor. Once we've exhausted the first
    3033BitBlock (i.e. swizzle), we move on to the next one. This pattern continues until we've consumed
     
    6366
    6467namespace kernel {
    65 class PDEPkernel final : public BlockOrientedKernel {
     68class PDEPkernel : public MultiBlockKernel {
    6669public:
    67     PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned PDEP_width = 64);
     70    PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width = 64);
    6871    bool isCachable() const override { return true; }
    6972    bool hasSignature() const override { return false; }
     
    7174    const unsigned mSwizzleFactor;
    7275    const unsigned mPDEPWidth;
    73     void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) override;
     76    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) override;
    7477    std::vector<llvm::Value *> get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * PDEP_ms_blk,
    7578                                              const unsigned mask_width);
    7679    std::vector<llvm::Value *> get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * blk,
    77                                                   const unsigned field_width);
    78 
     80                                                   const unsigned field_width);
    7981};   
    8082}
    8183   
    8284#endif
    83 
Note: See TracChangeset for help on using the changeset viewer.