Ignore:
Timestamp:
Sep 2, 2017, 11:59:14 PM (20 months ago)
Author:
cameron
Message:

PDEP kernels from Adam with pdep_width_less_1 fix

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5588 r5627  
    66#include <kernels/kernel_builder.h>
    77#include <llvm/Support/raw_ostream.h>
     8#include <iostream>
    89
    910using namespace llvm;
     
    1112namespace kernel {
    1213
    13 PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned PDEP_width)
    14 : BlockOrientedKernel("PDEPdel",
     14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
     15: MultiBlockKernel("PDEPdel",
    1516                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
    16                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}}, {}, {}, {})
    17 , mSwizzleFactor(kb->getBitBlockWidth() / PDEP_width)
     17                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
     18                  {}, {}, {})
     19, mSwizzleFactor(swizzleFactor)
    1820, mPDEPWidth(PDEP_width)
    1921{
     22    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
    2023    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
    2124}
    2225
    23 void PDEPkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
    24     // Extract the values we will use in the main processing loop
    25     Value * PDEP_ms_blk = kb->loadInputStreamBlock("PDEPmarkerStream", kb->getInt32(0));
    26     const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
    27     const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
    28     Value * processedBits = kb->getProcessedItemCount("sourceStreamSet");
    29     Value * blockWidth = kb->getSize(kb->getBitBlockWidth());
    30     Value * base_block_idx = kb->CreateUDiv(processedBits, blockWidth);
     26void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {   
     27    BasicBlock * entry = kb->GetInsertBlock();
     28    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
     29    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
     30    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
     31
     32    Function::arg_iterator args = mCurrentMethod->arg_begin();
     33    args++; //self
     34    Value * itemsToDo = &*(args++); // Since PDEP marker stream is a bit stream, this is the number of PDEP marker bits to process
     35    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
     36    args++; //sourceItemsAvail
     37    Value * PDEPStrmPtr = &*(args++);
     38    Value * inputSwizzlesPtr = &*(args++);
     39
     40    // Get pointer to start of the output StreamSetBlock we're currently writing to
     41    Value * outputStreamPtr = &*(args);
     42   
     43    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
     44    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
     45    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
     46    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
     47       
    3148    Value * pdepWidth = kb->getSize(mPDEPWidth);
     49    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
    3250    Value * PDEP_func = nullptr;
    3351    if (mPDEPWidth == 64) {
     
    3654        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
    3755    }
    38     Value * updatedProcessedBits = processedBits;
     56    kb->CreateBr(checkLoopCond);
    3957
    40     // For each mask extracted from the PDEP marker stream
     58    kb->SetInsertPoint(checkLoopCond);
     59    // The following PHINodes' values can come from entry or processBlock
     60    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     61    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
     62    PHINode * updatedProcessedBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     63    blocksToDoPhi->addIncoming(blocksToDo, entry);
     64    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
     65    updatedProcessedBitsPhi->addIncoming(processedSourceBits, entry);
     66
     67    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
     68    kb->CreateCondBr(haveRemBlocks, processBlock, terminate);
     69
     70    kb->SetInsertPoint(processBlock);
     71    // Extract the values we will use in the main processing loop
     72    Value * updatedProcessedBits = updatedProcessedBitsPhi;
     73    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, {blockOffsetPhi, kb->getInt32(0)}));
     74    kb->CallPrintRegister("PDEP_ms_blk", PDEP_ms_blk);
     75
     76    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
     77    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
     78
     79    // For each mask extracted from the PDEP marker block
    4180    for (unsigned i = 0; i < mSwizzleFactor; i++) {
    42         // Do block and swizzle index calculations, then combine the "src" and "next" swizzles
    43         Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_block_idx); // blk index == stream set block index
     81        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
     82        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
    4483        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
    45         Value * next_block_idx = kb->CreateSub(kb->CreateUDiv(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), base_block_idx);
    46         Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), pdepWidth);
     84        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedBits);
     85       
     86        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
     87        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
    4788
    4889        // Load current and next BitBlocks/swizzles
    49         Value * current_blk_ptr = kb->getAdjustedInputStreamBlockPtr(current_blk_idx, "sourceStreamSet", current_swizzle_idx);
    50         Value * next_blk_ptr = kb->getAdjustedInputStreamBlockPtr(next_block_idx, "sourceStreamSet", next_swizzle_idx);
    51         Value * current_swizzle = kb->CreateBlockAlignedLoad(current_blk_ptr);
    52         Value * next_swizzle = kb->CreateBlockAlignedLoad(next_blk_ptr);
     90        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {current_blk_idx, current_swizzle_idx});
     91        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {next_blk_idx, next_swizzle_idx});
     92        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
     93        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
    5394
    5495        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
     
    5697        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
    5798        Value * borrowed_bits = kb->CreateShl(next_swizzle,
    58                                               kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
     99                                             kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
    59100        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
    60101
    61         Value * PDEP_mask = PDEP_masks[i];
    62102        Value * segments = kb->fwCast(mPDEPWidth, combined);
    63103        Value * result_swizzle = Constant::getNullValue(segments->getType());
    64104        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
     105        Value * PDEP_mask = PDEP_masks[i];
    65106        for (unsigned j = 0; j < mSwizzleFactor; j++) {
    66107            Value * source_field = kb->CreateExtractElement(segments, j);
     
    70111
    71112        // Store the result
    72         kb->storeOutputStreamBlock("outputStreamSet", kb->getSize(i), result_swizzle);
     113        kb->CreateBlockAlignedStore(result_swizzle, kb->CreateGEP(outputStreamPtr, {blockOffsetPhi, kb->getSize(i)}));
     114                                    kb->CallPrintRegister("result_swizzle", result_swizzle);
    73115        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
    74116    }
    75     kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBits);
     117
     118    updatedProcessedBitsPhi->addIncoming(updatedProcessedBits, processBlock);
     119    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
     120    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
     121    kb->CreateBr(checkLoopCond);
     122
     123    kb->SetInsertPoint(terminate);
     124    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);   
    76125}
    77126
     
    79128    Value * pop_counts = kb->simd_popcount(field_width, blk);
    80129    std::vector<Value *> counts;
    81     for (unsigned i = 0; i < mSwizzleFactor; i++) {
     130    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
    82131        // Store the pop counts for each blk_width field in blk
    83132        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
     
    91140    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
    92141    std::vector<Value *> PDEP_masks;
    93     for (unsigned i = 0; i < mSwizzleFactor; i++) {
     142    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
    94143        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
    95144    }
Note: See TracChangeset for help on using the changeset viewer.