source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5611

Last change on this file since 5611 was 5588, checked in by cameron, 23 months ago

PDEP kernel - initial check-in from Adam

File size: 5.6 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8
9using namespace llvm;
10
11namespace kernel {
12
13PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned PDEP_width)
14: BlockOrientedKernel("PDEPdel",
15                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
16                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}}, {}, {}, {})
17, mSwizzleFactor(kb->getBitBlockWidth() / PDEP_width)
18, mPDEPWidth(PDEP_width)
19{
20    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
21}
22
23void PDEPkernel::generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & kb) {
24    // Extract the values we will use in the main processing loop
25    Value * PDEP_ms_blk = kb->loadInputStreamBlock("PDEPmarkerStream", kb->getInt32(0));
26    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
27    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
28    Value * processedBits = kb->getProcessedItemCount("sourceStreamSet");
29    Value * blockWidth = kb->getSize(kb->getBitBlockWidth());
30    Value * base_block_idx = kb->CreateUDiv(processedBits, blockWidth);
31    Value * pdepWidth = kb->getSize(mPDEPWidth);
32    Value * PDEP_func = nullptr;
33    if (mPDEPWidth == 64) {
34        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
35    } else if (mPDEPWidth == 32) {
36        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
37    }
38    Value * updatedProcessedBits = processedBits;
39
40    // For each mask extracted from the PDEP marker stream
41    for (unsigned i = 0; i < mSwizzleFactor; i++) {
42        // Do block and swizzle index calculations, then combine the "src" and "next" swizzles
43        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_block_idx); // blk index == stream set block index
44        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
45        Value * next_block_idx = kb->CreateSub(kb->CreateUDiv(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), base_block_idx);
46        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(kb->CreateAdd(pdepWidth, updatedProcessedBits), blockWidth), pdepWidth);
47
48        // Load current and next BitBlocks/swizzles
49        Value * current_blk_ptr = kb->getAdjustedInputStreamBlockPtr(current_blk_idx, "sourceStreamSet", current_swizzle_idx);
50        Value * next_blk_ptr = kb->getAdjustedInputStreamBlockPtr(next_block_idx, "sourceStreamSet", next_swizzle_idx);
51        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_blk_ptr);
52        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_blk_ptr);
53
54        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
55        Value * shift_amount = kb->CreateURem(updatedProcessedBits, pdepWidth);
56        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
57        Value * borrowed_bits = kb->CreateShl(next_swizzle,
58                                              kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
59        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
60
61        Value * PDEP_mask = PDEP_masks[i];
62        Value * segments = kb->fwCast(mPDEPWidth, combined);
63        Value * result_swizzle = Constant::getNullValue(segments->getType());
64        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
65        for (unsigned j = 0; j < mSwizzleFactor; j++) { 
66            Value * source_field = kb->CreateExtractElement(segments, j);
67            Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask}); 
68            result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j); 
69        }
70
71        // Store the result
72        kb->storeOutputStreamBlock("outputStreamSet", kb->getSize(i), result_swizzle);
73        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
74    }
75    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBits);
76}
77
78std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
79    Value * pop_counts = kb->simd_popcount(field_width, blk);
80    std::vector<Value *> counts;
81    for (unsigned i = 0; i < mSwizzleFactor; i++) {
82        // Store the pop counts for each blk_width field in blk
83        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
84    }
85    return counts;
86}
87
88std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
89    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
90    // Split the PDEP marker stream block into mPDEPWidth segments.
91    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk); 
92    std::vector<Value *> PDEP_masks;
93    for (unsigned i = 0; i < mSwizzleFactor; i++) {
94        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
95    }
96    return PDEP_masks;
97}
98}
Note: See TracBrowser for help on using the repository browser.