source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5627

Last change on this file since 5627 was 5627, checked in by cameron, 20 months ago

PDEP kernels from Adam with pdep_width_less_1 fix

File size: 8.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
15: MultiBlockKernel("PDEPdel",
16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
17                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
18                  {}, {}, {})
19, mSwizzleFactor(swizzleFactor)
20, mPDEPWidth(PDEP_width)
21{
22    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
23    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
24}
25
26void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {   
27    BasicBlock * entry = kb->GetInsertBlock();
28    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
29    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
30    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
31
32    Function::arg_iterator args = mCurrentMethod->arg_begin();
33    args++; //self
34    Value * itemsToDo = &*(args++); // Since PDEP marker stream is a bit stream, this is the number of PDEP marker bits to process
35    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
36    args++; //sourceItemsAvail
37    Value * PDEPStrmPtr = &*(args++);
38    Value * inputSwizzlesPtr = &*(args++);
39
40    // Get pointer to start of the output StreamSetBlock we're currently writing to
41    Value * outputStreamPtr = &*(args);
42   
43    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
44    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
45    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
46    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
47       
48    Value * pdepWidth = kb->getSize(mPDEPWidth);
49    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
50    Value * PDEP_func = nullptr;
51    if (mPDEPWidth == 64) {
52        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
53    } else if (mPDEPWidth == 32) {
54        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
55    }
56    kb->CreateBr(checkLoopCond);
57
58    kb->SetInsertPoint(checkLoopCond);
59    // The following PHINodes' values can come from entry or processBlock
60    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
61    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
62    PHINode * updatedProcessedBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
63    blocksToDoPhi->addIncoming(blocksToDo, entry);
64    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
65    updatedProcessedBitsPhi->addIncoming(processedSourceBits, entry);
66
67    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
68    kb->CreateCondBr(haveRemBlocks, processBlock, terminate);
69
70    kb->SetInsertPoint(processBlock);
71    // Extract the values we will use in the main processing loop
72    Value * updatedProcessedBits = updatedProcessedBitsPhi;
73    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, {blockOffsetPhi, kb->getInt32(0)}));
74    kb->CallPrintRegister("PDEP_ms_blk", PDEP_ms_blk);
75
76    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
77    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
78
79    // For each mask extracted from the PDEP marker block
80    for (unsigned i = 0; i < mSwizzleFactor; i++) {
81        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
82        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
83        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
84        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedBits);
85       
86        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
87        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
88
89        // Load current and next BitBlocks/swizzles
90        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {current_blk_idx, current_swizzle_idx});
91        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {next_blk_idx, next_swizzle_idx});
92        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
93        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
94
95        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
96        Value * shift_amount = kb->CreateURem(updatedProcessedBits, pdepWidth);
97        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
98        Value * borrowed_bits = kb->CreateShl(next_swizzle,
99                                             kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
100        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
101
102        Value * segments = kb->fwCast(mPDEPWidth, combined);
103        Value * result_swizzle = Constant::getNullValue(segments->getType());
104        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
105        Value * PDEP_mask = PDEP_masks[i];
106        for (unsigned j = 0; j < mSwizzleFactor; j++) { 
107            Value * source_field = kb->CreateExtractElement(segments, j);
108            Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask}); 
109            result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j); 
110        }
111
112        // Store the result
113        kb->CreateBlockAlignedStore(result_swizzle, kb->CreateGEP(outputStreamPtr, {blockOffsetPhi, kb->getSize(i)}));
114                                    kb->CallPrintRegister("result_swizzle", result_swizzle);
115        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
116    }
117
118    updatedProcessedBitsPhi->addIncoming(updatedProcessedBits, processBlock);
119    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
120    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
121    kb->CreateBr(checkLoopCond);
122
123    kb->SetInsertPoint(terminate);
124    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);   
125}
126
127std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
128    Value * pop_counts = kb->simd_popcount(field_width, blk);
129    std::vector<Value *> counts;
130    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
131        // Store the pop counts for each blk_width field in blk
132        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
133    }
134    return counts;
135}
136
137std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
138    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
139    // Split the PDEP marker stream block into mPDEPWidth segments.
140    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk); 
141    std::vector<Value *> PDEP_masks;
142    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
143        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
144    }
145    return PDEP_masks;
146}
147}
Note: See TracBrowser for help on using the repository browser.