source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5635

Last change on this file since 5635 was 5635, checked in by cameron, 21 months ago

PDEP kernel: only process blocks if sufficient source stream data available

File size: 9.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
15: MultiBlockKernel("PDEPdel",
16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", MaxRatio(1)}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
17                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
18                  {}, {}, {})
19, mSwizzleFactor(swizzleFactor)
20, mPDEPWidth(PDEP_width)
21{
22    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
23    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
24}
25
26void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {   
27    BasicBlock * entry = kb->GetInsertBlock();
28    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
29    BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
30    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
31    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
32
33    Function::arg_iterator args = mCurrentMethod->arg_begin();
34    args++; //self
35    Value * itemsToDo = &*(args++); // Since PDEP marker stream is a bit stream, this is the number of PDEP marker bits to process
36    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
37    Value * sourceItemsAvail = args++; 
38    Value * PDEPStrmPtr = &*(args++);
39    Value * inputSwizzlesPtr = &*(args++);
40
41    // Get pointer to start of the output StreamSetBlock we're currently writing to
42    Value * outputStreamPtr = &*(args);
43   
44    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
45    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
46    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
47    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
48       
49    Value * pdepWidth = kb->getSize(mPDEPWidth);
50    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
51    Value * PDEP_func = nullptr;
52    if (mPDEPWidth == 64) {
53        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
54    } else if (mPDEPWidth == 32) {
55        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
56    }
57    kb->CreateBr(checkLoopCond);
58
59    kb->SetInsertPoint(checkLoopCond);
60    // The following PHINodes' values can come from entry or processBlock
61    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
62    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
63    PHINode * updatedProcessedBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
64    PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
65    blocksToDoPhi->addIncoming(blocksToDo, entry);
66    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
67    updatedProcessedBitsPhi->addIncoming(processedSourceBits, entry);
68    sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
69
70    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
71    kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
72
73    kb->SetInsertPoint(checkSourceCount);
74    // Extract the values we will use in the main processing loop
75    Value * updatedProcessedBits = updatedProcessedBitsPhi;
76    Value * updatedSourceItems = sourceItemsRemaining;
77    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, {blockOffsetPhi, kb->getInt32(0)}));
78    kb->CallPrintRegister("PDEP_ms_blk", PDEP_ms_blk);
79
80    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
81    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
82   
83    Value * total_count = mask_popcounts[0];
84    for (unsigned j = 1; j < mask_popcounts.size(); j++) {
85        total_count = kb->CreateAdd(mask_popcounts[j]);
86    }
87    kb->CreateCondBr(kb->CreateUGE(total_count, sourceItemsRemaining), processBlock, terminate);
88    kb->SetInsertPoint(processBlock);
89
90    // For each mask extracted from the PDEP marker block
91    for (unsigned i = 0; i < mSwizzleFactor; i++) {
92        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
93        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
94        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
95        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedBits);
96       
97        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
98        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
99
100        // Load current and next BitBlocks/swizzles
101        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {current_blk_idx, current_swizzle_idx});
102        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {next_blk_idx, next_swizzle_idx});
103        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
104        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
105
106        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
107        Value * shift_amount = kb->CreateURem(updatedProcessedBits, pdepWidth);
108        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
109        Value * borrowed_bits = kb->CreateShl(next_swizzle,
110                                             kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
111        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
112
113        Value * segments = kb->fwCast(mPDEPWidth, combined);
114        Value * result_swizzle = Constant::getNullValue(segments->getType());
115        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
116        Value * PDEP_mask = PDEP_masks[i];
117        for (unsigned j = 0; j < mSwizzleFactor; j++) { 
118            Value * source_field = kb->CreateExtractElement(segments, j);
119            Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask}); 
120            result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j); 
121        }
122
123        // Store the result
124        kb->CreateBlockAlignedStore(result_swizzle, kb->CreateGEP(outputStreamPtr, {blockOffsetPhi, kb->getSize(i)}));
125                                    kb->CallPrintRegister("result_swizzle", result_swizzle);
126        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
127        updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
128    }
129
130    updatedProcessedBitsPhi->addIncoming(updatedProcessedBits, processBlock);
131    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
132    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
133    sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
134    kb->CreateBr(checkLoopCond);
135
136    kb->SetInsertPoint(terminate);
137   
138    kb->setProcessedItemCount("PDEPmarkerStream", updatedProcessedBitsPhi);   
139    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);   
140}
141
142std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
143    Value * pop_counts = kb->simd_popcount(field_width, blk);
144    std::vector<Value *> counts;
145    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
146        // Store the pop counts for each blk_width field in blk
147        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
148    }
149    return counts;
150}
151
152std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
153    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
154    // Split the PDEP marker stream block into mPDEPWidth segments.
155    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk); 
156    std::vector<Value *> PDEP_masks;
157    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
158        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
159    }
160    return PDEP_masks;
161}
162}
Note: See TracBrowser for help on using the repository browser.