source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5682

Last change on this file since 5682 was 5639, checked in by cameron, 23 months ago

Fixes for multiblock kernel builder

File size: 9.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
15: MultiBlockKernel("PDEPdel",
16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", MaxRatio(1)}, Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", MaxRatio(1)}},
17                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
18                  {}, {}, {})
19, mSwizzleFactor(swizzleFactor)
20, mPDEPWidth(PDEP_width)
21{
22    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
23    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
24}
25
26void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb) {   
27    BasicBlock * entry = kb->GetInsertBlock();
28    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
29    BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
30    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
31    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
32
33    Function::arg_iterator args = mCurrentMethod->arg_begin();
34    args++; //self
35    Value * itemsToDo = &*(args++); // Since PDEP marker stream is a bit stream, this is the number of PDEP marker bits to process
36    // Get pointer to start of the StreamSetBlock containing unprocessed input items.
37    Value * sourceItemsAvail =  &*(args++); 
38    Value * PDEPStrmPtr = &*(args++);
39    Value * inputSwizzlesPtr = &*(args++);
40    // Get pointer to start of the output StreamSetBlock we're currently writing to
41    Value * outputStreamPtr = &*(args);
42
43    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
44    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
45    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
46    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
47       
48    Value * pdepWidth = kb->getSize(mPDEPWidth);
49    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
50    Value * PDEP_func = nullptr;
51    if (mPDEPWidth == 64) {
52        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
53    } else if (mPDEPWidth == 32) {
54        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
55    }
56    kb->CreateBr(checkLoopCond);
57
58    kb->SetInsertPoint(checkLoopCond);
59    // The following PHINodes' values can come from entry or processBlock
60    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
61    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
62    PHINode * updatedProcessedBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
63    PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
64    blocksToDoPhi->addIncoming(blocksToDo, entry);
65    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
66    updatedProcessedBitsPhi->addIncoming(processedSourceBits, entry);
67    sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
68
69    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
70    kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
71
72    kb->SetInsertPoint(checkSourceCount);
73    // Extract the values we will use in the main processing loop
74    Value * updatedProcessedBits = updatedProcessedBitsPhi;
75    Value * updatedSourceItems = sourceItemsRemaining;
76    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, {blockOffsetPhi, kb->getInt32(0)}));
77
78    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
79    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
80   
81    Value * total_count = mask_popcounts[0];
82    for (unsigned j = 1; j < mask_popcounts.size(); j++) {
83        total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
84    }
85    kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
86    kb->SetInsertPoint(processBlock);
87
88    // For each mask extracted from the PDEP marker block
89    for (unsigned i = 0; i < mSwizzleFactor; i++) {
90        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
91        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
92        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
93        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedBits);
94       
95        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
96        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
97
98        // Load current and next BitBlocks/swizzles
99        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {current_blk_idx, current_swizzle_idx});
100        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {next_blk_idx, next_swizzle_idx});
101        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
102        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
103
104        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
105        Value * shift_amount = kb->CreateURem(updatedProcessedBits, pdepWidth);
106        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
107        Value * borrowed_bits = kb->CreateShl(next_swizzle,
108                                             kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
109        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
110
111        Value * segments = kb->fwCast(mPDEPWidth, combined);
112        Value * result_swizzle = Constant::getNullValue(segments->getType());
113        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
114        Value * PDEP_mask = PDEP_masks[i];
115        for (unsigned j = 0; j < mSwizzleFactor; j++) { 
116            Value * source_field = kb->CreateExtractElement(segments, j);
117            Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask}); 
118            result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j); 
119        }
120
121        // Store the result
122        kb->CreateBlockAlignedStore(result_swizzle, kb->CreateGEP(outputStreamPtr, {blockOffsetPhi, kb->getSize(i)}));
123        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
124        updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
125    }
126
127    updatedProcessedBitsPhi->addIncoming(updatedProcessedBits, processBlock);
128    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
129    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
130    sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
131    kb->CreateBr(checkLoopCond);
132
133    kb->SetInsertPoint(terminate);
134    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
135    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
136    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));   
137    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);   
138}
139
140std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
141    Value * pop_counts = kb->simd_popcount(field_width, blk);
142    std::vector<Value *> counts;
143    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
144        // Store the pop counts for each blk_width field in blk
145        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
146    }
147    return counts;
148}
149
150std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
151    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
152    // Split the PDEP marker stream block into mPDEPWidth segments.
153    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk); 
154    std::vector<Value *> PDEP_masks;
155    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
156        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
157    }
158    return PDEP_masks;
159}
160}
Note: See TracBrowser for help on using the repository browser.