source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5805

Last change on this file since 5805 was 5755, checked in by nmedfort, 16 months ago

Bug fixes and simplified MultiBlockKernel? logic

File size: 9.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width)
15: MultiBlockKernel("PDEPdel",
16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)},
17                   Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
18                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
19                  {}, {}, {})
20, mSwizzleFactor(swizzleFactor)
21, mPDEPWidth(PDEP_width)
22{
23    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
24    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
25}
26
27Value * PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
28    BasicBlock * entry = kb->GetInsertBlock();
29    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
30    BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
31    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
32    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
33
34    Value * itemsToDo = mAvailableItemCount[0];
35    Value * sourceItemsAvail = mAvailableItemCount[1];
36
37    Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
38    Value * inputSwizzlesPtr = kb->getInputStreamBlockPtr("sourceStreamSet", kb->getInt32(0)); // mStreamBufferPtr[1];
39    // Get pointer to start of the output StreamSetBlock we're currently writing to
40    Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(0)); // mStreamBufferPtr[2];
41
42    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
43    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block
44    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
45    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
46       
47    Value * pdepWidth = kb->getSize(mPDEPWidth);
48    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
49    Value * PDEP_func = nullptr;
50    if (mPDEPWidth == 64) {
51        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
52    } else if (mPDEPWidth == 32) {
53        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
54    }
55    kb->CreateBr(checkLoopCond);
56
57    kb->SetInsertPoint(checkLoopCond);
58    // The following PHINodes' values can come from entry or processBlock
59    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
60    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
61    PHINode * updatedProcessedBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
62    PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
63    blocksToDoPhi->addIncoming(blocksToDo, entry);
64    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
65    updatedProcessedBitsPhi->addIncoming(processedSourceBits, entry);
66    sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
67
68    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
69    kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
70
71    kb->SetInsertPoint(checkSourceCount);
72    // Extract the values we will use in the main processing loop
73    Value * updatedProcessedBits = updatedProcessedBitsPhi;
74    Value * updatedSourceItems = sourceItemsRemaining;
75    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, {blockOffsetPhi, kb->getInt32(0)}));
76
77    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
78    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
79   
80    Value * total_count = mask_popcounts[0];
81    for (unsigned j = 1; j < mask_popcounts.size(); j++) {
82        total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
83    }
84    kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
85    kb->SetInsertPoint(processBlock);
86
87    // For each mask extracted from the PDEP marker block
88    for (unsigned i = 0; i < mSwizzleFactor; i++) {
89        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
90        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
91        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedBits, blockWidth), pdepWidth);
92        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedBits);
93       
94        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
95        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
96
97        // Load current and next BitBlocks/swizzles
98        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {current_blk_idx, current_swizzle_idx});
99        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, {next_blk_idx, next_swizzle_idx});
100        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
101        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
102
103        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
104        Value * shift_amount = kb->CreateURem(updatedProcessedBits, pdepWidth);
105        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
106        Value * borrowed_bits = kb->CreateShl(next_swizzle,
107                                             kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
108        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
109
110        Value * segments = kb->fwCast(mPDEPWidth, combined);
111        Value * result_swizzle = Constant::getNullValue(segments->getType());
112        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
113        Value * PDEP_mask = PDEP_masks[i];
114        for (unsigned j = 0; j < mSwizzleFactor; j++) { 
115            Value * source_field = kb->CreateExtractElement(segments, j);
116            Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask}); 
117            result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j); 
118        }
119
120        // Store the result
121        kb->CreateBlockAlignedStore(result_swizzle, kb->CreateGEP(outputStreamPtr, {blockOffsetPhi, kb->getSize(i)}));
122        updatedProcessedBits = kb->CreateAdd(updatedProcessedBits, mask_popcounts[i]);
123        updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
124    }
125
126    updatedProcessedBitsPhi->addIncoming(updatedProcessedBits, processBlock);
127    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
128    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
129    sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
130    kb->CreateBr(checkLoopCond);
131
132    kb->SetInsertPoint(terminate);
133    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
134    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
135    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));   
136    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedBitsPhi);
137
138    return numOfStrides;
139}
140
141std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
142    Value * pop_counts = kb->simd_popcount(field_width, blk);
143    std::vector<Value *> counts;
144    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
145        // Store the pop counts for each blk_width field in blk
146        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
147    }
148    return counts;
149}
150
151std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
152    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
153    // Split the PDEP marker stream block into mPDEPWidth segments.
154    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk); 
155    std::vector<Value *> PDEP_masks;
156    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
157        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
158    }
159    return PDEP_masks;
160}
161}
Note: See TracBrowser for help on using the repository browser.