source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 6004

Last change on this file since 6004 was 5985, checked in by nmedfort, 17 months ago

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File size: 7.4 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <toolchain/toolchain.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
15: MultiBlockKernel(std::move(name),
16// input stream sets
17{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
18Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
19// output stream set
20{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
21{}, {}, {})
22, mSwizzleFactor(swizzleFactor) {
23
24}
25
26void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
27    BasicBlock * const entry = b->GetInsertBlock();
28    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
29    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
30    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
31    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
32    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
33
34    Function * pdep = nullptr;
35    if (pdepWidth == 64) {
36        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
37    } else if (pdepWidth == 32) {
38        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
39    } else {
40        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
41    }
42
43    Constant * const ZERO = b->getSize(0);
44    Value * const sourceItemCount = b->getProcessedItemCount("source");
45
46    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
47    b->CreateBr(processBlock);
48
49    b->SetInsertPoint(processBlock);
50    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
51    strideIndex->addIncoming(ZERO, entry);
52    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
53    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
54    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
55    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
56    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
57    bufferSizePhi->addIncoming(ZERO, entry);
58
59    // Extract the values we will use in the main processing loop
60    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
61    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
62    Value * const selectors = b->fwCast(pdepWidth, markerValue);
63    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
64
65    // For each element of the marker block
66    Value * bufferSize = bufferSizePhi;
67    Value * sourceOffset = sourceOffsetPhi;
68    Value * buffer = bufferPhi;
69    for (unsigned i = 0; i < mSwizzleFactor; i++) {
70
71        // How many bits will we deposit?
72        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
73
74        // Aggressively enqueue any additional bits
75        BasicBlock * const entry = b->GetInsertBlock();
76        BasicBlock * const enqueueBits = b->CreateBasicBlock();
77        b->CreateBr(enqueueBits);
78
79        b->SetInsertPoint(enqueueBits);
80        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
81        updatedBufferSize->addIncoming(bufferSize, entry);
82        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
83        updatedSourceOffset->addIncoming(sourceOffset, entry);
84        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
85        updatedBuffer->addIncoming(buffer, entry);
86
87        // Calculate the block and swizzle index of the current swizzle row
88        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
89        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
90        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
91        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
92
93        // Shift the swizzle to the right to clear off any used bits ...
94        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
95        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
96
97        // ... then to the left to align the bits with the buffer and combine them.
98        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
99        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
100
101        buffer = b->CreateOr(updatedBuffer, pendingBits);
102        updatedBuffer->addIncoming(buffer, enqueueBits);
103
104        // Update the buffer size with the number of bits we have actually enqueued
105        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
106        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
107        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
108
109        // ... and increment the source offset by the number we actually inserted
110        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
111        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
112        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
113
114        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
115        // branch misprediction is better or worse than always filling from two swizzles to
116        // ensure that we have enough bits to deposit.
117        BasicBlock * const depositBits = b->CreateBasicBlock();
118        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
119
120        b->SetInsertPoint(depositBits);
121
122        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
123        Value * result = UndefValue::get(buffer->getType());
124        Value * const mask = b->CreateExtractElement(selectors, i);
125        for (unsigned j = 0; j < mSwizzleFactor; j++) {
126            Value * source_field = b->CreateExtractElement(buffer, j);
127            Value * PDEP_field = b->CreateCall(pdep, {source_field, mask});
128            result = b->CreateInsertElement(result, PDEP_field, j);
129        }
130
131        // Store the result
132        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
133        b->CreateBlockAlignedStore(result, outputStreamPtr);
134
135        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
136        Value * const usedShift = b->simd_fill(pdepWidth, required);
137        buffer = b->CreateLShr(buffer, usedShift);
138        bufferSize = b->CreateSub(bufferSize, required);
139    }
140
141    BasicBlock * const finishedBlock = b->GetInsertBlock();
142    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
143    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
144    bufferPhi->addIncoming(buffer, finishedBlock);
145    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
146    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
147    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
148
149    b->SetInsertPoint(finishedStrides);
150}
151
152}
Note: See TracBrowser for help on using the repository browser.