source: icGREP/icgrep-devel/icgrep/kernels/swizzled_multiple_pdep_kernel.cpp @ 6055

Last change on this file since 6055 was 6055, checked in by cameron, 13 months ago

Various small fixes

File size: 9.0 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "swizzled_multiple_pdep_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <toolchain/toolchain.h>
9#include <llvm/IR/Intrinsics.h>
10
11using namespace llvm;
12
13namespace kernel {
14
15SwizzledMultiplePDEPkernel::SwizzledMultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, const unsigned numberOfStreamSet, std::string name)
16: MultiBlockKernel(std::move(name),
17// input stream sets
18{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
19Binding{b->getStreamSetTy(swizzleFactor), "source0", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
20// output stream set
21{Binding{b->getStreamSetTy(swizzleFactor), "output0", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
22{}, {}, {})
23, mSwizzleFactor(swizzleFactor), mNumberOfStreamSet(numberOfStreamSet) {
24    for (unsigned i = 1; i < numberOfStreamSet; i++) {
25        mStreamSetInputs.push_back(Binding{b->getStreamSetTy(swizzleFactor), "source" + std::to_string(i), RateEqualTo("source0"), BlockSize(b->getBitBlockWidth() / swizzleFactor) });
26        mStreamSetOutputs.push_back(Binding{b->getStreamSetTy(swizzleFactor), "output" + std::to_string(i), RateEqualTo("output0"), BlockSize(b->getBitBlockWidth() / swizzleFactor)});
27    }
28}
29
30void SwizzledMultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
31    BasicBlock * const entry = b->GetInsertBlock();
32    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
33    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
34    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
35    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
36    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
37
38    Function * pdep = nullptr;
39    if (pdepWidth == 64) {
40        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
41    } else if (pdepWidth == 32) {
42        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
43    } else {
44        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
45    }
46
47    Constant * const ZERO = b->getSize(0);
48    Value * const sourceItemCount = b->getProcessedItemCount("source0");
49
50    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
51    b->CreateBr(processBlock);
52
53    b->SetInsertPoint(processBlock);
54    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
55    strideIndex->addIncoming(ZERO, entry);
56
57    std::vector<PHINode*> bufferPhiArray(mNumberOfStreamSet, NULL);
58    std::vector<Value*> bufferArray(mNumberOfStreamSet, NULL);
59    for (unsigned iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
60        PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
61        bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
62        bufferPhiArray[iStreamSetIndex] = bufferPhi;
63        bufferArray[iStreamSetIndex] = bufferPhi;
64    }
65
66    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
67    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
68    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
69    bufferSizePhi->addIncoming(ZERO, entry);
70
71    // Extract the values we will use in the main processing loop
72    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
73    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
74    Value * const selectors = b->fwCast(pdepWidth, markerValue);
75    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
76
77    // For each element of the marker block
78    Value * bufferSize = bufferSizePhi;
79    Value * sourceOffset = sourceOffsetPhi;
80    for (unsigned i = 0; i < mSwizzleFactor; i++) {
81
82        // How many bits will we deposit?
83        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
84
85        // Aggressively enqueue any additional bits
86        BasicBlock * const entry = b->GetInsertBlock();
87        BasicBlock * const enqueueBits = b->CreateBasicBlock();
88        b->CreateBr(enqueueBits);
89
90        b->SetInsertPoint(enqueueBits);
91        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
92        updatedBufferSize->addIncoming(bufferSize, entry);
93        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
94        updatedSourceOffset->addIncoming(sourceOffset, entry);
95
96        std::vector<PHINode * > updatedBufferArray(mNumberOfStreamSet, NULL);
97        for (unsigned iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
98            Value* buffer = bufferArray[iStreamSetIndex];
99            PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
100            updatedBuffer->addIncoming(buffer, entry);
101            updatedBufferArray[iStreamSetIndex] = updatedBuffer;
102        }
103
104        // Calculate the block and swizzle index of the current swizzle row
105        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
106        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
107
108        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
109
110        for (unsigned iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
111            Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source" + std::to_string(iStreamSetIndex), swizzleIndex, blockOffset));
112
113            // Shift the swizzle to the right to clear off any used bits ...
114            Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
115            Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
116
117            // ... then to the left to align the bits with the buffer and combine them.
118            Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
119            Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
120
121            bufferArray[iStreamSetIndex] = b->CreateOr(updatedBufferArray[iStreamSetIndex], pendingBits);
122            updatedBufferArray[iStreamSetIndex]->addIncoming(bufferArray[iStreamSetIndex], enqueueBits);
123        }
124
125        // Update the buffer size with the number of bits we have actually enqueued
126        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
127        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
128        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
129
130        // ... and increment the source offset by the number we actually inserted
131        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
132        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
133        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
134
135        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
136        // branch misprediction is better or worse than always filling from two swizzles to
137        // ensure that we have enough bits to deposit.
138        BasicBlock * const depositBits = b->CreateBasicBlock();
139        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
140
141        b->SetInsertPoint(depositBits);
142
143        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
144        Value * const mask = b->CreateExtractElement(selectors, i);
145        Value * const usedShift = b->simd_fill(pdepWidth, required);
146        for (unsigned iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
147            Value* result = b->simd_pdep(pdepWidth, bufferArray[iStreamSetIndex], b->simd_fill(pdepWidth, mask));
148            // Store the result
149            Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output" + std::to_string(iStreamSetIndex), b->getSize(i), strideIndex);
150            b->CreateBlockAlignedStore(result, outputStreamPtr);
151
152            // Shift away any used bits from the buffer and decrement our buffer size by the number we used
153            bufferArray[iStreamSetIndex] = b->CreateLShr(bufferArray[iStreamSetIndex], usedShift);
154        }
155
156        bufferSize = b->CreateSub(bufferSize, required);
157    }
158
159    BasicBlock * const finishedBlock = b->GetInsertBlock();
160    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
161    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
162    for (unsigned iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
163        bufferPhiArray[iStreamSetIndex]->addIncoming(bufferArray[iStreamSetIndex], finishedBlock);
164    }
165
166    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
167    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
168    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
169
170    b->SetInsertPoint(finishedStrides);
171}
172
173}
Note: See TracBrowser for help on using the repository browser.