source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5941

Last change on this file since 5941 was 5870, checked in by nmedfort, 16 months ago

Modified PDEP kernel

File size: 9.6 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <toolchain/toolchain.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
15: MultiBlockKernel(std::move(name),
16// input stream sets
17{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
18Binding{b->getStreamSetTy(swizzleFactor), "source", FixedRate(), Deferred()}},
19// output stream set
20{Binding{b->getStreamSetTy(swizzleFactor), "output"}},
21{}, {},
22// internal scalars
23{Binding{b->getBitBlockType(), "buffer"},
24Binding{b->getSizeTy(), "buffered"},
25Binding{b->getSizeTy(), "sourceOffset"}})
26, mSwizzleFactor(swizzleFactor) {
27
28}
29
30void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
31    BasicBlock * const entry = b->GetInsertBlock();
32    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
33    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
34    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
35    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
36    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
37
38    Function * pdep = nullptr;
39    if (pdepWidth == 64) {
40        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
41    } else if (pdepWidth == 32) {
42        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
43    } else {
44        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
45    }
46
47    // We store an internal source offset here because this kernel processes items in an unusual way.
48    // The pipeline and multiblock assume that if we report we're on the i-th bit of a stream we have
49    // fully processed all of the bits up to the i-th position.
50
51    //                                             v
52    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
53    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
54    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
55    //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
56
57    // However, this kernel divides the stream into K elements and fully consumes a single stream of
58    // the stream set before consuming the next one. So the same i-th position above is actually:
59
60    //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
61    //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
62    //                |XX          |XX          |XX          |XX          |
63    //                |            |            |            |            |
64
65    // In the future, we may want the pipeline and multiblock to understand this style of processing
66    // but for now, we hide it by delaying writing the actual processed offset until we've fully
67    // processed the entire block.
68
69    Value * const initialBuffer = b->getScalarField("buffer");
70    Value * const initialBufferSize = b->getScalarField("buffered");
71    Value * const initialSourceOffset = b->getScalarField("sourceOffset");
72    b->CreateBr(processBlock);
73
74    b->SetInsertPoint(processBlock);
75    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
76    strideIndex->addIncoming(b->getSize(0), entry);
77    PHINode * const bufferPhi = b->CreatePHI(initialBuffer->getType(), 2);
78    bufferPhi->addIncoming(initialBuffer, entry);
79    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
80    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
81    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
82    bufferSizePhi->addIncoming(initialBufferSize, entry);
83
84    // Extract the values we will use in the main processing loop
85    Value * const markerStream = b->getInputStreamBlockPtr("marker", b->getInt32(0), strideIndex);
86    Value * const selectors = b->fwCast(pdepWidth, b->CreateBlockAlignedLoad(markerStream));
87    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
88
89    // If we run out of source items here, it is a failure of the MultiBlockKernel and/or PipelineGenerator
90    if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
91        Value * requiredSourceItems = b->CreateExtractElement(numOfSelectors, b->getInt64(0));
92        for (unsigned i = 1; i < mSwizzleFactor; i++) {
93            requiredSourceItems = b->CreateAdd(requiredSourceItems, b->CreateExtractElement(numOfSelectors, b->getInt64(i)));
94        }
95        Value * const availableSourceItems = b->getAvailableItemCount("source");
96        Value * const unreadSourceItems = b->CreateSub(availableSourceItems, sourceOffsetPhi);
97        Value * const hasSufficientSourceItems = b->CreateICmpULE(requiredSourceItems, unreadSourceItems);
98        b->CreateAssert(hasSufficientSourceItems, getName() + " has insufficient source items for the given marker stream");
99    }
100
101    // For each element of the marker block
102    Value * bufferSize = bufferSizePhi;
103    Value * sourceOffset = sourceOffsetPhi;
104    Value * buffer = bufferPhi;
105    for (unsigned i = 0; i < mSwizzleFactor; i++) {
106
107        // How many bits will we deposit?
108        Value * const required = b->CreateExtractElement(numOfSelectors, b->getInt32(i));
109
110        // Aggressively enqueue any additional bits
111        BasicBlock * const entry = b->GetInsertBlock();
112        BasicBlock * const enqueueBits = b->CreateBasicBlock();
113        b->CreateBr(enqueueBits);
114
115        b->SetInsertPoint(enqueueBits);
116        PHINode * const bufferSize2 = b->CreatePHI(bufferSize->getType(), 2);
117        bufferSize2->addIncoming(bufferSize, entry);
118        PHINode * const sourceOffset2 = b->CreatePHI(sourceOffset->getType(), 2);
119        sourceOffset2->addIncoming(sourceOffset, entry);
120        PHINode * const buffer2 = b->CreatePHI(buffer->getType(), 2);
121        buffer2->addIncoming(buffer, entry);
122
123        // Calculate the block and swizzle index of the "current" swizzle
124        Value * const block_index = b->CreateUDiv(sourceOffset2, BLOCK_WIDTH);
125        Value * const stream_index = b->CreateUDiv(b->CreateURem(sourceOffset2, BLOCK_WIDTH), PDEP_WIDTH);
126        Value * const ptr = b->getInputStreamBlockPtr("source", stream_index, block_index);
127        Value * const swizzle = b->CreateBlockAlignedLoad(ptr);
128        Value * const swizzleOffset = b->CreateURem(sourceOffset2, PDEP_WIDTH);
129
130        // Shift the swizzle to the right to clear off any used bits ...
131        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
132        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
133
134        // ... then to the left to align the bits with the buffer and combine them.
135        Value * const bufferShift = b->simd_fill(pdepWidth, bufferSize2);
136        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
137        buffer = b->CreateOr(buffer, pendingBits);
138        buffer2->addIncoming(buffer, enqueueBits);
139
140        // Update the buffer size by the number of bits we have actually enqueued
141        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), bufferSize2);
142        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
143        // ... and increment the source offset by the number we actually inserted
144        sourceOffset = b->CreateAdd(sourceOffset2, b->CreateSub(bufferSize, bufferSize2));
145        bufferSize2->addIncoming(bufferSize, enqueueBits);
146        sourceOffset2->addIncoming(sourceOffset, enqueueBits);
147        BasicBlock * const depositBits = b->CreateBasicBlock();
148        b->CreateCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
149
150        b->SetInsertPoint(depositBits);
151        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
152        Value * result = UndefValue::get(buffer->getType());
153        Value * const mask = b->CreateExtractElement(selectors, i);
154        for (unsigned j = 0; j < mSwizzleFactor; j++) {
155            Value * source_field = b->CreateExtractElement(buffer, j);
156            Value * PDEP_field = b->CreateCall(pdep, {source_field, mask});
157            result = b->CreateInsertElement(result, PDEP_field, j);
158        }
159
160        // Store the result
161        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getInt32(i), strideIndex);
162        b->CreateBlockAlignedStore(result, outputStreamPtr);
163
164        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
165        Value * const usedShift = b->simd_fill(pdepWidth, required);
166        buffer = b->CreateLShr(buffer, usedShift);
167        bufferSize = b->CreateSub(bufferSize, required);
168    }
169
170    BasicBlock * const finishedBlock = b->GetInsertBlock();
171    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
172    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
173    bufferPhi->addIncoming(buffer, finishedBlock);
174    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
175    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
176    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
177
178    b->SetInsertPoint(finishedStrides);
179    Value * const sourceItemsProcessed = b->CreateMul(b->CreateUDiv(sourceOffset, BLOCK_WIDTH), BLOCK_WIDTH);
180    b->setProcessedItemCount("source", b->CreateAdd(b->getProcessedItemCount("source"), sourceItemsProcessed));
181    b->setScalarField("buffer", buffer);
182    b->setScalarField("buffered", bufferSize);
183    b->setScalarField("sourceOffset", b->CreateURem(sourceOffset, BLOCK_WIDTH));
184}
185
186}
Note: See TracBrowser for help on using the repository browser.