source: icGREP/icgrep-devel/icgrep/kernels/swizzled_multiple_pdep_kernel.cpp @ 6034

Last change on this file since 6034 was 6026, checked in by xwa163, 16 months ago
  1. Implement SwizzledMultiplePDEPkernel with the same logic as new PDEPkernel, remove LZ4MultiplePDEPkernel, improve the performance
  2. Remove some unnecessary include
  3. Add prefix for some kernels
  4. Remove a legacy kernel
File size: 9.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include "swizzled_multiple_pdep_kernel.h"
7#include <kernels/kernel_builder.h>
8#include <toolchain/toolchain.h>
9
10using namespace llvm;
11
12namespace kernel {
13
14SwizzledMultiplePDEPkernel::SwizzledMultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, const unsigned numberOfStreamSet, std::string name)
15: MultiBlockKernel(std::move(name),
16// input stream sets
17{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
18Binding{b->getStreamSetTy(swizzleFactor), "source0", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
19// output stream set
20{Binding{b->getStreamSetTy(swizzleFactor), "output0", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
21{}, {}, {})
22, mSwizzleFactor(swizzleFactor), mNumberOfStreamSet(numberOfStreamSet) {
23    for (int i = 1; i < numberOfStreamSet; i++) {
24        mStreamSetInputs.push_back(Binding{b->getStreamSetTy(swizzleFactor), "source" + std::to_string(i), RateEqualTo("source0"), BlockSize(b->getBitBlockWidth() / swizzleFactor) });
25        mStreamSetOutputs.push_back(Binding{b->getStreamSetTy(swizzleFactor), "output" + std::to_string(i), RateEqualTo("output0"), BlockSize(b->getBitBlockWidth() / swizzleFactor)});
26    }
27}
28
29void SwizzledMultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
30    BasicBlock * const entry = b->GetInsertBlock();
31    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
32    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
33    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
34    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
35    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
36
37    Function * pdep = nullptr;
38    if (pdepWidth == 64) {
39        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
40    } else if (pdepWidth == 32) {
41        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
42    } else {
43        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
44    }
45
46    Constant * const ZERO = b->getSize(0);
47    Value * const sourceItemCount = b->getProcessedItemCount("source0");
48
49    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
50    b->CreateBr(processBlock);
51
52    b->SetInsertPoint(processBlock);
53    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
54    strideIndex->addIncoming(ZERO, entry);
55
56    std::vector<PHINode*> bufferPhiArray(mNumberOfStreamSet, NULL);
57    std::vector<Value*> bufferArray(mNumberOfStreamSet, NULL);
58    for (int iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
59        PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
60        bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
61        bufferPhiArray[iStreamSetIndex] = bufferPhi;
62        bufferArray[iStreamSetIndex] = bufferPhi;
63    }
64
65    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
66    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
67    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
68    bufferSizePhi->addIncoming(ZERO, entry);
69
70    // Extract the values we will use in the main processing loop
71    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
72    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
73    Value * const selectors = b->fwCast(pdepWidth, markerValue);
74    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
75
76    // For each element of the marker block
77    Value * bufferSize = bufferSizePhi;
78    Value * sourceOffset = sourceOffsetPhi;
79    for (unsigned i = 0; i < mSwizzleFactor; i++) {
80
81        // How many bits will we deposit?
82        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
83
84        // Aggressively enqueue any additional bits
85        BasicBlock * const entry = b->GetInsertBlock();
86        BasicBlock * const enqueueBits = b->CreateBasicBlock();
87        b->CreateBr(enqueueBits);
88
89        b->SetInsertPoint(enqueueBits);
90        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
91        updatedBufferSize->addIncoming(bufferSize, entry);
92        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
93        updatedSourceOffset->addIncoming(sourceOffset, entry);
94
95        std::vector<PHINode * > updatedBufferArray(mNumberOfStreamSet, NULL);
96        for (int iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
97            Value* buffer = bufferArray[iStreamSetIndex];
98            PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
99            updatedBuffer->addIncoming(buffer, entry);
100            updatedBufferArray[iStreamSetIndex] = updatedBuffer;
101        }
102
103        // Calculate the block and swizzle index of the current swizzle row
104        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
105        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
106
107        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
108
109        for (int iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
110            Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source" + std::to_string(iStreamSetIndex), swizzleIndex, blockOffset));
111
112            // Shift the swizzle to the right to clear off any used bits ...
113            Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
114            Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
115
116            // ... then to the left to align the bits with the buffer and combine them.
117            Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
118            Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
119
120            bufferArray[iStreamSetIndex] = b->CreateOr(updatedBufferArray[iStreamSetIndex], pendingBits);
121            updatedBufferArray[iStreamSetIndex]->addIncoming(bufferArray[iStreamSetIndex], enqueueBits);
122        }
123
124        // Update the buffer size with the number of bits we have actually enqueued
125        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
126        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
127        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
128
129        // ... and increment the source offset by the number we actually inserted
130        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
131        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
132        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
133
134        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
135        // branch misprediction is better or worse than always filling from two swizzles to
136        // ensure that we have enough bits to deposit.
137        BasicBlock * const depositBits = b->CreateBasicBlock();
138        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
139
140        b->SetInsertPoint(depositBits);
141
142        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
143        Value * const mask = b->CreateExtractElement(selectors, i);
144        Value * const usedShift = b->simd_fill(pdepWidth, required);
145        for (int iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
146            Value * result = UndefValue::get(bufferArray[iStreamSetIndex]->getType());
147            for (unsigned j = 0; j < mSwizzleFactor; j++) {
148                Value * source_field = b->CreateExtractElement(bufferArray[iStreamSetIndex], j);
149                Value * PDEP_field = b->CreateCall(pdep, {source_field, mask});
150                result = b->CreateInsertElement(result, PDEP_field, j);
151            }
152            // Store the result
153            Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output" + std::to_string(iStreamSetIndex), b->getSize(i), strideIndex);
154            b->CreateBlockAlignedStore(result, outputStreamPtr);
155
156            // Shift away any used bits from the buffer and decrement our buffer size by the number we used
157            bufferArray[iStreamSetIndex] = b->CreateLShr(bufferArray[iStreamSetIndex], usedShift);
158        }
159
160        bufferSize = b->CreateSub(bufferSize, required);
161    }
162
163    BasicBlock * const finishedBlock = b->GetInsertBlock();
164    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
165    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
166    for (int iStreamSetIndex = 0; iStreamSetIndex < mNumberOfStreamSet; iStreamSetIndex++) {
167        bufferPhiArray[iStreamSetIndex]->addIncoming(bufferArray[iStreamSetIndex], finishedBlock);
168    }
169
170    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
171    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
172    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
173
174    b->SetInsertPoint(finishedStrides);
175}
176
177}
Note: See TracBrowser for help on using the repository browser.