source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 6046

Last change on this file since 6046 was 6046, checked in by cameron, 15 months ago

Some fixes

File size: 19.9 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <toolchain/toolchain.h>
9#include <toolchain/driver.h>
10#include <toolchain/cpudriver.h>
11#include <IR_Gen/idisa_target.h>
12#include <llvm/IR/Module.h>
13
14
15using namespace llvm;
16
17namespace kernel {
18
19PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
20: MultiBlockKernel(std::move(name),
21// input stream sets
22{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
23Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
24// output stream set
25{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
26{}, {}, {})
27, mSwizzleFactor(swizzleFactor) {
28
29}
30
31void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
32    BasicBlock * const entry = b->GetInsertBlock();
33    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
34    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
35    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
36    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
37    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
38
39    Function * pdep = nullptr;
40    if (pdepWidth == 64) {
41        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
42    } else if (pdepWidth == 32) {
43        pdep = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
44    } else {
45        report_fatal_error(getName() + ": PDEP width must be 32 or 64");
46    }
47
48    Constant * const ZERO = b->getSize(0);
49    Value * const sourceItemCount = b->getProcessedItemCount("source");
50
51    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
52    b->CreateBr(processBlock);
53
54    b->SetInsertPoint(processBlock);
55    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
56    strideIndex->addIncoming(ZERO, entry);
57    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
58    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
59    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
60    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
61    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
62    bufferSizePhi->addIncoming(ZERO, entry);
63
64    // Extract the values we will use in the main processing loop
65    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
66    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
67    Value * const selectors = b->fwCast(pdepWidth, markerValue);
68    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
69
70    // For each element of the marker block
71    Value * bufferSize = bufferSizePhi;
72    Value * sourceOffset = sourceOffsetPhi;
73    Value * buffer = bufferPhi;
74    for (unsigned i = 0; i < mSwizzleFactor; i++) {
75
76        // How many bits will we deposit?
77        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
78
79        // Aggressively enqueue any additional bits
80        BasicBlock * const entry = b->GetInsertBlock();
81        BasicBlock * const enqueueBits = b->CreateBasicBlock();
82        b->CreateBr(enqueueBits);
83
84        b->SetInsertPoint(enqueueBits);
85        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
86        updatedBufferSize->addIncoming(bufferSize, entry);
87        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
88        updatedSourceOffset->addIncoming(sourceOffset, entry);
89        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
90        updatedBuffer->addIncoming(buffer, entry);
91
92        // Calculate the block and swizzle index of the current swizzle row
93        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
94        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
95        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
96        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
97
98        // Shift the swizzle to the right to clear off any used bits ...
99        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
100        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
101
102        // ... then to the left to align the bits with the buffer and combine them.
103        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
104        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
105
106        buffer = b->CreateOr(updatedBuffer, pendingBits);
107        updatedBuffer->addIncoming(buffer, enqueueBits);
108
109        // Update the buffer size with the number of bits we have actually enqueued
110        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
111        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
112        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
113
114        // ... and increment the source offset by the number we actually inserted
115        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
116        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
117        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
118
119        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
120        // branch misprediction is better or worse than always filling from two swizzles to
121        // ensure that we have enough bits to deposit.
122        BasicBlock * const depositBits = b->CreateBasicBlock();
123        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
124
125        b->SetInsertPoint(depositBits);
126
127        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
128        Value * const mask = b->CreateExtractElement(selectors, i);
129        Value* result = b->simd_pdep(pdepWidth, buffer, b->simd_fill(pdepWidth, mask));
130
131        // Store the result
132        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
133        b->CreateBlockAlignedStore(result, outputStreamPtr);
134
135        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
136        Value * const usedShift = b->simd_fill(pdepWidth, required);
137        buffer = b->CreateLShr(buffer, usedShift);
138        bufferSize = b->CreateSub(bufferSize, required);
139    }
140
141    BasicBlock * const finishedBlock = b->GetInsertBlock();
142    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
143    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
144    bufferPhi->addIncoming(buffer, finishedBlock);
145    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
146    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
147    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
148
149    b->SetInsertPoint(finishedStrides);
150}
151   
152StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
153: MultiBlockKernel("streamExpand" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
154                   {Binding{kb->getStreamSetTy(), "marker", FixedRate(), Principal()},
155                       Binding{kb->getStreamSetTy(streamCount), "source", PopcountOf("marker")}},
156                   {Binding{kb->getStreamSetTy(streamCount), "output", FixedRate()}},
157                   {}, {}, {})
158, mFieldWidth(fieldWidth)
159, mStreamCount(streamCount) {
160    for (unsigned i = 0; i < streamCount; i++) {
161        addScalar(kb->getBitBlockType(), "pendingSourceBlock_" + std::to_string(i));
162    }
163}
164
165void StreamExpandKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
166    const unsigned fw = mFieldWidth;
167    Type * fwTy = b->getIntNTy(fw);
168    Type * sizeTy = b->getSizeTy();
169    const unsigned numFields = b->getBitBlockWidth()/fw;
170   
171    Constant * const ZERO = b->getSize(0);
172    Constant * bwConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
173    Constant * bw_sub1Const = ConstantInt::get(sizeTy, b->getBitBlockWidth() -1);
174    Constant * fwConst = ConstantInt::get(sizeTy, fw);
175    Constant * fw_sub1Const = ConstantInt::get(sizeTy, fw-1);
176    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
177    Constant * fw_sub1Splat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
178   
179    BasicBlock * entry = b->GetInsertBlock();
180    BasicBlock * expandLoop = b->CreateBasicBlock("expandLoop");
181    BasicBlock * expansionDone = b->CreateBasicBlock("expansionDone");
182   
183    Value * processedSourceItems = b->getProcessedItemCount("source");
184    Value * sourceOffset = b->CreateURem(processedSourceItems, bwConst);
185   
186    std::vector<Value *> pendingData(mStreamCount);
187    for (unsigned i = 0; i < mStreamCount; i++) {
188        pendingData[i] = b->getScalarField("pendingSourceBlock_" + std::to_string(i));
189    }
190   
191    b->CreateBr(expandLoop);
192    // Main Loop
193    b->SetInsertPoint(expandLoop);
194    PHINode * blockNoPhi = b->CreatePHI(b->getSizeTy(), 2);
195    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
196    PHINode * pendingDataPhi[mStreamCount];
197    blockNoPhi->addIncoming(ZERO, entry);
198    pendingItemsPhi->addIncoming(sourceOffset, entry);
199    for (unsigned i = 0; i < mStreamCount; i++) {
200        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
201        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
202    }
203    Value * deposit_mask = b->loadInputStreamBlock("marker", ZERO, blockNoPhi);
204    // The source stream may not be positioned at a block boundary.  Partial data
205    // has been saved in the kernel state, determine the next full block number
206    // for loading source streams.
207    Value * pendingBlockEnd = b->CreateAdd(pendingItemsPhi, bw_sub1Const);
208    Value * srcBlockNo = b->CreateUDiv(pendingBlockEnd, bwConst);
209   
210    // Calculate the field values and offsets we need for assembling a
211    // a full block of source bits.  Assembly will use the following operations.
212    // A = b->simd_srl(fw, b->mvmd_dsll(fw, source, pending, field_offset_lo), bit_offset);
213    // B = b->simd_sll(fw, b->mvmd_dsll(fw, source, pending, field_offset_hi), shift_fwd);
214    // all_source_bits = simd_or(A, B);
215    Value * pendingOffset = b->CreateURem(pendingBlockEnd, bwConst);
216    Value * field_offset_lo =  b->CreateUDiv(pendingOffset, fwConst);
217    Value * bit_offset = b->simd_fill(fw, b->CreateURem(pendingOffset, fwConst));
218   
219    // Carefully avoid a shift by the full fieldwith (which gives a poison value).
220    // field_offset_lo + 1 unless the bit_offset is 0, in which case it is just field_offset_lo.
221    Value * field_offset_hi =  b->CreateUDiv(b->CreateAdd(pendingOffset, fw_sub1Const), fwConst);
222    // fw - bit_offset, unless bit_offset is 0, in which case, the shift_fwd is 0.
223    Value * shift_fwd = b->CreateURem(b->CreateSub(fwSplat, bit_offset), fwSplat);
224   
225    // Once all source bits are assembled, they need to be distributed to the
226    // output fields in accord with the popcounts of the deposit mask fields.
227    // The bits for each output field will typically come from (at most) two
228    // source fields, with offsets.  Calculate the field numbers and offsets.
229   
230    Value * fieldPopCounts = b->simd_popcount(fw, deposit_mask);
231    // For each field determine the (partial) sum popcount of all fields prior to
232    // the current field.
233    Value * partialSum = fieldPopCounts;
234    for (unsigned i = 1; i < numFields; i *= 2) {
235        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
236    }
237    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
238    partialSum = b->mvmd_slli(fw, partialSum, 1);
239   
240    Value * source_field_lo = b->CreateUDiv(partialSum, fwSplat);
241    Value * source_field_hi = b->CreateUDiv(b->CreateAdd(partialSum, fw_sub1Splat), fwSplat);
242    Value * source_shift_lo = b->CreateAnd(partialSum, fw_sub1Splat);  // parallel URem
243    Value * source_shift_hi = b->CreateAnd(b->CreateSub(fwSplat, source_shift_lo), fw_sub1Splat);
244   
245    // Now load and process source streams.
246    for (unsigned i = 0; i < mStreamCount; i++) {
247        Value * source = b->loadInputStreamBlock("source", b->getInt32(i), srcBlockNo);
248        Value * A = b->simd_srlv(fw, b->mvmd_dsll(fw, source, pendingDataPhi[i], field_offset_lo), bit_offset);
249        Value * B = b->simd_sllv(fw, b->mvmd_dsll(fw, source, pendingDataPhi[i], field_offset_hi), shift_fwd);
250        Value * full_source_block = b->simd_or(A, B);
251       
252        Value * C = b->simd_srlv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_lo), source_shift_lo);
253        Value * D = b->simd_sllv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_hi), source_shift_hi);
254        Value * output = b->bitCast(b->simd_or(C, D));
255        b->storeOutputStreamBlock("output", b->getInt32(i), blockNoPhi, output);
256        pendingDataPhi[i]->addIncoming(source, expandLoop);
257    }
258    //
259    // Update loop control Phis for the next iteration.
260    //
261    Value * nextBlk = b->CreateAdd(blockNoPhi, b->getSize(1));
262    blockNoPhi->addIncoming(nextBlk, expandLoop);
263    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
264    pendingItemsPhi->addIncoming(newPending, expandLoop);
265    //
266    // Now continue the loop if there are more blocks to process.
267    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
268    b->CreateCondBr(moreToDo, expandLoop, expansionDone);
269   
270    b->SetInsertPoint(expansionDone);
271    // Update kernel state.
272    for (unsigned i = 0; i < mStreamCount; i++) {
273        b->setScalarField("pendingSourceBlock_" + std::to_string(i), b->bitCast(pendingDataPhi[i]));
274    }
275}
276
277FieldDepositKernel::FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
278: MultiBlockKernel("FieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
279                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
280                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
281                   {}, {}, {})
282, mFieldWidth(fieldWidth)
283, mStreamCount(streamCount) {
284}
285   
286void FieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
287    BasicBlock * entry = kb->GetInsertBlock();
288    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
289    BasicBlock * done = kb->CreateBasicBlock("done");
290    Constant * const ZERO = kb->getSize(0);
291    kb->CreateBr(processBlock);
292    kb->SetInsertPoint(processBlock);
293    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
294    blockOffsetPhi->addIncoming(ZERO, entry);
295    Value * depositMask = kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi);
296    for (unsigned j = 0; j < mStreamCount; ++j) {
297        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
298        Value * output = kb->simd_pdep(mFieldWidth, input, depositMask);
299        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
300    }
301    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
302    blockOffsetPhi->addIncoming(nextBlk, processBlock);
303    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
304    kb->CreateCondBr(moreToDo, processBlock, done);
305    kb->SetInsertPoint(done);
306}
307
308PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
309: MultiBlockKernel("PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
310                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
311                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
312                   {}, {}, {})
313, mPDEPWidth(fieldWidth)
314, mStreamCount(streamCount) {
315    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldCompressKernel");
316}
317
318void PDEPFieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
319    Type * fieldTy = kb->getIntNTy(mPDEPWidth);
320    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
321    Constant * PDEP_func = nullptr;
322    if (mPDEPWidth == 64) {
323        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
324    } else if (mPDEPWidth == 32) {
325        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
326    }
327    BasicBlock * entry = kb->GetInsertBlock();
328    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
329    BasicBlock * done = kb->CreateBasicBlock("done");
330    Constant * const ZERO = kb->getSize(0);
331    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPDEPWidth;
332    kb->CreateBr(processBlock);
333    kb->SetInsertPoint(processBlock);
334    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
335    blockOffsetPhi->addIncoming(ZERO, entry);
336    std::vector<Value *> mask(fieldsPerBlock);
337    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
338    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
339    for (unsigned i = 0; i < fieldsPerBlock; i++) {
340        mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
341    }
342    for (unsigned j = 0; j < mStreamCount; ++j) {
343        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
344        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
345        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
346        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
347        for (unsigned i = 0; i < fieldsPerBlock; i++) {
348            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
349            Value * compressed = kb->CreateCall(PDEP_func, {field, mask[i]});
350            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
351        }
352    }
353    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
354    blockOffsetPhi->addIncoming(nextBlk, processBlock);
355    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
356    kb->CreateCondBr(moreToDo, processBlock, done);
357    kb->SetInsertPoint(done);
358}
359
360void StreamDepositCompiler::makeCall(parabix::StreamSetBuffer * depositMask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
361    if (mBufferBlocks == 0) {
362        llvm::report_fatal_error("StreamDepositCompiler needs a non-zero bufferBlocks parameter (for now).");
363    }
364    auto & iBuilder = mDriver.getBuilder();
365    unsigned N = IDISA::getNumOfStreams(ssType);
366    if (IDISA::getStreamFieldWidth(ssType) != 1) {
367        llvm::report_fatal_error("StreamDepositCompiler only compresses bit streams (for now)");
368    }
369    parabix::StreamSetBuffer * expandedStreams = mDriver.addBuffer<parabix::CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
370    Kernel * streamK = mDriver.addKernelInstance<StreamExpandKernel>(iBuilder, mFieldWidth, N);
371    mDriver.makeKernelCall(streamK, {depositMask, inputs}, {expandedStreams});
372
373    Kernel * depositK = nullptr;
374    if (AVX2_available()) {
375        depositK = mDriver.addKernelInstance<PDEPFieldDepositKernel>(iBuilder, mFieldWidth, N);
376    } else {
377        depositK = mDriver.addKernelInstance<FieldDepositKernel>(iBuilder, mFieldWidth, N);
378    }
379    mDriver.makeKernelCall(depositK, {depositMask, expandedStreams}, {outputs});
380}
381
382}
Note: See TracBrowser for help on using the repository browser.