source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp @ 5985

Last change on this file since 5985 was 5985, checked in by nmedfort, 12 months ago

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File size: 7.9 KB
Line 
1//
2// Created by wxy325 on 2018/2/9.
3//
4
5#include "lz4_multiple_pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7
8using namespace llvm;
9
10namespace kernel {
11
12LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
13: MultiBlockKernel(name + "",
14                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", FixedRate(), Principal()}},
15                   {},
16                   {}, {}, {})
17, mSwizzleFactor(swizzleFactor)
18, mPDEPWidth(PDEP_width)
19, mStreamSize(streamSize)
20{
21    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
22    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
23
24    mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", PopcountOf("PDEPmarkerStream"), Swizzled()});
25    mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0"});
26
27    for (unsigned i = 1; i < streamSize; i++) {
28        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), Swizzled()});
29        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i)});
30    }
31}
32
33void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const blocksToDo) {
34    BasicBlock * entry = kb->GetInsertBlock();
35
36    BasicBlock * loopBody = kb->CreateBasicBlock("loopBody");
37    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
38    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
39    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
40    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
41
42    Value * pdepWidth = kb->getSize(mPDEPWidth);
43    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
44    Value * PDEP_func = nullptr;
45    if (mPDEPWidth == 64) {
46        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
47    } else if (mPDEPWidth == 32) {
48        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
49    }
50    kb->CreateBr(loopBody);
51
52    kb->SetInsertPoint(loopBody);
53    // The following PHINodes' values can come from entry or processBlock
54    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
55    PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
56    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
57    updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
58    Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
59    Value * PDEP_ms_blk = kb->loadInputStreamBlock("PDEPmarkerStream", kb->getInt32(0), blockOffsetPhi);
60
61    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
62    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
63
64    // For each mask extracted from the PDEP marker block
65    for (unsigned i = 0; i < mSwizzleFactor; i++) {
66        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
67
68        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
69        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
70        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
71
72
73        Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
74
75        for (unsigned iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
76            Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
77            Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
78
79            // Load current and next BitBlocks/swizzles
80            Value* current_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), current_swizzle_idx, current_blk_idx);
81
82            Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
83
84
85            Value* next_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), next_swizzle_idx, next_blk_idx);
86            Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
87
88            // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
89            Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
90
91            Value * borrowed_bits = kb->CreateShl(next_swizzle,
92                                                  kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
93            Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
94
95            Value * segments = kb->fwCast(mPDEPWidth, combined);
96
97            Value * result_swizzle = Constant::getNullValue(segments->getType());
98            // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
99
100            Value * PDEP_mask = PDEP_masks[i];
101            for (unsigned j = 0; j < mSwizzleFactor; j++) {
102                Value * source_field = kb->CreateExtractElement(segments, j);
103                Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
104                result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
105
106            }
107
108            // Store the result
109            Value* outputPos = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(iStreamIndex), kb->getSize(i), blockOffsetPhi);
110
111            kb->CreateBlockAlignedStore(result_swizzle, outputPos);
112        }
113
114        updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
115    }
116
117    updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, loopBody);
118    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), loopBody);
119    Value * haveRemBlocks = kb->CreateICmpNE(blockOffsetPhi, blocksToDo);
120    kb->CreateCondBr(haveRemBlocks, loopBody, terminate);
121
122    kb->SetInsertPoint(terminate);
123    kb->setProcessedItemCount("sourceStreamSet0", updatedProcessedSourceBitsPhi);
124
125}
126
127std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
128    Value * pop_counts = kb->simd_popcount(field_width, blk);
129    std::vector<Value *> counts;
130    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
131        // Store the pop counts for each blk_width field in blk
132        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
133    }
134    return counts;
135}
136
137std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
138    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
139    // Split the PDEP marker stream block into mPDEPWidth segments.
140    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
141    std::vector<Value *> PDEP_masks;
142    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
143        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
144    }
145    return PDEP_masks;
146}
147
148}
Note: See TracBrowser for help on using the repository browser.