source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp @ 6020

Last change on this file since 6020 was 6020, checked in by xwa163, 3 months ago
  1. New version of lz4_swizzled_match_copy kernel with higher performance
  2. Adjust related pipeline code
  3. Remove legacy comments
File size: 7.8 KB
Line 
1
2
3#include "lz4_multiple_pdep_kernel.h"
4#include <kernels/kernel_builder.h>
5
6using namespace llvm;
7
8namespace kernel {
9
10LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
11: MultiBlockKernel(name + "",
12                   {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", FixedRate(), Principal()}},
13                   {},
14                   {}, {}, {})
15, mSwizzleFactor(swizzleFactor)
16, mPDEPWidth(PDEP_width)
17, mStreamSize(streamSize)
18{
19    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
20    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
21
22    mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", PopcountOf("PDEPmarkerStream"), Swizzled()});
23    mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0"});
24
25    for (unsigned i = 1; i < streamSize; i++) {
26        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), RateEqualTo("sourceStreamSet0"), Swizzled()});
27        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i)});
28    }
29}
30
31void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const blocksToDo) {
32    BasicBlock * entry = kb->GetInsertBlock();
33
34    BasicBlock * loopBody = kb->CreateBasicBlock("loopBody");
35    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
36    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
37    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
38    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
39
40    Value * pdepWidth = kb->getSize(mPDEPWidth);
41    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
42    Value * PDEP_func = nullptr;
43    if (mPDEPWidth == 64) {
44        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
45    } else if (mPDEPWidth == 32) {
46        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
47    }
48    kb->CreateBr(loopBody);
49
50    kb->SetInsertPoint(loopBody);
51    // The following PHINodes' values can come from entry or processBlock
52    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
53    PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
54    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
55    updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
56    Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
57    Value * PDEP_ms_blk = kb->loadInputStreamBlock("PDEPmarkerStream", kb->getInt32(0), blockOffsetPhi);
58
59    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
60    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
61
62    // For each mask extracted from the PDEP marker block
63    for (unsigned i = 0; i < mSwizzleFactor; i++) {
64        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
65
66        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
67        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
68        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
69
70
71        Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
72
73        for (unsigned iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
74            Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
75            Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
76
77            // Load current and next BitBlocks/swizzles
78            Value* current_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), current_swizzle_idx, current_blk_idx);
79
80            Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
81
82
83            Value* next_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), next_swizzle_idx, next_blk_idx);
84            Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
85
86            // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
87            Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
88
89            Value * borrowed_bits = kb->CreateShl(next_swizzle,
90                                                  kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
91            Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
92
93            Value * segments = kb->fwCast(mPDEPWidth, combined);
94
95            Value * result_swizzle = Constant::getNullValue(segments->getType());
96            // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
97
98            Value * PDEP_mask = PDEP_masks[i];
99            for (unsigned j = 0; j < mSwizzleFactor; j++) {
100                Value * source_field = kb->CreateExtractElement(segments, j);
101                Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
102                result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
103
104            }
105
106            // Store the result
107            Value* outputPos = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(iStreamIndex), kb->getSize(i), blockOffsetPhi);
108
109            kb->CreateBlockAlignedStore(result_swizzle, outputPos);
110        }
111
112        updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
113    }
114
115    updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, loopBody);
116    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), loopBody);
117    Value * haveRemBlocks = kb->CreateICmpNE(blockOffsetPhi, blocksToDo);
118    kb->CreateCondBr(haveRemBlocks, loopBody, terminate);
119
120    kb->SetInsertPoint(terminate);
121    kb->setProcessedItemCount("sourceStreamSet0", updatedProcessedSourceBitsPhi);
122
123}
124
125std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
126    Value * pop_counts = kb->simd_popcount(field_width, blk);
127    std::vector<Value *> counts;
128    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
129        // Store the pop counts for each blk_width field in blk
130        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
131    }
132    return counts;
133}
134
135std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
136    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
137    // Split the PDEP marker stream block into mPDEPWidth segments.
138    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
139    std::vector<Value *> PDEP_masks;
140    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
141        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
142    }
143    return PDEP_masks;
144}
145
146}
Note: See TracBrowser for help on using the repository browser.