source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp @ 5926

Last change on this file since 5926 was 5926, checked in by xwa163, 15 months ago

Fix lz4 related GEP instructions and TODO

File size: 11.7 KB
Line 
1//
2// Created by wxy325 on 2018/2/9.
3//
4
5#include "lz4_multiple_pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9#include <vector>
10
11
12using namespace llvm;
13
14
15namespace kernel {
16
17    LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
18            : MultiBlockKernel(name + "",
19                               {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)}},
20                               {},
21                               {}, {}, {})
22            , mSwizzleFactor(swizzleFactor)
23            , mPDEPWidth(PDEP_width)
24            , mStreamSize(streamSize)
25    {
26        assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
27        assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
28
29        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), Swizzled()});
30        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0", RateEqualTo("PDEPmarkerStream")});
31
32        for (int i = 1; i < streamSize; i++) {
33            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), BoundedRate(0, 1), Swizzled()});
34            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0")});
35        }
36    }
37
38    void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
39        BasicBlock * entry = kb->GetInsertBlock();
40//        kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
41        BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
42        BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
43        BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
44        BasicBlock * terminate = kb->CreateBasicBlock("terminate");
45
46        Value * itemsToDo = mAvailableItemCount[0];
47
48        Value * sourceItemsAvail = mAvailableItemCount[1]; //TODO need to be calculated from numOfStrides
49
50        Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
51        Value * blocksToDo = kb->CreateSelect(mIsFinal, kb->CreateUDivCeil(itemsToDo, blockWidth), kb->CreateUDiv(itemsToDo, blockWidth));
52        Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
53        Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
54
55        Value * pdepWidth = kb->getSize(mPDEPWidth);
56        Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
57        Value * PDEP_func = nullptr;
58        if (mPDEPWidth == 64) {
59            PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
60        } else if (mPDEPWidth == 32) {
61            PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
62        }
63        kb->CreateBr(checkLoopCond);
64
65        kb->SetInsertPoint(checkLoopCond);
66        // The following PHINodes' values can come from entry or processBlock
67        PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
68        PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
69        PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
70        PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
71        blocksToDoPhi->addIncoming(blocksToDo, entry);
72        blockOffsetPhi->addIncoming(kb->getSize(0), entry);
73        updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
74        sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
75
76        Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
77        kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
78
79        kb->SetInsertPoint(checkSourceCount);
80        // Extract the values we will use in the main processing loop
81        Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
82        Value * updatedSourceItems = sourceItemsRemaining;
83        Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0), blockOffsetPhi));
84
85        const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
86        const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
87
88        Value * total_count = mask_popcounts[0];
89        for (unsigned j = 1; j < mask_popcounts.size(); j++) {
90            total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
91        }
92//    kb->CallPrintInt("total_count", total_count);
93//    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
94        // Do not check popcount in final block, since there may be some useless pdep marker in the end
95        kb->CreateCondBr(kb->CreateOr(kb->CreateICmpULE(total_count, sourceItemsRemaining), mIsFinal), processBlock, terminate);
96        kb->SetInsertPoint(processBlock);
97
98        // For each mask extracted from the PDEP marker block
99        for (unsigned i = 0; i < mSwizzleFactor; i++) {
100            // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
101
102            Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
103            Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
104            Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
105
106
107            Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
108
109            for (int iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
110                Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
111                Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
112
113                // Load current and next BitBlocks/swizzles
114                Value* current_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), current_swizzle_idx, current_blk_idx);
115
116                Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
117
118
119                Value* next_swizzle_ptr = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(iStreamIndex), next_swizzle_idx, next_blk_idx);
120                Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
121
122                // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
123                Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
124
125                Value * borrowed_bits = kb->CreateShl(next_swizzle,
126                                                      kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
127                Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
128
129                Value * segments = kb->fwCast(mPDEPWidth, combined);
130
131//                kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
132//                kb->CallPrintInt("next_swizzle_idx", next_swizzle_idx);
133//                if (iStreamIndex == 1) {
134//                    kb->CallPrintInt("current_swizzle_ptr"  + std::to_string(iStreamIndex) , current_swizzle_ptr);
135//                    kb->CallPrintRegister("current_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), current_swizzle);
136//
137//                    kb->CallPrintRegister("next_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), next_swizzle);
138//                    kb->CallPrintRegister("segments_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), segments);
139//                }
140
141                Value * result_swizzle = Constant::getNullValue(segments->getType());
142                // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
143
144                Value * PDEP_mask = PDEP_masks[i];
145                for (unsigned j = 0; j < mSwizzleFactor; j++) {
146                    Value * source_field = kb->CreateExtractElement(segments, j);
147                    Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
148                    result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
149
150                }
151
152                // Store the result
153                Value* outputPos = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(iStreamIndex), kb->getSize(i), blockOffsetPhi);
154
155                kb->CreateBlockAlignedStore(result_swizzle, outputPos);
156            }
157
158            updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
159            updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
160        }
161
162        updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
163        blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
164        blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
165        sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
166        kb->CreateBr(checkLoopCond);
167
168        kb->SetInsertPoint(terminate);
169        for (int i = 0; i < mStreamSize; i++) {
170            kb->setProcessedItemCount("sourceStreamSet" + std::to_string(i), updatedProcessedSourceBitsPhi);
171        }
172
173        Value* processedBlock = kb->CreateSub(blocksToDo, blocksToDoPhi);
174//        kb->CallPrintInt("blocksToDoPhi", blocksToDoPhi);
175
176        kb->setProcessedItemCount("PDEPmarkerStream",
177                                  kb->CreateSelect(mIsFinal,
178                                                   kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"), itemsToDo),
179                                                   kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"),kb->CreateMul(processedBlock, blockWidth))
180                                  )
181        );
182    }
183
184    std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
185        Value * pop_counts = kb->simd_popcount(field_width, blk);
186        std::vector<Value *> counts;
187        for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
188            // Store the pop counts for each blk_width field in blk
189            counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
190        }
191        return counts;
192    }
193
194    std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
195        // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
196        // Split the PDEP marker stream block into mPDEPWidth segments.
197        Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
198        std::vector<Value *> PDEP_masks;
199        for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
200            PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
201        }
202        return PDEP_masks;
203    }
204}
Note: See TracBrowser for help on using the repository browser.