source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp @ 5873

Last change on this file since 5873 was 5873, checked in by xwa163, 14 months ago
  1. Implement LZ4MultiplePdepKernel, which can do PDEP for more than one input streamset
  2. Add attributed Swizzled in StreamSet?, copy at least one whole block when doing temporary buffer copy for Swizzled InputStreamSet?
  3. Bug fixing for character_deposit pipeline
  4. Add more test files for character_deposit pipeline
File size: 12.9 KB
Line 
1//
2// Created by wxy325 on 2018/2/9.
3//
4
5#include "lz4_multiple_pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9#include <vector>
10
11
12using namespace llvm;
13
14
15namespace kernel {
16
17    LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
18            : MultiBlockKernel(name + "",
19                               {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)}},
20                               {},
21                               {}, {}, {})
22            , mSwizzleFactor(swizzleFactor)
23            , mPDEPWidth(PDEP_width)
24            , mStreamSize(streamSize)
25    {
26        assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
27        assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
28
29        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), Swizzled()});
30        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0", RateEqualTo("PDEPmarkerStream")});
31
32        for (int i = 1; i < streamSize; i++) {
33            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), BoundedRate(0, 1), Swizzled()});
34            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0")});
35        }
36    }
37
38    void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
39        BasicBlock * entry = kb->GetInsertBlock();
40//        kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
41        BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
42        BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
43        BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
44        BasicBlock * terminate = kb->CreateBasicBlock("terminate");
45
46        Value * itemsToDo = mAvailableItemCount[0];
47
48        Value * sourceItemsAvail = mAvailableItemCount[1]; //TODO need to be calculated from numOfStrides
49//    kb->CallPrintInt("itemsToDo:", itemsToDo);
50//        kb->CallPrintInt("sourceItemsAvail:", sourceItemsAvail);
51//        kb->getProcessedItemCount("")
52//        kb->CallPrintInt("sourceItemsAvail2:", sourceItemsAvail2);
53
54
55        Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
56
57
58        std::vector<Value*> inputSwizzlesPtrs = std::vector<Value*>(mStreamSize, NULL);
59        std::vector<Value*> outputStreamPtrs = std::vector<Value*>(mStreamSize, NULL);
60        for (int i = 0; i < mStreamSize; i++) {
61            inputSwizzlesPtrs[i] = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), kb->getInt32(0));
62//            kb->CallPrintInt("@@inputSwizzlesPtrs_" + std::to_string(i), inputSwizzlesPtrs[i]);
63            // Get pointer to start of the output StreamSetBlock we're currently writing to
64            outputStreamPtrs[i] = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), kb->getInt32(0));
65        }
66
67        Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
68        Value * blocksToDo = kb->CreateSelect(mIsFinal, kb->CreateUDivCeil(itemsToDo, blockWidth), kb->CreateUDiv(itemsToDo, blockWidth));
69        Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
70        Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
71
72        Value * pdepWidth = kb->getSize(mPDEPWidth);
73        Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
74        Value * PDEP_func = nullptr;
75        if (mPDEPWidth == 64) {
76            PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
77        } else if (mPDEPWidth == 32) {
78            PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
79        }
80        kb->CreateBr(checkLoopCond);
81
82        kb->SetInsertPoint(checkLoopCond);
83        // The following PHINodes' values can come from entry or processBlock
84        PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
85        PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
86        PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
87        PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
88        blocksToDoPhi->addIncoming(blocksToDo, entry);
89        blockOffsetPhi->addIncoming(kb->getSize(0), entry);
90        updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
91        sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
92
93        Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
94        kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
95
96        kb->SetInsertPoint(checkSourceCount);
97        // Extract the values we will use in the main processing loop
98        Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
99        Value * updatedSourceItems = sourceItemsRemaining;
100        Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, blockOffsetPhi));
101
102        const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
103        const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
104
105        Value * total_count = mask_popcounts[0];
106        for (unsigned j = 1; j < mask_popcounts.size(); j++) {
107            total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
108        }
109//    kb->CallPrintInt("total_count", total_count);
110//    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
111        kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
112        kb->SetInsertPoint(processBlock);
113
114        // For each mask extracted from the PDEP marker block
115        for (unsigned i = 0; i < mSwizzleFactor; i++) {
116            // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
117
118            Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
119            Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
120            Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
121
122
123            Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
124
125            for (int iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
126                Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
127                Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
128
129                // Load current and next BitBlocks/swizzles
130                Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtrs[iStreamIndex], kb->CreateAdd(kb->CreateMul(current_blk_idx, kb->getSize(mSwizzleFactor)), current_swizzle_idx));
131                Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
132
133                Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtrs[iStreamIndex], kb->CreateAdd(kb->CreateMul(next_blk_idx, kb->getSize(mSwizzleFactor)), next_swizzle_idx));
134                Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
135
136                // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
137                Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
138
139                Value * borrowed_bits = kb->CreateShl(next_swizzle,
140                                                      kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
141                Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
142
143                Value * segments = kb->fwCast(mPDEPWidth, combined);
144
145//                kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
146//                kb->CallPrintInt("next_swizzle_idx", next_swizzle_idx);
147//                if (iStreamIndex == 1) {
148//                    kb->CallPrintInt("current_swizzle_ptr"  + std::to_string(iStreamIndex) , current_swizzle_ptr);
149//                    kb->CallPrintRegister("current_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), current_swizzle);
150//
151//                    kb->CallPrintRegister("next_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), next_swizzle);
152//                    kb->CallPrintRegister("segments_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), segments);
153//                }
154
155                Value * result_swizzle = Constant::getNullValue(segments->getType());
156                // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
157
158                Value * PDEP_mask = PDEP_masks[i];
159                for (unsigned j = 0; j < mSwizzleFactor; j++) {
160                    Value * source_field = kb->CreateExtractElement(segments, j);
161                    Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
162                    result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
163
164                }
165
166                // Store the result
167                auto outputPos = kb->CreateGEP(outputStreamPtrs[iStreamIndex], kb->CreateAdd(kb->CreateMul(blockOffsetPhi, kb->getSize(mSwizzleFactor)), kb->getSize(i)));
168//                if (iStreamIndex == 0) {
169//                    kb->CallPrintInt("dataPtr_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), outputPos);
170//                    kb->CallPrintRegister("data_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), result_swizzle);
171//                }
172
173                kb->CreateBlockAlignedStore(result_swizzle, outputPos);
174            }
175
176            updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
177            updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
178        }
179
180        updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
181        blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
182        blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
183        sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
184        kb->CreateBr(checkLoopCond);
185
186        kb->SetInsertPoint(terminate);
187        for (int i = 0; i < mStreamSize; i++) {
188            kb->setProcessedItemCount("sourceStreamSet" + std::to_string(i), updatedProcessedSourceBitsPhi);
189        }
190
191        Value* processedBlock = kb->CreateSub(blocksToDo, blocksToDoPhi);
192//        kb->CallPrintInt("blocksToDoPhi", blocksToDoPhi);
193
194        kb->setProcessedItemCount("PDEPmarkerStream",
195                                  kb->CreateSelect(mIsFinal,
196                                                   kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"), itemsToDo),
197                                                   kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"),kb->CreateMul(processedBlock, blockWidth))
198                                  )
199        );
200    }
201
202    std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
203        Value * pop_counts = kb->simd_popcount(field_width, blk);
204        std::vector<Value *> counts;
205        for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
206            // Store the pop counts for each blk_width field in blk
207            counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
208        }
209        return counts;
210    }
211
212    std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
213        // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
214        // Split the PDEP marker stream block into mPDEPWidth segments.
215        Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
216        std::vector<Value *> PDEP_masks;
217        for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
218            PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
219        }
220        return PDEP_masks;
221    }
222}
Note: See TracBrowser for help on using the repository browser.