source: icGREP/icgrep-devel/icgrep/kernels/lz4/lz4_multiple_pdep_kernel.cpp @ 5905

Last change on this file since 5905 was 5885, checked in by xwa163, 18 months ago

Implement lz4_numbers_to_bitstream_kernel in new kernel infrastructure, fix bug of extract and deposit processes of lz4_ext_dep in large data.

File size: 12.8 KB
Line 
1//
2// Created by wxy325 on 2018/2/9.
3//
4
5#include "lz4_multiple_pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9#include <vector>
10
11
12using namespace llvm;
13
14
15namespace kernel {
16
17    LZ4MultiplePDEPkernel::LZ4MultiplePDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned streamSize, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
18            : MultiBlockKernel(name + "",
19                               {Binding{kb->getStreamSetTy(), "PDEPmarkerStream", BoundedRate(0, 1)}},
20                               {},
21                               {}, {}, {})
22            , mSwizzleFactor(swizzleFactor)
23            , mPDEPWidth(PDEP_width)
24            , mStreamSize(streamSize)
25    {
26        assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
27        assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
28
29        mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet0", BoundedRate(0, 1), Swizzled()});
30        mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet0", RateEqualTo("PDEPmarkerStream")});
31
32        for (int i = 1; i < streamSize; i++) {
33            mStreamSetInputs.push_back(Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet" + std::to_string(i), BoundedRate(0, 1), Swizzled()});
34            mStreamSetOutputs.push_back(Binding{kb->getStreamSetTy(streamCount), "outputStreamSet" + std::to_string(i), RateEqualTo("outputStreamSet0")});
35        }
36    }
37
38    void LZ4MultiplePDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
39        BasicBlock * entry = kb->GetInsertBlock();
40//        kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
41        BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
42        BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
43        BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
44        BasicBlock * terminate = kb->CreateBasicBlock("terminate");
45
46        Value * itemsToDo = mAvailableItemCount[0];
47
48        Value * sourceItemsAvail = mAvailableItemCount[1]; //TODO need to be calculated from numOfStrides
49
50        Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
51
52
53        std::vector<Value*> inputSwizzlesPtrs = std::vector<Value*>(mStreamSize, NULL);
54        std::vector<Value*> outputStreamPtrs = std::vector<Value*>(mStreamSize, NULL);
55        for (int i = 0; i < mStreamSize; i++) {
56            inputSwizzlesPtrs[i] = kb->getInputStreamBlockPtr("sourceStreamSet" + std::to_string(i), kb->getInt32(0));
57//            kb->CallPrintInt("@@inputSwizzlesPtrs_" + std::to_string(i), inputSwizzlesPtrs[i]);
58            // Get pointer to start of the output StreamSetBlock we're currently writing to
59            outputStreamPtrs[i] = kb->getOutputStreamBlockPtr("outputStreamSet" + std::to_string(i), kb->getInt32(0));
60        }
61
62        Constant * blockWidth = kb->getSize(kb->getBitBlockWidth());
63        Value * blocksToDo = kb->CreateSelect(mIsFinal, kb->CreateUDivCeil(itemsToDo, blockWidth), kb->CreateUDiv(itemsToDo, blockWidth));
64        Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet0");
65        Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
66
67        Value * pdepWidth = kb->getSize(mPDEPWidth);
68        Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
69        Value * PDEP_func = nullptr;
70        if (mPDEPWidth == 64) {
71            PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
72        } else if (mPDEPWidth == 32) {
73            PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
74        }
75        kb->CreateBr(checkLoopCond);
76
77        kb->SetInsertPoint(checkLoopCond);
78        // The following PHINodes' values can come from entry or processBlock
79        PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
80        PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
81        PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
82        PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
83        blocksToDoPhi->addIncoming(blocksToDo, entry);
84        blockOffsetPhi->addIncoming(kb->getSize(0), entry);
85        updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
86        sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
87
88        Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
89        kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
90
91        kb->SetInsertPoint(checkSourceCount);
92        // Extract the values we will use in the main processing loop
93        Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
94        Value * updatedSourceItems = sourceItemsRemaining;
95        Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, blockOffsetPhi));
96
97        const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);
98        const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
99
100        Value * total_count = mask_popcounts[0];
101        for (unsigned j = 1; j < mask_popcounts.size(); j++) {
102            total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
103        }
104//    kb->CallPrintInt("total_count", total_count);
105//    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
106        // Do not check popcount in final block, since there may be some useless pdep marker in the end
107        kb->CreateCondBr(kb->CreateOr(kb->CreateICmpULE(total_count, sourceItemsRemaining), mIsFinal), processBlock, terminate);
108        kb->SetInsertPoint(processBlock);
109
110        // For each mask extracted from the PDEP marker block
111        for (unsigned i = 0; i < mSwizzleFactor; i++) {
112            // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
113
114            Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
115            Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
116            Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
117
118
119            Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
120
121            for (int iStreamIndex = 0; iStreamIndex < mStreamSize; iStreamIndex++) {
122                Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
123                Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
124
125                // Load current and next BitBlocks/swizzles
126                Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtrs[iStreamIndex], kb->CreateAdd(kb->CreateMul(current_blk_idx, kb->getSize(mSwizzleFactor)), current_swizzle_idx));
127                Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
128
129                Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtrs[iStreamIndex], kb->CreateAdd(kb->CreateMul(next_blk_idx, kb->getSize(mSwizzleFactor)), next_swizzle_idx));
130                Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
131
132                // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
133                Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
134
135                Value * borrowed_bits = kb->CreateShl(next_swizzle,
136                                                      kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
137                Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
138
139                Value * segments = kb->fwCast(mPDEPWidth, combined);
140
141//                kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
142//                kb->CallPrintInt("next_swizzle_idx", next_swizzle_idx);
143//                if (iStreamIndex == 1) {
144//                    kb->CallPrintInt("current_swizzle_ptr"  + std::to_string(iStreamIndex) , current_swizzle_ptr);
145//                    kb->CallPrintRegister("current_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), current_swizzle);
146//
147//                    kb->CallPrintRegister("next_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), next_swizzle);
148//                    kb->CallPrintRegister("segments_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), segments);
149//                }
150
151                Value * result_swizzle = Constant::getNullValue(segments->getType());
152                // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
153
154                Value * PDEP_mask = PDEP_masks[i];
155                for (unsigned j = 0; j < mSwizzleFactor; j++) {
156                    Value * source_field = kb->CreateExtractElement(segments, j);
157                    Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
158                    result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j);
159
160                }
161
162                // Store the result
163                auto outputPos = kb->CreateGEP(outputStreamPtrs[iStreamIndex], kb->CreateAdd(kb->CreateMul(blockOffsetPhi, kb->getSize(mSwizzleFactor)), kb->getSize(i)));
164//                if (iStreamIndex == 0) {
165//                    kb->CallPrintInt("dataPtr_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), outputPos);
166//                    kb->CallPrintRegister("data_" + std::to_string(iStreamIndex) + "_" + std::to_string(i), result_swizzle);
167//                }
168
169                kb->CreateBlockAlignedStore(result_swizzle, outputPos);
170            }
171
172            updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
173            updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
174        }
175
176        updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
177        blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
178        blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
179        sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
180        kb->CreateBr(checkLoopCond);
181
182        kb->SetInsertPoint(terminate);
183        for (int i = 0; i < mStreamSize; i++) {
184            kb->setProcessedItemCount("sourceStreamSet" + std::to_string(i), updatedProcessedSourceBitsPhi);
185        }
186
187        Value* processedBlock = kb->CreateSub(blocksToDo, blocksToDoPhi);
188//        kb->CallPrintInt("blocksToDoPhi", blocksToDoPhi);
189
190        kb->setProcessedItemCount("PDEPmarkerStream",
191                                  kb->CreateSelect(mIsFinal,
192                                                   kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"), itemsToDo),
193                                                   kb->CreateAdd(kb->getProcessedItemCount("PDEPmarkerStream"),kb->CreateMul(processedBlock, blockWidth))
194                                  )
195        );
196    }
197
198    std::vector<Value *> LZ4MultiplePDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
199        Value * pop_counts = kb->simd_popcount(field_width, blk);
200        std::vector<Value *> counts;
201        for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
202            // Store the pop counts for each blk_width field in blk
203            counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
204        }
205        return counts;
206    }
207
208    std::vector<Value *> LZ4MultiplePDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
209        // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
210        // Split the PDEP marker stream block into mPDEPWidth segments.
211        Value * masks = kb->fwCast(mask_width, PDEP_ms_blk);
212        std::vector<Value *> PDEP_masks;
213        for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
214            PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
215        }
216        return PDEP_masks;
217    }
218}
Note: See TracBrowser for help on using the repository browser.