source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 5866

Last change on this file since 5866 was 5865, checked in by nmedfort, 19 months ago

More work on the pipeline I/O rate handling

File size: 10.4 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
8#include <iostream>
9
10using namespace llvm;
11
12namespace kernel {
13
14PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & kb, unsigned streamCount, unsigned swizzleFactor, unsigned PDEP_width, std::string name)
15: MultiBlockKernel(name + "",
16                  {Binding{kb->getStreamSetTy(), "PDEPmarkerStream"},
17                   Binding{kb->getStreamSetTy(streamCount), "sourceStreamSet", BoundedRate(0, 1)}},
18                  {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
19                  {}, {}, {})
20, mSwizzleFactor(swizzleFactor)
21, mPDEPWidth(PDEP_width)
22{
23    assert((mSwizzleFactor == (kb->getBitBlockWidth() / PDEP_width)) && "swizzle factor must equal bitBlockWidth / PDEP_width");
24    assert((mPDEPWidth == 64 || mPDEPWidth == 32) && "PDEP width must be 32 or 64");
25}
26
27void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, Value * const numOfStrides) {
28    BasicBlock * entry = kb->GetInsertBlock();
29//    kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock Start:", kb->getSize(0));
30    BasicBlock * checkLoopCond = kb->CreateBasicBlock("checkLoopCond");
31    BasicBlock * checkSourceCount = kb->CreateBasicBlock("checkSourceCount");
32    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
33    BasicBlock * terminate = kb->CreateBasicBlock("terminate");
34
35    Value * itemsToDo = mAvailableItemCount[0];
36
37    Value * sourceItemsAvail = mAvailableItemCount[1];
38//    kb->CallPrintInt("itemsToDo:", itemsToDo);
39//    kb->CallPrintInt("sourceItemsAvail:", sourceItemsAvail);
40
41
42    Value * PDEPStrmPtr = kb->getInputStreamBlockPtr("PDEPmarkerStream", kb->getInt32(0)); // mStreamBufferPtr[0];
43    Value * inputSwizzlesPtr = kb->getInputStreamBlockPtr("sourceStreamSet", kb->getInt32(0)); // mStreamBufferPtr[1];
44    // Get pointer to start of the output StreamSetBlock we're currently writing to
45    Value * outputStreamPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(0)); // mStreamBufferPtr[2];
46
47//    kb->CallPrintInt("aaa", outputStreamPtr)
48
49    Constant * blockWidth = kb->getSize(kb->getBitBlockWidth()); // 256
50    Value * blocksToDo = kb->CreateUDivCeil(itemsToDo, blockWidth); // 1 if this is the final block TODO the assumption is incorrect here
51    Value * processedSourceBits = kb->getProcessedItemCount("sourceStreamSet");
52    Value * base_src_blk_idx = kb->CreateUDiv(processedSourceBits, blockWidth);
53       
54    Value * pdepWidth = kb->getSize(mPDEPWidth);
55    Value * pdepWidth_1 = kb->getSize(mPDEPWidth - 1);
56    Value * PDEP_func = nullptr;
57    if (mPDEPWidth == 64) {
58        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
59    } else if (mPDEPWidth == 32) {
60        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
61    }
62    kb->CreateBr(checkLoopCond);
63
64    kb->SetInsertPoint(checkLoopCond);
65    // The following PHINodes' values can come from entry or processBlock
66    PHINode * blocksToDoPhi = kb->CreatePHI(kb->getSizeTy(), 2);
67    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2); // block offset from the base block, e.g. 0, 1, 2, ...
68    PHINode * updatedProcessedSourceBitsPhi = kb->CreatePHI(kb->getSizeTy(), 2);
69    PHINode * sourceItemsRemaining = kb->CreatePHI(kb->getSizeTy(), 2);
70    blocksToDoPhi->addIncoming(blocksToDo, entry);
71    blockOffsetPhi->addIncoming(kb->getSize(0), entry);
72    updatedProcessedSourceBitsPhi->addIncoming(processedSourceBits, entry);
73    sourceItemsRemaining->addIncoming(sourceItemsAvail, entry);
74
75    Value * haveRemBlocks = kb->CreateICmpUGT(blocksToDoPhi, kb->getSize(0));
76    kb->CreateCondBr(haveRemBlocks, checkSourceCount, terminate);
77
78    kb->SetInsertPoint(checkSourceCount);
79    // Extract the values we will use in the main processing loop
80    Value * updatedProcessedSourceBits = updatedProcessedSourceBitsPhi;
81    Value * updatedSourceItems = sourceItemsRemaining;
82    Value * PDEP_ms_blk = kb->CreateBlockAlignedLoad(kb->CreateGEP(PDEPStrmPtr, blockOffsetPhi));
83
84    const auto PDEP_masks = get_PDEP_masks(kb, PDEP_ms_blk, mPDEPWidth);   
85    const auto mask_popcounts = get_block_popcounts(kb, PDEP_ms_blk, mPDEPWidth);
86   
87    Value * total_count = mask_popcounts[0];
88    for (unsigned j = 1; j < mask_popcounts.size(); j++) {
89        total_count = kb->CreateAdd(total_count, mask_popcounts[j]);
90    }
91//    kb->CallPrintInt("total_count", total_count);
92//    kb->CallPrintInt("sourceItemsRemaining", sourceItemsRemaining);
93    kb->CreateCondBr(kb->CreateICmpULE(total_count, sourceItemsRemaining), processBlock, terminate);
94    kb->SetInsertPoint(processBlock);
95
96    // For each mask extracted from the PDEP marker block
97    for (unsigned i = 0; i < mSwizzleFactor; i++) {
98        // Do block and swizzle index calculations, then combine the "current" and "next" swizzles
99
100        Value * current_blk_idx = kb->CreateSub(kb->CreateUDiv(updatedProcessedSourceBits, blockWidth), base_src_blk_idx); // blk index == stream set block index
101        Value * current_swizzle_idx = kb->CreateUDiv(kb->CreateURem(updatedProcessedSourceBits, blockWidth), pdepWidth);
102        Value * ahead_pdep_width_less_1 = kb->CreateAdd(pdepWidth_1, updatedProcessedSourceBits);
103
104        Value * next_blk_idx = kb->CreateSub(kb->CreateUDiv(ahead_pdep_width_less_1, blockWidth), base_src_blk_idx);
105        Value * next_swizzle_idx = kb->CreateUDiv(kb->CreateURem(ahead_pdep_width_less_1, blockWidth), pdepWidth);
106
107//        kb->CallPrintInt("current_blk_idx", current_blk_idx);
108//        kb->CallPrintInt("current_swizzle_idx", current_swizzle_idx);
109
110        // Load current and next BitBlocks/swizzles
111        // TODO can not guarantee the two GEP is correct, need to check later
112        Value * current_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, kb->CreateAdd(kb->CreateMul(current_blk_idx, kb->getSize(mSwizzleFactor)), current_swizzle_idx));
113        Value * next_swizzle_ptr = kb->CreateGEP(inputSwizzlesPtr, kb->CreateAdd(kb->CreateMul(next_blk_idx, kb->getSize(mSwizzleFactor)), next_swizzle_idx));
114
115        Value * current_swizzle = kb->CreateBlockAlignedLoad(current_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
116        Value * next_swizzle = kb->CreateBlockAlignedLoad(next_swizzle_ptr);//Constant::getNullValue(cast<PointerType>(current_swizzle_ptr->getType())->getElementType());
117//        kb->CallPrintInt("ptr", current_swizzle_ptr);
118//        kb->CallPrintRegister("current_swizzle", current_swizzle);
119
120        // Combine the two swizzles to guarantee we'll have enough source bits for the PDEP operation
121        Value * shift_amount = kb->CreateURem(updatedProcessedSourceBits, pdepWidth);
122        Value * remaining_bits = kb->CreateLShr(current_swizzle, kb->simd_fill(mPDEPWidth, shift_amount)); // shift away bits that have already been used
123
124        Value * borrowed_bits = kb->CreateShl(next_swizzle,
125                                              kb->simd_fill(mPDEPWidth, kb->CreateSub(pdepWidth, shift_amount))); // shift next swizzle left by width of first swizzle
126        Value * combined = kb->CreateOr(remaining_bits, borrowed_bits); // combine current swizzle and next swizzle
127
128        Value * segments = kb->fwCast(mPDEPWidth, combined);
129        Value * result_swizzle = Constant::getNullValue(segments->getType());
130        // Apply PDEP to each mPDEPWidth segment of the combined swizzle using the current PDEP mask
131
132
133
134
135        Value * PDEP_mask = PDEP_masks[i];
136        for (unsigned j = 0; j < mSwizzleFactor; j++) {
137            Value * source_field = kb->CreateExtractElement(segments, j);
138            Value * PDEP_field = kb->CreateCall(PDEP_func, {source_field, PDEP_mask});
139            result_swizzle = kb->CreateInsertElement(result_swizzle, PDEP_field, j); 
140        }
141
142        // Store the result
143        auto outputPos = kb->CreateGEP(outputStreamPtr, kb->CreateAdd(kb->CreateMul(blockOffsetPhi, kb->getSize(mSwizzleFactor)), kb->getSize(i)));
144        kb->CreateBlockAlignedStore(result_swizzle, outputPos);
145        updatedProcessedSourceBits = kb->CreateAdd(updatedProcessedSourceBits, mask_popcounts[i]);
146        updatedSourceItems = kb->CreateSub(updatedSourceItems, mask_popcounts[i]);
147    }
148
149    updatedProcessedSourceBitsPhi->addIncoming(updatedProcessedSourceBits, processBlock);
150    blocksToDoPhi->addIncoming(kb->CreateSub(blocksToDoPhi, kb->getSize(1)), processBlock);
151    blockOffsetPhi->addIncoming(kb->CreateAdd(blockOffsetPhi, kb->getSize(1)), processBlock);
152    sourceItemsRemaining->addIncoming(updatedSourceItems, processBlock);
153    kb->CreateBr(checkLoopCond);
154
155    kb->SetInsertPoint(terminate);
156//    Value * itemsDone = kb->CreateMul(blockOffsetPhi, blockWidth);
157//    itemsDone = kb->CreateSelect(kb->CreateICmpULT(itemsToDo, itemsDone), itemsToDo, itemsDone);
158//    kb->setProcessedItemCount("PDEPmarkerStream", kb->CreateAdd(itemsDone, kb->getProcessedItemCount("PDEPmarkerStream")));
159    kb->setProcessedItemCount("sourceStreamSet", updatedProcessedSourceBitsPhi);
160
161
162//    kb->CallPrintInt("itemsDone:", itemsDone);
163//    kb->CallPrintInt("produced:", kb->getProducedItemCount("outputStreamSet"));
164//    kb->CallPrintInt("--------------" + this->getName() + " doMultiBlock End:", kb->getSize(0));
165}
166
167std::vector<Value *> PDEPkernel::get_block_popcounts(const std::unique_ptr<KernelBuilder> & kb, Value * blk, const unsigned field_width) {
168    Value * pop_counts = kb->simd_popcount(field_width, blk);
169    std::vector<Value *> counts;
170    for (unsigned i = 0; i < kb->getBitBlockWidth() / field_width ; i++) {
171        // Store the pop counts for each blk_width field in blk
172        counts.push_back(kb->CreateExtractElement(pop_counts, i)); // Extract field i from SIMD register popCounts
173    }
174    return counts;
175}
176
177std::vector<Value *> PDEPkernel::get_PDEP_masks(const std::unique_ptr<KernelBuilder> & kb, Value * PDEP_ms_blk, const unsigned mask_width) {
178    // We apply the PDEP operation mPDEPWidth bits at a time (e.g. if block is 256 bits and mPDEPWidth is 64, apply 4 PDEP ops to full process swizzle).
179    // Split the PDEP marker stream block into mPDEPWidth segments.
180    Value * masks = kb->fwCast(mask_width, PDEP_ms_blk); 
181    std::vector<Value *> PDEP_masks;
182    for (unsigned i = 0; i < kb->getBitBlockWidth() / mask_width; i++) {
183        PDEP_masks.push_back(kb->CreateExtractElement(masks, i));
184    }
185    return PDEP_masks;
186}
187}
Note: See TracBrowser for help on using the repository browser.