source: icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp @ 6297

Last change on this file since 6297 was 6261, checked in by nmedfort, 9 months ago

Work on OptimizationBranch?; revisited pipeline termination

File size: 20.1 KB
RevLine 
[5588]1/*
[6071]2 *  Copyright (c) 2018 International Characters.
[5588]3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5#include "pdep_kernel.h"
6#include <kernels/kernel_builder.h>
7#include <llvm/Support/raw_ostream.h>
[5870]8#include <toolchain/toolchain.h>
[6045]9#include <toolchain/driver.h>
10#include <toolchain/cpudriver.h>
11#include <IR_Gen/idisa_target.h>
[6055]12#include <llvm/IR/Intrinsics.h>
[6045]13#include <llvm/IR/Module.h>
[5588]14
[6045]15
[5588]16using namespace llvm;
17
18namespace kernel {
19
[5870]20PDEPkernel::PDEPkernel(const std::unique_ptr<kernel::KernelBuilder> & b, const unsigned swizzleFactor, std::string name)
[6261]21: MultiBlockKernel(b, std::move(name),
[5870]22// input stream sets
23{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
[5985]24Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
[5870]25// output stream set
[5985]26{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
27{}, {}, {})
[5870]28, mSwizzleFactor(swizzleFactor) {
29
[5588]30}
31
[5870]32void PDEPkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {
33    BasicBlock * const entry = b->GetInsertBlock();
34    BasicBlock * const processBlock = b->CreateBasicBlock("processBlock");
35    BasicBlock * const finishedStrides = b->CreateBasicBlock("finishedStrides");
36    const auto pdepWidth = b->getBitBlockWidth() / mSwizzleFactor;
37    ConstantInt * const BLOCK_WIDTH = b->getSize(b->getBitBlockWidth());
38    ConstantInt * const PDEP_WIDTH = b->getSize(pdepWidth);
[5627]39
[5985]40    Constant * const ZERO = b->getSize(0);
41    Value * const sourceItemCount = b->getProcessedItemCount("source");
[5706]42
[5985]43    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
[5870]44    b->CreateBr(processBlock);
[5627]45
[5870]46    b->SetInsertPoint(processBlock);
47    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
[5985]48    strideIndex->addIncoming(ZERO, entry);
49    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
50    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
[5870]51    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
52    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
53    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
[5985]54    bufferSizePhi->addIncoming(ZERO, entry);
[5627]55
56    // Extract the values we will use in the main processing loop
[5985]57    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
58    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
59    Value * const selectors = b->fwCast(pdepWidth, markerValue);
[5870]60    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
[5627]61
[5870]62    // For each element of the marker block
63    Value * bufferSize = bufferSizePhi;
64    Value * sourceOffset = sourceOffsetPhi;
65    Value * buffer = bufferPhi;
[5588]66    for (unsigned i = 0; i < mSwizzleFactor; i++) {
[5857]67
[5870]68        // How many bits will we deposit?
[5985]69        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
[5857]70
[5870]71        // Aggressively enqueue any additional bits
72        BasicBlock * const entry = b->GetInsertBlock();
73        BasicBlock * const enqueueBits = b->CreateBasicBlock();
74        b->CreateBr(enqueueBits);
[5588]75
[5870]76        b->SetInsertPoint(enqueueBits);
[5985]77        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
78        updatedBufferSize->addIncoming(bufferSize, entry);
79        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
80        updatedSourceOffset->addIncoming(sourceOffset, entry);
81        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
82        updatedBuffer->addIncoming(buffer, entry);
[5857]83
[5985]84        // Calculate the block and swizzle index of the current swizzle row
85        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
86        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
87        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
88        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
[5857]89
[5870]90        // Shift the swizzle to the right to clear off any used bits ...
91        Value * const swizzleShift = b->simd_fill(pdepWidth, swizzleOffset);
92        Value * const unreadBits = b->CreateLShr(swizzle, swizzleShift);
[5588]93
[5870]94        // ... then to the left to align the bits with the buffer and combine them.
[5985]95        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
[5870]96        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
[5857]97
[5985]98        buffer = b->CreateOr(updatedBuffer, pendingBits);
99        updatedBuffer->addIncoming(buffer, enqueueBits);
100
101        // Update the buffer size with the number of bits we have actually enqueued
102        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
[5870]103        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
[5985]104        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
105
[5870]106        // ... and increment the source offset by the number we actually inserted
[5985]107        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
108        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
109        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
110
111        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
112        // branch misprediction is better or worse than always filling from two swizzles to
113        // ensure that we have enough bits to deposit.
[5870]114        BasicBlock * const depositBits = b->CreateBasicBlock();
[5985]115        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
[5588]116
[5870]117        b->SetInsertPoint(depositBits);
[5985]118
[5870]119        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
120        Value * const mask = b->CreateExtractElement(selectors, i);
[6041]121        Value* result = b->simd_pdep(pdepWidth, buffer, b->simd_fill(pdepWidth, mask));
[5588]122
123        // Store the result
[5985]124        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
[5870]125        b->CreateBlockAlignedStore(result, outputStreamPtr);
126
127        // Shift away any used bits from the buffer and decrement our buffer size by the number we used
128        Value * const usedShift = b->simd_fill(pdepWidth, required);
129        buffer = b->CreateLShr(buffer, usedShift);
130        bufferSize = b->CreateSub(bufferSize, required);
[5588]131    }
[5627]132
[5870]133    BasicBlock * const finishedBlock = b->GetInsertBlock();
134    sourceOffsetPhi->addIncoming(sourceOffset, finishedBlock);
135    bufferSizePhi->addIncoming(bufferSize, finishedBlock);
136    bufferPhi->addIncoming(buffer, finishedBlock);
137    Value * const nextStrideIndex = b->CreateAdd(strideIndex, b->getSize(1));
138    strideIndex->addIncoming(nextStrideIndex, finishedBlock);
139    b->CreateLikelyCondBr(b->CreateICmpNE(nextStrideIndex, numOfBlocks), processBlock, finishedStrides);
[5627]140
[5870]141    b->SetInsertPoint(finishedStrides);
[5588]142}
[6228]143
144StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & b
[6184]145                                       , StreamSet * source, const unsigned base, StreamSet * mask
146                                       , StreamSet * expanded
147                                       , const unsigned FieldWidth)
[6261]148: MultiBlockKernel(b, "streamExpand" + std::to_string(FieldWidth)
[6184]149+ "_" + std::to_string(source->getNumElements())
150+ "_" + std::to_string(base) + "_" + std::to_string(expanded->getNumElements()),
151
152{Binding{"marker", mask, FixedRate(), Principal()},
[6255]153Binding{"source", source, PopcountOf("marker")}},
[6258]154{Binding{"output", expanded, FixedRate(), BlockSize(1)}},
[6184]155{}, {}, {})
156, mFieldWidth(FieldWidth)
157, mSelectedStreamBase(base)
158, mSelectedStreamCount(expanded->getNumElements()) {
159
[6045]160}
[5588]161
[6045]162void StreamExpandKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
[6184]163    Type * fieldWidthTy = b->getIntNTy(mFieldWidth);
[6045]164    Type * sizeTy = b->getSizeTy();
[6184]165    const unsigned numFields = b->getBitBlockWidth() / mFieldWidth;
[6228]166
[6045]167    Constant * const ZERO = b->getSize(0);
168    Constant * bwConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
[6184]169    Constant * fwConst = ConstantInt::get(sizeTy, mFieldWidth);
170    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fieldWidthTy, mFieldWidth));
171    Constant * fw_sub1Splat = ConstantVector::getSplat(numFields, ConstantInt::get(fieldWidthTy, mFieldWidth - 1));
[6228]172
[6045]173    BasicBlock * entry = b->GetInsertBlock();
174    BasicBlock * expandLoop = b->CreateBasicBlock("expandLoop");
175    BasicBlock * expansionDone = b->CreateBasicBlock("expansionDone");
176    Value * processedSourceItems = b->getProcessedItemCount("source");
[6184]177    Value * initialSourceOffset = b->CreateURem(processedSourceItems, bwConst);
[6228]178
[6184]179    Value * pendingData[mSelectedStreamCount];
[6071]180    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
[6079]181        pendingData[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), ZERO);
[6045]182    }
[6228]183
[6045]184    b->CreateBr(expandLoop);
185    // Main Loop
186    b->SetInsertPoint(expandLoop);
187    PHINode * blockNoPhi = b->CreatePHI(b->getSizeTy(), 2);
[6078]188    PHINode * pendingOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
[6071]189    PHINode * pendingDataPhi[mSelectedStreamCount];
[6045]190    blockNoPhi->addIncoming(ZERO, entry);
[6184]191    pendingOffsetPhi->addIncoming(initialSourceOffset, entry);
[6071]192    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
[6045]193        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
194        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
195    }
[6255]196
[6045]197    Value * deposit_mask = b->loadInputStreamBlock("marker", ZERO, blockNoPhi);
[6184]198
[6045]199    // Calculate the field values and offsets we need for assembling a
200    // a full block of source bits.  Assembly will use the following operations.
[6078]201    // A = b->simd_srlv(fw, b->mvmd_dsll(fw, source, pending, field_offset_lo), bit_offset);
202    // B = b->simd_sllv(fw, b->mvmd_dsll(fw, source, pending, field_offset_hi), shift_fwd);
[6045]203    // all_source_bits = simd_or(A, B);
[6078]204    Value * pendingOffset = b->CreateURem(pendingOffsetPhi, bwConst);
[6184]205    // Value * pendingItems = b->CreateURem(b->CreateSub(bwConst, pendingOffset), bwConst);
206    Value * pendingItems = b->CreateSub(bwConst, pendingOffset);
207
208    Value * field_offset_lo = b->CreateCeilUDiv(pendingItems, fwConst);
209    Value * bit_offset = b->simd_fill(mFieldWidth, b->CreateURem(pendingOffset, fwConst));
[6045]210    // Carefully avoid a shift by the full fieldwith (which gives a poison value).
211    // field_offset_lo + 1 unless the bit_offset is 0, in which case it is just field_offset_lo.
[6078]212    Value * field_offset_hi =  b->CreateUDiv(pendingItems, fwConst);
[6045]213    // fw - bit_offset, unless bit_offset is 0, in which case, the shift_fwd is 0.
214    Value * shift_fwd = b->CreateURem(b->CreateSub(fwSplat, bit_offset), fwSplat);
[6071]215
[6045]216    // Once all source bits are assembled, they need to be distributed to the
217    // output fields in accord with the popcounts of the deposit mask fields.
218    // The bits for each output field will typically come from (at most) two
219    // source fields, with offsets.  Calculate the field numbers and offsets.
[6228]220
[6184]221    Value * fieldPopCounts = b->simd_popcount(mFieldWidth, deposit_mask);
[6045]222    // For each field determine the (partial) sum popcount of all fields prior to
223    // the current field.
224    Value * partialSum = fieldPopCounts;
225    for (unsigned i = 1; i < numFields; i *= 2) {
[6184]226        partialSum = b->simd_add(mFieldWidth, partialSum, b->mvmd_slli(mFieldWidth, partialSum, i));
[6045]227    }
[6184]228    Value * const blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields - 1), sizeTy);
229    partialSum = b->mvmd_slli(mFieldWidth, partialSum, 1);
[6071]230
[6184]231    Value * const source_field_lo = b->CreateUDiv(partialSum, fwSplat);
232    Value * const source_field_hi = b->CreateUDiv(b->CreateAdd(partialSum, fw_sub1Splat), fwSplat);
233    Value * const source_shift_lo = b->CreateAnd(partialSum, fw_sub1Splat);  // parallel URem
234    Value * const source_shift_hi = b->CreateAnd(b->CreateSub(fwSplat, source_shift_lo), fw_sub1Splat);
235
236    // The source stream may not be positioned at a block boundary.  Partial data
237    // has been saved in the kernel state, determine the next full block number
238    // for loading source streams.
239    Value * const newPendingOffset = b->CreateAdd(pendingOffsetPhi, blockPopCount);
240    Value * const srcBlockNo = b->CreateUDiv(newPendingOffset, bwConst);
[6228]241
[6045]242    // Now load and process source streams.
[6184]243    Value * sourceData[mSelectedStreamCount];
[6071]244    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
[6184]245        sourceData[i] = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), srcBlockNo);
246        Value * A = b->simd_srlv(mFieldWidth, b->mvmd_dsll(mFieldWidth, sourceData[i], pendingDataPhi[i], field_offset_lo), bit_offset);
247        Value * B = b->simd_sllv(mFieldWidth, b->mvmd_dsll(mFieldWidth, sourceData[i], pendingDataPhi[i], field_offset_hi), shift_fwd);
[6045]248        Value * full_source_block = b->simd_or(A, B);
[6184]249        Value * C = b->simd_srlv(mFieldWidth, b->mvmd_shuffle(mFieldWidth, full_source_block, source_field_lo), source_shift_lo);
250        Value * D = b->simd_sllv(mFieldWidth, b->mvmd_shuffle(mFieldWidth, full_source_block, source_field_hi), source_shift_hi);
[6045]251        Value * output = b->bitCast(b->simd_or(C, D));
252        b->storeOutputStreamBlock("output", b->getInt32(i), blockNoPhi, output);
253    }
254    //
255    // Update loop control Phis for the next iteration.
256    //
257    Value * nextBlk = b->CreateAdd(blockNoPhi, b->getSize(1));
258    blockNoPhi->addIncoming(nextBlk, expandLoop);
[6184]259    pendingOffsetPhi->addIncoming(newPendingOffset, expandLoop);
[6086]260    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
[6184]261        pendingDataPhi[i]->addIncoming(sourceData[i], expandLoop);
[6086]262    }
[6045]263    //
264    // Now continue the loop if there are more blocks to process.
265    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
266    b->CreateCondBr(moreToDo, expandLoop, expansionDone);
[6228]267
[6045]268    b->SetInsertPoint(expansionDone);
[5588]269}
[6045]270
[6261]271FieldDepositKernel::FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & b
[6184]272                                       , StreamSet * mask, StreamSet * input, StreamSet * output
273                                       , const unsigned fieldWidth)
[6261]274: MultiBlockKernel(b, "FieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(input->getNumElements()),
[6184]275{Binding{"depositMask", mask}
276, Binding{"inputStreamSet", input}},
277{Binding{"outputStreamSet", output}},
278{}, {}, {})
[6045]279, mFieldWidth(fieldWidth)
[6184]280, mStreamCount(input->getNumElements()) {
281
[6045]282}
[6228]283
[6045]284void FieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
285    BasicBlock * entry = kb->GetInsertBlock();
286    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
287    BasicBlock * done = kb->CreateBasicBlock("done");
288    Constant * const ZERO = kb->getSize(0);
289    kb->CreateBr(processBlock);
290    kb->SetInsertPoint(processBlock);
291    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
292    blockOffsetPhi->addIncoming(ZERO, entry);
293    Value * depositMask = kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi);
294    for (unsigned j = 0; j < mStreamCount; ++j) {
295        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
296        Value * output = kb->simd_pdep(mFieldWidth, input, depositMask);
297        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
298    }
299    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
300    blockOffsetPhi->addIncoming(nextBlk, processBlock);
301    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
302    kb->CreateCondBr(moreToDo, processBlock, done);
303    kb->SetInsertPoint(done);
304}
305
[6261]306PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & b
[6184]307                                               , StreamSet * mask, StreamSet * input, StreamSet * output
308                                               , const unsigned fieldWidth)
[6261]309: MultiBlockKernel(b, "PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(input->getNumElements()) ,
[6184]310                   {Binding{"depositMask", mask},
311                    Binding{"inputStreamSet", input}},
312                   {Binding{"outputStreamSet", output}},
[6045]313                   {}, {}, {})
314, mPDEPWidth(fieldWidth)
[6184]315, mStreamCount(input->getNumElements()) {
316    if ((fieldWidth != 32) && (fieldWidth != 64))
317        llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldDepositKernel");
[6045]318}
319
320void PDEPFieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
321    Type * fieldTy = kb->getIntNTy(mPDEPWidth);
322    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
323    Constant * PDEP_func = nullptr;
324    if (mPDEPWidth == 64) {
325        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
326    } else if (mPDEPWidth == 32) {
327        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
328    }
329    BasicBlock * entry = kb->GetInsertBlock();
330    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
331    BasicBlock * done = kb->CreateBasicBlock("done");
332    Constant * const ZERO = kb->getSize(0);
333    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPDEPWidth;
334    kb->CreateBr(processBlock);
335    kb->SetInsertPoint(processBlock);
336    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
337    blockOffsetPhi->addIncoming(ZERO, entry);
338    std::vector<Value *> mask(fieldsPerBlock);
[6071]339//  When operating on fields individually, we can use vector load/store with
340//  extract/insert element operations, or we can use individual field load
341//  and stores.   Individual field operations require fewer total operations,
342//  but more memory instructions.   It may be that vector load/extract is better,
343//  while field store is better.   Vector insert then store creates long dependence
344//  chains.
345//
346#define PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
347#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
348    Value * depositMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
349    depositMaskPtr = kb->CreatePointerCast(depositMaskPtr, fieldPtrTy);
[6046]350    for (unsigned i = 0; i < fieldsPerBlock; i++) {
[6071]351        mask[i] = kb->CreateLoad(kb->CreateGEP(depositMaskPtr, kb->getInt32(i)));
[6046]352    }
[6071]353#else
354    Value * depositMask = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi));
355    for (unsigned i = 0; i < fieldsPerBlock; i++) {
356        mask[i] = kb->CreateExtractElement(depositMask, kb->getInt32(i));
357    }
358#endif
[6045]359    for (unsigned j = 0; j < mStreamCount; ++j) {
[6071]360#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
[6045]361        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
362        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
[6071]363#else
364        Value * inputStrm = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi));
365#endif
366#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
[6045]367        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
368        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
[6071]369#else
370        Value * outputStrm = kb->fwCast(mPDEPWidth, kb->allZeroes());
371#endif
[6045]372        for (unsigned i = 0; i < fieldsPerBlock; i++) {
[6071]373#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
[6045]374            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
[6071]375#else
376            Value * field = kb->CreateExtractElement(inputStrm, kb->getInt32(i));
377#endif
[6045]378            Value * compressed = kb->CreateCall(PDEP_func, {field, mask[i]});
[6071]379#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
[6045]380            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
381        }
[6071]382#else
383            outputStrm = kb->CreateInsertElement(outputStrm, compressed, kb->getInt32(i));
384        }
385        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, outputStrm);
386#endif
[6045]387    }
388    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
389    blockOffsetPhi->addIncoming(nextBlk, processBlock);
390    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
391    kb->CreateCondBr(moreToDo, processBlock, done);
392    kb->SetInsertPoint(done);
393}
394
395}
396
Note: See TracBrowser for help on using the repository browser.