Changeset 6045


Ignore:
Timestamp:
May 16, 2018, 1:55:43 PM (3 months ago)
Author:
cameron
Message:

StreamExpand? kernel and compiler - initial check-in

Location:
icGREP/icgrep-devel/icgrep
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6019 r6045  
    343343}
    344344
     345   
     346llvm::Value * IDISA_AVX2_Builder::mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) {
     347    if (mBitBlockWidth == 256 && fw > 32) {
     348        // Create a table for shuffling with smaller field widths.
     349        unsigned half_fw = fw/2;
     350        unsigned field_count = mBitBlockWidth/half_fw;
     351        // Build a ConstantVector of alternating 0 and 1 values.
     352        Constant * Idxs[field_count];
     353        for (unsigned int i = 0; i < field_count; i++) {
     354            Idxs[i] = getInt32(i & 1);
     355        }
     356        Constant * splat01 = ConstantVector::get({Idxs, field_count});
     357        Value * half_shuffle_table = simd_add(fw, simd_add(fw, shuffle_table, shuffle_table), splat01);
     358        return mvmd_shuffle(half_fw, a, half_shuffle_table);
     359    }
     360    if (mBitBlockWidth == 256 && fw == 32) {
     361        Value * shuf32Func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_permd);
     362        return CreateCall(shuf32Func, {fwCast(32, a), fwCast(32, shuffle_table)});
     363    }
     364    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     365}
     366
    345367llvm::Value * IDISA_AVX2_Builder::mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) {
    346368    if (mBitBlockWidth == 256 && fw == 64) {
     
    356378        Type * v8xi1Ty = VectorType::get(getInt1Ty(), 8);
    357379        Constant * mask0000000Fsplaat = ConstantVector::getSplat(8, ConstantInt::get(getInt32Ty(), 0xF));
    358         Value * shuf32Func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_permd);
    359380        Value * PEXT_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_32);
    360381        Value * PDEP_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_32);
     
    376397        }
    377398        Value * shuf = CreateAnd(CreateLShr(bdcst, ConstantVector::get({Shifts, 8})), mask0000000Fsplaat);
    378         Value * compress = CreateCall(shuf32Func, {a, shuf});
     399        Value * compress = mvmd_shuffle(32, a, shuf);
    379400        Value * field_mask = CreateTrunc(CreateSub(CreateShl(getInt32(1), field_count), getInt32(1)), getInt8Ty());
    380401        Value * result = CreateAnd(compress, CreateSExt(CreateBitCast(field_mask, v8xi1Ty), v8xi32Ty));
     
    512533}
    513534
     535llvm::Value * IDISA_AVX512F_Builder::mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) {
     536    const unsigned fieldCount = mBitBlockWidth/fw;
     537    if (mBitBlockWidth == 512 && fw == 32) {
     538        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_d_512);
     539        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     540        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     541    }
     542    if (mBitBlockWidth == 512 && fw == 64) {
     543        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_q_512);
     544        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     545        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     546    }
     547    if (mBitBlockWidth == 512 && fw == 16 && hostCPUFeatures.hasAVX512BW) {
     548        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     549        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     550        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     551    }
     552    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     553}
     554
     555llvm::Value * IDISA_AVX512F_Builder::mvmd_shuffle2(unsigned fw, Value * a, Value * b, llvm::Value * shuffle_table) {
     556    const unsigned fieldCount = mBitBlockWidth/fw;
     557    if (mBitBlockWidth == 512 && fw == 32) {
     558        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_d_512);
     559        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     560        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
     561    }
     562    if (mBitBlockWidth == 512 && fw == 64) {
     563        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_q_512);
     564        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     565        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
     566    }
     567    if (mBitBlockWidth == 512 && fw == 16 && hostCPUFeatures.hasAVX512BW) {
     568        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     569        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     570        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
     571    }
     572    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     573}
    514574
    515575llvm::Value * IDISA_AVX512F_Builder::mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) {
    516    
    517576    if (mBitBlockWidth == 512 && fw == 32) {
    518577        Value * compressFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_compress_d_512);
     
    554613    }
    555614}
     615
     616Value * IDISA_AVX512F_Builder:: mvmd_dslli(unsigned fw, llvm::Value * a, llvm::Value * b, unsigned shift) {
     617    if (shift == 0) return a;
     618    if (fw > 32) {
     619        return mvmd_dslli(32, a, b, shift * (fw/32));
     620    } else if (((shift % 2) == 0) && (fw < 32)) {
     621        return mvmd_dslli(2 * fw, a, b, shift / 2);
     622    }
     623    const unsigned field_count = mBitBlockWidth/fw;
     624    if ((fw == 32) || (hostCPUFeatures.hasAVX512BW && (fw == 16)))   {
     625        Type * fwTy = getIntNTy(fw);
     626        Value * permute_func = nullptr;
     627        if (fw == 32) permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_d_512);
     628        else permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     629        Constant * indices[field_count];
     630        for (unsigned i = 0; i < field_count; i++) {
     631            indices[i] = ConstantInt::get(fwTy, i + field_count - shift);
     632        }
     633        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(field_count));
     634        Value * args[4] = {ConstantVector::get({indices, field_count}), fwCast(fw, a), fwCast(fw, b), mask};
     635        return bitCast(CreateCall(permute_func, args));
     636    } else {
     637        unsigned field32_shift = (shift * fw) / 32;
     638        unsigned bit_shift = (shift * fw) % 32;
     639        return simd_or(simd_slli(32, mvmd_slli(32, a, field32_shift), bit_shift),
     640                       simd_srli(32, mvmd_slli(32, a, field32_shift + 1), 32-bit_shift));
     641    }
     642}
     643
    556644llvm::Value * IDISA_AVX512F_Builder::simd_popcount(unsigned fw, llvm::Value * a) {
    557645     if (fw == 512) {
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.h

    r6017 r6045  
    5050    llvm::Value * mvmd_srl(unsigned fw, llvm::Value * a, llvm::Value * shift) override;
    5151    llvm::Value * mvmd_sll(unsigned fw, llvm::Value * a, llvm::Value * shift) override;
     52    llvm::Value * mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) override;
    5253    llvm::Value * mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) override;
    5354    llvm::Value * simd_pext(unsigned fw, llvm::Value * v, llvm::Value * extract_mask) override;
     
    7475    llvm::Value * simd_popcount(unsigned fw, llvm::Value * a) override;
    7576    llvm::Value * mvmd_slli(unsigned fw, llvm::Value * a, unsigned shift) override;
     77    llvm::Value * mvmd_dslli(unsigned fw, llvm::Value * a, llvm::Value * b, unsigned shift) override;
    7678    llvm::Value * hsimd_signmask(unsigned fw, llvm::Value * a) override;
     79    llvm::Value * mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) override;
     80    llvm::Value * mvmd_shuffle2(unsigned fw, llvm::Value * a, llvm::Value * b, llvm::Value * shuffle_table) override;
    7781    llvm::Value * mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) override;
    7882    llvm::Value * mvmd_srl(unsigned fw, llvm::Value * a, llvm::Value * shift) override;
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.cpp

    r6007 r6045  
    180180    shift = CreateZExtOrTrunc(CreateMul(shift, ConstantInt::get(shift->getType(), fw)), intTy);
    181181    return CreateBitCast(CreateShl(value, shift), vecTy);
     182}
     183
     184Value * IDISA_Builder::mvmd_dsll(unsigned fw, Value * a, Value * b, Value * shift) {
     185    if (fw < 8) report_fatal_error("Unsupported field width: mvmd_dsll " + std::to_string(fw));
     186    const auto field_count = mBitBlockWidth/fw;
     187    Type * fwTy = getIntNTy(fw);
     188   
     189    Constant * Idxs[field_count];
     190    for (unsigned i = 0; i < field_count; i++) {
     191        Idxs[i] = ConstantInt::get(fwTy, i + field_count);
     192    }
     193    Value * shuffle = simd_add(fw, simd_fill(fw, shift), ConstantVector::get({Idxs, field_count}));
     194    return mvmd_shuffle2(fw, fwCast(fw, b), fwCast(fw, a), shuffle);
    182195}
    183196
     
    513526    return CreateShuffleVector(fwCast(fw, b), fwCast(fw, a), ConstantVector::get({Idxs, field_count}));
    514527}
     528
     529Value * IDISA_Builder::mvmd_shuffle(unsigned fw, Value * a, Value * shuffle_table) {
     530    report_fatal_error("Unsupported field width: mvmd_shuffle " + std::to_string(fw));
     531}
     532   
     533Value * IDISA_Builder::mvmd_shuffle2(unsigned fw, Value * a, Value *b, Value * shuffle_table) {
     534    //  Use two shuffles, with selection by the bit value within the shuffle_table.
     535    const auto field_count = mBitBlockWidth/fw;
     536    Constant * selectorSplat = ConstantVector::getSplat(field_count, ConstantInt::get(getIntNTy(fw), 1<<field_count));
     537    Value * selectMask = simd_eq(fw, simd_and(shuffle_table, selectorSplat), selectorSplat);
     538    Value * negSelect = simd_not(selectMask);
     539    Value * tbl = simd_and(shuffle_table, negSelect);
     540    return simd_or(simd_and(mvmd_shuffle(fw, a, tbl), negSelect), simd_and(mvmd_shuffle(fw, b, tbl), selectMask));
     541}
     542   
    515543
    516544llvm::Value * IDISA_Builder::mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) {
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.h

    r6007 r6045  
    149149    virtual llvm::Value * mvmd_srli(unsigned fw, llvm::Value * a, unsigned shift);
    150150    virtual llvm::Value * mvmd_dslli(unsigned fw, llvm::Value * a, llvm::Value * b, unsigned shift);
     151    virtual llvm::Value * mvmd_dsll(unsigned fw, llvm::Value * a, llvm::Value * b, llvm::Value * shift);
     152    virtual llvm::Value * mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table);
     153    virtual llvm::Value * mvmd_shuffle2(unsigned fw, llvm::Value * a, llvm::Value *b, llvm::Value * shuffle_table);
    151154    virtual llvm::Value * mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask);
    152155
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r6041 r6045  
    77#include <llvm/Support/raw_ostream.h>
    88#include <toolchain/toolchain.h>
     9#include <toolchain/driver.h>
     10#include <toolchain/cpudriver.h>
     11#include <IR_Gen/idisa_target.h>
     12#include <llvm/IR/Module.h>
     13
    914
    1015using namespace llvm;
     
    144149    b->SetInsertPoint(finishedStrides);
    145150}
    146 
    147 }
     151   
     152StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
     153: MultiBlockKernel("streamExpand" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     154                   {Binding{kb->getStreamSetTy(), "marker", FixedRate(), Principal()},
     155                       Binding{kb->getStreamSetTy(streamCount), "source", PopcountOf("marker")}},
     156                   {Binding{kb->getStreamSetTy(streamCount), "output", FixedRate()}},
     157                   {}, {}, {})
     158, mFieldWidth(fieldWidth)
     159, mStreamCount(streamCount) {
     160    for (unsigned i = 0; i < streamCount; i++) {
     161        addScalar(kb->getBitBlockType(), "pendingSourceBlock_" + std::to_string(i));
     162    }
     163}
     164
     165void StreamExpandKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfBlocks) {
     166    const unsigned fw = mFieldWidth;
     167    Type * fwTy = b->getIntNTy(fw);
     168    Type * sizeTy = b->getSizeTy();
     169    const unsigned numFields = b->getBitBlockWidth()/fw;
     170   
     171    Constant * const ZERO = b->getSize(0);
     172    Constant * bwConst = ConstantInt::get(sizeTy, b->getBitBlockWidth());
     173    Constant * bw_sub1Const = ConstantInt::get(sizeTy, b->getBitBlockWidth() -1);
     174    Constant * fwConst = ConstantInt::get(sizeTy, fw);
     175    Constant * fw_sub1Const = ConstantInt::get(sizeTy, fw-1);
     176    Constant * fwSplat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw));
     177    Constant * fw_sub1Splat = ConstantVector::getSplat(numFields, ConstantInt::get(fwTy, fw-1));
     178   
     179    BasicBlock * entry = b->GetInsertBlock();
     180    BasicBlock * expandLoop = b->CreateBasicBlock("expandLoop");
     181    BasicBlock * expansionDone = b->CreateBasicBlock("expansionDone");
     182   
     183    Value * processedSourceItems = b->getProcessedItemCount("source");
     184    Value * sourceOffset = b->CreateURem(processedSourceItems, bwConst);
     185   
     186    std::vector<Value *> pendingData(mStreamCount);
     187    for (unsigned i = 0; i < mStreamCount; i++) {
     188        pendingData[i] = b->getScalarField("pendingSourceBlock_" + std::to_string(i));
     189    }
     190   
     191    b->CreateBr(expandLoop);
     192    // Main Loop
     193    b->SetInsertPoint(expandLoop);
     194    PHINode * blockNoPhi = b->CreatePHI(b->getSizeTy(), 2);
     195    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
     196    PHINode * pendingDataPhi[mStreamCount];
     197    blockNoPhi->addIncoming(ZERO, entry);
     198    pendingItemsPhi->addIncoming(sourceOffset, entry);
     199    for (unsigned i = 0; i < mStreamCount; i++) {
     200        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
     201        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
     202    }
     203    Value * deposit_mask = b->loadInputStreamBlock("marker", ZERO, blockNoPhi);
     204    // The source stream may not be positioned at a block boundary.  Partial data
     205    // has been saved in the kernel state, determine the next full block number
     206    // for loading source streams.
     207    Value * pendingBlockEnd = b->CreateAdd(pendingItemsPhi, bw_sub1Const);
     208    Value * srcBlockNo = b->CreateUDiv(pendingBlockEnd, bwConst);
     209   
     210    // Calculate the field values and offsets we need for assembling a
     211    // a full block of source bits.  Assembly will use the following operations.
     212    // A = b->simd_srli(fw, b->mvmd_dslli(fw, source, pending, field_offset_lo), bit_offset);
     213    // B = b->simd_slli(fw, b->mvmd_dslli(fw, source, pending, field_offset_hi), shift_fwd);
     214    // all_source_bits = simd_or(A, B);
     215    Value * pendingOffset = b->CreateURem(pendingBlockEnd, bwConst);
     216    Value * field_offset_lo =  b->simd_fill(fw, b->CreateUDiv(pendingOffset, fwConst));
     217    Value * bit_offset = b->simd_fill(fw, b->CreateURem(pendingOffset, fwConst));
     218   
     219    // Carefully avoid a shift by the full fieldwith (which gives a poison value).
     220    // field_offset_lo + 1 unless the bit_offset is 0, in which case it is just field_offset_lo.
     221    Value * field_offset_hi =  b->simd_fill(fw, b->CreateUDiv(b->CreateAdd(pendingOffset, fw_sub1Const), fwConst));
     222    // fw - bit_offset, unless bit_offset is 0, in which case, the shift_fwd is 0.
     223    Value * shift_fwd = b->CreateURem(b->CreateSub(fwSplat, bit_offset), fwSplat);
     224   
     225    // Once all source bits are assembled, they need to be distributed to the
     226    // output fields in accord with the popcounts of the deposit mask fields.
     227    // The bits for each output field will typically come from (at most) two
     228    // source fields, with offsets.  Calculate the field numbers and offsets.
     229   
     230    Value * fieldPopCounts = b->simd_popcount(fw, deposit_mask);
     231    // For each field determine the (partial) sum popcount of all fields prior to
     232    // the current field.
     233    Value * partialSum = fieldPopCounts;
     234    for (unsigned i = 1; i < numFields; i *= 2) {
     235        partialSum = b->simd_add(fw, partialSum, b->mvmd_slli(fw, partialSum, i));
     236    }
     237    Value * blockPopCount = b->CreateZExtOrTrunc(b->CreateExtractElement(partialSum, numFields-1), sizeTy);
     238    partialSum = b->mvmd_slli(fw, partialSum, 1);
     239   
     240    Value * source_field_lo = b->CreateUDiv(partialSum, fwSplat);
     241    Value * source_field_hi = b->CreateUDiv(b->CreateAdd(partialSum, fw_sub1Splat), fwSplat);
     242    Value * source_shift_lo = b->CreateAnd(partialSum, fw_sub1Splat);  // parallel URem
     243    Value * source_shift_hi = b->CreateAnd(b->CreateSub(fwSplat, source_shift_lo), fw_sub1Splat);
     244   
     245    // Now load and process source streams.
     246    for (unsigned i = 0; i < mStreamCount; i++) {
     247        Value * source = b->loadInputStreamBlock("source", b->getInt32(i), srcBlockNo);
     248        Value * A = b->simd_srlv(fw, b->mvmd_dsll(fw, source, pendingDataPhi[i], field_offset_lo), bit_offset);
     249        Value * B = b->simd_sllv(fw, b->mvmd_dsll(fw, source, pendingDataPhi[i], field_offset_hi), shift_fwd);
     250        Value * full_source_block = b->simd_or(A, B);
     251       
     252        Value * C = b->simd_srlv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_lo), source_shift_lo);
     253        Value * D = b->simd_sllv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_hi), source_shift_hi);
     254        Value * output = b->bitCast(b->simd_or(C, D));
     255        b->storeOutputStreamBlock("output", b->getInt32(i), blockNoPhi, output);
     256        pendingDataPhi[i]->addIncoming(source, expandLoop);
     257    }
     258    //
     259    // Update loop control Phis for the next iteration.
     260    //
     261    Value * nextBlk = b->CreateAdd(blockNoPhi, b->getSize(1));
     262    blockNoPhi->addIncoming(nextBlk, expandLoop);
     263    Value * newPending = b->CreateAdd(pendingItemsPhi, blockPopCount);
     264    pendingItemsPhi->addIncoming(newPending, expandLoop);
     265    //
     266    // Now continue the loop if there are more blocks to process.
     267    Value * moreToDo = b->CreateICmpNE(nextBlk, numOfBlocks);
     268    b->CreateCondBr(moreToDo, expandLoop, expansionDone);
     269   
     270    b->SetInsertPoint(expansionDone);
     271    // Update kernel state.
     272    for (unsigned i = 0; i < mStreamCount; i++) {
     273        b->setScalarField("pendingSourceBlock_" + std::to_string(i), b->bitCast(pendingDataPhi[i]));
     274    }
     275    b->getModule()->dump();
     276}
     277
     278FieldDepositKernel::FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
     279: MultiBlockKernel("FieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     280                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
     281                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
     282                   {}, {}, {})
     283, mFieldWidth(fieldWidth)
     284, mStreamCount(streamCount) {
     285}
     286   
     287void FieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
     288    BasicBlock * entry = kb->GetInsertBlock();
     289    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
     290    BasicBlock * done = kb->CreateBasicBlock("done");
     291    Constant * const ZERO = kb->getSize(0);
     292    kb->CreateBr(processBlock);
     293    kb->SetInsertPoint(processBlock);
     294    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     295    blockOffsetPhi->addIncoming(ZERO, entry);
     296    Value * depositMask = kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi);
     297    for (unsigned j = 0; j < mStreamCount; ++j) {
     298        Value * input = kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
     299        Value * output = kb->simd_pdep(mFieldWidth, input, depositMask);
     300        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, output);
     301    }
     302    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
     303    blockOffsetPhi->addIncoming(nextBlk, processBlock);
     304    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
     305    kb->CreateCondBr(moreToDo, processBlock, done);
     306    kb->SetInsertPoint(done);
     307}
     308
     309PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
     310: MultiBlockKernel("PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     311                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
     312                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
     313                   {}, {}, {})
     314, mPDEPWidth(fieldWidth)
     315, mStreamCount(streamCount) {
     316    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldCompressKernel");
     317}
     318
     319void PDEPFieldDepositKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
     320    Type * fieldTy = kb->getIntNTy(mPDEPWidth);
     321    Type * fieldPtrTy = PointerType::get(fieldTy, 0);
     322    Constant * PDEP_func = nullptr;
     323    if (mPDEPWidth == 64) {
     324        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_64);
     325    } else if (mPDEPWidth == 32) {
     326        PDEP_func = Intrinsic::getDeclaration(kb->getModule(), Intrinsic::x86_bmi_pdep_32);
     327    }
     328    BasicBlock * entry = kb->GetInsertBlock();
     329    BasicBlock * processBlock = kb->CreateBasicBlock("processBlock");
     330    BasicBlock * done = kb->CreateBasicBlock("done");
     331    Constant * const ZERO = kb->getSize(0);
     332    const unsigned fieldsPerBlock = kb->getBitBlockWidth()/mPDEPWidth;
     333    kb->CreateBr(processBlock);
     334    kb->SetInsertPoint(processBlock);
     335    PHINode * blockOffsetPhi = kb->CreatePHI(kb->getSizeTy(), 2);
     336    blockOffsetPhi->addIncoming(ZERO, entry);
     337    std::vector<Value *> mask(fieldsPerBlock);
     338    Value * extractionMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
     339    extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
     340    for (unsigned j = 0; j < mStreamCount; ++j) {
     341        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
     342        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
     343        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
     344        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
     345        for (unsigned i = 0; i < fieldsPerBlock; i++) {
     346            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
     347            Value * compressed = kb->CreateCall(PDEP_func, {field, mask[i]});
     348            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
     349        }
     350    }
     351    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
     352    blockOffsetPhi->addIncoming(nextBlk, processBlock);
     353    Value * moreToDo = kb->CreateICmpNE(nextBlk, numOfBlocks);
     354    kb->CreateCondBr(moreToDo, processBlock, done);
     355    kb->SetInsertPoint(done);
     356}
     357
     358void StreamDepositCompiler::makeCall(parabix::StreamSetBuffer * depositMask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs) {
     359    if (mBufferBlocks == 0) {
     360        llvm::report_fatal_error("StreamDepositCompiler needs a non-zero bufferBlocks parameter (for now).");
     361    }
     362    auto & iBuilder = mDriver.getBuilder();
     363    unsigned N = IDISA::getNumOfStreams(ssType);
     364    if (IDISA::getStreamFieldWidth(ssType) != 1) {
     365        llvm::report_fatal_error("StreamDepositCompiler only compresses bit streams (for now)");
     366    }
     367    parabix::StreamSetBuffer * expandedStreams = mDriver.addBuffer<parabix::CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
     368    Kernel * streamK = mDriver.addKernelInstance<StreamExpandKernel>(iBuilder, mFieldWidth, N);
     369    mDriver.makeKernelCall(streamK, {depositMask, inputs}, {expandedStreams});
     370
     371    Kernel * depositK = nullptr;
     372    if (AVX2_available()) {
     373        depositK = mDriver.addKernelInstance<PDEPFieldDepositKernel>(iBuilder, mFieldWidth, N);
     374    } else {
     375        depositK = mDriver.addKernelInstance<FieldDepositKernel>(iBuilder, mFieldWidth, N);
     376    }
     377    mDriver.makeKernelCall(depositK, {depositMask, expandedStreams}, {outputs});
     378}
     379
     380}
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.h

    r5870 r6045  
    99#include <llvm/IR/Value.h>
    1010#include <string>
     11#include <toolchain/driver.h>
    1112
    1213/*
     
    5051};   
    5152
     53class StreamExpandKernel final : public MultiBlockKernel {
     54public:
     55    StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
     56    bool isCachable() const override { return true; }
     57    bool hasSignature() const override { return false; }
     58protected:
     59    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfBlocks) override;
     60private:
     61    const unsigned mFieldWidth;
     62    const unsigned mStreamCount;
     63};
     64
     65class FieldDepositKernel final : public MultiBlockKernel {
     66public:
     67    FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
     68    bool isCachable() const override { return true; }
     69    bool hasSignature() const override { return false; }
     70protected:
     71    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
     72private:
     73    const unsigned mFieldWidth;
     74    const unsigned mStreamCount;
     75};
     76
     77class PDEPFieldDepositKernel final : public MultiBlockKernel {
     78public:
     79    PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & b, unsigned fw, unsigned streamCount);
     80    bool isCachable() const override { return true; }
     81    bool hasSignature() const override { return false; }
     82protected:
     83    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & kb, llvm::Value * const numOfStrides) override;
     84private:
     85    const unsigned mPDEPWidth;
     86    const unsigned mStreamCount;
     87};
     88
     89class StreamDepositCompiler {
     90public:
     91    StreamDepositCompiler(Driver & driver, llvm::Type * streamSetType, unsigned bufferBlocks = 0) :
     92    mDriver(driver), ssType(streamSetType), mBufferBlocks(bufferBlocks), mFieldWidth(64) {}
     93    void setDepositFieldWidth(unsigned fw) {mFieldWidth = fw;}
     94    void makeCall(parabix::StreamSetBuffer * mask, parabix::StreamSetBuffer * inputs, parabix::StreamSetBuffer * outputs);
     95private:
     96    Driver & mDriver;
     97    llvm::Type * ssType;
     98    unsigned mBufferBlocks;
     99    unsigned mFieldWidth;
     100};
     101
    52102}
    53    
     103
    54104#endif
Note: See TracChangeset for help on using the changeset viewer.