Changeset 6056


Ignore:
Timestamp:
May 23, 2018, 12:21:40 PM (5 months ago)
Author:
cameron
Message:

mvmd_dslli, mvmd_shuffle fixes for AVX-512

Location:
icGREP/icgrep-devel/icgrep
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6055 r6056  
    539539
    540540llvm::Value * IDISA_AVX512F_Builder::mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) {
     541    Type * fwTy = getIntNTy(fw);
    541542    const unsigned fieldCount = mBitBlockWidth/fw;
    542543    if (mBitBlockWidth == 512 && fw == 32) {
    543544        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_d_512);
    544         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
    545         return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     545        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
     546        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, a), mask});
    546547    }
    547548    if (mBitBlockWidth == 512 && fw == 64) {
    548549        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_q_512);
    549         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
    550         return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     550        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
     551        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, a), mask});
    551552    }
    552553    if (mBitBlockWidth == 512 && fw == 16 && hostCPUFeatures.hasAVX512BW) {
    553554        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
    554         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
    555         return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     555        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
     556        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, a), mask});
    556557    }
    557558    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     
    559560
    560561llvm::Value * IDISA_AVX512F_Builder::mvmd_shuffle2(unsigned fw, Value * a, Value * b, llvm::Value * shuffle_table) {
     562    Type * fwTy = getIntNTy(fw);
    561563    const unsigned fieldCount = mBitBlockWidth/fw;
    562564    if (mBitBlockWidth == 512 && fw == 32) {
    563565        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_d_512);
    564         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     566        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
    565567        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
    566568    }
    567569    if (mBitBlockWidth == 512 && fw == 64) {
    568570        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_q_512);
    569         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     571        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
    570572        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
    571573    }
    572574    if (mBitBlockWidth == 512 && fw == 16 && hostCPUFeatures.hasAVX512BW) {
    573575        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
    574         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     576        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
    575577        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
    576578    }
     
    597599        return mvmd_slli(2 * fw, a, shift / 2);
    598600    }
    599     const unsigned field_count = mBitBlockWidth/fw;
     601    const unsigned fieldCount = mBitBlockWidth/fw;
    600602    if ((fw == 32) || (hostCPUFeatures.hasAVX512BW && (fw == 16)))   {
    601603        // Mask with 1 bit per field indicating which fields are not zeroed out.
    602604        Type * fwTy = getIntNTy(fw);
    603         Constant * fieldMask = ConstantInt::get(getIntNTy(field_count), (1 << field_count) - (1 << shift));
     605        Constant * fieldMask = ConstantInt::get(getIntNTy(fieldCount), (1 << fieldCount) - (1 << shift));
    604606        Value * permute_func = nullptr;
    605607        if (fw == 32) permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_d_512);
    606608        else permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
    607         Constant * indices[field_count];
    608         for (unsigned i = 0; i < field_count; i++) {
     609        Constant * indices[fieldCount];
     610        for (unsigned i = 0; i < fieldCount; i++) {
    609611            indices[i] = i < shift ? UndefValue::get(fwTy) : ConstantInt::get(fwTy, i - shift);
    610612        }
    611         Value * args[4] = {ConstantVector::get({indices, field_count}), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), fieldMask};
     613        Value * args[4] = {ConstantVector::get({indices, fieldCount}), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), fieldMask};
    612614        return bitCast(CreateCall(permute_func, args));
    613615    } else {
     
    626628        return mvmd_dslli(2 * fw, a, b, shift / 2);
    627629    }
    628     const unsigned field_count = mBitBlockWidth/fw;
     630    const unsigned fieldCount = mBitBlockWidth/fw;
    629631    if ((fw == 32) || (hostCPUFeatures.hasAVX512BW && (fw == 16)))   {
    630632        Type * fwTy = getIntNTy(fw);
     
    632634        if (fw == 32) permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_d_512);
    633635        else permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
    634         Constant * indices[field_count];
    635         for (unsigned i = 0; i < field_count; i++) {
    636             indices[i] = ConstantInt::get(fwTy, i + field_count - shift);
    637         }
    638         Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(field_count));
    639         Value * args[4] = {ConstantVector::get({indices, field_count}), fwCast(fw, a), fwCast(fw, b), mask};
     636        Constant * indices[fieldCount];
     637        for (unsigned i = 0; i < fieldCount; i++) {
     638            indices[i] = ConstantInt::get(fwTy, i + fieldCount - shift);
     639        }
     640        Constant * mask = Constant::getIntegerValue(fwTy, APInt::getLowBitsSet(fw, fieldCount));
     641        Value * args[4] = {ConstantVector::get({indices, fieldCount}), fwCast(fw, b), fwCast(fw, a), mask};
    640642        return bitCast(CreateCall(permute_func, args));
    641643    } else {
    642644        unsigned field32_shift = (shift * fw) / 32;
    643645        unsigned bit_shift = (shift * fw) % 32;
    644         return simd_or(simd_slli(32, mvmd_slli(32, a, field32_shift), bit_shift),
    645                        simd_srli(32, mvmd_slli(32, a, field32_shift + 1), 32-bit_shift));
     646        return simd_or(simd_slli(32, mvmd_dslli(32, a, b, field32_shift), bit_shift),
     647                       simd_srli(32, mvmd_dslli(32, a, b, field32_shift + 1), 32-bit_shift));
    646648    }
    647649}
  • icGREP/icgrep-devel/icgrep/idisa_test.cpp

    r6054 r6056  
    4040static cl::opt<bool> QuietMode("q", cl::desc("Suppress output, set the return code only."), cl::cat(testFlags));
    4141static cl::opt<int> ShiftLimit("ShiftLimit", cl::desc("Upper limit for the shift operand (2nd operand) of sllv, srlv, srav."), cl::init(0));
     42static cl::opt<int> Immediate("i", cl::desc("Immediate value for mvmd_dslli"), cl::init(1));
    4243
    4344class ShiftLimitKernel : public kernel::BlockOrientedKernel {
     
    7172class IdisaBinaryOpTestKernel : public kernel::MultiBlockKernel {
    7273public:
    73     IdisaBinaryOpTestKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw);
     74    IdisaBinaryOpTestKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw, unsigned imm=0);
    7475    bool isCachable() const override { return true; }
    7576    bool hasSignature() const override { return false; }
     
    7980    const std::string mIdisaOperation;
    8081    const unsigned mTestFw;
     82    const unsigned mImmediateShift;
    8183};
    8284
    83 IdisaBinaryOpTestKernel::IdisaBinaryOpTestKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw)
     85IdisaBinaryOpTestKernel::IdisaBinaryOpTestKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw, unsigned imm)
    8486: kernel::MultiBlockKernel(idisa_op + std::to_string(fw) + "_test",
    8587     {kernel::Binding{b->getStreamSetTy(1, 1), "operand1"}, kernel::Binding{b->getStreamSetTy(1, 1), "operand2"}},
    8688     {kernel::Binding{b->getStreamSetTy(1, 1), "result"}},
    8789     {}, {}, {}),
    88 mIdisaOperation(idisa_op), mTestFw(fw) {}
     90mIdisaOperation(idisa_op), mTestFw(fw), mImmediateShift(imm) {}
    8991
    9092void IdisaBinaryOpTestKernel::generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & kb, llvm::Value * const numOfBlocks) {
     
    148150    } else if (mIdisaOperation == "mvmd_compress") {
    149151        result = kb->mvmd_compress(mTestFw, operand1, operand2);
     152    } else if (mIdisaOperation == "mvmd_dslli") {
     153        result = kb->mvmd_dslli(mTestFw, operand1, operand2, mImmediateShift);
    150154    } else {
    151155        llvm::report_fatal_error("Binary operation " + mIdisaOperation + " is unknown to the IdisaBinaryOpTestKernel kernel.");
     
    161165class IdisaBinaryOpCheckKernel : public kernel::BlockOrientedKernel {
    162166public:
    163     IdisaBinaryOpCheckKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw);
     167    IdisaBinaryOpCheckKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw, unsigned imm=0);
    164168    bool isCachable() const override { return true; }
    165169    bool hasSignature() const override { return false; }
     
    169173    const std::string mIdisaOperation;
    170174    const unsigned mTestFw;
     175    const unsigned mImmediateShift;
    171176};
    172177
    173 IdisaBinaryOpCheckKernel::IdisaBinaryOpCheckKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw)
     178IdisaBinaryOpCheckKernel::IdisaBinaryOpCheckKernel(const std::unique_ptr<kernel::KernelBuilder> & b, std::string idisa_op, unsigned fw, unsigned imm)
    174179: kernel::BlockOrientedKernel(idisa_op + std::to_string(fw) + "_check" + std::to_string(QuietMode),
    175180                           {kernel::Binding{b->getStreamSetTy(1, 1), "operand1"},
     
    178183                           {kernel::Binding{b->getStreamSetTy(1, 1), "expected_result"}},
    179184                           {}, {kernel::Binding{b->getSizeTy(), "totalFailures"}}, {}),
    180 mIdisaOperation(idisa_op), mTestFw(fw) {}
     185mIdisaOperation(idisa_op), mTestFw(fw), mImmediateShift(imm) {}
    181186
    182187void IdisaBinaryOpCheckKernel::generateDoBlockMethod(const std::unique_ptr<kernel::KernelBuilder> & kb) {
     
    191196    Value * expectedBlock = kb->allZeroes();
    192197    if (mIdisaOperation == "mvmd_shuffle") {
    193         for (unsigned i = 0; i < mTestFw; i++) {
    194             Value * iConst = ConstantInt::get(kb->getInt32Ty(), i);
    195             Value * idx = kb->CreateExtractElement(operand2Block, iConst);
    196             expectedBlock = kb->CreateInsertElement(expectedBlock, kb->CreateExtractElement(operand1Block, idx), iConst);
     198        for (unsigned i = 0; i < fieldCount; i++) {
     199            Value * idx = kb->CreateURem(kb->mvmd_extract(mTestFw, operand2Block, i), ConstantInt::get(fwTy, fieldCount));
     200            Value * elt = kb->CreateExtractElement(kb->fwCast(mTestFw, operand1Block), kb->CreateZExtOrTrunc(idx, kb->getInt32Ty()));
     201            expectedBlock = kb->mvmd_insert(mTestFw, expectedBlock, elt, i);
     202        }
     203    } else if (mIdisaOperation == "mvmd_dslli") {
     204        for (unsigned i = 0; i < fieldCount; i++) {
     205            Value * elt = nullptr;
     206            if (i < mImmediateShift) elt = kb->mvmd_extract(mTestFw, operand2Block, fieldCount - mImmediateShift + i);
     207            else elt = kb->mvmd_extract(mTestFw, operand1Block, i - mImmediateShift);
     208            expectedBlock = kb->mvmd_insert(mTestFw, expectedBlock, elt, i);
    197209        }
    198210    } else {
     
    383395
    384396    StreamSetBuffer * ResultBitStream = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), bufferSize);
    385     kernel::Kernel * testK = pxDriver.addKernelInstance<IdisaBinaryOpTestKernel>(idb, TestOperation, TestFieldWidth);
     397    kernel::Kernel * testK = pxDriver.addKernelInstance<IdisaBinaryOpTestKernel>(idb, TestOperation, TestFieldWidth, Immediate);
    386398    pxDriver.makeKernelCall(testK, {Operand1BitStream, Operand2BitStream}, {ResultBitStream});
    387399   
    388400    StreamSetBuffer * ExpectedResultBitStream = pxDriver.addBuffer<StaticBuffer>(idb, idb->getStreamSetTy(1, 1), bufferSize);
    389     kernel::Kernel * checkK = pxDriver.addKernelInstance<IdisaBinaryOpCheckKernel>(idb, TestOperation, TestFieldWidth);
     401    kernel::Kernel * checkK = pxDriver.addKernelInstance<IdisaBinaryOpCheckKernel>(idb, TestOperation, TestFieldWidth, Immediate);
    390402    pxDriver.makeKernelCall(checkK, {Operand1BitStream, Operand2BitStream, ResultBitStream}, {ExpectedResultBitStream});
    391403   
Note: See TracChangeset for help on using the changeset viewer.