Changeset 6113


Ignore:
Timestamp:
Jun 25, 2018, 6:00:30 PM (3 months ago)
Author:
cameron
Message:

hsimd_signmask support for various BlockSizes?

Location:
icGREP/icgrep-devel/icgrep/IR_Gen
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6108 r6113  
    4747        }
    4848    }
    49     // Otherwise use default SSE logic.
    50     return IDISA_SSE_Builder::hsimd_signmask(fw, a);
     49    // Otherwise use default SSE2 logic.
     50    return IDISA_SSE2_Builder::hsimd_signmask(fw, a);
    5151}
    5252
     
    7474    }
    7575    // Otherwise use default SSE logic.
    76     return IDISA_SSE_Builder::hsimd_packh(fw, a, b);
     76    return IDISA_SSE2_Builder::hsimd_packh(fw, a, b);
    7777}
    7878
     
    9696    }
    9797    // Otherwise use default SSE logic.
    98     return IDISA_SSE_Builder::hsimd_packl(fw, a, b);
     98    return IDISA_SSE2_Builder::hsimd_packl(fw, a, b);
    9999}
    100100
     
    126126    }
    127127    // Otherwise use default SSE logic.
    128     return IDISA_SSE_Builder::esimd_mergeh(fw, a, b);
     128    return IDISA_SSE2_Builder::esimd_mergeh(fw, a, b);
    129129}
    130130
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_sse_builder.cpp

    r6110 r6113  
    1818
    1919Value * IDISA_SSE2_Builder::hsimd_packh(unsigned fw, Value * a, Value * b) {   
    20     if ((fw == 16) && (mBitBlockWidth == 128)) {
     20    if ((fw == 16) && (getVectorBitWidth(a) == SSE_width)) {
    2121        Value * packuswb_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse2_packuswb_128);
    2222        return CreateCall(packuswb_func, {simd_srli(16, a, 8), simd_srli(16, b, 8)});
     
    2727
    2828Value * IDISA_SSE2_Builder::hsimd_packl(unsigned fw, Value * a, Value * b) {
    29     if ((fw == 16) && (mBitBlockWidth == 128)) {
     29    if ((fw == 16) && (getVectorBitWidth(a) == SSE_width)) {
    3030        Value * packuswb_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse2_packuswb_128);
    3131        Value * mask = simd_lomask(16);
     
    3838Value * IDISA_SSE2_Builder::hsimd_signmask(unsigned fw, Value * a) {
    3939    // SSE2 special case using Intrinsic::x86_sse2_movmsk_pd (fw=32 only)
    40     if (mBitBlockWidth == 128) {
     40    if (getVectorBitWidth(a) == SSE_width) {
    4141        if (fw == 64) {
    4242            Value * signmask_f64func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse2_movmsk_pd);
     
    5050        }
    5151    }
    52     const auto fieldCount = mBitBlockWidth / fw;
    53     if ((fieldCount > 4) && (fieldCount <= 16)) {
    54         Value * pmovmskb_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse2_pmovmskb_128);
    55         int fieldBytes = fw / 8;
    56         int hiByte = fieldBytes - 1;
    57         Constant * Idxs[16];
    58         for (unsigned i = 0; i < fieldCount; i++) {
    59             Idxs[i] = getInt32(fieldBytes * i + hiByte);
    60         }
    61         for (unsigned i = fieldCount; i < 16; i++) {
    62             Idxs[i] = getInt32(mBitBlockWidth / 8);
    63         }
    64         Value * packh = CreateShuffleVector(fwCast(8, a), fwCast(8, allZeroes()), ConstantVector::get({Idxs, 16}));
    65         return CreateCall(pmovmskb_func, packh);
    66     }
    6752    // Otherwise use default SSE logic.
    6853    return IDISA_SSE_Builder::hsimd_signmask(fw, a);
     
    7055
    7156Value * IDISA_SSE_Builder::hsimd_signmask(const unsigned fw, Value * a) {
     57    const unsigned SSE_blocks = getVectorBitWidth(a)/SSE_width;
     58    if (SSE_blocks > 1) {
     59        Value * a_lo = CreateHalfVectorLow(a);
     60        Value * a_hi = CreateHalfVectorHigh(a);
     61        if ((fw == 8 * SSE_blocks) || (fw >= 32 * SSE_blocks)) {
     62            return hsimd_signmask(fw/2, hsimd_packh(fw, a_hi, a_lo));
     63        }
     64        unsigned maskWidth = getVectorBitWidth(a)/fw;
     65        Type * maskTy = getIntNTy(maskWidth);
     66        Value * mask_lo = CreateZExtOrTrunc(hsimd_signmask(fw, a_lo), maskTy);
     67        Value * mask_hi = CreateZExtOrTrunc(hsimd_signmask(fw, a_hi), maskTy);
     68        return CreateOr(CreateShl(mask_hi, maskWidth/2), mask_lo);
     69    }
    7270    // SSE special cases using Intrinsic::x86_sse_movmsk_ps (fw=32 only)
    7371    if (fw == 32) {
     
    7573        Type * bitBlock_f32type = VectorType::get(getFloatTy(), mBitBlockWidth/32);
    7674        Value * a_as_ps = CreateBitCast(a, bitBlock_f32type);
    77         if (mBitBlockWidth == 128) {
     75        if (getVectorBitWidth(a) == SSE_width) {
    7876            return CreateCall(signmask_f32func, a_as_ps);
    7977        }
    80     } else if ((fw == 64) && (mBitBlockWidth == 256)) {
    81         Type * bitBlock_f32type = VectorType::get(getFloatTy(), mBitBlockWidth/32);
    82         Value * a_as_ps = CreateBitCast(a, bitBlock_f32type);
    83         Constant * Idxs[4];
    84         for (unsigned i = 0; i < 4; i++) {
    85             Idxs[i] = getInt32(2 * i + 1);
    86         }
    87         Value * packh = CreateShuffleVector(a_as_ps, UndefValue::get(bitBlock_f32type), ConstantVector::get({Idxs, 4}));
    88         Type * halfBlock_f32type = VectorType::get(getFloatTy(), mBitBlockWidth/64);
    89         Value * pack_as_ps = CreateBitCast(packh, halfBlock_f32type);
    90         Value * signmask_f32func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse_movmsk_ps);
    91         Value * mask = CreateCall(signmask_f32func, pack_as_ps);
    92         return mask;
    9378    }
    9479    // Otherwise use default logic.
Note: See TracChangeset for help on using the changeset viewer.