Changeset 6101


Ignore:
Timestamp:
Jun 18, 2018, 3:00:59 PM (5 months ago)
Author:
cameron
Message:

esimd_merge 8 for AVX-512; improves p2s performance

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6100 r6101  
    769769        return simd_or(simd_if(1, simd_himask(16), high_bits, low_bits), simd_or(lo_move_back, hi_move_fwd));
    770770    }
     771    if ((fw == 32) || (hostCPUFeatures.hasAVX512BW && (fw == 16)))   {
     772        const unsigned fieldCount = mBitBlockWidth/fw;
     773        Value * permute_func = nullptr;
     774        if (fw == 32) permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_d_512);
     775        else permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     776        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     777        Constant * Idxs[fieldCount];
     778        for (unsigned i = 0; i < fieldCount / 2; i++) {
     779            Idxs[2 * i] = getInt32(i + fieldCount / 2); // selects elements from first reg.
     780            Idxs[2 * i + 1] = getInt32(i + fieldCount / 2 + fieldCount); // selects elements from second reg.
     781        }
     782        Value * args[4] = {ConstantVector::get({Idxs, fieldCount}), fwCast(fw, b), fwCast(fw, a), mask};
     783        return bitCast(CreateCall(permute_func, args));
     784    }
     785    if ((fw == 8) || (hostCPUFeatures.hasAVX512BW && (fw == 8)))   {
     786        const unsigned fieldCount = mBitBlockWidth/fw;
     787        Constant * Idxs[fieldCount/2];
     788        for (unsigned i = 0; i < fieldCount / 2; i++) {
     789            Idxs[i] = getInt32(i+fieldCount/2); // selects elements from first reg.
     790        }
     791        Constant * low_indexes = ConstantVector::get({Idxs, fieldCount/2});
     792        Value * a_low = CreateShuffleVector(fwCast(8, a), UndefValue::get(fwVectorType(8)), low_indexes);
     793        Value * b_low = CreateShuffleVector(fwCast(8, b), UndefValue::get(fwVectorType(8)), low_indexes);
     794        Value * zext_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_pmovzxb_w_512);
     795        Constant * mask = ConstantInt::getAllOnesValue(getInt32Ty());
     796        Constant * zeroes = Constant::getNullValue(fwVectorType(16));
     797        Value * a_ext = CreateCall(zext_func, {a_low, zeroes, mask});
     798        Value * b_ext = CreateCall(zext_func, {b_low, zeroes, mask});
     799        Value * rslt = simd_or(a_ext, simd_slli(16, b_ext, 8));
     800        return rslt;
     801    }
    771802    // Otherwise use default AVX2 logic.
    772803    return IDISA_AVX2_Builder::esimd_mergeh(fw, a, b);
     
    791822        return simd_or(simd_if(1, simd_himask(16), high_bits, low_bits), simd_or(lo_move_back, hi_move_fwd));
    792823    }
     824    if ((fw == 32) || (hostCPUFeatures.hasAVX512BW && (fw == 16)))   {
     825        const unsigned fieldCount = mBitBlockWidth/fw;
     826        Value * permute_func = nullptr;
     827        if (fw == 32) permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_d_512);
     828        else permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     829        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     830        Constant * Idxs[fieldCount];
     831        for (unsigned i = 0; i < fieldCount / 2; i++) {
     832            Idxs[2 * i] = getInt32(i); // selects elements from first reg.
     833            Idxs[2 * i + 1] = getInt32(i + fieldCount); // selects elements from second reg.
     834        }
     835        Value * args[4] = {ConstantVector::get({Idxs, fieldCount}), fwCast(fw, b), fwCast(fw, a), mask};
     836        return bitCast(CreateCall(permute_func, args));
     837    }
     838    if ((fw == 8) || (hostCPUFeatures.hasAVX512BW && (fw == 8)))   {
     839        const unsigned fieldCount = mBitBlockWidth/fw;
     840        Constant * Idxs[fieldCount/2];
     841        for (unsigned i = 0; i < fieldCount / 2; i++) {
     842            Idxs[i] = getInt32(i); // selects elements from first reg.
     843        }
     844        Constant * low_indexes = ConstantVector::get({Idxs, fieldCount/2});
     845        Value * a_low = CreateShuffleVector(fwCast(8, a), UndefValue::get(fwVectorType(8)), low_indexes);
     846        Value * b_low = CreateShuffleVector(fwCast(8, b), UndefValue::get(fwVectorType(8)), low_indexes);
     847        Value * zext_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_pmovzxb_w_512);
     848        Constant * mask = ConstantInt::getAllOnesValue(getInt32Ty());
     849        Constant * zeroes = Constant::getNullValue(fwVectorType(16));
     850        Value * a_ext = CreateCall(zext_func, {a_low, zeroes, mask});
     851        Value * b_ext = CreateCall(zext_func, {b_low, zeroes, mask});
     852        Value * rslt = simd_or(a_ext, simd_slli(16, b_ext, 8));
     853        return rslt;
     854    }
    793855    // Otherwise use default AVX2 logic.
    794856    return IDISA_AVX2_Builder::esimd_mergel(fw, a, b);
Note: See TracChangeset for help on using the changeset viewer.