Ignore:
Timestamp:
May 16, 2018, 1:55:43 PM (12 months ago)
Author:
cameron
Message:

StreamExpand? kernel and compiler - initial check-in

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r6019 r6045  
    343343}
    344344
     345   
     346llvm::Value * IDISA_AVX2_Builder::mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) {
     347    if (mBitBlockWidth == 256 && fw > 32) {
     348        // Create a table for shuffling with smaller field widths.
     349        unsigned half_fw = fw/2;
     350        unsigned field_count = mBitBlockWidth/half_fw;
     351        // Build a ConstantVector of alternating 0 and 1 values.
     352        Constant * Idxs[field_count];
     353        for (unsigned int i = 0; i < field_count; i++) {
     354            Idxs[i] = getInt32(i & 1);
     355        }
     356        Constant * splat01 = ConstantVector::get({Idxs, field_count});
     357        Value * half_shuffle_table = simd_add(fw, simd_add(fw, shuffle_table, shuffle_table), splat01);
     358        return mvmd_shuffle(half_fw, a, half_shuffle_table);
     359    }
     360    if (mBitBlockWidth == 256 && fw == 32) {
     361        Value * shuf32Func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_permd);
     362        return CreateCall(shuf32Func, {fwCast(32, a), fwCast(32, shuffle_table)});
     363    }
     364    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     365}
     366
    345367llvm::Value * IDISA_AVX2_Builder::mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) {
    346368    if (mBitBlockWidth == 256 && fw == 64) {
     
    356378        Type * v8xi1Ty = VectorType::get(getInt1Ty(), 8);
    357379        Constant * mask0000000Fsplaat = ConstantVector::getSplat(8, ConstantInt::get(getInt32Ty(), 0xF));
    358         Value * shuf32Func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx2_permd);
    359380        Value * PEXT_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pext_32);
    360381        Value * PDEP_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_bmi_pdep_32);
     
    376397        }
    377398        Value * shuf = CreateAnd(CreateLShr(bdcst, ConstantVector::get({Shifts, 8})), mask0000000Fsplaat);
    378         Value * compress = CreateCall(shuf32Func, {a, shuf});
     399        Value * compress = mvmd_shuffle(32, a, shuf);
    379400        Value * field_mask = CreateTrunc(CreateSub(CreateShl(getInt32(1), field_count), getInt32(1)), getInt8Ty());
    380401        Value * result = CreateAnd(compress, CreateSExt(CreateBitCast(field_mask, v8xi1Ty), v8xi32Ty));
     
    512533}
    513534
     535llvm::Value * IDISA_AVX512F_Builder::mvmd_shuffle(unsigned fw, llvm::Value * a, llvm::Value * shuffle_table) {
     536    const unsigned fieldCount = mBitBlockWidth/fw;
     537    if (mBitBlockWidth == 512 && fw == 32) {
     538        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_d_512);
     539        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     540        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     541    }
     542    if (mBitBlockWidth == 512 && fw == 64) {
     543        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_q_512);
     544        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     545        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     546    }
     547    if (mBitBlockWidth == 512 && fw == 16 && hostCPUFeatures.hasAVX512BW) {
     548        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     549        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     550        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), UndefValue::get(fwVectorType(fw)), mask});
     551    }
     552    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     553}
     554
     555llvm::Value * IDISA_AVX512F_Builder::mvmd_shuffle2(unsigned fw, Value * a, Value * b, llvm::Value * shuffle_table) {
     556    const unsigned fieldCount = mBitBlockWidth/fw;
     557    if (mBitBlockWidth == 512 && fw == 32) {
     558        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_d_512);
     559        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     560        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
     561    }
     562    if (mBitBlockWidth == 512 && fw == 64) {
     563        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_vpermt2var_q_512);
     564        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     565        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
     566    }
     567    if (mBitBlockWidth == 512 && fw == 16 && hostCPUFeatures.hasAVX512BW) {
     568        Value * permuteFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     569        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(fieldCount));
     570        return CreateCall(permuteFunc, {fwCast(fw, shuffle_table), fwCast(fw, a), fwCast(fw, b), mask});
     571    }
     572    return IDISA_Builder::mvmd_shuffle(fw, a, shuffle_table);
     573}
    514574
    515575llvm::Value * IDISA_AVX512F_Builder::mvmd_compress(unsigned fw, llvm::Value * a, llvm::Value * select_mask) {
    516    
    517576    if (mBitBlockWidth == 512 && fw == 32) {
    518577        Value * compressFunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_compress_d_512);
     
    554613    }
    555614}
     615
     616Value * IDISA_AVX512F_Builder:: mvmd_dslli(unsigned fw, llvm::Value * a, llvm::Value * b, unsigned shift) {
     617    if (shift == 0) return a;
     618    if (fw > 32) {
     619        return mvmd_dslli(32, a, b, shift * (fw/32));
     620    } else if (((shift % 2) == 0) && (fw < 32)) {
     621        return mvmd_dslli(2 * fw, a, b, shift / 2);
     622    }
     623    const unsigned field_count = mBitBlockWidth/fw;
     624    if ((fw == 32) || (hostCPUFeatures.hasAVX512BW && (fw == 16)))   {
     625        Type * fwTy = getIntNTy(fw);
     626        Value * permute_func = nullptr;
     627        if (fw == 32) permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_d_512);
     628        else permute_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_maskz_vpermt2var_hi_512);
     629        Constant * indices[field_count];
     630        for (unsigned i = 0; i < field_count; i++) {
     631            indices[i] = ConstantInt::get(fwTy, i + field_count - shift);
     632        }
     633        Constant * mask = ConstantInt::getAllOnesValue(getIntNTy(field_count));
     634        Value * args[4] = {ConstantVector::get({indices, field_count}), fwCast(fw, a), fwCast(fw, b), mask};
     635        return bitCast(CreateCall(permute_func, args));
     636    } else {
     637        unsigned field32_shift = (shift * fw) / 32;
     638        unsigned bit_shift = (shift * fw) % 32;
     639        return simd_or(simd_slli(32, mvmd_slli(32, a, field32_shift), bit_shift),
     640                       simd_srli(32, mvmd_slli(32, a, field32_shift + 1), 32-bit_shift));
     641    }
     642}
     643
    556644llvm::Value * IDISA_AVX512F_Builder::simd_popcount(unsigned fw, llvm::Value * a) {
    557645     if (fw == 512) {
Note: See TracChangeset for help on using the changeset viewer.