Ignore:
Timestamp:
Apr 18, 2018, 10:02:24 AM (12 months ago)
Author:
cameron
Message:

AVX512 subfeature detection and popcount from Cole with further modification

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r5976 r5977  
    77#include "idisa_avx_builder.h"
    88#include <toolchain/toolchain.h>
     9#include <llvm/Support/raw_ostream.h>
    910
    1011using namespace llvm;
     
    339340}
    340341
    341 
    342 
    343 }
     342llvm::Value * IDISA_AVX512F_Builder::simd_popcount(unsigned fw, llvm::Value * a) {
     343     if (fw == 512) {
     344         Constant * zero16xi8 = Constant::getNullValue(VectorType::get(getInt8Ty(), 16));
     345         Constant * zeroInt32 = Constant::getNullValue(getInt32Ty());
     346         Value * c = simd_popcount(64, a);
     347         //  Should probably use _mm512_reduce_add_epi64, but not found in LLVM 3.8
     348         Value * pack64_8_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_pmov_qb_512);
     349         // popcounts of 64 bit fields will always fit in 8 bit fields.
     350         // We don't need the masked version of this, but the unmasked intrinsic was not found.
     351         c = CreateCall(pack64_8_func, {c, zero16xi8, Constant::getAllOnesValue(getInt8Ty())});
     352         Value * horizSADfunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse2_psad_bw);
     353         c = CreateCall(horizSADfunc, {c, zero16xi8});
     354         return CreateInsertElement(allZeroes(), CreateExtractElement(c, zeroInt32), zeroInt32);
     355    }
     356    if (hostCPUFeatures.hasAVX512VPOPCNTDQ && (fw == 32 || fw == 64)){
     357        //llvm should use vpopcntd or vpopcntq instructions
     358        return CreatePopcount(fwCast(fw, a));
     359    }
     360    if (hostCPUFeatures.hasAVX512BW && (fw == 64)) {
     361        Value * horizSADfunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_psad_bw_512);
     362        return bitCast(CreateCall(horizSADfunc, {fwCast(8, simd_popcount(8, a)), fwCast(8, allZeroes())}));
     363    }
     364    //https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
     365    if((fw == 64) && (mBitBlockWidth == 512)){
     366        Constant * m1Arr[8];
     367        llvm::Constant * m1;
     368        for (unsigned int i = 0; i < 8; i++) {
     369            m1Arr[i] = getInt64(0x5555555555555555);
     370        }
     371        m1 = ConstantVector::get({m1Arr, 8});
     372       
     373        Constant * m2Arr[8];
     374        llvm::Constant * m2;
     375        for (unsigned int i = 0; i < 8; i++) {
     376            m2Arr[i] = getInt64(0x3333333333333333);
     377        }
     378        m2 = ConstantVector::get({m2Arr, 8});
     379       
     380        Constant * m4Arr[8];
     381        llvm::Constant * m4;
     382        for (unsigned int i = 0; i < 8; i++) {
     383            m4Arr[i] = getInt64(0x0f0f0f0f0f0f0f0f);
     384        }
     385        m4 = ConstantVector::get({m4Arr, 8});
     386       
     387        Constant * h01Arr[8];
     388        llvm::Constant * h01;
     389        for (unsigned int i = 0; i < 8; i++) {
     390            h01Arr[i] = getInt64(0x0101010101010101);
     391        }
     392        h01 = ConstantVector::get({h01Arr, 8});
     393       
     394        a = simd_sub(fw, a, simd_and(simd_srli(fw, a, 1), m1));
     395        a = simd_add(fw, simd_and(a, m2), simd_and(simd_srli(fw, a, 2), m2));
     396        a = simd_and(simd_add(fw, a, simd_srli(fw, a, 4)), m4);
     397        return simd_srli(fw, simd_mult(fw, a, h01), 56);
     398       
     399    }
     400    return IDISA_Builder::simd_popcount(fw, a);
     401}
     402
     403
     404void IDISA_AVX512F_Builder::getAVX512Features() {
     405    llvm::StringMap<bool> features;
     406    if (llvm::sys::getHostCPUFeatures(features)) {
     407        hostCPUFeatures.hasAVX512CD = features.lookup("avx512cd");
     408        hostCPUFeatures.hasAVX512BW = features.lookup("avx512bw");
     409        hostCPUFeatures.hasAVX512DQ = features.lookup("avx512dq");
     410        hostCPUFeatures.hasAVX512VL = features.lookup("avx512vl");
     411       
     412        //hostCPUFeatures.hasAVX512VBMI, hostCPUFeatures.hasAVX512VBMI2,
     413        //hostCPUFeatures.hasAVX512VPOPCNTDQ have not been tested as we
     414        //did not have hardware support. It should work in theory (tm)
     415       
     416        hostCPUFeatures.hasAVX512VBMI = features.lookup("avx512_vbmi");
     417        hostCPUFeatures.hasAVX512VBMI2 = features.lookup("avx512_vbmi2");
     418        hostCPUFeatures.hasAVX512VPOPCNTDQ = features.lookup("avx512_vpopcntdq");
     419    }
     420}
     421
     422
     423}
Note: See TracChangeset for help on using the changeset viewer.