Changeset 5977 for icGREP


Ignore:
Timestamp:
Apr 18, 2018, 10:02:24 AM (13 months ago)
Author:
cameron
Message:

AVX512 subfeature detection and popcount from Cole with further modification

Location:
icGREP/icgrep-devel/icgrep
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.cpp

    r5976 r5977  
    77#include "idisa_avx_builder.h"
    88#include <toolchain/toolchain.h>
     9#include <llvm/Support/raw_ostream.h>
    910
    1011using namespace llvm;
     
    339340}
    340341
    341 
    342 
    343 }
     342llvm::Value * IDISA_AVX512F_Builder::simd_popcount(unsigned fw, llvm::Value * a) {
     343     if (fw == 512) {
     344         Constant * zero16xi8 = Constant::getNullValue(VectorType::get(getInt8Ty(), 16));
     345         Constant * zeroInt32 = Constant::getNullValue(getInt32Ty());
     346         Value * c = simd_popcount(64, a);
     347         //  Should probably use _mm512_reduce_add_epi64, but not found in LLVM 3.8
     348         Value * pack64_8_func = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_mask_pmov_qb_512);
     349         // popcounts of 64 bit fields will always fit in 8 bit fields.
     350         // We don't need the masked version of this, but the unmasked intrinsic was not found.
     351         c = CreateCall(pack64_8_func, {c, zero16xi8, Constant::getAllOnesValue(getInt8Ty())});
     352         Value * horizSADfunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_sse2_psad_bw);
     353         c = CreateCall(horizSADfunc, {c, zero16xi8});
     354         return CreateInsertElement(allZeroes(), CreateExtractElement(c, zeroInt32), zeroInt32);
     355    }
     356    if (hostCPUFeatures.hasAVX512VPOPCNTDQ && (fw == 32 || fw == 64)){
     357        //llvm should use vpopcntd or vpopcntq instructions
     358        return CreatePopcount(fwCast(fw, a));
     359    }
     360    if (hostCPUFeatures.hasAVX512BW && (fw == 64)) {
     361        Value * horizSADfunc = Intrinsic::getDeclaration(getModule(), Intrinsic::x86_avx512_psad_bw_512);
     362        return bitCast(CreateCall(horizSADfunc, {fwCast(8, simd_popcount(8, a)), fwCast(8, allZeroes())}));
     363    }
     364    //https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
     365    if((fw == 64) && (mBitBlockWidth == 512)){
     366        Constant * m1Arr[8];
     367        llvm::Constant * m1;
     368        for (unsigned int i = 0; i < 8; i++) {
     369            m1Arr[i] = getInt64(0x5555555555555555);
     370        }
     371        m1 = ConstantVector::get({m1Arr, 8});
     372       
     373        Constant * m2Arr[8];
     374        llvm::Constant * m2;
     375        for (unsigned int i = 0; i < 8; i++) {
     376            m2Arr[i] = getInt64(0x3333333333333333);
     377        }
     378        m2 = ConstantVector::get({m2Arr, 8});
     379       
     380        Constant * m4Arr[8];
     381        llvm::Constant * m4;
     382        for (unsigned int i = 0; i < 8; i++) {
     383            m4Arr[i] = getInt64(0x0f0f0f0f0f0f0f0f);
     384        }
     385        m4 = ConstantVector::get({m4Arr, 8});
     386       
     387        Constant * h01Arr[8];
     388        llvm::Constant * h01;
     389        for (unsigned int i = 0; i < 8; i++) {
     390            h01Arr[i] = getInt64(0x0101010101010101);
     391        }
     392        h01 = ConstantVector::get({h01Arr, 8});
     393       
     394        a = simd_sub(fw, a, simd_and(simd_srli(fw, a, 1), m1));
     395        a = simd_add(fw, simd_and(a, m2), simd_and(simd_srli(fw, a, 2), m2));
     396        a = simd_and(simd_add(fw, a, simd_srli(fw, a, 4)), m4);
     397        return simd_srli(fw, simd_mult(fw, a, h01), 56);
     398       
     399    }
     400    return IDISA_Builder::simd_popcount(fw, a);
     401}
     402
     403
     404void IDISA_AVX512F_Builder::getAVX512Features() {
     405    llvm::StringMap<bool> features;
     406    if (llvm::sys::getHostCPUFeatures(features)) {
     407        hostCPUFeatures.hasAVX512CD = features.lookup("avx512cd");
     408        hostCPUFeatures.hasAVX512BW = features.lookup("avx512bw");
     409        hostCPUFeatures.hasAVX512DQ = features.lookup("avx512dq");
     410        hostCPUFeatures.hasAVX512VL = features.lookup("avx512vl");
     411       
     412        //hostCPUFeatures.hasAVX512VBMI, hostCPUFeatures.hasAVX512VBMI2,
     413        //hostCPUFeatures.hasAVX512VPOPCNTDQ have not been tested as we
     414        //did not have hardware support. It should work in theory (tm)
     415       
     416        hostCPUFeatures.hasAVX512VBMI = features.lookup("avx512_vbmi");
     417        hostCPUFeatures.hasAVX512VBMI2 = features.lookup("avx512_vbmi2");
     418        hostCPUFeatures.hasAVX512VPOPCNTDQ = features.lookup("avx512_vpopcntdq");
     419    }
     420}
     421
     422
     423}
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_avx_builder.h

    r5976 r5977  
    5858    : IDISA_Builder(C, vectorWidth, stride)
    5959    , IDISA_AVX2_Builder(C, vectorWidth, stride) {
     60        getAVX512Features();
    6061    }
    6162
    6263    virtual std::string getBuilderUniqueName() override;
     64    void getAVX512Features();
    6365    llvm::Value * hsimd_packh(unsigned fw, llvm::Value * a, llvm::Value * b) override;
    6466    llvm::Value * hsimd_packl(unsigned fw, llvm::Value * a, llvm::Value * b) override;
    65     llvm::Value * esimd_bitspread(unsigned fw, llvm::Value * bitmask);
     67    llvm::Value * esimd_bitspread(unsigned fw, llvm::Value * bitmask) override;
     68    llvm::Value * simd_popcount(unsigned fw, llvm::Value * a) override;
    6669
    67     ~IDISA_AVX512F_Builder() {}
     70    ~IDISA_AVX512F_Builder() {
     71    }
     72private:
     73    struct Features {
     74        //not an exhaustive list, can be extended if needed
     75        bool hasAVX512CD = false;
     76        bool hasAVX512BW = false;
     77        bool hasAVX512DQ = false;
     78        bool hasAVX512VL = false;
     79        bool hasAVX512VBMI = false;
     80        bool hasAVX512VBMI2 = false;
     81        bool hasAVX512VPOPCNTDQ = false;
     82    };
     83    Features hostCPUFeatures;
    6884};
    6985
  • icGREP/icgrep-devel/icgrep/IR_Gen/idisa_builder.cpp

    r5972 r5977  
    299299        // case 11:  ab - 0a = 11 - 01 = 10
    300300        return simd_sub(64, a, simd_srli(64, simd_and(simd_himask(2), a), 1));
    301     } else if (fw == 4) {
     301    } else if (fw <= 8) {
    302302        Value * c = simd_popcount(fw/2, a);
    303303        c = simd_add(64, simd_and(c, simd_lomask(fw)), simd_srli(fw, c, fw/2));
  • icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.cpp

    r5973 r5977  
    506506            Value * const countSoFar = b->CreateAlignedLoad(ptr, alignment, c->getName() + "_accumulator");
    507507            const auto fieldWidth = b->getSizeTy()->getBitWidth();
    508             auto fields = (b->getBitBlockWidth() / fieldWidth);
    509             Value * fieldCounts = b->simd_popcount(fieldWidth, to_count);
    510             while (fields > 1) {
    511                 fields /= 2;
    512                 fieldCounts = b->CreateAdd(fieldCounts, b->mvmd_srli(fieldWidth, fieldCounts, fields));
    513             }
    514             value = b->CreateAdd(b->mvmd_extract(fieldWidth, fieldCounts, 0), countSoFar, "countSoFar");
     508            Value * bitBlockCount = b->simd_popcount(b->getBitBlockWidth(), to_count);
     509            value = b->CreateAdd(b->mvmd_extract(fieldWidth, bitBlockCount, 0), countSoFar, "countSoFar");
    515510            b->CreateAlignedStore(value, ptr, alignment);
    516511        } else if (const Lookahead * l = dyn_cast<Lookahead>(stmt)) {
Note: See TracChangeset for help on using the changeset viewer.