Ignore:
Timestamp:
Sep 11, 2013, 5:04:25 PM (6 years ago)
Author:
linmengl
Message:

update mvmd_srli, mvmd_slli, bitblock_popcount, have a little bit better performance now

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_avx2.cpp

    r3453 r3462  
    11531153}
    11541154
    1155 //The total number of operations is 103.166666667
     1155//The total number of operations is 62.1666666667
    11561156template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1)
    11571157{
     
    11771177}
    11781178
    1179 //The total number of operations is 17.75
     1179//The total number of operations is 6.875
    11801180template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sll(bitblock256_t arg1, bitblock256_t shift_mask)
    11811181{
     
    14151415}
    14161416
    1417 //The total number of operations is 59.0
     1417//The total number of operations is 18.0
    14181418template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
    14191419{
    1420         bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
    1421         return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
     1420        return _mm256_castsi128_si256(_mm_cvtsi64_si128((int64_t)(bitblock256::popcount(arg1))));
    14221421}
    14231422
     
    17501749}
    17511750
    1752 //The total number of operations is 18.5
     1751//The total number of operations is 6.875
    17531752template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srl(bitblock256_t arg1, bitblock256_t shift_mask)
    17541753{
     
    25062505//The total number of operations is 1.0
    25072506template <> IDISA_ALWAYS_INLINE FieldType<256/8>::T hsimd256<8>::signmask(bitblock256_t arg1)
    2508 {       
     2507{
    25092508        return _mm256_movemask_epi8(arg1);
    25102509}
     
    30913090}
    30923091
    3093 //The total number of operations is 20.5
     3092//The total number of operations is 5.5
    30943093template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    30953094{
     
    30973096}
    30983097
    3099 //The total number of operations is 20.5
     3098//The total number of operations is 5.5
    31003099template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    31013100{
     
    31033102}
    31043103
    3105 //The total number of operations is 20.5
     3104//The total number of operations is 5.5
    31063105template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    31073106{
     
    33613360}
    33623361
    3363 //The total number of operations is 10.0
     3362//The total number of operations is 2.25
    33643363template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
    33653364{
    3366         return simd256<256>::srli<(sh*64)>(arg1);
    3367 }
    3368 
    3369 //The total number of operations is 10.0
     3365        return ((sh == 3) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(0), (int64_t)(0), (int64_t)(-1)), _mm256_permute4x64_epi64(arg1, (int32_t)(3))) : ((sh == 2) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(0), (int64_t)(-1), (int64_t)(-1)), _mm256_permute4x64_epi64(arg1, (int32_t)(14))) : ((sh == 1) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(-1), (int64_t)(-1), (int64_t)(-1)), _mm256_permute4x64_epi64(arg1, (int32_t)(57))) : ((sh == 0) ? arg1 : simd256<32>::constant<0>()))));
     3366}
     3367
     3368//The total number of operations is 2.25
    33703369template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)
    33713370{
    3372         return simd256<256>::srli<(sh*128)>(arg1);
    3373 }
    3374 
    3375 //The total number of operations is 10.0
     3371        return mvmd256<(64)>::srli<(sh*2)>(arg1);
     3372}
     3373
     3374//The total number of operations is 2.25
    33763375template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)
    33773376{
    3378         return simd256<256>::srli<(sh*256)>(arg1);
     3377        return mvmd256<(128)>::srli<(sh*2)>(arg1);
    33793378}
    33803379
     
    34573456}
    34583457
    3459 //The total number of operations is 20.5
     3458//The total number of operations is 5.5
    34603459template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    34613460{
     
    34633462}
    34643463
    3465 //The total number of operations is 20.5
     3464//The total number of operations is 5.5
    34663465template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    34673466{
     
    34693468}
    34703469
    3471 //The total number of operations is 20.5
     3470//The total number of operations is 5.5
    34723471template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    34733472{
     
    35053504}
    35063505
    3507 //The total number of operations is 9.5
     3506//The total number of operations is 2.25
    35083507template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1)
    35093508{
    3510         return simd256<256>::slli<(sh*64)>(arg1);
    3511 }
    3512 
    3513 //The total number of operations is 9.5
     3509        return ((sh == 1) ? simd_and(_mm256_set_epi64x((int64_t)(-1), (int64_t)(-1), (int64_t)(-1), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)((144)))) : ((sh == 2) ? simd_and(_mm256_set_epi64x((int64_t)(-1), (int64_t)(-1), (int64_t)(0), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)(64))) : ((sh == 3) ? simd_and(_mm256_set_epi64x((int64_t)(-1), (int64_t)(0), (int64_t)(0), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)(0))) : ((sh == 0) ? arg1 : simd256<32>::constant<0>()))));
     3510}
     3511
     3512//The total number of operations is 2.25
    35143513template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1)
    35153514{
    3516         return simd256<256>::slli<(sh*128)>(arg1);
    3517 }
    3518 
    3519 //The total number of operations is 9.5
     3515        return mvmd256<(64)>::slli<(sh*2)>(arg1);
     3516}
     3517
     3518//The total number of operations is 2.25
    35203519template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1)
    35213520{
    3522         return simd256<256>::slli<(sh*256)>(arg1);
     3521        return mvmd256<(128)>::slli<(sh*2)>(arg1);
    35233522}
    35243523
     
    35593558}
    35603559
    3561 //The total number of operations is 17.75
     3560//The total number of operations is 6.875
    35623561IDISA_ALWAYS_INLINE bitblock256_t bitblock256::sll(bitblock256_t arg1, bitblock256_t arg2)
    35633562{
     
    35773576}
    35783577
    3579 //The total number of operations is 62.0
     3578//The total number of operations is 16.0
    35803579IDISA_ALWAYS_INLINE uint16_t bitblock256::popcount(bitblock256_t arg1)
    35813580{
    3582         return mvmd256<64>::extract<0>(simd256<256>::popcount(arg1));
     3581        return (((__builtin_popcountll((uint64_t)(mvmd256<64>::extract<0>(arg1)))+__builtin_popcountll((uint64_t)(mvmd256<64>::extract<1>(arg1))))+__builtin_popcountll((uint64_t)(mvmd256<64>::extract<2>(arg1))))+__builtin_popcountll((uint64_t)(mvmd256<64>::extract<3>(arg1))));
    35833582}
    35843583
     
    35893588}
    35903589
    3591 //The total number of operations is 18.5
     3590//The total number of operations is 6.875
    35923591IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srl(bitblock256_t arg1, bitblock256_t arg2)
    35933592{
Note: See TracChangeset for help on using the changeset viewer.