Changeset 3448


Ignore:
Timestamp:
Sep 8, 2013, 5:48:04 PM (6 years ago)
Author:
linmengl
Message:

update newly defined simd_sll and simd_srl, small optimize on bitblock_sll

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_avx2.cpp

    r3441 r3448  
    1212
    1313#include "immintrin.h"
     14
    1415#include "emmintrin.h"
    1516
    1617typedef __m256i bitblock256_t;
    17 
     18               
    1819#ifndef FIELD_TYPE
    19 #define FIELD_TYPE
     20#define FIELD_TYPE     
    2021template <uint32_t fw> struct FieldType {
    2122   typedef int T;  //default for FieldType::T is int
     
    3031template <> struct FieldType<64> {typedef uint64_t T;};
    3132template <> struct FieldType<128> {typedef uint64_t T;};
     33template <> struct FieldType<256> {typedef uint64_t T;};
    3234#endif
    3335
     
    4345        static IDISA_ALWAYS_INLINE bitblock256_t all(bitblock256_t arg1);
    4446        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
     47        static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    4548        static IDISA_ALWAYS_INLINE bitblock256_t ctz(bitblock256_t arg1);
     49        static IDISA_ALWAYS_INLINE bitblock256_t sll(bitblock256_t arg1, bitblock256_t shift_mask);
    4650        static IDISA_ALWAYS_INLINE bitblock256_t eq(bitblock256_t arg1, bitblock256_t arg2);
    4751        static IDISA_ALWAYS_INLINE bitblock256_t popcount(bitblock256_t arg1);
     52        static IDISA_ALWAYS_INLINE bitblock256_t sra(bitblock256_t arg1, bitblock256_t shift_mask);
    4853        static IDISA_ALWAYS_INLINE bitblock256_t neg(bitblock256_t arg1);
    4954        static IDISA_ALWAYS_INLINE bitblock256_t himask();
    5055        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    51         static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    5256        static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2);
    5357        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1);
     58        static IDISA_ALWAYS_INLINE bitblock256_t srl(bitblock256_t arg1, bitblock256_t shift_mask);
    5459        static IDISA_ALWAYS_INLINE bitblock256_t lomask();
    5560        static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2);
    5661        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    5762        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
     63        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    5864        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    5965        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
     
    6268        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
    6369        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
    64         static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    6570        static IDISA_ALWAYS_INLINE bitblock256_t ugt(bitblock256_t arg1, bitblock256_t arg2);
    6671};
     
    112117{
    113118public:
     119        static IDISA_ALWAYS_INLINE bitblock256_t sll(bitblock256_t arg1, bitblock256_t arg2);
    114120        static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(const bitblock256_t* arg1);
    115121        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    116         static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
     122        static IDISA_ALWAYS_INLINE bitblock256_t srl(bitblock256_t arg1, bitblock256_t arg2);
    117123        static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1);
    118124        static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1);
     
    121127        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(const bitblock256_t* arg1);
    122128        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t arg1, bitblock256_t* arg2);
    123 
    124         // Add by hand
    125         static IDISA_ALWAYS_INLINE bitblock256_t sll(bitblock256_t arg1, int shift);
    126         static IDISA_ALWAYS_INLINE bitblock256_t srl(bitblock256_t arg1, int shift);
    127 
    128         static IDISA_ALWAYS_INLINE bitblock256_t sll(bitblock256_t arg1, bitblock256_t shift);
    129         static IDISA_ALWAYS_INLINE bitblock256_t srl(bitblock256_t arg1, bitblock256_t shift);
     129        static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    130130};
    131131
     
    133133IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2);
    134134IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1);
     135IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    135136IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    136 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    137137IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    138138IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
     
    206206template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1);
    207207template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1);
     208template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sll(bitblock256_t arg1, bitblock256_t shift_mask);
     209template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sll(bitblock256_t arg1, bitblock256_t shift_mask);
     210template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sll(bitblock256_t arg1, bitblock256_t shift_mask);
     211template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sll(bitblock256_t arg1, bitblock256_t shift_mask);
     212template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
     213template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
     214template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
     215template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
     216template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
     217template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
     218template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
     219template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
     220template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
    208221template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2);
    209222template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2);
     
    240253template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1);
    241254template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1);
     255template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sra(bitblock256_t arg1, bitblock256_t shift_mask);
     256template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sra(bitblock256_t arg1, bitblock256_t shift_mask);
    242257template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1);
    243258template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1);
     
    265280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    266281template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    267 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
    271 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
    272 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
    273 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
    274 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
    275 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
     282template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
     283template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
     284template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
     285template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
     286template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
     287template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
     288template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
     289template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    276290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1);
    277291template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1);
     
    282296template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
    283297template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
     298template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srl(bitblock256_t arg1, bitblock256_t shift_mask);
     299template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srl(bitblock256_t arg1, bitblock256_t shift_mask);
     300template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srl(bitblock256_t arg1, bitblock256_t shift_mask);
     301template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srl(bitblock256_t arg1, bitblock256_t shift_mask);
     302template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
     303template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
     304template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
     305template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
     306template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
     307template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
     308template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
     309template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    284310template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    285311template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     
    300326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
    301327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
    302 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
    303 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
    304 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
    305 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
    306 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
    307 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
    308 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    309 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
     328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
     329template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     330template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
     331template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
     332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
     333template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
     334template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
     335template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
     336template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
     337template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
     338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
     340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
     341template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
     342template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
     343template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
     344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
     345template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
     346template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
     347template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
     348template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
     349template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
     350template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
     351template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
     353template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
     354template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
     355template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
     356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
     357template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
     358template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
     359template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
     360template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
     361template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
     362template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
     363template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
     364template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
     365template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
     366template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
     367template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
     368template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
     369template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
     370template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
     371template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    310372template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2);
    311373template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2);
     
    317379template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    318380template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
    321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
    322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
    323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
    324 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
    325 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
    326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
    327 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
    328381template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
    329382template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
     
    334387template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
    335388template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
    336 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
    337 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
    338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
    339 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
    340 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
    341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
    342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
    343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
    344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
    345 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
    346 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
    347 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
    348 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
    349 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
    350 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
    351 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
    352 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    353 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
    354 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
    355 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
    356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
    357 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
    358 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
    359 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
    360 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    361 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
    362 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
    363 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
    364 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
    365 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
    366 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
    367 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
    368 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
    369 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    370 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    371 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
    372 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
    373 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
    374 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
    375 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
    376 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
    377 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
    378 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
    379389template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
    380390template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    495505template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    496506template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     507template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
     508template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
     509template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
     510template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
     511template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
    497512template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1);
    498513template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1);
     
    520535template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
    521536template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
    522 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
    523 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
    524 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
    525 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
    526 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
    527537template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
    528538template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     
    606616
    607617//The total number of operations is 1.0
     618IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
     619{
     620        return _mm256_or_si256(arg1, arg2);
     621}
     622
     623//The total number of operations is 1.0
    608624IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
    609625{
    610626        return _mm256_andnot_si256(arg2, arg1);
    611 }
    612 
    613 //The total number of operations is 1.0
    614 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
    615 {
    616         return _mm256_or_si256(arg1, arg2);
    617627}
    618628
     
    11501160
    11511161//The total number of operations is 1.0
    1152 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1153 {
    1154         return simd_andc(arg1, arg2);
    1155 }
    1156 
    1157 //The total number of operations is 14.0
    1158 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1159 {
    1160         bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
    1161         bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    1162         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    1163         return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
    1164 }
    1165 
    1166 //The total number of operations is 12.0
    1167 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1168 {
    1169         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
    1170 }
    1171 
    1172 //The total number of operations is 3.0
    1173 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1174 {
    1175         bitblock256_t high_bit = simd256<8>::constant<(128)>();
    1176         return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1177 }
    1178 
    1179 //The total number of operations is 3.0
    1180 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1181 {
    1182         bitblock256_t high_bit = simd256<16>::constant<(32768)>();
    1183         return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1184 }
    1185 
    1186 //The total number of operations is 3.0
    1187 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1188 {
    1189         bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
    1190         return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1191 }
    1192 
    1193 //The total number of operations is 3.0
    1194 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1195 {
    1196         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1197         return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1198 }
    1199 
    1200 //The total number of operations is 31.75
    1201 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1202 {
    1203         bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
    1204         bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    1205         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    1206         return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
    1207 }
    1208 
    1209 //The total number of operations is 101.041666667
    1210 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1211 {
    1212         bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
    1213         bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    1214         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    1215         return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
    1216 }
    1217 
    1218 //The total number of operations is 4.0
    1219 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
    1220 {
    1221         return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
    1222 }
    1223 
    1224 //The total number of operations is 4.0
    1225 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
    1226 {
    1227         return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
    1228 }
    1229 
    1230 //The total number of operations is 4.0
    1231 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
    1232 {
    1233         return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
    1234 }
    1235 
    1236 //The total number of operations is 3.0
    1237 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
    1238 {
    1239         return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
    1240 }
    1241 
    1242 //The total number of operations is 3.0
    1243 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
    1244 {
    1245         return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
    1246 }
    1247 
    1248 //The total number of operations is 3.0
    1249 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
    1250 {
    1251         return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
    1252 }
    1253 
    1254 //The total number of operations is 7.33333333333
    1255 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
    1256 {
    1257         return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
    1258 }
    1259 
    1260 //The total number of operations is 12.0
    1261 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
    1262 {
    1263         return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
    1264 }
    1265 
    1266 //The total number of operations is 0
    1267 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
    1268 {
    1269         return arg1;
    1270 }
    1271 
    1272 //The total number of operations is 3.0
    1273 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
    1274 {
    1275         return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
    1276 }
    1277 
    1278 //The total number of operations is 7.0
    1279 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
    1280 {
    1281         return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
    1282 }
    1283 
    1284 //The total number of operations is 11.0
    1285 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
    1286 {
    1287         return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
    1288 }
    1289 
    1290 //The total number of operations is 14.0
    1291 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
    1292 {
    1293         return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
    1294 }
    1295 
    1296 //The total number of operations is 17.0
    1297 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
    1298 {
    1299         return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
    1300 }
    1301 
    1302 //The total number of operations is 17.0
    1303 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
    1304 {
    1305         bitblock256_t tmpAns = simd256<8>::popcount(arg1);
    1306         return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
    1307 }
    1308 
    1309 //The total number of operations is 35.6666666667
    1310 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
    1311 {
    1312         return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
    1313 }
    1314 
    1315 //The total number of operations is 59.0
    1316 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
    1317 {
    1318         bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
    1319         return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
    1320 }
    1321 
    1322 //The total number of operations is 8.0
    1323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
    1324 {
    1325         bitblock256_t t0 = simd256<2>::srli<1>(arg1);
    1326         bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
    1327         return simd_or(f0, simd256<2>::slli<1>(f0));
    1328 }
    1329 
    1330 //The total number of operations is 12.0
    1331 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
    1332 {
    1333         return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
    1334 }
    1335 
    1336 //The total number of operations is 3.0
    1337 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
    1338 {
    1339         return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
    1340 }
    1341 
    1342 //The total number of operations is 3.0
    1343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
    1344 {
    1345         return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
    1346 }
    1347 
    1348 //The total number of operations is 3.0
    1349 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
    1350 {
    1351         return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
    1352 }
    1353 
    1354 //The total number of operations is 3.0
    1355 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
    1356 {
    1357         return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
    1358 }
    1359 
    1360 //The total number of operations is 31.75
    1361 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
    1362 {
    1363         return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
    1364 }
    1365 
    1366 //The total number of operations is 1.0
    1367 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
    1368 {
    1369         return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
    1370 }
    1371 
    1372 //The total number of operations is 13.0
    1373 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
    1374 {
    1375         return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
    1376 }
    1377 
    1378 //The total number of operations is 6.0
    1379 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
    1380 {
    1381         return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
    1382 }
    1383 
    1384 //The total number of operations is 1.0
    1385 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
    1386 {
    1387         return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
    1388 }
    1389 
    1390 //The total number of operations is 1.0
    1391 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
    1392 {
    1393         return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
    1394 }
    1395 
    1396 //The total number of operations is 1.0
    1397 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
    1398 {
    1399         return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
    1400 }
    1401 
    1402 //The total number of operations is 1.0
    1403 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
    1404 {
    1405         return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
    1406 }
    1407 
    1408 //The total number of operations is 12.3333333333
    1409 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
    1410 {
    1411         return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
    1412 }
    1413 
    1414 //The total number of operations is 43.1666666667
    1415 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
    1416 {
    1417         return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
    1418 }
    1419 
    1420 //The total number of operations is 2.0
    1421 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
    1422 {
    1423         return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
    1424 }
    1425 
    1426 //The total number of operations is 2.0
    1427 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
    1428 {
    1429         return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
    1430 }
    1431 
    1432 //The total number of operations is 2.0
    1433 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
    1434 {
    1435         return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
    1436 }
    1437 
    1438 //The total number of operations is 1.0
    1439 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
    1440 {
    1441         return _mm256_slli_epi16(arg1, (int32_t)(sh));
    1442 }
    1443 
    1444 //The total number of operations is 1.0
    1445 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
    1446 {
    1447         return _mm256_slli_epi32(arg1, (int32_t)(sh));
    1448 }
    1449 
    1450 //The total number of operations is 1.0
    1451 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
    1452 {
    1453         return _mm256_slli_epi64(arg1, (int32_t)(sh));
    1454 }
    1455 
    1456 //The total number of operations is 5.33333333333
    1457 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
    1458 {
    1459         return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
    1460 }
    1461 
    1462 //The total number of operations is 9.5
    1463 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
    1464 {
    1465         return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
    1466 }
    1467 
    1468 //The total number of operations is 3.0
    1469 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1470 {
    1471         return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
    1472 }
    1473 
    1474 //The total number of operations is 8.0
    1475 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1476 {
    1477         return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
    1478 }
    1479 
    1480 //The total number of operations is 13.0
    1481 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1482 {
    1483         return simd256<1>::ifh(simd256<4>::gt(simd256<4>::constant<0>(), arg1), arg2, arg3);
    1484 }
    1485 
    1486 //The total number of operations is 1.0
    1487 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1488 {
    1489         return _mm256_blendv_epi8(arg3, arg2, arg1);
    1490 }
    1491 
    1492 //The total number of operations is 4.0
    1493 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1494 {
    1495         return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
    1496 }
    1497 
    1498 //The total number of operations is 4.0
    1499 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1500 {
    1501         return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
    1502 }
    1503 
    1504 //The total number of operations is 4.0
    1505 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1506 {
    1507         return simd256<1>::ifh(simd256<64>::gt(simd256<64>::constant<0>(), arg1), arg2, arg3);
    1508 }
    1509 
    1510 //The total number of operations is 12.3333333333
    1511 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1512 {
    1513         return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
    1514 }
    1515 
    1516 //The total number of operations is 25.3333333333
    1517 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1518 {
    1519         return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
     1162template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sll(bitblock256_t arg1, bitblock256_t shift_mask)
     1163{
     1164        return _mm256_sll_epi16(arg1, avx_select_lo128(shift_mask));
     1165}
     1166
     1167//The total number of operations is 1.0
     1168template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sll(bitblock256_t arg1, bitblock256_t shift_mask)
     1169{
     1170        return _mm256_sll_epi32(arg1, avx_select_lo128(shift_mask));
     1171}
     1172
     1173//The total number of operations is 1.0
     1174template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sll(bitblock256_t arg1, bitblock256_t shift_mask)
     1175{
     1176        return _mm256_sll_epi64(arg1, avx_select_lo128(shift_mask));
     1177}
     1178
     1179//The total number of operations is 17.75
     1180template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sll(bitblock256_t arg1, bitblock256_t shift_mask)
     1181{
     1182        uint32_t shift = _mm_cvtsi128_si32(avx_select_lo128(shift_mask));
     1183        uint32_t n = (shift/64);
     1184        bitblock256_t arg2 = ((n == 1) ? mvmd256<64>::slli<1>(arg1) : ((n == 2) ? mvmd256<64>::slli<2>(arg1) : ((n == 3) ? mvmd256<64>::slli<3>(arg1) : arg1)));
     1185        return ((n >= 4) ? simd256<32>::constant<0>() : (((shift&63) > 0) ? simd_or(_mm256_sll_epi64(arg2, _mm_cvtsi32_si128((int32_t)((shift&63)))), mvmd256<64>::slli<1>(_mm256_srl_epi64(arg2, _mm_cvtsi32_si128((int32_t)((64-(shift&63))))))) : arg2));
    15201186}
    15211187
     
    15851251}
    15861252
     1253//The total number of operations is 1.0
     1254template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1255{
     1256        return simd_andc(arg1, arg2);
     1257}
     1258
     1259//The total number of operations is 14.0
     1260template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1261{
     1262        bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
     1263        bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     1264        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     1265        return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
     1266}
     1267
     1268//The total number of operations is 12.0
     1269template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1270{
     1271        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
     1272}
     1273
     1274//The total number of operations is 3.0
     1275template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1276{
     1277        bitblock256_t high_bit = simd256<8>::constant<(128)>();
     1278        return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1279}
     1280
     1281//The total number of operations is 3.0
     1282template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1283{
     1284        bitblock256_t high_bit = simd256<16>::constant<(32768)>();
     1285        return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1286}
     1287
     1288//The total number of operations is 3.0
     1289template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1290{
     1291        bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
     1292        return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1293}
     1294
     1295//The total number of operations is 3.0
     1296template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1297{
     1298        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1299        return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1300}
     1301
     1302//The total number of operations is 31.75
     1303template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1304{
     1305        bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
     1306        bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     1307        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     1308        return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
     1309}
     1310
     1311//The total number of operations is 101.041666667
     1312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1313{
     1314        bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
     1315        bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     1316        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     1317        return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
     1318}
     1319
     1320//The total number of operations is 4.0
     1321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
     1322{
     1323        return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
     1324}
     1325
     1326//The total number of operations is 4.0
     1327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
     1328{
     1329        return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
     1330}
     1331
     1332//The total number of operations is 4.0
     1333template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
     1334{
     1335        return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
     1336}
     1337
     1338//The total number of operations is 3.0
     1339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
     1340{
     1341        return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
     1342}
     1343
     1344//The total number of operations is 3.0
     1345template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
     1346{
     1347        return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
     1348}
     1349
     1350//The total number of operations is 3.0
     1351template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
     1352{
     1353        return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
     1354}
     1355
     1356//The total number of operations is 7.33333333333
     1357template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
     1358{
     1359        return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
     1360}
     1361
     1362//The total number of operations is 12.0
     1363template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
     1364{
     1365        return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1366}
     1367
     1368//The total number of operations is 0
     1369template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
     1370{
     1371        return arg1;
     1372}
     1373
     1374//The total number of operations is 3.0
     1375template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
     1376{
     1377        return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
     1378}
     1379
     1380//The total number of operations is 7.0
     1381template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
     1382{
     1383        return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
     1384}
     1385
     1386//The total number of operations is 11.0
     1387template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
     1388{
     1389        return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
     1390}
     1391
     1392//The total number of operations is 14.0
     1393template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
     1394{
     1395        return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
     1396}
     1397
     1398//The total number of operations is 17.0
     1399template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
     1400{
     1401        return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
     1402}
     1403
     1404//The total number of operations is 17.0
     1405template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
     1406{
     1407        bitblock256_t tmpAns = simd256<8>::popcount(arg1);
     1408        return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
     1409}
     1410
     1411//The total number of operations is 35.6666666667
     1412template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
     1413{
     1414        return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
     1415}
     1416
     1417//The total number of operations is 59.0
     1418template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
     1419{
     1420        bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
     1421        return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
     1422}
     1423
     1424//The total number of operations is 8.0
     1425template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
     1426{
     1427        bitblock256_t t0 = simd256<2>::srli<1>(arg1);
     1428        bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
     1429        return simd_or(f0, simd256<2>::slli<1>(f0));
     1430}
     1431
     1432//The total number of operations is 12.0
     1433template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
     1434{
     1435        return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
     1436}
     1437
     1438//The total number of operations is 3.0
     1439template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
     1440{
     1441        return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
     1442}
     1443
     1444//The total number of operations is 3.0
     1445template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
     1446{
     1447        return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
     1448}
     1449
     1450//The total number of operations is 3.0
     1451template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
     1452{
     1453        return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
     1454}
     1455
     1456//The total number of operations is 3.0
     1457template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
     1458{
     1459        return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
     1460}
     1461
     1462//The total number of operations is 31.75
     1463template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
     1464{
     1465        return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
     1466}
     1467
     1468//The total number of operations is 1.0
     1469template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
     1470{
     1471        return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
     1472}
     1473
     1474//The total number of operations is 1.0
     1475template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sra(bitblock256_t arg1, bitblock256_t shift_mask)
     1476{
     1477        return _mm256_sra_epi16(arg1, avx_select_lo128(shift_mask));
     1478}
     1479
     1480//The total number of operations is 1.0
     1481template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sra(bitblock256_t arg1, bitblock256_t shift_mask)
     1482{
     1483        return _mm256_sra_epi32(arg1, avx_select_lo128(shift_mask));
     1484}
     1485
     1486//The total number of operations is 13.0
     1487template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
     1488{
     1489        return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
     1490}
     1491
     1492//The total number of operations is 6.0
     1493template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
     1494{
     1495        return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
     1496}
     1497
     1498//The total number of operations is 1.0
     1499template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
     1500{
     1501        return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
     1502}
     1503
     1504//The total number of operations is 1.0
     1505template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
     1506{
     1507        return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
     1508}
     1509
     1510//The total number of operations is 1.0
     1511template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
     1512{
     1513        return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
     1514}
     1515
     1516//The total number of operations is 1.0
     1517template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
     1518{
     1519        return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
     1520}
     1521
     1522//The total number of operations is 12.3333333333
     1523template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
     1524{
     1525        return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
     1526}
     1527
     1528//The total number of operations is 43.1666666667
     1529template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
     1530{
     1531        return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
     1532}
     1533
     1534//The total number of operations is 2.0
     1535template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
     1536{
     1537        return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
     1538}
     1539
     1540//The total number of operations is 2.0
     1541template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
     1542{
     1543        return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
     1544}
     1545
     1546//The total number of operations is 2.0
     1547template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
     1548{
     1549        return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
     1550}
     1551
     1552//The total number of operations is 1.0
     1553template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
     1554{
     1555        return _mm256_slli_epi16(arg1, (int32_t)(sh));
     1556}
     1557
     1558//The total number of operations is 1.0
     1559template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
     1560{
     1561        return _mm256_slli_epi32(arg1, (int32_t)(sh));
     1562}
     1563
     1564//The total number of operations is 1.0
     1565template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
     1566{
     1567        return _mm256_slli_epi64(arg1, (int32_t)(sh));
     1568}
     1569
     1570//The total number of operations is 5.33333333333
     1571template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
     1572{
     1573        return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
     1574}
     1575
     1576//The total number of operations is 9.5
     1577template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
     1578{
     1579        return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
     1580}
     1581
     1582//The total number of operations is 3.0
     1583template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1584{
     1585        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
     1586}
     1587
     1588//The total number of operations is 8.0
     1589template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1590{
     1591        return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
     1592}
     1593
     1594//The total number of operations is 13.0
     1595template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1596{
     1597        return simd256<1>::ifh(simd256<4>::gt(simd256<4>::constant<0>(), arg1), arg2, arg3);
     1598}
     1599
     1600//The total number of operations is 1.0
     1601template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1602{
     1603        return _mm256_blendv_epi8(arg3, arg2, arg1);
     1604}
     1605
     1606//The total number of operations is 4.0
     1607template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1608{
     1609        return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
     1610}
     1611
     1612//The total number of operations is 4.0
     1613template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1614{
     1615        return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
     1616}
     1617
     1618//The total number of operations is 4.0
     1619template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1620{
     1621        return simd256<1>::ifh(simd256<64>::gt(simd256<64>::constant<0>(), arg1), arg2, arg3);
     1622}
     1623
     1624//The total number of operations is 12.3333333333
     1625template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1626{
     1627        return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
     1628}
     1629
     1630//The total number of operations is 25.3333333333
     1631template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1632{
     1633        return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
     1634}
     1635
     1636//The total number of operations is 4.0
     1637template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
     1638{
     1639        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
     1640}
     1641
     1642//The total number of operations is 10.0
     1643template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
     1644{
     1645        bitblock256_t tmp = simd256<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1646        return simd_or(tmp, simd256<4>::sub(simd256<4>::constant<0>(), simd_and(simd256<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1647}
     1648
     1649//The total number of operations is 5.0
     1650template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
     1651{
     1652        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1653        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1654}
     1655
     1656//The total number of operations is 1.0
     1657template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
     1658{
     1659        return _mm256_srai_epi16(arg1, (int32_t)(sh));
     1660}
     1661
     1662//The total number of operations is 1.0
     1663template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
     1664{
     1665        return _mm256_srai_epi32(arg1, (int32_t)(sh));
     1666}
     1667
     1668//The total number of operations is 4.5
     1669template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
     1670{
     1671        return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
     1672}
     1673
     1674//The total number of operations is 14.0833333333
     1675template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
     1676{
     1677        return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
     1678}
     1679
     1680//The total number of operations is 33.125
     1681template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
     1682{
     1683        return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
     1684}
     1685
    15871686//The total number of operations is 3.0
    15881687template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
     
    16311730{
    16321731        return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1732}
     1733
     1734//The total number of operations is 1.0
     1735template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srl(bitblock256_t arg1, bitblock256_t shift_mask)
     1736{
     1737        return _mm256_srl_epi16(arg1, avx_select_lo128(shift_mask));
     1738}
     1739
     1740//The total number of operations is 1.0
     1741template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srl(bitblock256_t arg1, bitblock256_t shift_mask)
     1742{
     1743        return _mm256_srl_epi32(arg1, avx_select_lo128(shift_mask));
     1744}
     1745
     1746//The total number of operations is 1.0
     1747template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srl(bitblock256_t arg1, bitblock256_t shift_mask)
     1748{
     1749        return _mm256_srl_epi64(arg1, avx_select_lo128(shift_mask));
     1750}
     1751
     1752//The total number of operations is 18.5
     1753template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srl(bitblock256_t arg1, bitblock256_t shift_mask)
     1754{
     1755        uint32_t shift = _mm_cvtsi128_si32(avx_select_lo128(shift_mask));
     1756        uint32_t n = (shift/64);
     1757        bitblock256_t arg2 = ((n == 1) ? mvmd256<64>::srli<1>(arg1) : ((n == 2) ? mvmd256<64>::srli<2>(arg1) : ((n == 3) ? mvmd256<64>::srli<3>(arg1) : arg1)));
     1758        return ((n >= 4) ? simd256<32>::constant<0>() : (((shift&63) > 0) ? simd_or(_mm256_srl_epi64(arg2, _mm_cvtsi32_si128((int32_t)((shift&63)))), mvmd256<64>::srli<1>(_mm256_sll_epi64(arg2, _mm_cvtsi32_si128((int32_t)((64-(shift&63))))))) : arg2));
     1759}
     1760
     1761//The total number of operations is 0
     1762template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1763{
     1764        return simd256<2>::constant<(1)>();
     1765}
     1766
     1767//The total number of operations is 0
     1768template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1769{
     1770        return simd256<4>::constant<(3)>();
     1771}
     1772
     1773//The total number of operations is 0
     1774template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1775{
     1776        return simd256<8>::constant<(15)>();
     1777}
     1778
     1779//The total number of operations is 0
     1780template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1781{
     1782        return simd256<16>::constant<(255)>();
     1783}
     1784
     1785//The total number of operations is 0
     1786template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1787{
     1788        return simd256<32>::constant<(65535)>();
     1789}
     1790
     1791//The total number of operations is 0
     1792template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1793{
     1794        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     1795}
     1796
     1797//The total number of operations is 0
     1798template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1799{
     1800        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     1801}
     1802
     1803//The total number of operations is 0
     1804template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1805{
     1806        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
    16331807}
    16341808
     
    17501924}
    17511925
     1926//The total number of operations is 1.0
     1927template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1928{
     1929        return simd_and(arg1, arg2);
     1930}
     1931
     1932//The total number of operations is 16.0
     1933template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1934{
     1935        return simd_or(simd_and(simd256<(4)>::himask(), simd256<(4)>::umin(arg1, arg2)), simd256<(4)>::umin(simd_and(simd256<(4)>::lomask(), arg1), simd_and(simd256<(4)>::lomask(), arg2)));
     1936}
     1937
     1938//The total number of operations is 6.0
     1939template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1940{
     1941        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     1942}
     1943
     1944//The total number of operations is 1.0
     1945template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1946{
     1947        return _mm256_min_epu8(arg1, arg2);
     1948}
     1949
     1950//The total number of operations is 1.0
     1951template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1952{
     1953        return _mm256_min_epu16(arg1, arg2);
     1954}
     1955
     1956//The total number of operations is 1.0
     1957template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1958{
     1959        return _mm256_min_epu32(arg1, arg2);
     1960}
     1961
     1962//The total number of operations is 7.0
     1963template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1964{
     1965        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1966        return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1967}
     1968
     1969//The total number of operations is 28.6666666667
     1970template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1971{
     1972        bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
     1973        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     1974        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     1975        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1976}
     1977
     1978//The total number of operations is 85.0
     1979template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1980{
     1981        bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
     1982        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     1983        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     1984        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1985}
     1986
     1987//The total number of operations is 1.0
     1988template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1989{
     1990        return simd_or(arg1, arg2);
     1991}
     1992
     1993//The total number of operations is 16.0
     1994template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1995{
     1996        return simd_or(simd_and(simd256<(4)>::himask(), simd256<(4)>::umax(arg1, arg2)), simd256<(4)>::umax(simd_and(simd256<(4)>::lomask(), arg1), simd_and(simd256<(4)>::lomask(), arg2)));
     1997}
     1998
     1999//The total number of operations is 6.0
     2000template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2001{
     2002        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     2003}
     2004
     2005//The total number of operations is 1.0
     2006template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2007{
     2008        return _mm256_max_epu8(arg1, arg2);
     2009}
     2010
     2011//The total number of operations is 1.0
     2012template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2013{
     2014        return _mm256_max_epu16(arg1, arg2);
     2015}
     2016
     2017//The total number of operations is 1.0
     2018template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2019{
     2020        return _mm256_max_epu32(arg1, arg2);
     2021}
     2022
     2023//The total number of operations is 7.0
     2024template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2025{
     2026        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     2027        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2028}
     2029
     2030//The total number of operations is 28.6666666667
     2031template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2032{
     2033        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
     2034        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     2035        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     2036        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2037}
     2038
     2039//The total number of operations is 85.0
     2040template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2041{
     2042        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
     2043        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     2044        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     2045        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2046}
     2047
     2048//The total number of operations is 1.0
     2049template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2050{
     2051        return simd_andc(arg1, arg2);
     2052}
     2053
     2054//The total number of operations is 15.0
     2055template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2056{
     2057        bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
     2058        bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
     2059        bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     2060        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     2061        return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
     2062}
     2063
     2064//The total number of operations is 18.0
     2065template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2066{
     2067        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::lt(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::lt(simd256<(8)>::slli<4>(arg1), simd256<(8)>::slli<4>(arg2)));
     2068}
     2069
     2070//The total number of operations is 5.0
     2071template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2072{
     2073        return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
     2074}
     2075
     2076//The total number of operations is 5.0
     2077template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2078{
     2079        return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
     2080}
     2081
     2082//The total number of operations is 5.0
     2083template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2084{
     2085        return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
     2086}
     2087
     2088//The total number of operations is 5.0
     2089template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2090{
     2091        return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
     2092}
     2093
     2094//The total number of operations is 40.75
     2095template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2096{
     2097        bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
     2098        bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
     2099        bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     2100        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     2101        return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
     2102}
     2103
     2104//The total number of operations is 145.791666667
     2105template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2106{
     2107        bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
     2108        bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
     2109        bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     2110        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     2111        return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
     2112}
     2113
     2114//The total number of operations is 2.0
     2115template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2116{
     2117        return simd_not(simd_xor(arg1, arg2));
     2118}
     2119
     2120//The total number of operations is 8.0
     2121template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2122{
     2123        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
     2124        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
     2125        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
     2126        return simd_or(loMask, hiMask);
     2127}
     2128
     2129//The total number of operations is 9.0
     2130template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2131{
     2132        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
     2133}
     2134
     2135//The total number of operations is 1.0
     2136template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2137{
     2138        return _mm256_cmpeq_epi8(arg1, arg2);
     2139}
     2140
     2141//The total number of operations is 1.0
     2142template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2143{
     2144        return _mm256_cmpeq_epi16(arg1, arg2);
     2145}
     2146
     2147//The total number of operations is 1.0
     2148template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2149{
     2150        return _mm256_cmpeq_epi32(arg1, arg2);
     2151}
     2152
     2153//The total number of operations is 1.0
     2154template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2155{
     2156        return _mm256_cmpeq_epi64(arg1, arg2);
     2157}
     2158
     2159//The total number of operations is 13.6666666667
     2160template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2161{
     2162        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
     2163        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
     2164        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
     2165        return simd_or(loMask, hiMask);
     2166}
     2167
     2168//The total number of operations is 35.1666666667
     2169template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2170{
     2171        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
     2172        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
     2173        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
     2174        return simd_or(loMask, hiMask);
     2175}
     2176
    17522177//The total number of operations is 0
    1753 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1754 {
    1755         return simd256<2>::constant<(1)>();
     2178template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
     2179{
     2180        return simd256<2>::constant<(2)>();
    17562181}
    17572182
    17582183//The total number of operations is 0
    1759 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1760 {
    1761         return simd256<4>::constant<(3)>();
     2184template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
     2185{
     2186        return simd256<4>::constant<(12)>();
    17622187}
    17632188
    17642189//The total number of operations is 0
    1765 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1766 {
    1767         return simd256<8>::constant<(15)>();
     2190template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
     2191{
     2192        return simd256<8>::constant<(240)>();
    17682193}
    17692194
    17702195//The total number of operations is 0
    1771 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1772 {
    1773         return simd256<16>::constant<(255)>();
     2196template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
     2197{
     2198        return simd256<16>::constant<(65280)>();
    17742199}
    17752200
    17762201//The total number of operations is 0
    1777 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1778 {
    1779         return simd256<32>::constant<(65535)>();
     2202template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
     2203{
     2204        return simd256<32>::constant<-65536>();
    17802205}
    17812206
    17822207//The total number of operations is 0
    1783 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1784 {
    1785         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     2208template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
     2209{
     2210        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    17862211}
    17872212
    17882213//The total number of operations is 0
    1789 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1790 {
    1791         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     2214template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
     2215{
     2216        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    17922217}
    17932218
    17942219//The total number of operations is 0
    1795 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1796 {
    1797         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
     2220template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
     2221{
     2222        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    17982223}
    17992224
     
    18632288}
    18642289
    1865 //The total number of operations is 1.0
    1866 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1867 {
    1868         return simd_and(arg1, arg2);
    1869 }
    1870 
    1871 //The total number of operations is 16.0
    1872 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1873 {
    1874         return simd_or(simd_and(simd256<(4)>::himask(), simd256<(4)>::umin(arg1, arg2)), simd256<(4)>::umin(simd_and(simd256<(4)>::lomask(), arg1), simd_and(simd256<(4)>::lomask(), arg2)));
    1875 }
    1876 
    1877 //The total number of operations is 6.0
    1878 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1879 {
    1880         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    1881 }
    1882 
    1883 //The total number of operations is 1.0
    1884 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1885 {
    1886         return _mm256_min_epu8(arg1, arg2);
    1887 }
    1888 
    1889 //The total number of operations is 1.0
    1890 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1891 {
    1892         return _mm256_min_epu16(arg1, arg2);
    1893 }
    1894 
    1895 //The total number of operations is 1.0
    1896 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1897 {
    1898         return _mm256_min_epu32(arg1, arg2);
    1899 }
    1900 
    1901 //The total number of operations is 7.0
    1902 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1903 {
    1904         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1905         return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1906 }
    1907 
    1908 //The total number of operations is 28.6666666667
    1909 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1910 {
    1911         bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
    1912         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    1913         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    1914         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1915 }
    1916 
    1917 //The total number of operations is 85.0
    1918 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1919 {
    1920         bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
    1921         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    1922         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    1923         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1924 }
    1925 
    19262290//The total number of operations is 14.5
    19272291template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     
    19742338        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    19752339        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
    1976 }
    1977 
    1978 //The total number of operations is 2.0
    1979 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1980 {
    1981         return simd_not(simd_xor(arg1, arg2));
    1982 }
    1983 
    1984 //The total number of operations is 8.0
    1985 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1986 {
    1987         bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
    1988         bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
    1989         bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
    1990         return simd_or(loMask, hiMask);
    1991 }
    1992 
    1993 //The total number of operations is 9.0
    1994 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1995 {
    1996         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
    1997 }
    1998 
    1999 //The total number of operations is 1.0
    2000 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2001 {
    2002         return _mm256_cmpeq_epi8(arg1, arg2);
    2003 }
    2004 
    2005 //The total number of operations is 1.0
    2006 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2007 {
    2008         return _mm256_cmpeq_epi16(arg1, arg2);
    2009 }
    2010 
    2011 //The total number of operations is 1.0
    2012 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2013 {
    2014         return _mm256_cmpeq_epi32(arg1, arg2);
    2015 }
    2016 
    2017 //The total number of operations is 1.0
    2018 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2019 {
    2020         return _mm256_cmpeq_epi64(arg1, arg2);
    2021 }
    2022 
    2023 //The total number of operations is 13.6666666667
    2024 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2025 {
    2026         bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
    2027         bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
    2028         bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
    2029         return simd_or(loMask, hiMask);
    2030 }
    2031 
    2032 //The total number of operations is 35.1666666667
    2033 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2034 {
    2035         bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
    2036         bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
    2037         bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
    2038         return simd_or(loMask, hiMask);
    2039 }
    2040 
    2041 //The total number of operations is 4.0
    2042 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
    2043 {
    2044         return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
    2045 }
    2046 
    2047 //The total number of operations is 10.0
    2048 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
    2049 {
    2050         bitblock256_t tmp = simd256<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    2051         return simd_or(tmp, simd256<4>::sub(simd256<4>::constant<0>(), simd_and(simd256<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    2052 }
    2053 
    2054 //The total number of operations is 5.0
    2055 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
    2056 {
    2057         bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    2058         return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    2059 }
    2060 
    2061 //The total number of operations is 1.0
    2062 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
    2063 {
    2064         return _mm256_srai_epi16(arg1, (int32_t)(sh));
    2065 }
    2066 
    2067 //The total number of operations is 1.0
    2068 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
    2069 {
    2070         return _mm256_srai_epi32(arg1, (int32_t)(sh));
    2071 }
    2072 
    2073 //The total number of operations is 4.5
    2074 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
    2075 {
    2076         return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
    2077 }
    2078 
    2079 //The total number of operations is 14.0833333333
    2080 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
    2081 {
    2082         return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
    2083 }
    2084 
    2085 //The total number of operations is 33.125
    2086 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
    2087 {
    2088         return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    2089 }
    2090 
    2091 //The total number of operations is 0
    2092 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
    2093 {
    2094         return simd256<2>::constant<(2)>();
    2095 }
    2096 
    2097 //The total number of operations is 0
    2098 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
    2099 {
    2100         return simd256<4>::constant<(12)>();
    2101 }
    2102 
    2103 //The total number of operations is 0
    2104 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
    2105 {
    2106         return simd256<8>::constant<(240)>();
    2107 }
    2108 
    2109 //The total number of operations is 0
    2110 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
    2111 {
    2112         return simd256<16>::constant<(65280)>();
    2113 }
    2114 
    2115 //The total number of operations is 0
    2116 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
    2117 {
    2118         return simd256<32>::constant<-65536>();
    2119 }
    2120 
    2121 //The total number of operations is 0
    2122 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
    2123 {
    2124         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    2125 }
    2126 
    2127 //The total number of operations is 0
    2128 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
    2129 {
    2130         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    2131 }
    2132 
    2133 //The total number of operations is 0
    2134 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
    2135 {
    2136         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    2137 }
    2138 
    2139 //The total number of operations is 1.0
    2140 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2141 {
    2142         return simd_andc(arg1, arg2);
    2143 }
    2144 
    2145 //The total number of operations is 15.0
    2146 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2147 {
    2148         bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
    2149         bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
    2150         bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    2151         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    2152         return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
    2153 }
    2154 
    2155 //The total number of operations is 18.0
    2156 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2157 {
    2158         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::lt(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::lt(simd256<(8)>::slli<4>(arg1), simd256<(8)>::slli<4>(arg2)));
    2159 }
    2160 
    2161 //The total number of operations is 5.0
    2162 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2163 {
    2164         return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
    2165 }
    2166 
    2167 //The total number of operations is 5.0
    2168 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2169 {
    2170         return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
    2171 }
    2172 
    2173 //The total number of operations is 5.0
    2174 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2175 {
    2176         return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
    2177 }
    2178 
    2179 //The total number of operations is 5.0
    2180 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2181 {
    2182         return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
    2183 }
    2184 
    2185 //The total number of operations is 40.75
    2186 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2187 {
    2188         bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
    2189         bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
    2190         bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    2191         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    2192         return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
    2193 }
    2194 
    2195 //The total number of operations is 145.791666667
    2196 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2197 {
    2198         bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
    2199         bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
    2200         bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    2201         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    2202         return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
    2203 }
    2204 
    2205 //The total number of operations is 1.0
    2206 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2207 {
    2208         return simd_or(arg1, arg2);
    2209 }
    2210 
    2211 //The total number of operations is 16.0
    2212 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2213 {
    2214         return simd_or(simd_and(simd256<(4)>::himask(), simd256<(4)>::umax(arg1, arg2)), simd256<(4)>::umax(simd_and(simd256<(4)>::lomask(), arg1), simd_and(simd256<(4)>::lomask(), arg2)));
    2215 }
    2216 
    2217 //The total number of operations is 6.0
    2218 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2219 {
    2220         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    2221 }
    2222 
    2223 //The total number of operations is 1.0
    2224 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2225 {
    2226         return _mm256_max_epu8(arg1, arg2);
    2227 }
    2228 
    2229 //The total number of operations is 1.0
    2230 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2231 {
    2232         return _mm256_max_epu16(arg1, arg2);
    2233 }
    2234 
    2235 //The total number of operations is 1.0
    2236 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2237 {
    2238         return _mm256_max_epu32(arg1, arg2);
    2239 }
    2240 
    2241 //The total number of operations is 7.0
    2242 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2243 {
    2244         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    2245         return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    2246 }
    2247 
    2248 //The total number of operations is 28.6666666667
    2249 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2250 {
    2251         bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
    2252         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    2253         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    2254         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    2255 }
    2256 
    2257 //The total number of operations is 85.0
    2258 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2259 {
    2260         bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
    2261         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    2262         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    2263         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    22642340}
    22652341
     
    30333109}
    30343110
     3111//The total number of operations is 15.0
     3112template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
     3113{
     3114        return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
     3115}
     3116
     3117//The total number of operations is 7.0
     3118template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
     3119{
     3120        return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
     3121}
     3122
     3123//The total number of operations is 3.0
     3124template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
     3125{
     3126        return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
     3127}
     3128
     3129//The total number of operations is 1.0
     3130template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
     3131{
     3132        return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
     3133}
     3134
     3135//The total number of operations is 5.0
     3136template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
     3137{
     3138        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     3139}
     3140
    30353141//The total number of operations is 1.0
    30363142template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1)
     
    31833289}
    31843290
    3185 //The total number of operations is 15.0
    3186 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
    3187 {
    3188         return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
    3189 }
    3190 
    3191 //The total number of operations is 7.0
    3192 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
    3193 {
    3194         return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
    3195 }
    3196 
    3197 //The total number of operations is 3.0
    3198 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
    3199 {
    3200         return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
    3201 }
    3202 
    3203 //The total number of operations is 1.0
    3204 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
    3205 {
    3206         return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
    3207 }
    3208 
    3209 //The total number of operations is 5.0
    3210 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
    3211 {
    3212         return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    3213 }
    3214 
    32153291//The total number of operations is 5.0
    32163292template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
     
    34833559}
    34843560
     3561//The total number of operations is 17.75
     3562IDISA_ALWAYS_INLINE bitblock256_t bitblock256::sll(bitblock256_t arg1, bitblock256_t arg2)
     3563{
     3564        return simd256<256>::sll(arg1, arg2);
     3565}
     3566
    34853567//The total number of operations is 1.0
    34863568IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_unaligned(const bitblock256_t* arg1)
     
    34953577}
    34963578
    3497 //The total number of operations is 1.0
    3498 IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
    3499 {
    3500         _mm256_store_si256((bitblock256_t*)(arg2), arg1);
    3501 }
    3502 
    35033579//The total number of operations is 62.0
    35043580IDISA_ALWAYS_INLINE uint16_t bitblock256::popcount(bitblock256_t arg1)
     
    35133589}
    35143590
     3591//The total number of operations is 18.5
     3592IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srl(bitblock256_t arg1, bitblock256_t arg2)
     3593{
     3594        return simd256<256>::srl(arg1, arg2);
     3595}
     3596
     3597//The total number of operations is 1.0
     3598IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
     3599{
     3600        return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
     3601}
     3602
     3603//The total number of operations is 1.0
     3604IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
     3605{
     3606        return _mm256_load_si256((bitblock256_t*)(arg1));
     3607}
     3608
     3609//The total number of operations is 1.0
     3610IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
     3611{
     3612        _mm256_store_si256((bitblock256_t*)(arg2), arg1);
     3613}
     3614
     3615//The total number of operations is 1.0
     3616IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
     3617{
     3618        _mm256_storeu_si256((bitblock256_t*)(arg2), arg1);
     3619}
     3620
    35153621//The total number of operations is 9.5
    35163622template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
     
    35193625}
    35203626
    3521 //The total number of operations is 1.0
    3522 IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
    3523 {
    3524         return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
    3525 }
    3526 
    3527 //The total number of operations is 1.0
    3528 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
    3529 {
    3530         return _mm256_load_si256((bitblock256_t*)(arg1));
    3531 }
    3532 
    3533 //The total number of operations is 1.0
    3534 IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
    3535 {
    3536         _mm256_storeu_si256((bitblock256_t*)(arg2), arg1);
    3537 }
    3538 
    3539 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::sll(bitblock256_t arg1, int shift)
    3540 {
    3541         int n = shift / 64;
    3542         if (n == 1)
    3543                 arg1 = mvmd256<64>::slli<1>(arg1);
    3544         else if (n == 2)
    3545                 arg1 = mvmd256<64>::slli<2>(arg1);
    3546         else if (n == 3)
    3547                 arg1 = mvmd256<64>::slli<3>(arg1);
    3548         else if (n >= 4)
    3549                 return simd256<32>::constant<0>();
    3550 
    3551         if (shift & 63)
    3552         {
    3553                 __m128i sh = _mm_cvtsi32_si128(shift & 63);
    3554                 __m128i subsh = _mm_cvtsi32_si128(64 - (shift & 63));
    3555 
    3556                 return simd_or(_mm256_sll_epi64(arg1, sh), mvmd256<64>::slli<1>(_mm256_srl_epi64(arg1, subsh)));
    3557         }
    3558 
    3559         return arg1;
    3560 }
    3561 
    3562 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srl(bitblock256_t arg1, int shift)
    3563 {
    3564         int n = shift / 64;
    3565         if (n == 1)
    3566                 arg1 = mvmd256<64>::srli<1>(arg1);
    3567         else if (n == 2)
    3568                 arg1 = mvmd256<64>::srli<2>(arg1);
    3569         else if (n == 3)
    3570                 arg1 = mvmd256<64>::srli<3>(arg1);
    3571         else if (n >= 4)
    3572                 return simd256<32>::constant<0>();
    3573 
    3574         if (shift & 63)
    3575         {
    3576                 __m128i sh = _mm_cvtsi32_si128(shift & 63);
    3577                 __m128i subsh = _mm_cvtsi32_si128(64 - (shift & 63));
    3578 
    3579                 return simd_or(_mm256_srl_epi64(arg1, sh), mvmd256<64>::srli<1>(_mm256_sll_epi64(arg1, subsh)));
    3580         }
    3581 
    3582         return arg1;
    3583 }
    3584 
    3585 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::sll(bitblock256_t arg1, bitblock256_t shift)
    3586 {
    3587         return bitblock256::sll(arg1, _mm_cvtsi128_si32(avx_select_lo128(shift)));
    3588 }
    3589 
    3590 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srl(bitblock256_t arg1, bitblock256_t shift)
    3591 {
    3592         return bitblock256::srl(arg1, _mm_cvtsi128_si32(avx_select_lo128(shift)));
    3593 }
    35943627#endif
Note: See TracChangeset for help on using the changeset viewer.