Ignore:
Timestamp:
Sep 22, 2013, 3:09:25 PM (6 years ago)
Author:
linmengl
Message:

add mvmd_insert to sse and avx. Update all sse library. Hand modified sse2 is saved as idisa_sse2_hand.cpp

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_ssse3.cpp

    r2275 r3525  
    1414
    1515typedef __m128i bitblock128_t;
     16               
     17#ifndef FIELD_TYPE
     18#define FIELD_TYPE     
     19template <uint32_t fw> struct FieldType {
     20   typedef int T;  //default for FieldType::T is int
     21};
     22
     23template <> struct FieldType<1> {typedef uint8_t T;};
     24template <> struct FieldType<2> {typedef uint8_t T;};
     25template <> struct FieldType<4> {typedef uint8_t T;};
     26template <> struct FieldType<8> {typedef uint8_t T;};
     27template <> struct FieldType<16> {typedef uint16_t T;};
     28template <> struct FieldType<32> {typedef uint32_t T;};
     29template <> struct FieldType<64> {typedef uint64_t T;};
     30template <> struct FieldType<128> {typedef uint64_t T;};
     31template <> struct FieldType<256> {typedef uint64_t T;};
     32#endif
     33
    1634template <uint32_t fw>
    1735class simd128
     
    2341        static IDISA_ALWAYS_INLINE bitblock128_t umult(bitblock128_t arg1, bitblock128_t arg2);
    2442        static IDISA_ALWAYS_INLINE bitblock128_t ult(bitblock128_t arg1, bitblock128_t arg2);
    25         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     43        static IDISA_ALWAYS_INLINE bitblock128_t all(bitblock128_t arg1);
     44        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     45        static IDISA_ALWAYS_INLINE bitblock128_t ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    2646        static IDISA_ALWAYS_INLINE bitblock128_t ctz(bitblock128_t arg1);
    2747        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t shift_mask);
     48        static IDISA_ALWAYS_INLINE bitblock128_t vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    2849        static IDISA_ALWAYS_INLINE bitblock128_t eq(bitblock128_t arg1, bitblock128_t arg2);
    2950        static IDISA_ALWAYS_INLINE bitblock128_t popcount(bitblock128_t arg1);
    3051        static IDISA_ALWAYS_INLINE bitblock128_t neg(bitblock128_t arg1);
    3152        static IDISA_ALWAYS_INLINE bitblock128_t himask();
    32         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    33         static IDISA_ALWAYS_INLINE bitblock128_t ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
     53        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    3454        static IDISA_ALWAYS_INLINE bitblock128_t sub(bitblock128_t arg1, bitblock128_t arg2);
    3555        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    3656        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    3757        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    3859        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    39         template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
     60        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    4061        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
     62        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    4163        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    4264        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
    4365        static IDISA_ALWAYS_INLINE bitblock128_t xor_hl(bitblock128_t arg1);
    44         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
     66        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
     67        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    4568        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    46         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    4769        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    4870};
     
    5678        static IDISA_ALWAYS_INLINE bitblock128_t packss(bitblock128_t arg1, bitblock128_t arg2);
    5779        static IDISA_ALWAYS_INLINE bitblock128_t packh(bitblock128_t arg1, bitblock128_t arg2);
    58         static IDISA_ALWAYS_INLINE uint64_t signmask(bitblock128_t arg1);
     80        static IDISA_ALWAYS_INLINE typename FieldType<128/fw>::T signmask(bitblock128_t arg1);
    5981        static IDISA_ALWAYS_INLINE bitblock128_t packl(bitblock128_t arg1, bitblock128_t arg2);
    6082        static IDISA_ALWAYS_INLINE bitblock128_t min_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    79101public:
    80102        template <uint64_t msk> static IDISA_ALWAYS_INLINE bitblock128_t shufflei(bitblock128_t arg1);
    81         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dsrli(bitblock128_t arg1, bitblock128_t arg2);
    82         static IDISA_ALWAYS_INLINE bitblock128_t fill(uint64_t val1);
     103        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dsrli(bitblock128_t arg1, bitblock128_t arg2);
     104        static IDISA_ALWAYS_INLINE bitblock128_t fill(typename FieldType<fw>::T val1);
    83105        static IDISA_ALWAYS_INLINE bitblock128_t shuffle(bitblock128_t arg1, bitblock128_t arg2);
    84         template <uint64_t pos> static IDISA_ALWAYS_INLINE uint64_t extract(bitblock128_t arg1);
    85         template <uint64_t pos> static IDISA_ALWAYS_INLINE bitblock128_t splat(bitblock128_t arg1);
    86         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    87         static IDISA_ALWAYS_INLINE bitblock128_t fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    88         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    89         static IDISA_ALWAYS_INLINE bitblock128_t fill2(uint64_t val1, uint64_t val2);
    90         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dslli(bitblock128_t arg1, bitblock128_t arg2);
    91         static IDISA_ALWAYS_INLINE bitblock128_t fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    92         static IDISA_ALWAYS_INLINE bitblock128_t fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
     106        template <uint16_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock128_t arg1);
     107        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock128_t splat(bitblock128_t arg1);
     108        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
     109        static IDISA_ALWAYS_INLINE bitblock128_t fill4(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4);
     110        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     111        static IDISA_ALWAYS_INLINE bitblock128_t fill2(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2);
     112        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dslli(bitblock128_t arg1, bitblock128_t arg2);
     113        static IDISA_ALWAYS_INLINE bitblock128_t fill8(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8);
     114        static IDISA_ALWAYS_INLINE bitblock128_t fill16(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8, typename FieldType<fw>::T val9, typename FieldType<fw>::T val10, typename FieldType<fw>::T val11, typename FieldType<fw>::T val12, typename FieldType<fw>::T val13, typename FieldType<fw>::T val14, typename FieldType<fw>::T val15, typename FieldType<fw>::T val16);
     115        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock128_t insert(bitblock128_t arg1, typename FieldType<fw>::T arg2);
    93116};
    94117
     
    98121        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    99122        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(const bitblock128_t* arg1);
    100         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     123        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    101124        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    102         static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
    103125        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    104126        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    105         static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
    106         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
     127        static IDISA_ALWAYS_INLINE uint16_t popcount(bitblock128_t arg1);
     128        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    107129        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(const bitblock128_t* arg1);
     130        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
    108131        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t arg1, bitblock128_t* arg2);
    109132};
     
    112135IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    113136IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     137IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    114138IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    115 IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
     139IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    116140IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
    117 IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    118141template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::max(bitblock128_t arg1, bitblock128_t arg2);
    119142template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::max(bitblock128_t arg1, bitblock128_t arg2);
     
    155178template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ult(bitblock128_t arg1, bitblock128_t arg2);
    156179template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ult(bitblock128_t arg1, bitblock128_t arg2);
    157 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    158 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
    159 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lt(bitblock128_t arg1, bitblock128_t arg2);
    160 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lt(bitblock128_t arg1, bitblock128_t arg2);
    161 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lt(bitblock128_t arg1, bitblock128_t arg2);
    162 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lt(bitblock128_t arg1, bitblock128_t arg2);
    163 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    164 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    165 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1);
    166 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1);
    167 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1);
    168 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1);
    169 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1);
    170 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1);
    171 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1);
     180template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::all(bitblock128_t arg1);
     181template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::all(bitblock128_t arg1);
     182template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::all(bitblock128_t arg1);
     183template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::all(bitblock128_t arg1);
     184template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::all(bitblock128_t arg1);
     185template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::all(bitblock128_t arg1);
     186template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::all(bitblock128_t arg1);
     187template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1);
     188template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1);
     189template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1);
     190template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1);
     191template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1);
     192template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1);
     193template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1);
    172194template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1);
    173195template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1);
     
    178200template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1);
    179201template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1);
    180 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sll(bitblock128_t arg1, bitblock128_t shift_mask);
    181202template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sll(bitblock128_t arg1, bitblock128_t shift_mask);
     203template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2);
     204template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2);
     205template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2);
     206template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2);
     207template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2);
     208template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2);
     209template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2);
     210template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2);
    182211template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2);
    183212template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2);
     
    203232template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1);
    204233template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1);
     234template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::any(bitblock128_t arg1);
     235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::any(bitblock128_t arg1);
     236template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::any(bitblock128_t arg1);
     237template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::any(bitblock128_t arg1);
     238template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::any(bitblock128_t arg1);
     239template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::any(bitblock128_t arg1);
     240template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::any(bitblock128_t arg1);
    205241template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1);
    206242template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1);
     
    210246template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1);
    211247template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1);
    212 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1);
    213 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1);
    214 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1);
    215 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1);
    216 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1);
    217 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1);
    218 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1);
     248template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1);
     249template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1);
     250template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1);
     251template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1);
     252template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1);
     253template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1);
     254template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1);
    219255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    220256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
     
    225261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    226262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    227 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2);
    228 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2);
    229 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2);
    230 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2);
    231 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2);
    232 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2);
    233 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2);
    234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2);
     263template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     264template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     265template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     266template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     267template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     268template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     269template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    235272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1);
    236273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1);
     
    240277template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1);
    241278template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    242 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    243279template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    244 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    245 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
    246 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
    247 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
    248 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
    249 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
    250 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
    251 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
     280template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     281template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     282template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     283template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     284template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     285template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     286template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
     287template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     288template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     289template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
     290template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     291template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
     292template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
     293template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
     294template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
     295template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
     296template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
    252297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::min(bitblock128_t arg1, bitblock128_t arg2);
    253298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::min(bitblock128_t arg1, bitblock128_t arg2);
     
    258303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    259304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
    260 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    261 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    262 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    263 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    264 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    265 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    267305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    268306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    273311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    274312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    275 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    276 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    277 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    278 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    279 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    280 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     320template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     321template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
     322template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     323template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lt(bitblock128_t arg1, bitblock128_t arg2);
     324template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lt(bitblock128_t arg1, bitblock128_t arg2);
     325template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lt(bitblock128_t arg1, bitblock128_t arg2);
     326template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lt(bitblock128_t arg1, bitblock128_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
     328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    282329template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    283330template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    288335template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    289336template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    290 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    291 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    292 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    293 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    294 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    295 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    296 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    297337template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    298338template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    310350template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    311351template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    312 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    313 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    314 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     353template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     354template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     355template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     356template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     357template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     358template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    320359template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    321360template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    339378template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<64>::packss(bitblock128_t arg1, bitblock128_t arg2);
    340379template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<128>::packss(bitblock128_t arg1, bitblock128_t arg2);
    341 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<4>::signmask(bitblock128_t arg1);
    342 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<8>::signmask(bitblock128_t arg1);
    343 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<16>::signmask(bitblock128_t arg1);
    344 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<32>::signmask(bitblock128_t arg1);
    345 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<64>::signmask(bitblock128_t arg1);
    346 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<128>::signmask(bitblock128_t arg1);
     380template <> IDISA_ALWAYS_INLINE FieldType<128/4>::T hsimd128<4>::signmask(bitblock128_t arg1);
     381template <> IDISA_ALWAYS_INLINE FieldType<128/8>::T hsimd128<8>::signmask(bitblock128_t arg1);
     382template <> IDISA_ALWAYS_INLINE FieldType<128/16>::T hsimd128<16>::signmask(bitblock128_t arg1);
     383template <> IDISA_ALWAYS_INLINE FieldType<128/32>::T hsimd128<32>::signmask(bitblock128_t arg1);
     384template <> IDISA_ALWAYS_INLINE FieldType<128/64>::T hsimd128<64>::signmask(bitblock128_t arg1);
     385template <> IDISA_ALWAYS_INLINE FieldType<128/128>::T hsimd128<128>::signmask(bitblock128_t arg1);
    347386template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::packl(bitblock128_t arg1, bitblock128_t arg2);
    348387template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::packl(bitblock128_t arg1, bitblock128_t arg2);
     
    418457template <> template <uint64_t msk> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::shufflei(bitblock128_t arg1);
    419458template <> template <uint64_t msk> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::shufflei(bitblock128_t arg1);
    420 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    421 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    422 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    423 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    424 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    425 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    426 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    427 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(uint64_t val1);
    428 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(uint64_t val1);
    429 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(uint64_t val1);
    430 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(uint64_t val1);
    431 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(uint64_t val1);
    432 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(uint64_t val1);
    433 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(uint64_t val1);
    434 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(uint64_t val1);
     459template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     460template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     461template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     462template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     463template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     464template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     465template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     466template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
     467template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
     468template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
     469template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
     470template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(FieldType<1>::T val1);
     471template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(FieldType<2>::T val1);
     472template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(FieldType<4>::T val1);
     473template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(FieldType<8>::T val1);
     474template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(FieldType<16>::T val1);
     475template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(FieldType<32>::T val1);
     476template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(FieldType<64>::T val1);
     477template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(FieldType<128>::T val1);
    435478template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::shuffle(bitblock128_t arg1, bitblock128_t arg2);
    436479template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::shuffle(bitblock128_t arg1, bitblock128_t arg2);
    437480template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::shuffle(bitblock128_t arg1, bitblock128_t arg2);
    438481template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::shuffle(bitblock128_t arg1, bitblock128_t arg2);
    439 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<1>::extract(bitblock128_t arg1);
    440 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<2>::extract(bitblock128_t arg1);
    441 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<4>::extract(bitblock128_t arg1);
    442 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<8>::extract(bitblock128_t arg1);
    443 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<16>::extract(bitblock128_t arg1);
    444 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<32>::extract(bitblock128_t arg1);
    445 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<64>::extract(bitblock128_t arg1);
    446 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1);
    447 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1);
    448 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1);
    449 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1);
    450 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1);
    451 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1);
    452 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1);
    453 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1);
    454 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    455 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    456 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    457 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    458 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    459 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    460 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    461 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    462 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    463 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    464 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1);
    465 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1);
    466 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1);
    467 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1);
    468 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1);
    469 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1);
    470 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1);
    471 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(uint64_t val1, uint64_t val2);
    472 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(uint64_t val1, uint64_t val2);
    473 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(uint64_t val1, uint64_t val2);
    474 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(uint64_t val1, uint64_t val2);
    475 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(uint64_t val1, uint64_t val2);
    476 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(uint64_t val1, uint64_t val2);
    477 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(uint64_t val1, uint64_t val2);
    478 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    479 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    480 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    481 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    482 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    483 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    484 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    485 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    486 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    487 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    488 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    489 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    490 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    491 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    492 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    493 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    494 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    495 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    496 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     482template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd128<1>::extract(bitblock128_t arg1);
     483template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd128<2>::extract(bitblock128_t arg1);
     484template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd128<4>::extract(bitblock128_t arg1);
     485template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd128<8>::extract(bitblock128_t arg1);
     486template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd128<16>::extract(bitblock128_t arg1);
     487template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd128<32>::extract(bitblock128_t arg1);
     488template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd128<64>::extract(bitblock128_t arg1);
     489template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1);
     490template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1);
     491template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1);
     492template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1);
     493template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1);
     494template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1);
     495template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1);
     496template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1);
     497template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::insert(bitblock128_t arg1, FieldType<2>::T arg2);
     498template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::insert(bitblock128_t arg1, FieldType<4>::T arg2);
     499template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::insert(bitblock128_t arg1, FieldType<8>::T arg2);
     500template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::insert(bitblock128_t arg1, FieldType<16>::T arg2);
     501template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::insert(bitblock128_t arg1, FieldType<32>::T arg2);
     502template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::insert(bitblock128_t arg1, FieldType<64>::T arg2);
     503template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
     504template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     505template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4);
     506template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4);
     507template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4);
     508template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4);
     509template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1);
     510template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1);
     511template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1);
     512template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1);
     513template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1);
     514template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1);
     515template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1);
     516template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2);
     517template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2);
     518template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2);
     519template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2);
     520template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2);
     521template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2);
     522template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2);
     523template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     524template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     525template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     526template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     527template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     528template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     529template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     530template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     531template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     532template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     533template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     534template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     535template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     536template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
     537template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8);
     538template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8);
     539template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8);
     540template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8);
     541template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8);
    497542
    498543//Implementation Part
     
    519564
    520565//The total number of operations is 1.0
     566IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
     567{
     568        return _mm_or_si128(arg1, arg2);
     569}
     570
     571//The total number of operations is 1.0
    521572IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    522573{
     
    525576
    526577//The total number of operations is 1.0
    527 IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
    528 {
    529         return _mm_or_si128(arg1, arg2);
     578IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
     579{
     580        return _mm_and_si128(arg1, arg2);
    530581}
    531582
     
    534585{
    535586        return _mm_xor_si128(arg1, arg2);
    536 }
    537 
    538 //The total number of operations is 1.0
    539 IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
    540 {
    541         return _mm_and_si128(arg1, arg2);
    542587}
    543588
     
    849894}
    850895
     896//The total number of operations is 6.0
     897template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::all(bitblock128_t arg1)
     898{
     899        bitblock128_t f0 = simd_and(arg1, simd128<2>::srli<1>(arg1));
     900        return simd_or(f0, simd128<2>::slli<1>(f0));
     901}
     902
     903//The total number of operations is 9.0
     904template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::all(bitblock128_t arg1)
     905{
     906        return simd128<4>::eq(arg1, simd128<8>::constant<255>());
     907}
     908
     909//The total number of operations is 1.0
     910template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::all(bitblock128_t arg1)
     911{
     912        return simd128<8>::eq(arg1, simd128<8>::constant<255>());
     913}
     914
     915//The total number of operations is 1.0
     916template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::all(bitblock128_t arg1)
     917{
     918        return simd128<16>::eq(arg1, simd128<8>::constant<255>());
     919}
     920
     921//The total number of operations is 1.0
     922template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::all(bitblock128_t arg1)
     923{
     924        return simd128<32>::eq(arg1, simd128<8>::constant<255>());
     925}
     926
     927//The total number of operations is 5.0
     928template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::all(bitblock128_t arg1)
     929{
     930        return simd128<64>::eq(arg1, simd128<8>::constant<255>());
     931}
     932
     933//The total number of operations is 2.0
     934template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::all(bitblock128_t arg1)
     935{
     936        return ((bitblock128::all(arg1)) ? simd128<8>::constant<255>() : simd128<8>::constant<0>());
     937}
     938
     939//The total number of operations is 2.0
     940template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1)
     941{
     942        return simd_and(simd128<32>::srli<sh>(arg1), simd128<2>::constant<((3)>>sh)>());
     943}
     944
     945//The total number of operations is 2.0
     946template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1)
     947{
     948        return simd_and(simd128<32>::srli<sh>(arg1), simd128<4>::constant<((15)>>sh)>());
     949}
     950
     951//The total number of operations is 2.0
     952template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1)
     953{
     954        return simd_and(simd128<32>::srli<sh>(arg1), simd128<8>::constant<((255)>>sh)>());
     955}
     956
     957//The total number of operations is 1.0
     958template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1)
     959{
     960        return _mm_srli_epi16(arg1, (int32_t)(sh));
     961}
     962
     963//The total number of operations is 1.0
     964template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1)
     965{
     966        return _mm_srli_epi32(arg1, (int32_t)(sh));
     967}
     968
     969//The total number of operations is 1.0
     970template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1)
     971{
     972        return _mm_srli_epi64(arg1, (int32_t)(sh));
     973}
     974
     975//The total number of operations is 2.33333333333
     976template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1)
     977{
     978        return (((sh%8) == 0) ? _mm_srli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd128<64>::srli<(sh&63)>(_mm_srli_si128(arg1, (int32_t)(8))) : simd_or(simd128<64>::srli<sh>(arg1), _mm_srli_si128(simd128<64>::slli<((128-sh)&63)>(arg1), (int32_t)(8)))));
     979}
     980
     981//The total number of operations is 1.0
     982template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1)
     983{
     984        return simd_not(arg1);
     985}
     986
     987//The total number of operations is 10.6666666667
     988template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1)
     989{
     990        bitblock128_t tmp = simd_not(arg1);
     991        return simd128<1>::ifh(simd128<2>::himask(), simd_and(tmp, simd128<128>::slli<1>(tmp)), simd_and(simd128<128>::srli<1>(arg1), tmp));
     992}
     993
     994//The total number of operations is 14.0
     995template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ctz(bitblock128_t arg1)
     996{
     997        return simd128<4>::popcount(simd_andc(simd128<4>::sub(arg1, simd128<4>::constant<1>()), arg1));
     998}
     999
     1000//The total number of operations is 13.0
     1001template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ctz(bitblock128_t arg1)
     1002{
     1003        return simd128<8>::popcount(simd_andc(simd128<8>::sub(arg1, simd128<8>::constant<1>()), arg1));
     1004}
     1005
     1006//The total number of operations is 16.0
     1007template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ctz(bitblock128_t arg1)
     1008{
     1009        return simd128<16>::popcount(simd_andc(simd128<16>::sub(arg1, simd128<16>::constant<1>()), arg1));
     1010}
     1011
     1012//The total number of operations is 19.0
     1013template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ctz(bitblock128_t arg1)
     1014{
     1015        return simd128<32>::popcount(simd_andc(simd128<32>::sub(arg1, simd128<32>::constant<1>()), arg1));
     1016}
     1017
     1018//The total number of operations is 14.0
     1019template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1)
     1020{
     1021        return simd128<64>::popcount(simd_andc(simd128<64>::sub(arg1, simd128<64>::constant<1>()), arg1));
     1022}
     1023
     1024//The total number of operations is 26.6666666667
     1025template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1)
     1026{
     1027        return simd128<128>::popcount(simd_andc(simd128<128>::sub(arg1, simd128<128>::constant<1>()), arg1));
     1028}
     1029
     1030//The total number of operations is 13.0
     1031template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sll(bitblock128_t arg1, bitblock128_t shift_mask)
     1032{
     1033        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(((4294967296ULL)-1))));
     1034        return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd128<32>::sub(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd128<32>::sub(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
     1035}
     1036
     1037//The total number of operations is 1.0
     1038template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1039{
     1040        return simd_xor(arg1, arg2);
     1041}
     1042
     1043//The total number of operations is 9.33333333333
     1044template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1045{
     1046        bitblock128_t tmp = simd_xor(arg1, arg2);
     1047        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(simd_not(arg1), arg2))), tmp);
     1048}
     1049
     1050//The total number of operations is 6.0
     1051template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1052{
     1053        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::sub(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::sub(arg1, arg2));
     1054}
     1055
     1056//The total number of operations is 1.0
     1057template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1058{
     1059        return _mm_sub_epi8(arg1, arg2);
     1060}
     1061
     1062//The total number of operations is 1.0
     1063template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1064{
     1065        return _mm_sub_epi16(arg1, arg2);
     1066}
     1067
     1068//The total number of operations is 1.0
     1069template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1070{
     1071        return _mm_sub_epi32(arg1, arg2);
     1072}
     1073
     1074//The total number of operations is 1.0
     1075template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1076{
     1077        return _mm_sub_epi64(arg1, arg2);
     1078}
     1079
     1080//The total number of operations is 9.33333333333
     1081template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2)
     1082{
     1083        bitblock128_t partial = simd128<(64)>::sub(arg1, arg2);
     1084        bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
     1085        bitblock128_t borrow = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(borrowMask));
     1086        return simd128<(64)>::sub(partial, borrow);
     1087}
     1088
     1089//The total number of operations is 1.0
     1090template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1091{
     1092        return simd_andc(arg1, arg2);
     1093}
     1094
     1095//The total number of operations is 13.6666666667
     1096template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1097{
     1098        bitblock128_t tmp = simd_not(arg2);
     1099        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd128<128>::slli<1>(simd_and(arg1, tmp)), simd_or(arg1, tmp)));
     1100        return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<128>::srli<1>(tmpAns));
     1101}
     1102
     1103//The total number of operations is 12.0
     1104template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1105{
     1106        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::ugt(simd_and(simd128<(8)>::himask(), arg1), arg2), simd128<(8)>::ugt(simd_andc(arg1, simd128<(8)>::himask()), simd_andc(arg2, simd128<(8)>::himask())));
     1107}
     1108
     1109//The total number of operations is 3.0
     1110template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1111{
     1112        bitblock128_t high_bit = simd128<8>::constant<(128)>();
     1113        return simd128<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1114}
     1115
     1116//The total number of operations is 3.0
     1117template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1118{
     1119        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     1120        return simd128<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1121}
     1122
     1123//The total number of operations is 3.0
     1124template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1125{
     1126        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     1127        return simd128<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1128}
     1129
     1130//The total number of operations is 13.5
     1131template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1132{
     1133        bitblock128_t tmpAns = simd128<(32)>::ugt(arg1, arg2);
     1134        bitblock128_t mask = simd_and(tmpAns, simd128<64>::srli<(32)>(simd128<(32)>::eq(arg1, arg2)));
     1135        mask = simd_or(mask, simd128<64>::slli<(32)>(mask));
     1136        return simd_or(simd128<64>::srai<(32)>(tmpAns), mask);
     1137}
     1138
     1139//The total number of operations is 37.25
     1140template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     1141{
     1142        bitblock128_t tmpAns = simd128<(64)>::ugt(arg1, arg2);
     1143        bitblock128_t mask = simd_and(tmpAns, simd128<128>::srli<(64)>(simd128<(64)>::eq(arg1, arg2)));
     1144        mask = simd_or(mask, simd128<128>::slli<(64)>(mask));
     1145        return simd_or(simd128<128>::srai<(64)>(tmpAns), mask);
     1146}
     1147
     1148//The total number of operations is 4.0
     1149template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::xor_hl(bitblock128_t arg1)
     1150{
     1151        return simd_xor(simd128<2>::srli<(1)>(arg1), simd_and(arg1, simd128<2>::lomask()));
     1152}
     1153
     1154//The total number of operations is 4.0
     1155template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::xor_hl(bitblock128_t arg1)
     1156{
     1157        return simd_xor(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
     1158}
     1159
     1160//The total number of operations is 4.0
     1161template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::xor_hl(bitblock128_t arg1)
     1162{
     1163        return simd_xor(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
     1164}
     1165
     1166//The total number of operations is 3.0
     1167template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::xor_hl(bitblock128_t arg1)
     1168{
     1169        return simd_xor(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
     1170}
     1171
     1172//The total number of operations is 3.0
     1173template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::xor_hl(bitblock128_t arg1)
     1174{
     1175        return simd_xor(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
     1176}
     1177
     1178//The total number of operations is 3.0
     1179template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::xor_hl(bitblock128_t arg1)
     1180{
     1181        return simd_xor(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
     1182}
     1183
     1184//The total number of operations is 4.33333333333
     1185template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::xor_hl(bitblock128_t arg1)
     1186{
     1187        return simd_xor(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
     1188}
     1189
     1190//The total number of operations is 0
     1191template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::popcount(bitblock128_t arg1)
     1192{
     1193        return arg1;
     1194}
     1195
     1196//The total number of operations is 3.0
     1197template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::popcount(bitblock128_t arg1)
     1198{
     1199        return simd128<2>::add_hl(simd128<(1)>::popcount(arg1));
     1200}
     1201
     1202//The total number of operations is 7.0
     1203template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::popcount(bitblock128_t arg1)
     1204{
     1205        return simd128<4>::add_hl(simd128<(2)>::popcount(arg1));
     1206}
     1207
     1208//The total number of operations is 11.0
     1209template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::popcount(bitblock128_t arg1)
     1210{
     1211        return simd128<8>::add_hl(simd128<(4)>::popcount(arg1));
     1212}
     1213
     1214//The total number of operations is 14.0
     1215template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::popcount(bitblock128_t arg1)
     1216{
     1217        return simd128<16>::add_hl(simd128<(8)>::popcount(arg1));
     1218}
     1219
     1220//The total number of operations is 17.0
     1221template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::popcount(bitblock128_t arg1)
     1222{
     1223        return simd128<32>::add_hl(simd128<(16)>::popcount(arg1));
     1224}
     1225
     1226//The total number of operations is 12.0
     1227template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1)
     1228{
     1229        return _mm_sad_epu8(simd128<8>::popcount(arg1), simd128<8>::constant<0>());
     1230}
     1231
     1232//The total number of operations is 16.3333333333
     1233template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1)
     1234{
     1235        bitblock128_t tmpAns = simd128<(64)>::popcount(arg1);
     1236        return simd128<(64)>::add(simd_and(tmpAns, simd128<128>::lomask()), simd128<128>::srli<(64)>(tmpAns));
     1237}
     1238
     1239//The total number of operations is 8.0
     1240template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::any(bitblock128_t arg1)
     1241{
     1242        bitblock128_t t0 = simd128<2>::srli<1>(arg1);
     1243        bitblock128_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd128<8>::constant<255>())));
     1244        return simd_or(f0, simd128<2>::slli<1>(f0));
     1245}
     1246
     1247//The total number of operations is 12.0
     1248template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::any(bitblock128_t arg1)
     1249{
     1250        return simd128<4>::ugt(arg1, simd128<8>::constant<0>());
     1251}
     1252
     1253//The total number of operations is 3.0
     1254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::any(bitblock128_t arg1)
     1255{
     1256        return simd128<8>::ugt(arg1, simd128<8>::constant<0>());
     1257}
     1258
     1259//The total number of operations is 3.0
     1260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::any(bitblock128_t arg1)
     1261{
     1262        return simd128<16>::ugt(arg1, simd128<8>::constant<0>());
     1263}
     1264
     1265//The total number of operations is 3.0
     1266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::any(bitblock128_t arg1)
     1267{
     1268        return simd128<32>::ugt(arg1, simd128<8>::constant<0>());
     1269}
     1270
     1271//The total number of operations is 13.5
     1272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::any(bitblock128_t arg1)
     1273{
     1274        return simd128<64>::ugt(arg1, simd128<8>::constant<0>());
     1275}
     1276
     1277//The total number of operations is 2.0
     1278template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::any(bitblock128_t arg1)
     1279{
     1280        return ((bitblock128::any(arg1)) ? simd128<8>::constant<255>() : simd128<8>::constant<0>());
     1281}
     1282
     1283//The total number of operations is 6.33333333333
     1284template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1)
     1285{
     1286        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(arg1, simd128<128>::slli<1>(arg1)), arg1);
     1287}
     1288
     1289//The total number of operations is 6.0
     1290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1)
     1291{
     1292        return simd128<4>::sub(simd128<4>::constant<0>(), arg1);
     1293}
     1294
     1295//The total number of operations is 1.0
     1296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::neg(bitblock128_t arg1)
     1297{
     1298        return simd128<8>::sub(simd128<8>::constant<0>(), arg1);
     1299}
     1300
     1301//The total number of operations is 1.0
     1302template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::neg(bitblock128_t arg1)
     1303{
     1304        return simd128<16>::sub(simd128<16>::constant<0>(), arg1);
     1305}
     1306
     1307//The total number of operations is 1.0
     1308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
     1309{
     1310        return _mm_sign_epi32(arg1, simd128<32>::constant<-1>());
     1311}
     1312
     1313//The total number of operations is 1.0
     1314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1)
     1315{
     1316        return simd128<64>::sub(simd128<64>::constant<0>(), arg1);
     1317}
     1318
     1319//The total number of operations is 9.33333333333
     1320template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1)
     1321{
     1322        return simd128<128>::sub(simd128<128>::constant<0>(), arg1);
     1323}
     1324
     1325//The total number of operations is 2.0
     1326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1)
     1327{
     1328        return simd_and(simd128<32>::slli<sh>(arg1), simd128<2>::constant<(((3)<<sh)&(3))>());
     1329}
     1330
     1331//The total number of operations is 2.0
     1332template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1)
     1333{
     1334        return simd_and(simd128<32>::slli<sh>(arg1), simd128<4>::constant<(((15)<<sh)&(15))>());
     1335}
     1336
     1337//The total number of operations is 2.0
     1338template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1)
     1339{
     1340        return simd_and(simd128<32>::slli<sh>(arg1), simd128<8>::constant<(((255)<<sh)&(255))>());
     1341}
     1342
     1343//The total number of operations is 1.0
     1344template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1)
     1345{
     1346        return _mm_slli_epi16(arg1, (int32_t)(sh));
     1347}
     1348
     1349//The total number of operations is 1.0
     1350template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1)
     1351{
     1352        return _mm_slli_epi32(arg1, (int32_t)(sh));
     1353}
     1354
     1355//The total number of operations is 1.0
     1356template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1)
     1357{
     1358        return _mm_slli_epi64(arg1, (int32_t)(sh));
     1359}
     1360
     1361//The total number of operations is 2.33333333333
     1362template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1)
     1363{
     1364        return (((sh%8) == 0) ? _mm_slli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd128<64>::slli<(sh&63)>(_mm_slli_si128(arg1, (int32_t)(8))) : simd_or(simd128<64>::slli<sh>(arg1), _mm_slli_si128(simd128<64>::srli<((128-sh)&63)>(arg1), (int32_t)(8)))));
     1365}
     1366
     1367//The total number of operations is 3.0
     1368template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1369{
     1370        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
     1371}
     1372
     1373//The total number of operations is 8.0
     1374template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1375{
     1376        return simd128<(1)>::ifh(simd128<1>::ifh(simd128<2>::himask(), arg1, simd128<2>::srli<(1)>(arg1)), arg2, arg3);
     1377}
     1378
     1379//The total number of operations is 13.0
     1380template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1381{
     1382        return simd128<1>::ifh(simd128<4>::gt(simd128<4>::constant<0>(), arg1), arg2, arg3);
     1383}
     1384
     1385//The total number of operations is 4.0
     1386template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1387{
     1388        return simd128<1>::ifh(simd128<8>::gt(simd128<8>::constant<0>(), arg1), arg2, arg3);
     1389}
     1390
     1391//The total number of operations is 4.0
     1392template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1393{
     1394        return simd128<1>::ifh(simd128<16>::gt(simd128<16>::constant<0>(), arg1), arg2, arg3);
     1395}
     1396
     1397//The total number of operations is 4.0
     1398template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1399{
     1400        return simd128<1>::ifh(simd128<32>::gt(simd128<32>::constant<0>(), arg1), arg2, arg3);
     1401}
     1402
     1403//The total number of operations is 8.0
     1404template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1405{
     1406        return simd128<(32)>::ifh(simd128<1>::ifh(simd128<64>::himask(), arg1, simd128<64>::srli<(32)>(arg1)), arg2, arg3);
     1407}
     1408
     1409//The total number of operations is 13.3333333333
     1410template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1411{
     1412        return simd128<(64)>::ifh(simd128<1>::ifh(simd128<128>::himask(), arg1, simd128<128>::srli<(64)>(arg1)), arg2, arg3);
     1413}
     1414
     1415//The total number of operations is 4.0
     1416template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1417{
     1418        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1419}
     1420
     1421//The total number of operations is 10.0
     1422template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1423{
     1424        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1425        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1426}
     1427
     1428//The total number of operations is 5.0
     1429template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1430{
     1431        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1432        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1433}
     1434
     1435//The total number of operations is 1.0
     1436template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1437{
     1438        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1439}
     1440
     1441//The total number of operations is 1.0
     1442template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1443{
     1444        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1445}
     1446
     1447//The total number of operations is 4.5
     1448template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1449{
     1450        return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
     1451}
     1452
     1453//The total number of operations is 11.0833333333
     1454template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1455{
     1456        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
     1457}
     1458
     1459//The total number of operations is 10.0
     1460template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     1461{
     1462        return simd128<1>::ifh(simd128<128>::himask(), _mm_srl_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_srl_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
     1463}
     1464
     1465//The total number of operations is 13.0
     1466template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     1467{
     1468        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
     1469        return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd128<32>::sub(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd128<32>::sub(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
     1470}
     1471
     1472//The total number of operations is 3.0
     1473template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1)
     1474{
     1475        return simd128<16>::sub(arg1, simd_and(simd128<2>::lomask(), simd128<16>::srli<1>(arg1)));
     1476}
     1477
     1478//The total number of operations is 4.0
     1479template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1)
     1480{
     1481        return simd128<(8)>::add(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
     1482}
     1483
     1484//The total number of operations is 4.0
     1485template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add_hl(bitblock128_t arg1)
     1486{
     1487        return simd128<(16)>::add(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
     1488}
     1489
     1490//The total number of operations is 3.0
     1491template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add_hl(bitblock128_t arg1)
     1492{
     1493        return simd128<(32)>::add(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
     1494}
     1495
     1496//The total number of operations is 3.0
     1497template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add_hl(bitblock128_t arg1)
     1498{
     1499        return simd128<(64)>::add(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
     1500}
     1501
     1502//The total number of operations is 3.0
     1503template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1)
     1504{
     1505        return simd128<64>::add(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
     1506}
     1507
     1508//The total number of operations is 12.6666666667
     1509template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1)
     1510{
     1511        return simd128<128>::add(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
     1512}
     1513
     1514//The total number of operations is 13.0
     1515template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask)
     1516{
     1517        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(((4294967296ULL)-1))));
     1518        return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd128<32>::sub(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd128<32>::sub(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
     1519}
     1520
     1521//The total number of operations is 0
     1522template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1523{
     1524        return simd128<2>::constant<(1)>();
     1525}
     1526
     1527//The total number of operations is 0
     1528template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1529{
     1530        return simd128<4>::constant<(3)>();
     1531}
     1532
     1533//The total number of operations is 0
     1534template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1535{
     1536        return simd128<8>::constant<(15)>();
     1537}
     1538
     1539//The total number of operations is 0
     1540template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1541{
     1542        return simd128<16>::constant<(255)>();
     1543}
     1544
     1545//The total number of operations is 0
     1546template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1547{
     1548        return simd128<32>::constant<(65535)>();
     1549}
     1550
     1551//The total number of operations is 0
     1552template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1553{
     1554        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
     1555}
     1556
     1557//The total number of operations is 0
     1558template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1559{
     1560        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
     1561}
     1562
     1563//The total number of operations is 10.0
     1564template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask)
     1565{
     1566        return simd128<1>::ifh(simd128<128>::himask(), _mm_sll_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_sll_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
     1567}
     1568
     1569//The total number of operations is 13.0
     1570template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask)
     1571{
     1572        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
     1573        return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd128<32>::sub(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd128<32>::sub(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
     1574}
     1575
     1576//The total number of operations is 0
     1577template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
     1578{
     1579        return simd128<32>::constant<(-1*val)>();
     1580}
     1581
     1582//The total number of operations is 0
     1583template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant()
     1584{
     1585        return ((val < 0) ? simd128<(4)>::constant<((val<<2)|(val^(-4)))>() : simd128<(4)>::constant<((val<<2)|val)>());
     1586}
     1587
     1588//The total number of operations is 0
     1589template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant()
     1590{
     1591        return ((val < 0) ? simd128<(8)>::constant<((val<<4)|(val^(-16)))>() : simd128<(8)>::constant<((val<<4)|val)>());
     1592}
     1593
     1594//The total number of operations is 0
     1595template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant()
     1596{
     1597        return _mm_set1_epi8((int32_t)(val));
     1598}
     1599
     1600//The total number of operations is 0
     1601template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant()
     1602{
     1603        return _mm_set1_epi16((int32_t)(val));
     1604}
     1605
     1606//The total number of operations is 0
     1607template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant()
     1608{
     1609        return _mm_set1_epi32((int32_t)(val));
     1610}
     1611
     1612//The total number of operations is 0
     1613template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant()
     1614{
     1615        return _mm_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val));
     1616}
     1617
     1618//The total number of operations is 0
     1619template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant()
     1620{
     1621        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val));
     1622}
     1623
     1624//The total number of operations is 1.0
     1625template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::min(bitblock128_t arg1, bitblock128_t arg2)
     1626{
     1627        return simd_or(arg1, arg2);
     1628}
     1629
     1630//The total number of operations is 16.6666666667
     1631template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::min(bitblock128_t arg1, bitblock128_t arg2)
     1632{
     1633        bitblock128_t tmp1 = simd128<128>::srli<1>(arg1);
     1634        bitblock128_t tmp2 = simd128<128>::srli<1>(arg2);
     1635        return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg1, simd_and(tmp1, simd_not(tmp2))), simd_and(arg2, simd_or(simd_and(simd_not(tmp1), tmp2), arg1))));
     1636}
     1637
     1638//The total number of operations is 9.0
     1639template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::min(bitblock128_t arg1, bitblock128_t arg2)
     1640{
     1641        bitblock128_t high_bit = simd128<4>::constant<(8)>();
     1642        return simd_xor(simd128<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1643}
     1644
     1645//The total number of operations is 4.0
     1646template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::min(bitblock128_t arg1, bitblock128_t arg2)
     1647{
     1648        bitblock128_t high_bit = simd128<8>::constant<(128)>();
     1649        return simd_xor(simd128<8>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1650}
     1651
     1652//The total number of operations is 1.0
     1653template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::min(bitblock128_t arg1, bitblock128_t arg2)
     1654{
     1655        return _mm_min_epi16(arg1, arg2);
     1656}
     1657
     1658//The total number of operations is 4.0
     1659template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::min(bitblock128_t arg1, bitblock128_t arg2)
     1660{
     1661        return simd128<1>::ifh(simd128<32>::gt(arg1, arg2), arg2, arg1);
     1662}
     1663
     1664//The total number of operations is 17.5
     1665template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2)
     1666{
     1667        return simd128<1>::ifh(simd128<64>::gt(arg1, arg2), arg2, arg1);
     1668}
     1669
     1670//The total number of operations is 54.75
     1671template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2)
     1672{
     1673        return simd128<1>::ifh(simd128<128>::gt(arg1, arg2), arg2, arg1);
     1674}
     1675
     1676//The total number of operations is 1.0
     1677template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1678{
     1679        return simd_and(arg1, arg2);
     1680}
     1681
     1682//The total number of operations is 16.0
     1683template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1684{
     1685        return simd_or(simd_and(simd128<(4)>::himask(), simd128<(4)>::umin(arg1, arg2)), simd128<(4)>::umin(simd_and(simd128<(4)>::lomask(), arg1), simd_and(simd128<(4)>::lomask(), arg2)));
     1686}
     1687
     1688//The total number of operations is 6.0
     1689template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1690{
     1691        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umin(arg1, arg2)), simd128<(8)>::umin(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
     1692}
     1693
     1694//The total number of operations is 1.0
     1695template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1696{
     1697        return _mm_min_epu8(arg1, arg2);
     1698}
     1699
     1700//The total number of operations is 4.0
     1701template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1702{
     1703        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     1704        return simd_xor(simd128<16>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1705}
     1706
     1707//The total number of operations is 7.0
     1708template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1709{
     1710        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     1711        return simd_xor(simd128<32>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1712}
     1713
     1714//The total number of operations is 20.0
     1715template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1716{
     1717        bitblock128_t tmpAns = simd128<(32)>::umin(arg1, arg2);
     1718        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     1719        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     1720        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1721}
     1722
     1723//The total number of operations is 43.6666666667
     1724template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1725{
     1726        bitblock128_t tmpAns = simd128<(64)>::umin(arg1, arg2);
     1727        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     1728        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     1729        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1730}
     1731
     1732//The total number of operations is 1.0
     1733template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1734{
     1735        return simd_or(arg1, arg2);
     1736}
     1737
     1738//The total number of operations is 15.6666666667
     1739template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1740{
     1741        return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
     1742}
     1743
     1744//The total number of operations is 6.0
     1745template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1746{
     1747        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
     1748}
     1749
     1750//The total number of operations is 1.0
     1751template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1752{
     1753        return _mm_max_epu8(arg1, arg2);
     1754}
     1755
     1756//The total number of operations is 4.0
     1757template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1758{
     1759        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     1760        return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1761}
     1762
     1763//The total number of operations is 7.0
     1764template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1765{
     1766        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     1767        return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1768}
     1769
     1770//The total number of operations is 20.0
     1771template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1772{
     1773        bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
     1774        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     1775        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     1776        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1777}
     1778
     1779//The total number of operations is 43.6666666667
     1780template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1781{
     1782        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     1783        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     1784        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     1785        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1786}
     1787
    8511788//The total number of operations is 1.0
    8521789template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2)
     
    9051842
    9061843//The total number of operations is 2.0
    907 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1)
    908 {
    909         return simd_and(simd128<32>::srli<sh>(arg1), simd128<2>::constant<((3)>>sh)>());
    910 }
    911 
    912 //The total number of operations is 2.0
    913 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1)
    914 {
    915         return simd_and(simd128<32>::srli<sh>(arg1), simd128<4>::constant<((15)>>sh)>());
    916 }
    917 
    918 //The total number of operations is 2.0
    919 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1)
    920 {
    921         return simd_and(simd128<32>::srli<sh>(arg1), simd128<8>::constant<((255)>>sh)>());
    922 }
    923 
    924 //The total number of operations is 1.0
    925 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1)
    926 {
    927         return _mm_srli_epi16(arg1, (int32_t)(sh));
    928 }
    929 
    930 //The total number of operations is 1.0
    931 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1)
    932 {
    933         return _mm_srli_epi32(arg1, (int32_t)(sh));
    934 }
    935 
    936 //The total number of operations is 1.0
    937 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1)
    938 {
    939         return _mm_srli_epi64(arg1, (int32_t)(sh));
    940 }
    941 
    942 //The total number of operations is 2.33333333333
    943 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1)
    944 {
    945         return (((sh%8) == 0) ? _mm_srli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd128<64>::srli<(sh&63)>(_mm_srli_si128(arg1, (int32_t)(8))) : simd_or(simd128<64>::srli<sh>(arg1), _mm_srli_si128(simd128<64>::slli<((128-sh)&63)>(arg1), (int32_t)(8)))));
    946 }
    947 
    948 //The total number of operations is 1.0
    949 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1)
    950 {
    951         return simd_not(arg1);
    952 }
    953 
    954 //The total number of operations is 10.6666666667
    955 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1)
    956 {
    957         bitblock128_t tmp = simd_not(arg1);
    958         return simd128<1>::ifh(simd128<2>::himask(), simd_and(tmp, simd128<128>::slli<1>(tmp)), simd_and(simd128<128>::srli<1>(arg1), tmp));
    959 }
    960 
    961 //The total number of operations is 14.0
    962 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ctz(bitblock128_t arg1)
    963 {
    964         return simd128<4>::popcount(simd_andc(simd128<4>::sub(arg1, simd128<4>::constant<1>()), arg1));
    965 }
    966 
    967 //The total number of operations is 13.0
    968 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ctz(bitblock128_t arg1)
    969 {
    970         return simd128<8>::popcount(simd_andc(simd128<8>::sub(arg1, simd128<8>::constant<1>()), arg1));
    971 }
    972 
    973 //The total number of operations is 16.0
    974 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ctz(bitblock128_t arg1)
    975 {
    976         return simd128<16>::popcount(simd_andc(simd128<16>::sub(arg1, simd128<16>::constant<1>()), arg1));
    977 }
    978 
    979 //The total number of operations is 19.0
    980 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ctz(bitblock128_t arg1)
    981 {
    982         return simd128<32>::popcount(simd_andc(simd128<32>::sub(arg1, simd128<32>::constant<1>()), arg1));
    983 }
    984 
    985 //The total number of operations is 14.0
    986 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1)
    987 {
    988         return simd128<64>::popcount(simd_andc(simd128<64>::sub(arg1, simd128<64>::constant<1>()), arg1));
    989 }
    990 
    991 //The total number of operations is 26.6666666667
    992 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1)
    993 {
    994         return simd128<128>::popcount(simd_andc(simd128<128>::sub(arg1, simd128<128>::constant<1>()), arg1));
    995 }
    996 
    997 //The total number of operations is 10.0
    998 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sll(bitblock128_t arg1, bitblock128_t shift_mask)
    999 {
    1000         return simd128<1>::ifh(simd128<128>::himask(), _mm_sll_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_sll_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
    1001 }
    1002 
    1003 //The total number of operations is 13.0
    1004 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sll(bitblock128_t arg1, bitblock128_t shift_mask)
    1005 {
    1006         bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
    1007         return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd128<32>::sub(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd128<32>::sub(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
    1008 }
    1009 
    1010 //The total number of operations is 1.0
    1011 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1012 {
    1013         return simd_andc(arg1, arg2);
    1014 }
    1015 
    1016 //The total number of operations is 13.6666666667
    1017 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1018 {
    1019         bitblock128_t tmp = simd_not(arg2);
    1020         bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd128<128>::slli<1>(simd_and(arg1, tmp)), simd_or(arg1, tmp)));
    1021         return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<128>::srli<1>(tmpAns));
    1022 }
    1023 
    1024 //The total number of operations is 12.0
    1025 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1026 {
    1027         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::ugt(simd_and(simd128<(8)>::himask(), arg1), arg2), simd128<(8)>::ugt(simd_andc(arg1, simd128<(8)>::himask()), simd_andc(arg2, simd128<(8)>::himask())));
    1028 }
    1029 
    1030 //The total number of operations is 3.0
    1031 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1032 {
    1033         bitblock128_t high_bit = simd128<8>::constant<(128)>();
    1034         return simd128<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1035 }
    1036 
    1037 //The total number of operations is 3.0
    1038 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1039 {
    1040         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1041         return simd128<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1042 }
    1043 
    1044 //The total number of operations is 3.0
    1045 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1046 {
    1047         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1048         return simd128<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1049 }
    1050 
    1051 //The total number of operations is 13.5
    1052 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1053 {
    1054         bitblock128_t tmpAns = simd128<(32)>::ugt(arg1, arg2);
    1055         bitblock128_t mask = simd_and(tmpAns, simd128<64>::srli<(32)>(simd128<(32)>::eq(arg1, arg2)));
    1056         mask = simd_or(mask, simd128<64>::slli<(32)>(mask));
    1057         return simd_or(simd128<64>::srai<(32)>(tmpAns), mask);
    1058 }
    1059 
    1060 //The total number of operations is 37.25
    1061 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    1062 {
    1063         bitblock128_t tmpAns = simd128<(64)>::ugt(arg1, arg2);
    1064         bitblock128_t mask = simd_and(tmpAns, simd128<128>::srli<(64)>(simd128<(64)>::eq(arg1, arg2)));
    1065         mask = simd_or(mask, simd128<128>::slli<(64)>(mask));
    1066         return simd_or(simd128<128>::srai<(64)>(tmpAns), mask);
    1067 }
    1068 
    1069 //The total number of operations is 4.0
    1070 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::xor_hl(bitblock128_t arg1)
    1071 {
    1072         return simd_xor(simd128<2>::srli<(1)>(arg1), simd_and(arg1, simd128<2>::lomask()));
    1073 }
    1074 
    1075 //The total number of operations is 4.0
    1076 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::xor_hl(bitblock128_t arg1)
    1077 {
    1078         return simd_xor(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
    1079 }
    1080 
    1081 //The total number of operations is 4.0
    1082 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::xor_hl(bitblock128_t arg1)
    1083 {
    1084         return simd_xor(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
    1085 }
    1086 
    1087 //The total number of operations is 3.0
    1088 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::xor_hl(bitblock128_t arg1)
    1089 {
    1090         return simd_xor(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
    1091 }
    1092 
    1093 //The total number of operations is 3.0
    1094 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::xor_hl(bitblock128_t arg1)
    1095 {
    1096         return simd_xor(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
    1097 }
    1098 
    1099 //The total number of operations is 3.0
    1100 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::xor_hl(bitblock128_t arg1)
    1101 {
    1102         return simd_xor(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
    1103 }
    1104 
    1105 //The total number of operations is 4.33333333333
    1106 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::xor_hl(bitblock128_t arg1)
    1107 {
    1108         return simd_xor(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
    1109 }
    1110 
    1111 //The total number of operations is 0
    1112 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::popcount(bitblock128_t arg1)
    1113 {
    1114         return arg1;
    1115 }
    1116 
    1117 //The total number of operations is 3.0
    1118 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::popcount(bitblock128_t arg1)
    1119 {
    1120         return simd128<2>::add_hl(simd128<(1)>::popcount(arg1));
    1121 }
    1122 
    1123 //The total number of operations is 7.0
    1124 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::popcount(bitblock128_t arg1)
    1125 {
    1126         return simd128<4>::add_hl(simd128<(2)>::popcount(arg1));
    1127 }
    1128 
    1129 //The total number of operations is 11.0
    1130 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::popcount(bitblock128_t arg1)
    1131 {
    1132         return simd128<8>::add_hl(simd128<(4)>::popcount(arg1));
    1133 }
    1134 
    1135 //The total number of operations is 14.0
    1136 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::popcount(bitblock128_t arg1)
    1137 {
    1138         return simd128<16>::add_hl(simd128<(8)>::popcount(arg1));
    1139 }
    1140 
    1141 //The total number of operations is 17.0
    1142 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::popcount(bitblock128_t arg1)
    1143 {
    1144         return simd128<32>::add_hl(simd128<(16)>::popcount(arg1));
    1145 }
    1146 
    1147 //The total number of operations is 12.0
    1148 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1)
    1149 {
    1150         return _mm_sad_epu8(simd128<8>::popcount(arg1), simd128<8>::constant<0>());
    1151 }
    1152 
    1153 //The total number of operations is 16.3333333333
    1154 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1)
    1155 {
    1156         bitblock128_t tmpAns = simd128<(64)>::popcount(arg1);
    1157         return simd128<(64)>::add(simd_and(tmpAns, simd128<128>::lomask()), simd128<128>::srli<(64)>(tmpAns));
    1158 }
    1159 
    1160 //The total number of operations is 6.33333333333
    1161 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1)
    1162 {
    1163         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(arg1, simd128<128>::slli<1>(arg1)), arg1);
    1164 }
    1165 
    1166 //The total number of operations is 6.0
    1167 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1)
    1168 {
    1169         return simd128<4>::sub(simd128<4>::constant<0>(), arg1);
    1170 }
    1171 
    1172 //The total number of operations is 1.0
    1173 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::neg(bitblock128_t arg1)
    1174 {
    1175         return simd128<8>::sub(simd128<8>::constant<0>(), arg1);
    1176 }
    1177 
    1178 //The total number of operations is 1.0
    1179 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::neg(bitblock128_t arg1)
    1180 {
    1181         return simd128<16>::sub(simd128<16>::constant<0>(), arg1);
    1182 }
    1183 
    1184 //The total number of operations is 1.0
    1185 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
    1186 {
    1187         return _mm_sign_epi32(arg1, simd128<32>::constant<-1>());
    1188 }
    1189 
    1190 //The total number of operations is 1.0
    1191 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1)
    1192 {
    1193         return simd128<64>::sub(simd128<64>::constant<0>(), arg1);
    1194 }
    1195 
    1196 //The total number of operations is 9.33333333333
    1197 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1)
    1198 {
    1199         return simd128<128>::sub(simd128<128>::constant<0>(), arg1);
    1200 }
    1201 
    1202 //The total number of operations is 2.0
    1203 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1)
    1204 {
    1205         return simd_and(simd128<32>::slli<sh>(arg1), simd128<2>::constant<(((3)<<sh)&(3))>());
    1206 }
    1207 
    1208 //The total number of operations is 2.0
    1209 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1)
    1210 {
    1211         return simd_and(simd128<32>::slli<sh>(arg1), simd128<4>::constant<(((15)<<sh)&(15))>());
    1212 }
    1213 
    1214 //The total number of operations is 2.0
    1215 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1)
    1216 {
    1217         return simd_and(simd128<32>::slli<sh>(arg1), simd128<8>::constant<(((255)<<sh)&(255))>());
    1218 }
    1219 
    1220 //The total number of operations is 1.0
    1221 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1)
    1222 {
    1223         return _mm_slli_epi16(arg1, (int32_t)(sh));
    1224 }
    1225 
    1226 //The total number of operations is 1.0
    1227 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1)
    1228 {
    1229         return _mm_slli_epi32(arg1, (int32_t)(sh));
    1230 }
    1231 
    1232 //The total number of operations is 1.0
    1233 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1)
    1234 {
    1235         return _mm_slli_epi64(arg1, (int32_t)(sh));
    1236 }
    1237 
    1238 //The total number of operations is 2.33333333333
    1239 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1)
    1240 {
    1241         return (((sh%8) == 0) ? _mm_slli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd128<64>::slli<(sh&63)>(_mm_slli_si128(arg1, (int32_t)(8))) : simd_or(simd128<64>::slli<sh>(arg1), _mm_slli_si128(simd128<64>::srli<((128-sh)&63)>(arg1), (int32_t)(8)))));
    1242 }
    1243 
    1244 //The total number of operations is 3.0
    1245 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1246 {
    1247         return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
    1248 }
    1249 
    1250 //The total number of operations is 8.0
    1251 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1252 {
    1253         return simd128<(1)>::ifh(simd128<1>::ifh(simd128<2>::himask(), arg1, simd128<2>::srli<(1)>(arg1)), arg2, arg3);
    1254 }
    1255 
    1256 //The total number of operations is 13.0
    1257 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1258 {
    1259         return simd128<1>::ifh(simd128<4>::gt(simd128<4>::constant<0>(), arg1), arg2, arg3);
    1260 }
    1261 
    1262 //The total number of operations is 4.0
    1263 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1264 {
    1265         return simd128<1>::ifh(simd128<8>::gt(simd128<8>::constant<0>(), arg1), arg2, arg3);
    1266 }
    1267 
    1268 //The total number of operations is 4.0
    1269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1270 {
    1271         return simd128<1>::ifh(simd128<16>::gt(simd128<16>::constant<0>(), arg1), arg2, arg3);
    1272 }
    1273 
    1274 //The total number of operations is 4.0
    1275 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1276 {
    1277         return simd128<1>::ifh(simd128<32>::gt(simd128<32>::constant<0>(), arg1), arg2, arg3);
    1278 }
    1279 
    1280 //The total number of operations is 8.0
    1281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1282 {
    1283         return simd128<(32)>::ifh(simd128<1>::ifh(simd128<64>::himask(), arg1, simd128<64>::srli<(32)>(arg1)), arg2, arg3);
    1284 }
    1285 
    1286 //The total number of operations is 13.3333333333
    1287 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1288 {
    1289         return simd128<(64)>::ifh(simd128<1>::ifh(simd128<128>::himask(), arg1, simd128<128>::srli<(64)>(arg1)), arg2, arg3);
    1290 }
    1291 
    1292 //The total number of operations is 1.0
    1293 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1294 {
    1295         return simd_xor(arg1, arg2);
    1296 }
    1297 
    1298 //The total number of operations is 9.33333333333
    1299 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1300 {
    1301         bitblock128_t tmp = simd_xor(arg1, arg2);
    1302         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(simd_not(arg1), arg2))), tmp);
    1303 }
    1304 
    1305 //The total number of operations is 6.0
    1306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1307 {
    1308         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::sub(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::sub(arg1, arg2));
    1309 }
    1310 
    1311 //The total number of operations is 1.0
    1312 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1313 {
    1314         return _mm_sub_epi8(arg1, arg2);
    1315 }
    1316 
    1317 //The total number of operations is 1.0
    1318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1319 {
    1320         return _mm_sub_epi16(arg1, arg2);
    1321 }
    1322 
    1323 //The total number of operations is 1.0
    1324 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1325 {
    1326         return _mm_sub_epi32(arg1, arg2);
    1327 }
    1328 
    1329 //The total number of operations is 1.0
    1330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1331 {
    1332         return _mm_sub_epi64(arg1, arg2);
    1333 }
    1334 
    1335 //The total number of operations is 9.33333333333
    1336 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1337 {
    1338         bitblock128_t partial = simd128<(64)>::sub(arg1, arg2);
    1339         bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
    1340         bitblock128_t borrow = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(borrowMask));
    1341         return simd128<(64)>::sub(partial, borrow);
    1342 }
    1343 
    1344 //The total number of operations is 3.0
    1345 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1)
    1346 {
    1347         return simd128<16>::sub(arg1, simd_and(simd128<2>::lomask(), simd128<16>::srli<1>(arg1)));
    1348 }
    1349 
    1350 //The total number of operations is 4.0
    1351 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1)
    1352 {
    1353         return simd128<(8)>::add(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
    1354 }
    1355 
    1356 //The total number of operations is 4.0
    1357 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add_hl(bitblock128_t arg1)
    1358 {
    1359         return simd128<(16)>::add(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
    1360 }
    1361 
    1362 //The total number of operations is 3.0
    1363 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add_hl(bitblock128_t arg1)
    1364 {
    1365         return simd128<(32)>::add(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
    1366 }
    1367 
    1368 //The total number of operations is 3.0
    1369 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add_hl(bitblock128_t arg1)
    1370 {
    1371         return simd128<(64)>::add(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
    1372 }
    1373 
    1374 //The total number of operations is 3.0
    1375 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1)
    1376 {
    1377         return simd128<64>::add(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
    1378 }
    1379 
    1380 //The total number of operations is 12.6666666667
    1381 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1)
    1382 {
    1383         return simd128<128>::add(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
    1384 }
    1385 
    1386 //The total number of operations is 10.0
    1387 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask)
    1388 {
    1389         return simd128<1>::ifh(simd128<128>::himask(), _mm_srl_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_srl_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
    1390 }
    1391 
    1392 //The total number of operations is 13.0
    1393 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask)
    1394 {
    1395         bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
    1396         return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd128<32>::sub(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd128<32>::sub(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
    1397 }
    1398 
    1399 //The total number of operations is 0
    1400 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    1401 {
    1402         return simd128<32>::constant<(-1*val)>();
    1403 }
    1404 
    1405 //The total number of operations is 0
    1406 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant()
    1407 {
    1408         return simd128<(4)>::constant<((val<<2)|(val&(3)))>();
    1409 }
    1410 
    1411 //The total number of operations is 0
    1412 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant()
    1413 {
    1414         return simd128<(8)>::constant<((val<<4)|(val&(15)))>();
    1415 }
    1416 
    1417 //The total number of operations is 0
    1418 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant()
    1419 {
    1420         return _mm_set1_epi8((int32_t)(val));
    1421 }
    1422 
    1423 //The total number of operations is 0
    1424 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant()
    1425 {
    1426         return _mm_set1_epi16((int32_t)(val));
    1427 }
    1428 
    1429 //The total number of operations is 0
    1430 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant()
    1431 {
    1432         return _mm_set1_epi32((int32_t)(val));
    1433 }
    1434 
    1435 //The total number of operations is 0
    1436 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant()
    1437 {
    1438         return _mm_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val));
    1439 }
    1440 
    1441 //The total number of operations is 0
    1442 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant()
    1443 {
    1444         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val));
    1445 }
    1446 
    1447 //The total number of operations is 1.0
    1448 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::min(bitblock128_t arg1, bitblock128_t arg2)
    1449 {
    1450         return simd_or(arg1, arg2);
    1451 }
    1452 
    1453 //The total number of operations is 16.6666666667
    1454 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::min(bitblock128_t arg1, bitblock128_t arg2)
    1455 {
    1456         bitblock128_t tmp1 = simd128<128>::srli<1>(arg1);
    1457         bitblock128_t tmp2 = simd128<128>::srli<1>(arg2);
    1458         return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg1, simd_and(tmp1, simd_not(tmp2))), simd_and(arg2, simd_or(simd_and(simd_not(tmp1), tmp2), arg1))));
    1459 }
    1460 
    1461 //The total number of operations is 9.0
    1462 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::min(bitblock128_t arg1, bitblock128_t arg2)
    1463 {
    1464         bitblock128_t high_bit = simd128<4>::constant<(8)>();
    1465         return simd_xor(simd128<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1466 }
    1467 
    1468 //The total number of operations is 4.0
    1469 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::min(bitblock128_t arg1, bitblock128_t arg2)
    1470 {
    1471         return simd128<1>::ifh(simd128<8>::gt(arg1, arg2), arg2, arg1);
    1472 }
    1473 
    1474 //The total number of operations is 1.0
    1475 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::min(bitblock128_t arg1, bitblock128_t arg2)
    1476 {
    1477         return _mm_min_epi16(arg1, arg2);
    1478 }
    1479 
    1480 //The total number of operations is 4.0
    1481 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::min(bitblock128_t arg1, bitblock128_t arg2)
    1482 {
    1483         return simd128<1>::ifh(simd128<32>::gt(arg1, arg2), arg2, arg1);
    1484 }
    1485 
    1486 //The total number of operations is 17.5
    1487 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2)
    1488 {
    1489         return simd128<1>::ifh(simd128<64>::gt(arg1, arg2), arg2, arg1);
    1490 }
    1491 
    1492 //The total number of operations is 54.75
    1493 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2)
    1494 {
    1495         return simd128<1>::ifh(simd128<128>::gt(arg1, arg2), arg2, arg1);
    1496 }
    1497 
    1498 //The total number of operations is 0
    1499 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1500 {
    1501         return simd128<2>::constant<(1)>();
    1502 }
    1503 
    1504 //The total number of operations is 0
    1505 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1506 {
    1507         return simd128<4>::constant<(3)>();
    1508 }
    1509 
    1510 //The total number of operations is 0
    1511 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1512 {
    1513         return simd128<8>::constant<(15)>();
    1514 }
    1515 
    1516 //The total number of operations is 0
    1517 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1518 {
    1519         return simd128<16>::constant<(255)>();
    1520 }
    1521 
    1522 //The total number of operations is 0
    1523 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1524 {
    1525         return simd128<32>::constant<(65535)>();
    1526 }
    1527 
    1528 //The total number of operations is 0
    1529 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1530 {
    1531         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1532 }
    1533 
    1534 //The total number of operations is 0
    1535 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1536 {
    1537         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1538 }
    1539 
    1540 //The total number of operations is 1.0
    1541 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1542 {
    1543         return simd_and(arg1, arg2);
    1544 }
    1545 
    1546 //The total number of operations is 16.0
    1547 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1548 {
    1549         return simd_or(simd_and(simd128<(4)>::himask(), simd128<(4)>::umin(arg1, arg2)), simd128<(4)>::umin(simd_and(simd128<(4)>::lomask(), arg1), simd_and(simd128<(4)>::lomask(), arg2)));
    1550 }
    1551 
    1552 //The total number of operations is 6.0
    1553 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1554 {
    1555         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umin(arg1, arg2)), simd128<(8)>::umin(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
    1556 }
    1557 
    1558 //The total number of operations is 1.0
    1559 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1560 {
    1561         return _mm_min_epu8(arg1, arg2);
    1562 }
    1563 
    1564 //The total number of operations is 4.0
    1565 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1566 {
    1567         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1568         return simd_xor(simd128<16>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1569 }
    1570 
    1571 //The total number of operations is 7.0
    1572 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1573 {
    1574         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1575         return simd_xor(simd128<32>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1576 }
    1577 
    1578 //The total number of operations is 20.0
    1579 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1580 {
    1581         bitblock128_t tmpAns = simd128<(32)>::umin(arg1, arg2);
    1582         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1583         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1584         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1585 }
    1586 
    1587 //The total number of operations is 43.6666666667
    1588 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1589 {
    1590         bitblock128_t tmpAns = simd128<(64)>::umin(arg1, arg2);
    1591         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1592         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1593         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1594 }
    1595 
    1596 //The total number of operations is 7.33333333333
    1597 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1598 {
    1599         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1600 }
    1601 
    1602 //The total number of operations is 19.0
    1603 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1604 {
    1605         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1606         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1607 }
    1608 
    1609 //The total number of operations is 1.0
    1610 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1611 {
    1612         return _mm_abs_epi8(arg1);
    1613 }
    1614 
    1615 //The total number of operations is 1.0
    1616 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1617 {
    1618         return _mm_abs_epi16(arg1);
    1619 }
    1620 
    1621 //The total number of operations is 1.0
    1622 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1623 {
    1624         return _mm_abs_epi32(arg1);
    1625 }
    1626 
    1627 //The total number of operations is 13.0
    1628 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1629 {
    1630         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1631         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1632 }
    1633 
    1634 //The total number of operations is 40.0
    1635 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1636 {
    1637         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1638         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    1639 }
    1640 
    1641 //The total number of operations is 2.0
    16421844template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    16431845{
     
    16961898}
    16971899
    1698 //The total number of operations is 4.0
    1699 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1700 {
    1701         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1702 }
    1703 
    1704 //The total number of operations is 10.0
    1705 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1706 {
    1707         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1708         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1709 }
    1710 
    1711 //The total number of operations is 5.0
    1712 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1713 {
    1714         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1715         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1716 }
    1717 
    1718 //The total number of operations is 1.0
    1719 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1720 {
    1721         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1722 }
    1723 
    1724 //The total number of operations is 1.0
    1725 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1726 {
    1727         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1728 }
    1729 
    1730 //The total number of operations is 4.5
    1731 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1732 {
    1733         return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
    1734 }
    1735 
    1736 //The total number of operations is 11.0833333333
    1737 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1738 {
    1739         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1740 }
    1741 
    17421900//The total number of operations is 0
    17431901template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    18341992}
    18351993
    1836 //The total number of operations is 1.0
    1837 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1838 {
    1839         return simd_or(arg1, arg2);
    1840 }
    1841 
    1842 //The total number of operations is 15.6666666667
    1843 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1844 {
    1845         return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
    1846 }
    1847 
    1848 //The total number of operations is 6.0
    1849 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1850 {
    1851         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
    1852 }
    1853 
    1854 //The total number of operations is 1.0
    1855 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1856 {
    1857         return _mm_max_epu8(arg1, arg2);
    1858 }
    1859 
    1860 //The total number of operations is 4.0
    1861 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1862 {
    1863         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1864         return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1865 }
    1866 
    1867 //The total number of operations is 7.0
    1868 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1869 {
    1870         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1871         return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1872 }
    1873 
    1874 //The total number of operations is 20.0
    1875 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1876 {
    1877         bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
    1878         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1879         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1880         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1881 }
    1882 
    1883 //The total number of operations is 43.6666666667
    1884 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1885 {
    1886         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1887         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1888         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1889         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1994//The total number of operations is 7.33333333333
     1995template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1996{
     1997        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1998}
     1999
     2000//The total number of operations is 19.0
     2001template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     2002{
     2003        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     2004        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     2005}
     2006
     2007//The total number of operations is 1.0
     2008template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     2009{
     2010        return _mm_abs_epi8(arg1);
     2011}
     2012
     2013//The total number of operations is 1.0
     2014template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     2015{
     2016        return _mm_abs_epi16(arg1);
     2017}
     2018
     2019//The total number of operations is 1.0
     2020template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     2021{
     2022        return _mm_abs_epi32(arg1);
     2023}
     2024
     2025//The total number of operations is 13.0
     2026template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     2027{
     2028        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     2029        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     2030}
     2031
     2032//The total number of operations is 40.0
     2033template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     2034{
     2035        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     2036        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    18902037}
    18912038
     
    20272174
    20282175//The total number of operations is 24.0
    2029 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<4>::signmask(bitblock128_t arg1)
     2176template <> IDISA_ALWAYS_INLINE FieldType<128/4>::T hsimd128<4>::signmask(bitblock128_t arg1)
    20302177{
    20312178        uint64_t tmpAns1 = hsimd128<(8)>::signmask(esimd128<4>::mergeh(arg1, simd128<4>::constant<0>()));
     
    20352182
    20362183//The total number of operations is 1.0
    2037 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<8>::signmask(bitblock128_t arg1)
     2184template <> IDISA_ALWAYS_INLINE FieldType<128/8>::T hsimd128<8>::signmask(bitblock128_t arg1)
    20382185{
    20392186        return _mm_movemask_epi8(arg1);
     
    20412188
    20422189//The total number of operations is 2.0
    2043 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<16>::signmask(bitblock128_t arg1)
     2190template <> IDISA_ALWAYS_INLINE FieldType<128/16>::T hsimd128<16>::signmask(bitblock128_t arg1)
    20442191{
    20452192        return hsimd128<(8)>::signmask(hsimd128<16>::packss(simd128<16>::constant<0>(), arg1));
     
    20472194
    20482195//The total number of operations is 3.0
    2049 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<32>::signmask(bitblock128_t arg1)
     2196template <> IDISA_ALWAYS_INLINE FieldType<128/32>::T hsimd128<32>::signmask(bitblock128_t arg1)
    20502197{
    20512198        return hsimd128<(16)>::signmask(hsimd128<32>::packss(simd128<32>::constant<0>(), arg1));
     
    20532200
    20542201//The total number of operations is 1.0
    2055 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<64>::signmask(bitblock128_t arg1)
     2202template <> IDISA_ALWAYS_INLINE FieldType<128/64>::T hsimd128<64>::signmask(bitblock128_t arg1)
    20562203{
    20572204        return _mm_movemask_pd(_mm_castsi128_pd(arg1));
     
    20592206
    20602207//The total number of operations is 6.33333333333
    2061 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<128>::signmask(bitblock128_t arg1)
     2208template <> IDISA_ALWAYS_INLINE FieldType<128/128>::T hsimd128<128>::signmask(bitblock128_t arg1)
    20622209{
    20632210        return hsimd128<(64)>::signmask(hsimd128<128>::packh(simd128<128>::constant<0>(), arg1));
     
    24992646template <> template <uint64_t msk> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::shufflei(bitblock128_t arg1)
    25002647{
    2501         bitblock128_t tmphi = _mm_shufflehi_epi16(arg1, (int32_t)((shufflemask8_to_shufflemask4(msk)>>8)));
    2502         bitblock128_t tmpAns = _mm_shufflelo_epi16(tmphi, (int32_t)((shufflemask8_to_shufflemask4(msk)&255)));
    2503         bitblock128_t tmplh = _mm_shufflehi_epi16(simd128<128>::slli<64>(arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)>>8)));
    2504         bitblock128_t tmphl = _mm_shufflelo_epi16(simd128<128>::srli<64>(arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)&255)));
    2505         uint32_t a1 = ((((msk>>21)&4) == 0) ? 0 : (131071));
    2506         uint32_t a2 = ((((msk>>18)&4) == 0) ? 0 : (131071));
    2507         uint32_t a3 = ((((msk>>15)&4) == 0) ? 0 : (131071));
    2508         uint32_t a4 = ((((msk>>12)&4) == 0) ? 0 : (131071));
    2509         uint32_t a5 = ((((msk>>9)&4) == 0) ? (131071) : 0);
    2510         uint32_t a6 = ((((msk>>6)&4) == 0) ? (131071) : 0);
    2511         uint32_t a7 = ((((msk>>3)&4) == 0) ? (131071) : 0);
    2512         uint32_t a8 = (((msk&4) == 0) ? (131071) : 0);
    2513         return simd128<1>::ifh(mvmd128<16>::fill8(a1, a2, a3, a4, a5, a6, a7, a8), tmpAns, simd_or(tmplh, tmphl));
     2648        return simd128<1>::ifh(mvmd128<16>::fill8(((((msk>>21)&4) == 0) ? 0 : (131071)), ((((msk>>18)&4) == 0) ? 0 : (131071)), ((((msk>>15)&4) == 0) ? 0 : (131071)), ((((msk>>12)&4) == 0) ? 0 : (131071)), ((((msk>>9)&4) == 0) ? (131071) : 0), ((((msk>>6)&4) == 0) ? (131071) : 0), ((((msk>>3)&4) == 0) ? (131071) : 0), (((msk&4) == 0) ? (131071) : 0)), _mm_shufflelo_epi16(_mm_shufflehi_epi16(arg1, (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), (int32_t)((shufflemask8_to_shufflemask4(msk)&255))), simd_or(_mm_shufflehi_epi16(simd128<128>::slli<64>(arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), _mm_shufflelo_epi16(simd128<128>::srli<64>(arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)&255)))));
    25142649}
    25152650
     
    25272662
    25282663//The total number of operations is 5.66666666667
    2529 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2664template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25302665{
    25312666        return simd_or(mvmd128<2>::srli<sh>(arg1), mvmd128<2>::slli<((64)-sh)>(arg2));
     
    25332668
    25342669//The total number of operations is 5.66666666667
    2535 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2670template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25362671{
    25372672        return simd_or(mvmd128<4>::srli<sh>(arg1), mvmd128<4>::slli<((32)-sh)>(arg2));
     
    25392674
    25402675//The total number of operations is 3.0
    2541 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2676template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25422677{
    25432678        return simd_or(mvmd128<8>::srli<sh>(arg1), mvmd128<8>::slli<((16)-sh)>(arg2));
     
    25452680
    25462681//The total number of operations is 3.0
    2547 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2682template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25482683{
    25492684        return simd_or(mvmd128<16>::srli<sh>(arg1), mvmd128<16>::slli<((8)-sh)>(arg2));
     
    25512686
    25522687//The total number of operations is 3.0
    2553 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2688template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25542689{
    25552690        return simd_or(mvmd128<32>::srli<sh>(arg1), mvmd128<32>::slli<((4)-sh)>(arg2));
     
    25572692
    25582693//The total number of operations is 3.0
    2559 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2694template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25602695{
    25612696        return simd_or(mvmd128<64>::srli<sh>(arg1), mvmd128<64>::slli<((2)-sh)>(arg2));
     
    25632698
    25642699//The total number of operations is 3.0
    2565 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2700template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    25662701{
    25672702        return simd_or(mvmd128<128>::srli<sh>(arg1), mvmd128<128>::slli<((1)-sh)>(arg2));
    25682703}
    25692704
    2570 //The total number of operations is 1.0
    2571 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(uint64_t val1)
     2705//The total number of operations is 15.0
     2706template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
     2707{
     2708        return simd_or(mvmd128<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd128<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
     2709}
     2710
     2711//The total number of operations is 7.0
     2712template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
     2713{
     2714        return simd_or(mvmd128<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd128<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
     2715}
     2716
     2717//The total number of operations is 3.0
     2718template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
     2719{
     2720        return simd_or(mvmd128<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd128<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
     2721}
     2722
     2723//The total number of operations is 1.0
     2724template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
     2725{
     2726        return _mm_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
     2727}
     2728
     2729//The total number of operations is 1.0
     2730template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(FieldType<1>::T val1)
    25722731{
    25732732        return mvmd128<32>::fill((-1*val1));
     
    25752734
    25762735//The total number of operations is 1.0
    2577 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(uint64_t val1)
     2736template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(FieldType<2>::T val1)
    25782737{
    25792738        return mvmd128<(4)>::fill(((val1<<2)|val1));
     
    25812740
    25822741//The total number of operations is 1.0
    2583 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(uint64_t val1)
     2742template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(FieldType<4>::T val1)
    25842743{
    25852744        return mvmd128<(8)>::fill(((val1<<4)|val1));
     
    25872746
    25882747//The total number of operations is 1.0
    2589 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(uint64_t val1)
     2748template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(FieldType<8>::T val1)
    25902749{
    25912750        return _mm_set1_epi8((int32_t)(val1));
     
    25932752
    25942753//The total number of operations is 1.0
    2595 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(uint64_t val1)
     2754template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(FieldType<16>::T val1)
    25962755{
    25972756        return _mm_set1_epi16((int32_t)(val1));
     
    25992758
    26002759//The total number of operations is 1.0
    2601 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(uint64_t val1)
     2760template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(FieldType<32>::T val1)
    26022761{
    26032762        return _mm_set1_epi32((int32_t)(val1));
     
    26052764
    26062765//The total number of operations is 1.0
    2607 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(uint64_t val1)
     2766template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(FieldType<64>::T val1)
    26082767{
    26092768        return _mm_set_epi32((int32_t)((val1>>32)), (int32_t)(val1), (int32_t)((val1>>32)), (int32_t)(val1));
     
    26112770
    26122771//The total number of operations is 1.0
    2613 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(uint64_t val1)
     2772template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(FieldType<128>::T val1)
    26142773{
    26152774        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val1>>32)), (int32_t)(val1));
     
    26532812
    26542813//The total number of operations is 1.0
    2655 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<1>::extract(bitblock128_t arg1)
     2814template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd128<1>::extract(bitblock128_t arg1)
    26562815{
    26572816        return (((pos%2) == 0) ? (mvmd128<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd128<(2)>::extract<(pos/2)>(arg1)>>1));
     
    26592818
    26602819//The total number of operations is 1.0
    2661 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<2>::extract(bitblock128_t arg1)
     2820template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd128<2>::extract(bitblock128_t arg1)
    26622821{
    26632822        return (((pos%2) == 0) ? (mvmd128<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd128<(4)>::extract<(pos/2)>(arg1)>>2));
     
    26652824
    26662825//The total number of operations is 1.0
    2667 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<4>::extract(bitblock128_t arg1)
     2826template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd128<4>::extract(bitblock128_t arg1)
    26682827{
    26692828        return (((pos%2) == 0) ? (mvmd128<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd128<(8)>::extract<(pos/2)>(arg1)>>4));
     
    26712830
    26722831//The total number of operations is 1.0
    2673 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<8>::extract(bitblock128_t arg1)
     2832template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd128<8>::extract(bitblock128_t arg1)
    26742833{
    26752834        return (((pos%2) == 0) ? (mvmd128<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd128<(16)>::extract<(pos/2)>(arg1)>>8));
     
    26772836
    26782837//The total number of operations is 1.0
    2679 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<16>::extract(bitblock128_t arg1)
     2838template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd128<16>::extract(bitblock128_t arg1)
    26802839{
    26812840        return (65535&_mm_extract_epi16(arg1, (int32_t)(pos)));
     
    26832842
    26842843//The total number of operations is 2.0
    2685 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<32>::extract(bitblock128_t arg1)
    2686 {
    2687         return ((mvmd128<(16)>::extract<((2*pos)+1)>(arg1)<<(16))|mvmd128<(16)>::extract<(2*pos)>(arg1));
     2844template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd128<32>::extract(bitblock128_t arg1)
     2845{
     2846        return ((((uint64_t)(mvmd128<(16)>::extract<((2*pos)+1)>(arg1)))<<(16))|mvmd128<(16)>::extract<(2*pos)>(arg1));
    26882847}
    26892848
    26902849//The total number of operations is 4.0
    2691 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<64>::extract(bitblock128_t arg1)
    2692 {
    2693         return ((mvmd128<(32)>::extract<((2*pos)+1)>(arg1)<<(32))|mvmd128<(32)>::extract<(2*pos)>(arg1));
     2850template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd128<64>::extract(bitblock128_t arg1)
     2851{
     2852        return ((((uint64_t)(mvmd128<(32)>::extract<((2*pos)+1)>(arg1)))<<(32))|mvmd128<(32)>::extract<(2*pos)>(arg1));
    26942853}
    26952854
    26962855//The total number of operations is 12.6666666667
    2697 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1)
     2856template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1)
    26982857{
    26992858        return simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::constant<1>(), simd128<128>::srli<pos>(arg1)));
     
    27012860
    27022861//The total number of operations is 13.0
    2703 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1)
    2704 {
    2705         bitblock128_t tmpArg = (((pos%2) == 0) ? simd128<(4)>::slli<2>(arg1) : simd128<(4)>::srli<2>(arg1));
    2706         bitblock128_t arg11 = (((pos%2) == 0) ? simd_and(simd128<(4)>::lomask(), arg1) : simd_and(simd128<(4)>::himask(), arg1));
    2707         return mvmd128<(4)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
     2862template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1)
     2863{
     2864        return mvmd128<(4)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd128<(4)>::slli<2>(arg1) : simd128<(4)>::srli<2>(arg1)), (((pos%2) == 0) ? simd_and(simd128<(4)>::lomask(), arg1) : simd_and(simd128<(4)>::himask(), arg1))));
    27082865}
    27092866
    27102867//The total number of operations is 9.0
    2711 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1)
    2712 {
    2713         bitblock128_t tmpArg = (((pos%2) == 0) ? simd128<(8)>::slli<4>(arg1) : simd128<(8)>::srli<4>(arg1));
    2714         bitblock128_t arg11 = (((pos%2) == 0) ? simd_and(simd128<(8)>::lomask(), arg1) : simd_and(simd128<(8)>::himask(), arg1));
    2715         return mvmd128<(8)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
     2868template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1)
     2869{
     2870        return mvmd128<(8)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd128<(8)>::slli<4>(arg1) : simd128<(8)>::srli<4>(arg1)), (((pos%2) == 0) ? simd_and(simd128<(8)>::lomask(), arg1) : simd_and(simd128<(8)>::himask(), arg1))));
    27162871}
    27172872
    27182873//The total number of operations is 5.0
    2719 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1)
    2720 {
    2721         bitblock128_t tmpArg = (((pos%2) == 0) ? simd128<(16)>::slli<8>(arg1) : simd128<(16)>::srli<8>(arg1));
    2722         bitblock128_t arg11 = (((pos%2) == 0) ? simd_and(simd128<(16)>::lomask(), arg1) : simd_and(simd128<(16)>::himask(), arg1));
    2723         return mvmd128<(16)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
     2874template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1)
     2875{
     2876        return mvmd128<(16)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd128<(16)>::slli<8>(arg1) : simd128<(16)>::srli<8>(arg1)), (((pos%2) == 0) ? simd_and(simd128<(16)>::lomask(), arg1) : simd_and(simd128<(16)>::himask(), arg1))));
    27242877}
    27252878
    27262879//The total number of operations is 2.0
    2727 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1)
     2880template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1)
    27282881{
    27292882        return mvmd128<16>::fill(_mm_extract_epi16(arg1, (int32_t)(pos)));
     
    27312884
    27322885//The total number of operations is 1.0
    2733 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1)
     2886template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1)
    27342887{
    27352888        return mvmd128<32>::shufflei<shufflemask4(pos, pos, pos, pos)>(arg1);
     
    27372890
    27382891//The total number of operations is 5.0
    2739 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1)
     2892template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1)
    27402893{
    27412894        return simd128<1>::ifh(simd128<64>::himask(), mvmd128<(32)>::splat<((2*pos)+1)>(arg1), mvmd128<(32)>::splat<(2*pos)>(arg1));
     
    27432896
    27442897//The total number of operations is 13.0
    2745 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1)
     2898template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1)
    27462899{
    27472900        return simd128<1>::ifh(simd128<128>::himask(), mvmd128<(64)>::splat<((2*pos)+1)>(arg1), mvmd128<(64)>::splat<(2*pos)>(arg1));
    27482901}
    27492902
    2750 //The total number of operations is 15.0
    2751 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2752 {
    2753         return simd_or(mvmd128<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd128<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
     2903//The total number of operations is 4.0
     2904template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::insert(bitblock128_t arg1, FieldType<2>::T arg2)
     2905{
     2906        uint32_t v = (arg2&(3));
     2907        uint64_t doublev = mvmd128<(4)>::extract<(pos/2)>(arg1);
     2908        return mvmd128<(4)>::insert<(pos/2)>(arg1, (((pos&1) == 0) ? (((doublev>>2)<<2)|v) : ((doublev&(3))|(v<<2))));
     2909}
     2910
     2911//The total number of operations is 3.0
     2912template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::insert(bitblock128_t arg1, FieldType<4>::T arg2)
     2913{
     2914        uint32_t v = (arg2&(15));
     2915        uint64_t doublev = mvmd128<(8)>::extract<(pos/2)>(arg1);
     2916        return mvmd128<(8)>::insert<(pos/2)>(arg1, (((pos&1) == 0) ? (((doublev>>4)<<4)|v) : ((doublev&(15))|(v<<4))));
     2917}
     2918
     2919//The total number of operations is 2.0
     2920template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::insert(bitblock128_t arg1, FieldType<8>::T arg2)
     2921{
     2922        uint32_t v = (arg2&(255));
     2923        uint64_t doublev = mvmd128<(16)>::extract<(pos/2)>(arg1);
     2924        return mvmd128<(16)>::insert<(pos/2)>(arg1, (((pos&1) == 0) ? (((doublev>>8)<<8)|v) : ((doublev&(255))|(v<<8))));
     2925}
     2926
     2927//The total number of operations is 1.0
     2928template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::insert(bitblock128_t arg1, FieldType<16>::T arg2)
     2929{
     2930        return _mm_insert_epi16(arg1, (int32_t)(arg2), (int32_t)(pos));
     2931}
     2932
     2933//The total number of operations is 2.0
     2934template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::insert(bitblock128_t arg1, FieldType<32>::T arg2)
     2935{
     2936        return mvmd128<(16)>::insert<(2*pos)>(mvmd128<(16)>::insert<((2*pos)+1)>(arg1, (arg2>>(16))), (arg2&(65535)));
     2937}
     2938
     2939//The total number of operations is 4.0
     2940template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::insert(bitblock128_t arg1, FieldType<64>::T arg2)
     2941{
     2942        return mvmd128<(32)>::insert<(2*pos)>(mvmd128<(32)>::insert<((2*pos)+1)>(arg1, (arg2>>(32))), (arg2&((4294967296ULL)-1)));
     2943}
     2944
     2945//The total number of operations is 5.0
     2946template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
     2947{
     2948        return simd128<1>::ifh(simd128<(4)>::himask(), mvmd128<1>::fill2(val1, val2), mvmd128<1>::fill2(val3, val4));
     2949}
     2950
     2951//The total number of operations is 5.0
     2952template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4)
     2953{
     2954        return simd128<1>::ifh(simd128<(8)>::himask(), mvmd128<2>::fill2(val1, val2), mvmd128<2>::fill2(val3, val4));
     2955}
     2956
     2957//The total number of operations is 5.0
     2958template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4)
     2959{
     2960        return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<4>::fill2(val1, val2), mvmd128<4>::fill2(val3, val4));
     2961}
     2962
     2963//The total number of operations is 5.0
     2964template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4)
     2965{
     2966        return simd128<1>::ifh(simd128<(32)>::himask(), mvmd128<8>::fill2(val1, val2), mvmd128<8>::fill2(val3, val4));
     2967}
     2968
     2969//The total number of operations is 3.0
     2970template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4)
     2971{
     2972        return simd_or(mvmd128<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd128<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
     2973}
     2974
     2975//The total number of operations is 1.0
     2976template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4)
     2977{
     2978        return _mm_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
     2979}
     2980
     2981//The total number of operations is 2.33333333333
     2982template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1)
     2983{
     2984        return simd128<128>::srli<(sh*2)>(arg1);
     2985}
     2986
     2987//The total number of operations is 2.33333333333
     2988template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1)
     2989{
     2990        return simd128<128>::srli<(sh*4)>(arg1);
     2991}
     2992
     2993//The total number of operations is 1.0
     2994template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1)
     2995{
     2996        return _mm_srli_si128(arg1, (int32_t)(sh));
     2997}
     2998
     2999//The total number of operations is 1.0
     3000template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1)
     3001{
     3002        return mvmd128<(8)>::srli<(sh*2)>(arg1);
     3003}
     3004
     3005//The total number of operations is 1.0
     3006template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1)
     3007{
     3008        return mvmd128<(16)>::srli<(sh*2)>(arg1);
     3009}
     3010
     3011//The total number of operations is 1.0
     3012template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1)
     3013{
     3014        return mvmd128<(32)>::srli<(sh*2)>(arg1);
     3015}
     3016
     3017//The total number of operations is 1.0
     3018template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1)
     3019{
     3020        return mvmd128<(64)>::srli<(sh*2)>(arg1);
     3021}
     3022
     3023//The total number of operations is 1.0
     3024template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2)
     3025{
     3026        return mvmd128<(2)>::fill(((val1<<1)|(val2&(1))));
     3027}
     3028
     3029//The total number of operations is 1.0
     3030template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2)
     3031{
     3032        return mvmd128<(4)>::fill(((val1<<2)|(val2&(3))));
     3033}
     3034
     3035//The total number of operations is 1.0
     3036template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2)
     3037{
     3038        return mvmd128<(8)>::fill(((val1<<4)|(val2&(15))));
     3039}
     3040
     3041//The total number of operations is 1.0
     3042template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2)
     3043{
     3044        return mvmd128<(16)>::fill(((val1<<8)|(val2&(255))));
     3045}
     3046
     3047//The total number of operations is 1.0
     3048template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2)
     3049{
     3050        return mvmd128<(32)>::fill(((val1<<16)|(val2&(65535))));
     3051}
     3052
     3053//The total number of operations is 5.0
     3054template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2)
     3055{
     3056        return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<32>::fill(val1), mvmd128<32>::fill(val2));
     3057}
     3058
     3059//The total number of operations is 5.0
     3060template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2)
     3061{
     3062        return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<64>::fill(val1), mvmd128<64>::fill(val2));
     3063}
     3064
     3065//The total number of operations is 5.66666666667
     3066template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3067{
     3068        return simd_or(mvmd128<2>::slli<sh>(arg1), mvmd128<2>::srli<((64)-sh)>(arg2));
     3069}
     3070
     3071//The total number of operations is 5.66666666667
     3072template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3073{
     3074        return simd_or(mvmd128<4>::slli<sh>(arg1), mvmd128<4>::srli<((32)-sh)>(arg2));
     3075}
     3076
     3077//The total number of operations is 3.0
     3078template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3079{
     3080        return simd_or(mvmd128<8>::slli<sh>(arg1), mvmd128<8>::srli<((16)-sh)>(arg2));
     3081}
     3082
     3083//The total number of operations is 3.0
     3084template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3085{
     3086        return simd_or(mvmd128<16>::slli<sh>(arg1), mvmd128<16>::srli<((8)-sh)>(arg2));
     3087}
     3088
     3089//The total number of operations is 3.0
     3090template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3091{
     3092        return simd_or(mvmd128<32>::slli<sh>(arg1), mvmd128<32>::srli<((4)-sh)>(arg2));
     3093}
     3094
     3095//The total number of operations is 3.0
     3096template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3097{
     3098        return simd_or(mvmd128<64>::slli<sh>(arg1), mvmd128<64>::srli<((2)-sh)>(arg2));
     3099}
     3100
     3101//The total number of operations is 3.0
     3102template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     3103{
     3104        return simd_or(mvmd128<128>::slli<sh>(arg1), mvmd128<128>::srli<((1)-sh)>(arg2));
     3105}
     3106
     3107//The total number of operations is 2.33333333333
     3108template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     3109{
     3110        return simd128<128>::slli<(sh*2)>(arg1);
     3111}
     3112
     3113//The total number of operations is 2.33333333333
     3114template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     3115{
     3116        return simd128<128>::slli<(sh*4)>(arg1);
     3117}
     3118
     3119//The total number of operations is 1.0
     3120template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     3121{
     3122        return _mm_slli_si128(arg1, (int32_t)(sh));
     3123}
     3124
     3125//The total number of operations is 1.0
     3126template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     3127{
     3128        return mvmd128<(8)>::slli<(sh*2)>(arg1);
     3129}
     3130
     3131//The total number of operations is 1.0
     3132template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     3133{
     3134        return mvmd128<(16)>::slli<(sh*2)>(arg1);
     3135}
     3136
     3137//The total number of operations is 1.0
     3138template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     3139{
     3140        return mvmd128<(32)>::slli<(sh*2)>(arg1);
     3141}
     3142
     3143//The total number of operations is 1.0
     3144template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     3145{
     3146        return mvmd128<(64)>::slli<(sh*2)>(arg1);
     3147}
     3148
     3149//The total number of operations is 13.0
     3150template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8)
     3151{
     3152        return simd128<1>::ifh(simd128<(8)>::himask(), mvmd128<1>::fill4(val1, val2, val3, val4), mvmd128<1>::fill4(val5, val6, val7, val8));
     3153}
     3154
     3155//The total number of operations is 13.0
     3156template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8)
     3157{
     3158        return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<2>::fill4(val1, val2, val3, val4), mvmd128<2>::fill4(val5, val6, val7, val8));
    27543159}
    27553160
    27563161//The total number of operations is 7.0
    2757 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2758 {
    2759         return simd_or(mvmd128<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd128<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
    2760 }
    2761 
    2762 //The total number of operations is 3.0
    2763 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2764 {
    2765         return simd_or(mvmd128<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd128<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
    2766 }
    2767 
    2768 //The total number of operations is 1.0
    2769 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2770 {
    2771         return _mm_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
    2772 }
    2773 
    2774 //The total number of operations is 5.0
    2775 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    2776 {
    2777         return simd128<1>::ifh(simd128<(4)>::himask(), mvmd128<1>::fill2(val1, val2), mvmd128<1>::fill2(val3, val4));
    2778 }
    2779 
    2780 //The total number of operations is 5.0
    2781 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    2782 {
    2783         return simd128<1>::ifh(simd128<(8)>::himask(), mvmd128<2>::fill2(val1, val2), mvmd128<2>::fill2(val3, val4));
    2784 }
    2785 
    2786 //The total number of operations is 5.0
    2787 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    2788 {
    2789         return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<4>::fill2(val1, val2), mvmd128<4>::fill2(val3, val4));
    2790 }
    2791 
    2792 //The total number of operations is 5.0
    2793 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    2794 {
    2795         return simd128<1>::ifh(simd128<(32)>::himask(), mvmd128<8>::fill2(val1, val2), mvmd128<8>::fill2(val3, val4));
    2796 }
    2797 
    2798 //The total number of operations is 3.0
    2799 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    2800 {
    2801         return simd_or(mvmd128<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd128<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
    2802 }
    2803 
    2804 //The total number of operations is 1.0
    2805 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    2806 {
    2807         return _mm_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
    2808 }
    2809 
    2810 //The total number of operations is 2.33333333333
    2811 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1)
    2812 {
    2813         return simd128<128>::srli<(sh*2)>(arg1);
    2814 }
    2815 
    2816 //The total number of operations is 2.33333333333
    2817 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1)
    2818 {
    2819         return simd128<128>::srli<(sh*4)>(arg1);
    2820 }
    2821 
    2822 //The total number of operations is 1.0
    2823 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1)
    2824 {
    2825         return _mm_srli_si128(arg1, (int32_t)(sh));
    2826 }
    2827 
    2828 //The total number of operations is 1.0
    2829 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1)
    2830 {
    2831         return mvmd128<(8)>::srli<(sh*2)>(arg1);
    2832 }
    2833 
    2834 //The total number of operations is 1.0
    2835 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1)
    2836 {
    2837         return mvmd128<(16)>::srli<(sh*2)>(arg1);
    2838 }
    2839 
    2840 //The total number of operations is 1.0
    2841 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1)
    2842 {
    2843         return mvmd128<(32)>::srli<(sh*2)>(arg1);
    2844 }
    2845 
    2846 //The total number of operations is 1.0
    2847 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1)
    2848 {
    2849         return mvmd128<(64)>::srli<(sh*2)>(arg1);
    2850 }
    2851 
    2852 //The total number of operations is 1.0
    2853 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(uint64_t val1, uint64_t val2)
    2854 {
    2855         return mvmd128<(2)>::fill(((val1<<1)|(val2&(1))));
    2856 }
    2857 
    2858 //The total number of operations is 1.0
    2859 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(uint64_t val1, uint64_t val2)
    2860 {
    2861         return mvmd128<(4)>::fill(((val1<<2)|(val2&(3))));
    2862 }
    2863 
    2864 //The total number of operations is 1.0
    2865 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(uint64_t val1, uint64_t val2)
    2866 {
    2867         return mvmd128<(8)>::fill(((val1<<4)|(val2&(15))));
    2868 }
    2869 
    2870 //The total number of operations is 1.0
    2871 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(uint64_t val1, uint64_t val2)
    2872 {
    2873         return mvmd128<(16)>::fill(((val1<<8)|(val2&(255))));
    2874 }
    2875 
    2876 //The total number of operations is 1.0
    2877 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(uint64_t val1, uint64_t val2)
    2878 {
    2879         return mvmd128<(32)>::fill(((val1<<16)|(val2&(65535))));
    2880 }
    2881 
    2882 //The total number of operations is 5.0
    2883 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(uint64_t val1, uint64_t val2)
    2884 {
    2885         return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<32>::fill(val1), mvmd128<32>::fill(val2));
    2886 }
    2887 
    2888 //The total number of operations is 5.0
    2889 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(uint64_t val1, uint64_t val2)
    2890 {
    2891         return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<64>::fill(val1), mvmd128<64>::fill(val2));
    2892 }
    2893 
    2894 //The total number of operations is 5.66666666667
    2895 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2896 {
    2897         return simd_or(mvmd128<2>::slli<sh>(arg1), mvmd128<2>::srli<((64)-sh)>(arg2));
    2898 }
    2899 
    2900 //The total number of operations is 5.66666666667
    2901 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2902 {
    2903         return simd_or(mvmd128<4>::slli<sh>(arg1), mvmd128<4>::srli<((32)-sh)>(arg2));
    2904 }
    2905 
    2906 //The total number of operations is 3.0
    2907 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2908 {
    2909         return simd_or(mvmd128<8>::slli<sh>(arg1), mvmd128<8>::srli<((16)-sh)>(arg2));
    2910 }
    2911 
    2912 //The total number of operations is 3.0
    2913 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2914 {
    2915         return simd_or(mvmd128<16>::slli<sh>(arg1), mvmd128<16>::srli<((8)-sh)>(arg2));
    2916 }
    2917 
    2918 //The total number of operations is 3.0
    2919 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2920 {
    2921         return simd_or(mvmd128<32>::slli<sh>(arg1), mvmd128<32>::srli<((4)-sh)>(arg2));
    2922 }
    2923 
    2924 //The total number of operations is 3.0
    2925 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2926 {
    2927         return simd_or(mvmd128<64>::slli<sh>(arg1), mvmd128<64>::srli<((2)-sh)>(arg2));
    2928 }
    2929 
    2930 //The total number of operations is 3.0
    2931 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    2932 {
    2933         return simd_or(mvmd128<128>::slli<sh>(arg1), mvmd128<128>::srli<((1)-sh)>(arg2));
    2934 }
    2935 
    2936 //The total number of operations is 2.33333333333
    2937 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    2938 {
    2939         return simd128<128>::slli<(sh*2)>(arg1);
    2940 }
    2941 
    2942 //The total number of operations is 2.33333333333
    2943 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2944 {
    2945         return mvmd128<(2)>::slli<(sh*2)>(arg1);
    2946 }
    2947 
    2948 //The total number of operations is 1.0
    2949 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    2950 {
    2951         return _mm_slli_si128(arg1, (int32_t)(sh));
    2952 }
    2953 
    2954 //The total number of operations is 1.0
    2955 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2956 {
    2957         return mvmd128<(8)>::slli<(sh*2)>(arg1);
    2958 }
    2959 
    2960 //The total number of operations is 1.0
    2961 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    2962 {
    2963         return mvmd128<(16)>::slli<(sh*2)>(arg1);
    2964 }
    2965 
    2966 //The total number of operations is 1.0
    2967 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2968 {
    2969         return mvmd128<(32)>::slli<(sh*2)>(arg1);
    2970 }
    2971 
    2972 //The total number of operations is 1.0
    2973 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    2974 {
    2975         return mvmd128<(64)>::slli<(sh*2)>(arg1);
    2976 }
    2977 
    2978 //The total number of operations is 13.0
    2979 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
    2980 {
    2981         return simd128<1>::ifh(simd128<(8)>::himask(), mvmd128<1>::fill4(val1, val2, val3, val4), mvmd128<1>::fill4(val5, val6, val7, val8));
    2982 }
    2983 
    2984 //The total number of operations is 13.0
    2985 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
    2986 {
    2987         return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<2>::fill4(val1, val2, val3, val4), mvmd128<2>::fill4(val5, val6, val7, val8));
    2988 }
    2989 
    2990 //The total number of operations is 7.0
    2991 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3162template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8)
    29923163{
    29933164        return simd_or(mvmd128<(8)>::fill8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd128<(8)>::fill8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
     
    29953166
    29963167//The total number of operations is 3.0
    2997 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3168template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8)
    29983169{
    29993170        return simd_or(mvmd128<(16)>::fill8((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd128<(16)>::fill8((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
     
    30013172
    30023173//The total number of operations is 1.0
    3003 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3174template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8)
    30043175{
    30053176        return _mm_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
     
    30193190
    30203191//The total number of operations is 2.33333333333
    3021 template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     3192template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
    30223193{
    30233194        return simd128<128>::srli<sh>(arg1);
    30243195}
    30253196
    3026 //The total number of operations is 1.0
    3027 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
    3028 {
    3029         _mm_store_si128((bitblock128_t*)(arg2), arg1);
    3030 }
    3031 
    30323197//The total number of operations is 20.3333333333
    3033 IDISA_ALWAYS_INLINE uint64_t bitblock128::popcount(bitblock128_t arg1)
     3198IDISA_ALWAYS_INLINE uint16_t bitblock128::popcount(bitblock128_t arg1)
    30343199{
    30353200        return mvmd128<64>::extract<0>(simd128<128>::popcount(arg1));
     
    30483213}
    30493214
    3050 //The total number of operations is 2.33333333333
    3051 template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    3052 {
    3053         return simd128<128>::slli<sh>(arg1);
    3054 }
    3055 
    30563215//The total number of operations is 2.0
    30573216IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
     
    30723231}
    30733232
     3233//The total number of operations is 2.33333333333
     3234template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3235{
     3236        return simd128<128>::slli<sh>(arg1);
     3237}
     3238
     3239//The total number of operations is 1.0
     3240IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
     3241{
     3242        _mm_store_si128((bitblock128_t*)(arg2), arg1);
     3243}
     3244
    30743245#endif
Note: See TracChangeset for help on using the changeset viewer.