Changeset 3441 for trunk


Ignore:
Timestamp:
Sep 7, 2013, 3:05:51 PM (6 years ago)
Author:
linmengl
Message:

make avx2 can run now

Location:
trunk/lib
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/bitblock256.hpp

    r3439 r3441  
    1818union ubitblock {
    1919        bitblock256_t _256;
    20         bitblock256_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)];
     20        bitblock128_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)];
    2121        uint64_t _64[sizeof(bitblock256_t)/sizeof(uint64_t)];
    2222        uint32_t _32[sizeof(bitblock256_t)/sizeof(uint32_t)];
     
    2828typedef bitblock256_t carry_t;
    2929
     30static IDISA_ALWAYS_INLINE void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum);
     31static IDISA_ALWAYS_INLINE void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference);
     32static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, bitblock256_t carry_in, bitblock256_t & carry_out, bitblock256_t & rslt);
     33
     34
     35
     36
    3037static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry);
    3138static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry);
     
    4653static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) {  return carry;}
    4754
    48 static inline void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum);
    49 static inline void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference);
    50 static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, bitblock256_t carry_in, bitblock256_t & carry_out, bitblock256_t & rslt);
    51 
    52 
    5355static inline void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) {
    54 bitblock256_t all_ones = simd256<1>::constant<1>();
    55 bitblock256_t gen = simd_and(x, y);
    56 bitblock256_t prop = simd_xor(x, y);
    57 bitblock256_t partial_sum = simd256<64>::add(x, y);
    58 bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum));
    59 bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones);
    60 uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in);
    61 uint64_t bubble_mask = hsimd256<64>::signmask(bubble);
    62 uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
    63 uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
    64 carry_out = convert(increments >> 4);
    65 uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
    66 sum = simd256<64>::add(partial_sum, _mm256_cvtepu8_epi64(avx_select_lo128(convert(spread))));
     56  bitblock256_t all_ones = simd256<1>::constant<1>();
     57  bitblock256_t gen = simd_and(x, y);
     58  bitblock256_t prop = simd_xor(x, y);
     59  bitblock256_t partial_sum = simd256<64>::add(x, y);
     60  bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum));
     61  bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones);
     62  uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in);
     63  uint64_t bubble_mask = hsimd256<64>::signmask(bubble);
     64  uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
     65  uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
     66  carry_out = convert(increments >> 4);
     67  uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
     68  sum = simd256<64>::add(partial_sum, _mm256_cvtepu8_epi64(avx_select_lo128(convert(spread))));
    6769}
    6870
     
    7476        difference = simd256<128>::sub(partial, b1);
    7577        borrow_out = simd_or(gen, simd_and(prop, difference));
    76 
    7778}
    7879
     
    8384        rslt = simd_or(simd256<64>::add(cursor, cursor), low_bits);
    8485}
    85 
    8686
    8787
     
    251251}
    252252
    253 IDISA_ALWAYS_INLINE uint64_t convert (bitblock256_t v)
     253IDISA_ALWAYS_INLINE uint64_t convert(bitblock256_t v)
    254254{
    255255  return (uint64_t) mvmd256<64>::extract<0>(v);
  • trunk/lib/idisa256.hpp

    r2275 r3441  
    99#define IDISA256_HPP
    1010
     11#if defined USE_AVX
    1112#include "idisa_cpp/idisa_avx.cpp"
     13#else
     14#include "idisa_cpp/idisa_avx2.cpp"
    1215#endif
     16               
     17#endif
  • trunk/lib/idisa_cpp/idisa_avx.cpp

    r2275 r3441  
    1616
    1717typedef __m256 bitblock256_t;
     18                       
    1819template <uint32_t fw>
    1920class simd256
     
    2526        static IDISA_ALWAYS_INLINE bitblock256_t umult(bitblock256_t arg1, bitblock256_t arg2);
    2627        static IDISA_ALWAYS_INLINE bitblock256_t ult(bitblock256_t arg1, bitblock256_t arg2);
    27         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
     28        static IDISA_ALWAYS_INLINE bitblock256_t all(bitblock256_t arg1);
     29        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    2830        static IDISA_ALWAYS_INLINE bitblock256_t ctz(bitblock256_t arg1);
    2931        static IDISA_ALWAYS_INLINE bitblock256_t eq(bitblock256_t arg1, bitblock256_t arg2);
     
    3133        static IDISA_ALWAYS_INLINE bitblock256_t neg(bitblock256_t arg1);
    3234        static IDISA_ALWAYS_INLINE bitblock256_t himask();
    33         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
     35        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    3436        static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    3537        static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2);
     
    3739        static IDISA_ALWAYS_INLINE bitblock256_t lomask();
    3840        static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2);
    39         template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
     41        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    4042        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
    4143        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    4244        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
    4345        static IDISA_ALWAYS_INLINE bitblock256_t xor_hl(bitblock256_t arg1);
    44         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
     46        static IDISA_ALWAYS_INLINE bitblock256_t any(bitblock256_t arg1);
     47        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
    4548        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
    4649        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
     
    5659        static IDISA_ALWAYS_INLINE bitblock256_t packss(bitblock256_t arg1, bitblock256_t arg2);
    5760        static IDISA_ALWAYS_INLINE bitblock256_t packh(bitblock256_t arg1, bitblock256_t arg2);
    58         static IDISA_ALWAYS_INLINE uint64_t signmask(bitblock256_t arg1);
     61        static IDISA_ALWAYS_INLINE typename FieldType<256/fw>::T signmask(bitblock256_t arg1);
    5962        static IDISA_ALWAYS_INLINE bitblock256_t packl(bitblock256_t arg1, bitblock256_t arg2);
    6063        static IDISA_ALWAYS_INLINE bitblock256_t min_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    7881{
    7982public:
    80         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);
    81         static IDISA_ALWAYS_INLINE bitblock256_t fill(uint64_t val1);
    82         template <uint64_t pos> static IDISA_ALWAYS_INLINE uint64_t extract(bitblock256_t arg1);
    83         template <uint64_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);
    84         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    85         static IDISA_ALWAYS_INLINE bitblock256_t fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    86         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    87         static IDISA_ALWAYS_INLINE bitblock256_t fill2(uint64_t val1, uint64_t val2);
    88         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dslli(bitblock256_t arg1, bitblock256_t arg2);
    89         static IDISA_ALWAYS_INLINE bitblock256_t fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    90         static IDISA_ALWAYS_INLINE bitblock256_t fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
     83        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);
     84        static IDISA_ALWAYS_INLINE bitblock256_t fill(typename FieldType<fw>::T val1);
     85        template <uint8_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
     86        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);
     87        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
     88        static IDISA_ALWAYS_INLINE bitblock256_t fill4(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4);
     89        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
     90        static IDISA_ALWAYS_INLINE bitblock256_t fill2(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2);
     91        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dslli(bitblock256_t arg1, bitblock256_t arg2);
     92        static IDISA_ALWAYS_INLINE bitblock256_t fill8(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8);
     93        static IDISA_ALWAYS_INLINE bitblock256_t fill16(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8, typename FieldType<fw>::T val9, typename FieldType<fw>::T val10, typename FieldType<fw>::T val11, typename FieldType<fw>::T val12, typename FieldType<fw>::T val13, typename FieldType<fw>::T val14, typename FieldType<fw>::T val15, typename FieldType<fw>::T val16);
    9194};
    9295
     
    9598public:
    9699        static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(const bitblock256_t* arg1);
    97         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
     100        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    98101        static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    99102        static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1);
    100103        static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1);
    101         static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock256_t arg1);
    102         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
     104        static IDISA_ALWAYS_INLINE uint16_t popcount(bitblock256_t arg1);
     105        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    103106        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(const bitblock256_t* arg1);
    104107        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t arg1, bitblock256_t* arg2);
     
    110113IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    111114IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
     115IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    112116IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
    113 IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    114117template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::max(bitblock256_t arg1, bitblock256_t arg2);
    115118template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::max(bitblock256_t arg1, bitblock256_t arg2);
     
    156159template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ult(bitblock256_t arg1, bitblock256_t arg2);
    157160template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ult(bitblock256_t arg1, bitblock256_t arg2);
    158 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
    159 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
    160 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
    161 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
    162 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
    163 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
    164 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
    165 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
    166 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    167 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1);
    168 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1);
    169 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1);
    170 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1);
    171 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1);
    172 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1);
    173 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1);
    174 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1);
     161template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::all(bitblock256_t arg1);
     162template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::all(bitblock256_t arg1);
     163template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::all(bitblock256_t arg1);
     164template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::all(bitblock256_t arg1);
     165template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::all(bitblock256_t arg1);
     166template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::all(bitblock256_t arg1);
     167template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::all(bitblock256_t arg1);
     168template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::all(bitblock256_t arg1);
     169template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1);
     170template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1);
     171template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1);
     172template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1);
     173template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1);
     174template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1);
     175template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1);
     176template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1);
    175177template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1);
    176178template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1);
     
    208210template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1);
    209211template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1);
     212template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1);
     213template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1);
     214template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1);
     215template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1);
     216template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1);
     217template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1);
     218template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1);
     219template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1);
    210220template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1);
    211221template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1);
     
    216226template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1);
    217227template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1);
    218 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1);
    219 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1);
    220 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1);
    221 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1);
    222 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1);
    223 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1);
    224 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1);
    225 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1);
     228template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1);
     229template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1);
     230template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1);
     231template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1);
     232template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1);
     233template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1);
     234template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1);
     235template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1);
    226236template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    227237template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
     
    250260template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
    251261template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
    252 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    253 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
    254 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant();
    255 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant();
    256 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant();
    257 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant();
    258 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant();
    259 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant();
    260 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant();
     262template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
     263template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     264template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant();
     265template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant();
     266template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant();
     267template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant();
     268template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant();
     269template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant();
     270template <> template <FieldType<256>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant();
    261271template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2);
    262272template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2);
     
    276286template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    277287template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
     288template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2);
     289template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2);
     290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2);
     291template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2);
     292template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2);
     293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2);
     294template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2);
     295template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
     296template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    278297template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    279298template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     
    302321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
    303322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
    304 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
    305 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
    306 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
    307 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
    308 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
    309 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
    310 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
    311 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
     323template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
     324template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
     325template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
     326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
     327template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
     328template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
     329template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
     330template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    312331template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
    313332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
     
    318337template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
    319338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2);
    321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2);
    322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2);
    323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2);
    324 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2);
    325 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2);
    326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2);
    327 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    328 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
     339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
     340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
     341template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
     342template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
     343template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
     344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
     345template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
     346template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
     347template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    329348template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    330349template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     
    360379template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packss(bitblock256_t arg1, bitblock256_t arg2);
    361380template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packss(bitblock256_t arg1, bitblock256_t arg2);
    362 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<8>::signmask(bitblock256_t arg1);
    363 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<16>::signmask(bitblock256_t arg1);
    364 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<32>::signmask(bitblock256_t arg1);
    365 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<64>::signmask(bitblock256_t arg1);
    366 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<128>::signmask(bitblock256_t arg1);
    367 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<256>::signmask(bitblock256_t arg1);
     381template <> IDISA_ALWAYS_INLINE FieldType<256/8>::T hsimd256<8>::signmask(bitblock256_t arg1);
     382template <> IDISA_ALWAYS_INLINE FieldType<256/16>::T hsimd256<16>::signmask(bitblock256_t arg1);
     383template <> IDISA_ALWAYS_INLINE FieldType<256/32>::T hsimd256<32>::signmask(bitblock256_t arg1);
     384template <> IDISA_ALWAYS_INLINE FieldType<256/64>::T hsimd256<64>::signmask(bitblock256_t arg1);
     385template <> IDISA_ALWAYS_INLINE FieldType<256/128>::T hsimd256<128>::signmask(bitblock256_t arg1);
     386template <> IDISA_ALWAYS_INLINE FieldType<256/256>::T hsimd256<256>::signmask(bitblock256_t arg1);
    368387template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packl(bitblock256_t arg1, bitblock256_t arg2);
    369388template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packl(bitblock256_t arg1, bitblock256_t arg2);
     
    446465template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::signextendl(bitblock256_t arg1);
    447466template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::signextendl(bitblock256_t arg1);
    448 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    449 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    450 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    451 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    452 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    453 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    454 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    455 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    456 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(uint64_t val1);
    457 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(uint64_t val1);
    458 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(uint64_t val1);
    459 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(uint64_t val1);
    460 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(uint64_t val1);
    461 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(uint64_t val1);
    462 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill(uint64_t val1);
    463 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(uint64_t val1);
    464 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(uint64_t val1);
    465 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<1>::extract(bitblock256_t arg1);
    466 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<2>::extract(bitblock256_t arg1);
    467 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<4>::extract(bitblock256_t arg1);
    468 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<8>::extract(bitblock256_t arg1);
    469 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<16>::extract(bitblock256_t arg1);
    470 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<32>::extract(bitblock256_t arg1);
    471 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<64>::extract(bitblock256_t arg1);
    472 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);
    473 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);
    474 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1);
    475 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1);
    476 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1);
    477 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1);
    478 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1);
    479 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
    480 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
    481 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    482 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    483 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    484 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    485 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    486 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    487 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    488 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    489 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    490 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    491 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    492 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    493 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1);
    494 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1);
    495 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1);
    496 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1);
    497 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1);
    498 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1);
    499 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1);
    500 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1);
    501 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(uint64_t val1, uint64_t val2);
    502 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(uint64_t val1, uint64_t val2);
    503 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(uint64_t val1, uint64_t val2);
    504 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(uint64_t val1, uint64_t val2);
    505 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(uint64_t val1, uint64_t val2);
    506 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(uint64_t val1, uint64_t val2);
    507 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2(uint64_t val1, uint64_t val2);
    508 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2(uint64_t val1, uint64_t val2);
    509 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    510 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    511 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    512 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    513 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    514 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    515 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    516 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    517 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1);
    518 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1);
    519 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1);
    520 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1);
    521 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1);
    522 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1);
    523 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1);
    524 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1);
    525 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    526 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    527 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    528 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    529 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    530 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     467template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     468template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     469template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     470template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     471template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     472template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     473template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     474template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     475template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1);
     476template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1);
     477template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(FieldType<4>::T val1);
     478template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(FieldType<8>::T val1);
     479template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(FieldType<16>::T val1);
     480template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(FieldType<32>::T val1);
     481template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill(FieldType<64>::T val1);
     482template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1);
     483template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1);
     484template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
     485template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
     486template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
     487template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
     488template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
     489template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
     490template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
     491template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);
     492template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);
     493template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1);
     494template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1);
     495template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1);
     496template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1);
     497template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1);
     498template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
     499template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
     500template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
     501template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
     502template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
     503template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
     504template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
     505template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
     506template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     507template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4);
     508template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4);
     509template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4);
     510template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4);
     511template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4(FieldType<64>::T val1, FieldType<64>::T val2, FieldType<64>::T val3, FieldType<64>::T val4);
     512template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1);
     513template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1);
     514template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1);
     515template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1);
     516template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1);
     517template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1);
     518template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1);
     519template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1);
     520template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2);
     521template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2);
     522template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2);
     523template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2);
     524template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2);
     525template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2);
     526template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2);
     527template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2(FieldType<128>::T val1, FieldType<128>::T val2);
     528template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     529template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     530template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     531template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     532template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     533template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     534template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     535template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     536template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1);
     537template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1);
     538template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1);
     539template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1);
     540template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1);
     541template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1);
     542template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1);
     543template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1);
     544template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8);
     545template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8);
     546template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8);
     547template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8);
     548template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8);
     549template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4, FieldType<32>::T val5, FieldType<32>::T val6, FieldType<32>::T val7, FieldType<32>::T val8);
    531550
    532551//Implementation Part
     
    577596
    578597//The total number of operations is 1.0
     598IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2)
     599{
     600        return _mm256_and_ps(arg1, arg2);
     601}
     602
     603//The total number of operations is 1.0
    579604IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2)
    580605{
    581606        return _mm256_xor_ps(arg1, arg2);
    582 }
    583 
    584 //The total number of operations is 1.0
    585 IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2)
    586 {
    587         return _mm256_and_ps(arg1, arg2);
    588607}
    589608
     
    962981}
    963982
     983//The total number of operations is 12.0
     984template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::all(bitblock256_t arg1)
     985{
     986        bitblock256_t f0 = simd_and(arg1, simd256<2>::srli<1>(arg1));
     987        return simd_or(f0, simd256<2>::slli<1>(f0));
     988}
     989
     990//The total number of operations is 17.0
     991template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::all(bitblock256_t arg1)
     992{
     993        return simd256<4>::eq(arg1, simd256<8>::constant<255>());
     994}
     995
     996//The total number of operations is 5.0
     997template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::all(bitblock256_t arg1)
     998{
     999        return simd256<8>::eq(arg1, simd256<8>::constant<255>());
     1000}
     1001
     1002//The total number of operations is 5.0
     1003template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::all(bitblock256_t arg1)
     1004{
     1005        return simd256<16>::eq(arg1, simd256<8>::constant<255>());
     1006}
     1007
     1008//The total number of operations is 5.0
     1009template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::all(bitblock256_t arg1)
     1010{
     1011        return simd256<32>::eq(arg1, simd256<8>::constant<255>());
     1012}
     1013
     1014//The total number of operations is 5.0
     1015template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::all(bitblock256_t arg1)
     1016{
     1017        return simd256<64>::eq(arg1, simd256<8>::constant<255>());
     1018}
     1019
     1020//The total number of operations is 23.6666666667
     1021template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::all(bitblock256_t arg1)
     1022{
     1023        return simd256<128>::eq(arg1, simd256<8>::constant<255>());
     1024}
     1025
     1026//The total number of operations is 2.0
     1027template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::all(bitblock256_t arg1)
     1028{
     1029        return ((bitblock256::all(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
     1030}
     1031
     1032//The total number of operations is 5.0
     1033template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1)
     1034{
     1035        return simd_and(simd256<32>::srli<sh>(arg1), simd256<2>::constant<((3)>>sh)>());
     1036}
     1037
     1038//The total number of operations is 5.0
     1039template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1)
     1040{
     1041        return simd_and(simd256<32>::srli<sh>(arg1), simd256<4>::constant<((15)>>sh)>());
     1042}
     1043
     1044//The total number of operations is 5.0
     1045template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1)
     1046{
     1047        return simd_and(simd256<32>::srli<sh>(arg1), simd256<8>::constant<((255)>>sh)>());
     1048}
     1049
     1050//The total number of operations is 4.0
     1051template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1)
     1052{
     1053        return avx_general_combine256(_mm_srli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     1054}
     1055
     1056//The total number of operations is 4.0
     1057template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1)
     1058{
     1059        return avx_general_combine256(_mm_srli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     1060}
     1061
     1062//The total number of operations is 4.0
     1063template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1)
     1064{
     1065        return avx_general_combine256(_mm_srli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
     1066}
     1067
     1068//The total number of operations is 8.33333333333
     1069template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1)
     1070{
     1071        return (((sh%8) == 0) ? avx_byte_shift_right(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::srli<(sh&63)>(avx_byte_shift_right(arg1, 8)) : simd_or(simd256<64>::srli<sh>(arg1), avx_byte_shift_right(simd256<64>::slli<((128-sh)&63)>(arg1), 8))));
     1072}
     1073
     1074//The total number of operations is 14.5
     1075template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1)
     1076{
     1077        return ((sh < 128) ? simd_or(simd256<128>::srli<sh>(arg1), simd256<128>::slli<(128-sh)>(((bitblock256_t)(_mm256_castsi128_si256(avx_select_hi128(arg1)))))) : simd256<128>::srli<(sh-128)>(avx_move_hi128_to_lo128(arg1)));
     1078}
     1079
     1080//The total number of operations is 1.0
     1081template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1)
     1082{
     1083        return simd_not(arg1);
     1084}
     1085
     1086//The total number of operations is 27.0
     1087template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1)
     1088{
     1089        return simd256<2>::popcount(simd_andc(simd256<2>::sub(arg1, simd256<2>::constant<1>()), arg1));
     1090}
     1091
     1092//The total number of operations is 36.0
     1093template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ctz(bitblock256_t arg1)
     1094{
     1095        return simd256<4>::popcount(simd_andc(simd256<4>::sub(arg1, simd256<4>::constant<1>()), arg1));
     1096}
     1097
     1098//The total number of operations is 38.0
     1099template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ctz(bitblock256_t arg1)
     1100{
     1101        return simd256<8>::popcount(simd_andc(simd256<8>::sub(arg1, simd256<8>::constant<1>()), arg1));
     1102}
     1103
     1104//The total number of operations is 48.0
     1105template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ctz(bitblock256_t arg1)
     1106{
     1107        return simd256<16>::popcount(simd_andc(simd256<16>::sub(arg1, simd256<16>::constant<1>()), arg1));
     1108}
     1109
     1110//The total number of operations is 58.0
     1111template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ctz(bitblock256_t arg1)
     1112{
     1113        return simd256<32>::popcount(simd_andc(simd256<32>::sub(arg1, simd256<32>::constant<1>()), arg1));
     1114}
     1115
     1116//The total number of operations is 44.0
     1117template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ctz(bitblock256_t arg1)
     1118{
     1119        return simd256<64>::popcount(simd_andc(simd256<64>::sub(arg1, simd256<64>::constant<1>()), arg1));
     1120}
     1121
     1122//The total number of operations is 101.0
     1123template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1)
     1124{
     1125        return simd256<128>::popcount(simd_andc(simd256<128>::sub(arg1, simd256<128>::constant<1>()), arg1));
     1126}
     1127
     1128//The total number of operations is 192.166666667
     1129template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1)
     1130{
     1131        return simd256<256>::popcount(simd_andc(simd256<256>::sub(arg1, simd256<256>::constant<1>()), arg1));
     1132}
     1133
     1134//The total number of operations is 1.0
     1135template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1136{
     1137        return simd_andc(arg1, arg2);
     1138}
     1139
     1140//The total number of operations is 23.0
     1141template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1142{
     1143        bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
     1144        bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     1145        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     1146        return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
     1147}
     1148
     1149//The total number of operations is 20.0
     1150template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1151{
     1152        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
     1153}
     1154
     1155//The total number of operations is 7.0
     1156template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1157{
     1158        bitblock256_t high_bit = simd256<8>::constant<(128)>();
     1159        return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1160}
     1161
     1162//The total number of operations is 7.0
     1163template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1164{
     1165        bitblock256_t high_bit = simd256<16>::constant<(32768)>();
     1166        return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1167}
     1168
     1169//The total number of operations is 7.0
     1170template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1171{
     1172        bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
     1173        return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1174}
     1175
     1176//The total number of operations is 7.0
     1177template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1178{
     1179        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1180        return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1181}
     1182
     1183//The total number of operations is 60.0
     1184template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1185{
     1186        bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
     1187        bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     1188        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     1189        return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
     1190}
     1191
     1192//The total number of operations is 174.166666667
     1193template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1194{
     1195        bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
     1196        bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     1197        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     1198        return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
     1199}
     1200
     1201//The total number of operations is 7.0
     1202template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
     1203{
     1204        return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
     1205}
     1206
     1207//The total number of operations is 7.0
     1208template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
     1209{
     1210        return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
     1211}
     1212
     1213//The total number of operations is 7.0
     1214template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
     1215{
     1216        return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
     1217}
     1218
     1219//The total number of operations is 6.0
     1220template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
     1221{
     1222        return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
     1223}
     1224
     1225//The total number of operations is 6.0
     1226template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
     1227{
     1228        return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
     1229}
     1230
     1231//The total number of operations is 6.0
     1232template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
     1233{
     1234        return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
     1235}
     1236
     1237//The total number of operations is 10.3333333333
     1238template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
     1239{
     1240        return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
     1241}
     1242
     1243//The total number of operations is 16.5
     1244template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
     1245{
     1246        return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1247}
     1248
     1249//The total number of operations is 0
     1250template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
     1251{
     1252        return arg1;
     1253}
     1254
     1255//The total number of operations is 10.0
     1256template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
     1257{
     1258        return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
     1259}
     1260
     1261//The total number of operations is 21.0
     1262template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
     1263{
     1264        return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
     1265}
     1266
     1267//The total number of operations is 32.0
     1268template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
     1269{
     1270        return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
     1271}
     1272
     1273//The total number of operations is 42.0
     1274template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
     1275{
     1276        return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
     1277}
     1278
     1279//The total number of operations is 52.0
     1280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
     1281{
     1282        return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
     1283}
     1284
     1285//The total number of operations is 38.0
     1286template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
     1287{
     1288        bitblock256_t tmpAns = simd256<8>::popcount(arg1);
     1289        return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
     1290}
     1291
     1292//The total number of operations is 73.6666666667
     1293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
     1294{
     1295        return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
     1296}
     1297
     1298//The total number of operations is 115.5
     1299template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
     1300{
     1301        bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
     1302        return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
     1303}
     1304
     1305//The total number of operations is 14.0
     1306template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
     1307{
     1308        bitblock256_t t0 = simd256<2>::srli<1>(arg1);
     1309        bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
     1310        return simd_or(f0, simd256<2>::slli<1>(f0));
     1311}
     1312
     1313//The total number of operations is 20.0
     1314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
     1315{
     1316        return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
     1317}
     1318
     1319//The total number of operations is 7.0
     1320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
     1321{
     1322        return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
     1323}
     1324
     1325//The total number of operations is 7.0
     1326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
     1327{
     1328        return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
     1329}
     1330
     1331//The total number of operations is 7.0
     1332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
     1333{
     1334        return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
     1335}
     1336
     1337//The total number of operations is 7.0
     1338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
     1339{
     1340        return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
     1341}
     1342
     1343//The total number of operations is 60.0
     1344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
     1345{
     1346        return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
     1347}
     1348
     1349//The total number of operations is 1.0
     1350template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
     1351{
     1352        return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
     1353}
     1354
     1355//The total number of operations is 16.0
     1356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
     1357{
     1358        return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
     1359}
     1360
     1361//The total number of operations is 14.0
     1362template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
     1363{
     1364        return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
     1365}
     1366
     1367//The total number of operations is 5.0
     1368template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
     1369{
     1370        return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
     1371}
     1372
     1373//The total number of operations is 5.0
     1374template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
     1375{
     1376        return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
     1377}
     1378
     1379//The total number of operations is 5.0
     1380template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
     1381{
     1382        return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
     1383}
     1384
     1385//The total number of operations is 5.0
     1386template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
     1387{
     1388        return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
     1389}
     1390
     1391//The total number of operations is 26.3333333333
     1392template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
     1393{
     1394        return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
     1395}
     1396
     1397//The total number of operations is 75.6666666667
     1398template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
     1399{
     1400        return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
     1401}
     1402
     1403//The total number of operations is 5.0
     1404template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
     1405{
     1406        return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
     1407}
     1408
     1409//The total number of operations is 5.0
     1410template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
     1411{
     1412        return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
     1413}
     1414
     1415//The total number of operations is 5.0
     1416template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
     1417{
     1418        return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
     1419}
     1420
     1421//The total number of operations is 4.0
     1422template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
     1423{
     1424        return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     1425}
     1426
     1427//The total number of operations is 4.0
     1428template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
     1429{
     1430        return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     1431}
     1432
     1433//The total number of operations is 4.0
     1434template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
     1435{
     1436        return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
     1437}
     1438
     1439//The total number of operations is 8.33333333333
     1440template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
     1441{
     1442        return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
     1443}
     1444
     1445//The total number of operations is 14.0
     1446template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
     1447{
     1448        return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
     1449}
     1450
     1451//The total number of operations is 3.0
     1452template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1453{
     1454        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
     1455}
     1456
     1457//The total number of operations is 11.0
     1458template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1459{
     1460        return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
     1461}
     1462
     1463//The total number of operations is 19.0
     1464template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1465{
     1466        return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
     1467}
     1468
     1469//The total number of operations is 8.0
     1470template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1471{
     1472        return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
     1473}
     1474
     1475//The total number of operations is 8.0
     1476template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1477{
     1478        return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
     1479}
     1480
     1481//The total number of operations is 8.0
     1482template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1483{
     1484        return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
     1485}
     1486
     1487//The total number of operations is 1.0
     1488template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1489{
     1490        return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1));
     1491}
     1492
     1493//The total number of operations is 12.3333333333
     1494template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1495{
     1496        return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
     1497}
     1498
     1499//The total number of operations is 29.8333333333
     1500template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1501{
     1502        return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
     1503}
     1504
     1505//The total number of operations is 1.0
     1506template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1507{
     1508        return simd_xor(arg1, arg2);
     1509}
     1510
     1511//The total number of operations is 16.0
     1512template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1513{
     1514        bitblock256_t ans = simd256<(1)>::sub(arg1, arg2);
     1515        bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
     1516        bitblock256_t loMask = simd256<2>::lomask();
     1517        bitblock256_t borrow = simd256<2>::slli<1>(simd_and(borrowMask, loMask));
     1518        return simd256<1>::ifh(loMask, ans, simd256<(1)>::sub(ans, borrow));
     1519}
     1520
     1521//The total number of operations is 14.0
     1522template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1523{
     1524        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::sub(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::sub(arg1, arg2));
     1525}
     1526
     1527//The total number of operations is 5.0
     1528template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1529{
     1530        return avx_general_combine256(_mm_sub_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1531}
     1532
     1533//The total number of operations is 5.0
     1534template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1535{
     1536        return avx_general_combine256(_mm_sub_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1537}
     1538
     1539//The total number of operations is 5.0
     1540template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1541{
     1542        return avx_general_combine256(_mm_sub_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1543}
     1544
     1545//The total number of operations is 5.0
     1546template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1547{
     1548        return avx_general_combine256(_mm_sub_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1549}
     1550
     1551//The total number of operations is 26.3333333333
     1552template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1553{
     1554        bitblock256_t partial = simd256<(64)>::sub(arg1, arg2);
     1555        bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
     1556        bitblock256_t borrow = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(borrowMask));
     1557        return simd256<(64)>::sub(partial, borrow);
     1558}
     1559
     1560//The total number of operations is 75.6666666667
     1561template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2)
     1562{
     1563        bitblock256_t ans = simd256<(128)>::sub(arg1, arg2);
     1564        bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
     1565        bitblock256_t loMask = simd256<256>::lomask();
     1566        bitblock256_t borrow = simd256<256>::slli<1>(simd_and(borrowMask, loMask));
     1567        return simd256<1>::ifh(loMask, ans, simd256<(128)>::sub(ans, borrow));
     1568}
     1569
     1570//The total number of operations is 10.0
     1571template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
     1572{
     1573        return simd256<16>::sub(arg1, simd_and(simd256<2>::lomask(), simd256<16>::srli<1>(arg1)));
     1574}
     1575
     1576//The total number of operations is 11.0
     1577template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1)
     1578{
     1579        return simd256<(8)>::add(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
     1580}
     1581
     1582//The total number of operations is 11.0
     1583template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add_hl(bitblock256_t arg1)
     1584{
     1585        return simd256<(16)>::add(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
     1586}
     1587
     1588//The total number of operations is 10.0
     1589template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add_hl(bitblock256_t arg1)
     1590{
     1591        return simd256<(32)>::add(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
     1592}
     1593
     1594//The total number of operations is 10.0
     1595template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add_hl(bitblock256_t arg1)
     1596{
     1597        return simd256<(64)>::add(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
     1598}
     1599
     1600//The total number of operations is 10.0
     1601template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add_hl(bitblock256_t arg1)
     1602{
     1603        return simd256<64>::add(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
     1604}
     1605
     1606//The total number of operations is 35.6666666667
     1607template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1)
     1608{
     1609        return simd256<128>::add(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
     1610}
     1611
     1612//The total number of operations is 91.1666666667
     1613template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1)
     1614{
     1615        return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1616}
     1617
     1618//The total number of operations is 0
     1619template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()
     1620{
     1621        return simd256<32>::constant<(-1*val)>();
     1622}
     1623
     1624//The total number of operations is 0
     1625template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant()
     1626{
     1627        return ((val < 0) ? simd256<(4)>::constant<((val<<2)|(val^(-4)))>() : simd256<(4)>::constant<((val<<2)|val)>());
     1628}
     1629
     1630//The total number of operations is 0
     1631template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant()
     1632{
     1633        return ((val < 0) ? simd256<(8)>::constant<((val<<4)|(val^(-16)))>() : simd256<(8)>::constant<((val<<4)|val)>());
     1634}
     1635
     1636//The total number of operations is 0
     1637template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant()
     1638{
     1639        return (bitblock256_t)_mm256_set1_epi8((int32_t)(val));
     1640}
     1641
     1642//The total number of operations is 0
     1643template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant()
     1644{
     1645        return (bitblock256_t)_mm256_set1_epi16((int32_t)(val));
     1646}
     1647
     1648//The total number of operations is 0
     1649template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant()
     1650{
     1651        return (bitblock256_t)_mm256_set1_epi32((int32_t)(val));
     1652}
     1653
     1654//The total number of operations is 0
     1655template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant()
     1656{
     1657        return ((bitblock256_t)(_mm256_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))));
     1658}
     1659
     1660//The total number of operations is 0
     1661template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant()
     1662{
     1663        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))));
     1664}
     1665
     1666//The total number of operations is 0
     1667template <> template <FieldType<256>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant()
     1668{
     1669        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))));
     1670}
     1671
     1672//The total number of operations is 1.0
     1673template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2)
     1674{
     1675        return simd_or(arg1, arg2);
     1676}
     1677
     1678//The total number of operations is 25.0
     1679template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2)
     1680{
     1681        bitblock256_t hiAns = simd256<(1)>::min(arg1, arg2);
     1682        bitblock256_t loAns = simd256<(1)>::umin(arg1, arg2);
     1683        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg1));
     1684        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg2));
     1685        return simd256<1>::ifh(simd256<2>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
     1686}
     1687
     1688//The total number of operations is 17.0
     1689template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::min(bitblock256_t arg1, bitblock256_t arg2)
     1690{
     1691        bitblock256_t high_bit = simd256<4>::constant<(8)>();
     1692        return simd_xor(simd256<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1693}
     1694
     1695//The total number of operations is 5.0
     1696template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::min(bitblock256_t arg1, bitblock256_t arg2)
     1697{
     1698        return avx_general_combine256(_mm_min_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1699}
     1700
     1701//The total number of operations is 5.0
     1702template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::min(bitblock256_t arg1, bitblock256_t arg2)
     1703{
     1704        return avx_general_combine256(_mm_min_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1705}
     1706
     1707//The total number of operations is 5.0
     1708template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::min(bitblock256_t arg1, bitblock256_t arg2)
     1709{
     1710        return avx_general_combine256(_mm_min_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1711}
     1712
     1713//The total number of operations is 8.0
     1714template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::min(bitblock256_t arg1, bitblock256_t arg2)
     1715{
     1716        return simd256<1>::ifh(simd256<64>::gt(arg1, arg2), arg2, arg1);
     1717}
     1718
     1719//The total number of operations is 54.6666666667
     1720template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2)
     1721{
     1722        bitblock256_t hiAns = simd256<(64)>::min(arg1, arg2);
     1723        bitblock256_t loAns = simd256<(64)>::umin(arg1, arg2);
     1724        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg1));
     1725        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg2));
     1726        return simd256<1>::ifh(simd256<128>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
     1727}
     1728
     1729//The total number of operations is 186.666666667
     1730template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2)
     1731{
     1732        bitblock256_t hiAns = simd256<(128)>::min(arg1, arg2);
     1733        bitblock256_t loAns = simd256<(128)>::umin(arg1, arg2);
     1734        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg1));
     1735        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg2));
     1736        return simd256<1>::ifh(simd256<256>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
     1737}
     1738
     1739//The total number of operations is 0
     1740template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1741{
     1742        return simd256<2>::constant<(1)>();
     1743}
     1744
     1745//The total number of operations is 0
     1746template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1747{
     1748        return simd256<4>::constant<(3)>();
     1749}
     1750
     1751//The total number of operations is 0
     1752template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1753{
     1754        return simd256<8>::constant<(15)>();
     1755}
     1756
     1757//The total number of operations is 0
     1758template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1759{
     1760        return simd256<16>::constant<(255)>();
     1761}
     1762
     1763//The total number of operations is 0
     1764template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1765{
     1766        return simd256<32>::constant<(65535)>();
     1767}
     1768
     1769//The total number of operations is 0
     1770template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1771{
     1772        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     1773}
     1774
     1775//The total number of operations is 0
     1776template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1777{
     1778        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     1779}
     1780
     1781//The total number of operations is 0
     1782template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1783{
     1784        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
     1785}
     1786
     1787//The total number of operations is 1.0
     1788template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2)
     1789{
     1790        return simd_xor(arg1, arg2);
     1791}
     1792
     1793//The total number of operations is 16.0
     1794template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2)
     1795{
     1796        bitblock256_t ans = simd256<(1)>::add(arg1, arg2);
     1797        bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
     1798        bitblock256_t loMask = simd256<2>::lomask();
     1799        bitblock256_t carry = simd256<2>::slli<1>(simd_and(carryMask, loMask));
     1800        return simd256<1>::ifh(loMask, ans, simd256<(1)>::add(ans, carry));
     1801}
     1802
     1803//The total number of operations is 14.0
     1804template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2)
     1805{
     1806        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::add(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::add(arg1, arg2));
     1807}
     1808
     1809//The total number of operations is 5.0
     1810template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2)
     1811{
     1812        return avx_general_combine256(_mm_add_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1813}
     1814
     1815//The total number of operations is 5.0
     1816template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2)
     1817{
     1818        return avx_general_combine256(_mm_add_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1819}
     1820
     1821//The total number of operations is 5.0
     1822template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2)
     1823{
     1824        return avx_general_combine256(_mm_add_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1825}
     1826
     1827//The total number of operations is 5.0
     1828template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2)
     1829{
     1830        return avx_general_combine256(_mm_add_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1831}
     1832
     1833//The total number of operations is 26.3333333333
     1834template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2)
     1835{
     1836        bitblock256_t partial = simd256<(64)>::add(arg1, arg2);
     1837        bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
     1838        bitblock256_t carry = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(carryMask));
     1839        return simd256<(64)>::add(partial, carry);
     1840}
     1841
     1842//The total number of operations is 75.6666666667
     1843template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2)
     1844{
     1845        bitblock256_t ans = simd256<(128)>::add(arg1, arg2);
     1846        bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
     1847        bitblock256_t loMask = simd256<256>::lomask();
     1848        bitblock256_t carry = simd256<256>::slli<1>(simd_and(carryMask, loMask));
     1849        return simd256<1>::ifh(loMask, ans, simd256<(128)>::add(ans, carry));
     1850}
     1851
     1852//The total number of operations is 1.0
     1853template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1854{
     1855        return simd_and(arg1, arg2);
     1856}
     1857
     1858//The total number of operations is 24.0
     1859template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1860{
     1861        bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
     1862        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     1863        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     1864        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1865}
     1866
     1867//The total number of operations is 14.0
     1868template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1869{
     1870        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     1871}
     1872
     1873//The total number of operations is 5.0
     1874template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1875{
     1876        return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1877}
     1878
     1879//The total number of operations is 5.0
     1880template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1881{
     1882        return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1883}
     1884
     1885//The total number of operations is 5.0
     1886template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1887{
     1888        return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1889}
     1890
     1891//The total number of operations is 11.0
     1892template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1893{
     1894        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1895        return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1896}
     1897
     1898//The total number of operations is 46.6666666667
     1899template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1900{
     1901        bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
     1902        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     1903        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     1904        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1905}
     1906
     1907//The total number of operations is 132.0
     1908template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1909{
     1910        bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
     1911        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     1912        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     1913        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1914}
     1915
     1916//The total number of operations is 19.0
     1917template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     1918{
     1919        return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
     1920}
     1921
     1922//The total number of operations is 39.0
     1923template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
     1924{
     1925        bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
     1926        return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
     1927}
     1928
     1929//The total number of operations is 4.0
     1930template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
     1931{
     1932        return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
     1933}
     1934
     1935//The total number of operations is 4.0
     1936template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
     1937{
     1938        return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
     1939}
     1940
     1941//The total number of operations is 4.0
     1942template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
     1943{
     1944        return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
     1945}
     1946
     1947//The total number of operations is 13.0
     1948template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
     1949{
     1950        bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
     1951        return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
     1952}
     1953
     1954//The total number of operations is 69.0
     1955template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
     1956{
     1957        bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
     1958        return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
     1959}
     1960
     1961//The total number of operations is 204.833333333
     1962template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
     1963{
     1964        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
     1965        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
     1966}
     1967
     1968//The total number of operations is 2.0
     1969template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1970{
     1971        return simd_not(simd_xor(arg1, arg2));
     1972}
     1973
     1974//The total number of operations is 14.0
     1975template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1976{
     1977        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
     1978        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
     1979        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
     1980        return simd_or(loMask, hiMask);
     1981}
     1982
     1983//The total number of operations is 17.0
     1984template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1985{
     1986        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
     1987}
     1988
     1989//The total number of operations is 5.0
     1990template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1991{
     1992        return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1993}
     1994
     1995//The total number of operations is 5.0
     1996template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1997{
     1998        return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1999}
     2000
     2001//The total number of operations is 5.0
     2002template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2003{
     2004        return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2005}
     2006
     2007//The total number of operations is 5.0
     2008template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2009{
     2010        return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2011}
     2012
     2013//The total number of operations is 23.6666666667
     2014template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2015{
     2016        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
     2017        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
     2018        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
     2019        return simd_or(loMask, hiMask);
     2020}
     2021
     2022//The total number of operations is 54.1666666667
     2023template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2024{
     2025        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
     2026        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
     2027        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
     2028        return simd_or(loMask, hiMask);
     2029}
     2030
     2031//The total number of operations is 7.0
     2032template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
     2033{
     2034        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
     2035}
     2036
     2037//The total number of operations is 17.5
     2038template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
     2039{
     2040        return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
     2041}
     2042
     2043//The total number of operations is 12.0
     2044template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
     2045{
     2046        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     2047        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     2048}
     2049
     2050//The total number of operations is 4.0
     2051template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
     2052{
     2053        return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     2054}
     2055
     2056//The total number of operations is 4.0
     2057template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
     2058{
     2059        return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     2060}
     2061
     2062//The total number of operations is 12.0
     2063template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
     2064{
     2065        return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
     2066}
     2067
     2068//The total number of operations is 28.3333333333
     2069template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
     2070{
     2071        return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
     2072}
     2073
     2074//The total number of operations is 59.0
     2075template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
     2076{
     2077        return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
     2078}
     2079
     2080//The total number of operations is 0
     2081template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
     2082{
     2083        return simd256<2>::constant<(2)>();
     2084}
     2085
     2086//The total number of operations is 0
     2087template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
     2088{
     2089        return simd256<4>::constant<(12)>();
     2090}
     2091
     2092//The total number of operations is 0
     2093template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
     2094{
     2095        return simd256<8>::constant<(240)>();
     2096}
     2097
     2098//The total number of operations is 0
     2099template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
     2100{
     2101        return simd256<16>::constant<(65280)>();
     2102}
     2103
     2104//The total number of operations is 0
     2105template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
     2106{
     2107        return simd256<32>::constant<-65536>();
     2108}
     2109
     2110//The total number of operations is 0
     2111template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
     2112{
     2113        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
     2114}
     2115
     2116//The total number of operations is 0
     2117template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
     2118{
     2119        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
     2120}
     2121
     2122//The total number of operations is 0
     2123template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
     2124{
     2125        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
     2126}
     2127
    9642128//The total number of operations is 1.0
    9652129template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
     
    10292193}
    10302194
    1031 //The total number of operations is 5.0
    1032 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1)
    1033 {
    1034         return simd_and(simd256<32>::srli<sh>(arg1), simd256<2>::constant<((3)>>sh)>());
    1035 }
    1036 
    1037 //The total number of operations is 5.0
    1038 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1)
    1039 {
    1040         return simd_and(simd256<32>::srli<sh>(arg1), simd256<4>::constant<((15)>>sh)>());
    1041 }
    1042 
    1043 //The total number of operations is 5.0
    1044 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1)
    1045 {
    1046         return simd_and(simd256<32>::srli<sh>(arg1), simd256<8>::constant<((255)>>sh)>());
    1047 }
    1048 
    1049 //The total number of operations is 4.0
    1050 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1)
    1051 {
    1052         return avx_general_combine256(_mm_srli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    1053 }
    1054 
    1055 //The total number of operations is 4.0
    1056 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1)
    1057 {
    1058         return avx_general_combine256(_mm_srli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    1059 }
    1060 
    1061 //The total number of operations is 4.0
    1062 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1)
    1063 {
    1064         return avx_general_combine256(_mm_srli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
    1065 }
    1066 
    1067 //The total number of operations is 8.33333333333
    1068 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1)
    1069 {
    1070         return (((sh%8) == 0) ? avx_byte_shift_right(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::srli<(sh&63)>(avx_byte_shift_right(arg1, 8)) : simd_or(simd256<64>::srli<sh>(arg1), avx_byte_shift_right(simd256<64>::slli<((128-sh)&63)>(arg1), 8))));
    1071 }
    1072 
    1073 //The total number of operations is 14.5
    1074 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1)
    1075 {
    1076         return ((sh < 128) ? simd_or(simd256<128>::srli<sh>(arg1), simd256<128>::slli<((256-sh)&127)>(((bitblock256_t)(_mm256_castsi128_si256(avx_select_hi128(arg1)))))) : simd256<128>::srli<(sh&127)>(avx_move_hi128_to_lo128(arg1)));
    1077 }
    1078 
    1079 //The total number of operations is 1.0
    1080 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1)
    1081 {
    1082         return simd_not(arg1);
    1083 }
    1084 
    1085 //The total number of operations is 27.0
    1086 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1)
    1087 {
    1088         return simd256<2>::popcount(simd_andc(simd256<2>::sub(arg1, simd256<2>::constant<1>()), arg1));
    1089 }
    1090 
    1091 //The total number of operations is 36.0
    1092 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ctz(bitblock256_t arg1)
    1093 {
    1094         return simd256<4>::popcount(simd_andc(simd256<4>::sub(arg1, simd256<4>::constant<1>()), arg1));
    1095 }
    1096 
    1097 //The total number of operations is 38.0
    1098 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ctz(bitblock256_t arg1)
    1099 {
    1100         return simd256<8>::popcount(simd_andc(simd256<8>::sub(arg1, simd256<8>::constant<1>()), arg1));
    1101 }
    1102 
    1103 //The total number of operations is 48.0
    1104 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ctz(bitblock256_t arg1)
    1105 {
    1106         return simd256<16>::popcount(simd_andc(simd256<16>::sub(arg1, simd256<16>::constant<1>()), arg1));
    1107 }
    1108 
    1109 //The total number of operations is 58.0
    1110 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ctz(bitblock256_t arg1)
    1111 {
    1112         return simd256<32>::popcount(simd_andc(simd256<32>::sub(arg1, simd256<32>::constant<1>()), arg1));
    1113 }
    1114 
    1115 //The total number of operations is 44.0
    1116 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ctz(bitblock256_t arg1)
    1117 {
    1118         return simd256<64>::popcount(simd_andc(simd256<64>::sub(arg1, simd256<64>::constant<1>()), arg1));
    1119 }
    1120 
    1121 //The total number of operations is 101.0
    1122 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1)
    1123 {
    1124         return simd256<128>::popcount(simd_andc(simd256<128>::sub(arg1, simd256<128>::constant<1>()), arg1));
    1125 }
    1126 
    1127 //The total number of operations is 192.166666667
    1128 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1)
    1129 {
    1130         return simd256<256>::popcount(simd_andc(simd256<256>::sub(arg1, simd256<256>::constant<1>()), arg1));
    1131 }
    1132 
    1133 //The total number of operations is 1.0
    1134 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1135 {
    1136         return simd_andc(arg1, arg2);
    1137 }
    1138 
    1139 //The total number of operations is 23.0
    1140 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1141 {
    1142         bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
    1143         bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    1144         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    1145         return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
    1146 }
    1147 
    1148 //The total number of operations is 20.0
    1149 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1150 {
    1151         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
    1152 }
    1153 
    1154 //The total number of operations is 7.0
    1155 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1156 {
    1157         bitblock256_t high_bit = simd256<8>::constant<(128)>();
    1158         return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1159 }
    1160 
    1161 //The total number of operations is 7.0
    1162 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1163 {
    1164         bitblock256_t high_bit = simd256<16>::constant<(32768)>();
    1165         return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1166 }
    1167 
    1168 //The total number of operations is 7.0
    1169 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1170 {
    1171         bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
    1172         return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1173 }
    1174 
    1175 //The total number of operations is 7.0
    1176 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1177 {
    1178         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1179         return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1180 }
    1181 
    1182 //The total number of operations is 60.0
    1183 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1184 {
    1185         bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
    1186         bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    1187         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    1188         return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
    1189 }
    1190 
    1191 //The total number of operations is 174.166666667
    1192 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1193 {
    1194         bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
    1195         bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    1196         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    1197         return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
    1198 }
    1199 
    1200 //The total number of operations is 7.0
    1201 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
    1202 {
    1203         return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
    1204 }
    1205 
    1206 //The total number of operations is 7.0
    1207 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
    1208 {
    1209         return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
    1210 }
    1211 
    1212 //The total number of operations is 7.0
    1213 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
    1214 {
    1215         return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
    1216 }
    1217 
    1218 //The total number of operations is 6.0
    1219 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
    1220 {
    1221         return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
    1222 }
    1223 
    1224 //The total number of operations is 6.0
    1225 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
    1226 {
    1227         return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
    1228 }
    1229 
    1230 //The total number of operations is 6.0
    1231 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
    1232 {
    1233         return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
    1234 }
    1235 
    1236 //The total number of operations is 10.3333333333
    1237 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
    1238 {
    1239         return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
    1240 }
    1241 
    1242 //The total number of operations is 16.5
    1243 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
    1244 {
    1245         return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
    1246 }
    1247 
    1248 //The total number of operations is 0
    1249 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
    1250 {
    1251         return arg1;
    1252 }
    1253 
    1254 //The total number of operations is 10.0
    1255 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
    1256 {
    1257         return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
    1258 }
    1259 
    1260 //The total number of operations is 21.0
    1261 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
    1262 {
    1263         return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
    1264 }
    1265 
    1266 //The total number of operations is 32.0
    1267 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
    1268 {
    1269         return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
    1270 }
    1271 
    1272 //The total number of operations is 42.0
    1273 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
    1274 {
    1275         return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
    1276 }
    1277 
    1278 //The total number of operations is 52.0
    1279 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
    1280 {
    1281         return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
    1282 }
    1283 
    1284 //The total number of operations is 38.0
    1285 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
    1286 {
    1287         bitblock256_t tmpAns = simd256<8>::popcount(arg1);
    1288         return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
    1289 }
    1290 
    1291 //The total number of operations is 73.6666666667
    1292 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
    1293 {
    1294         return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
    1295 }
    1296 
    1297 //The total number of operations is 115.5
    1298 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
    1299 {
    1300         bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
    1301         return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
    1302 }
    1303 
    1304 //The total number of operations is 16.0
    1305 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
    1306 {
    1307         return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
    1308 }
    1309 
    1310 //The total number of operations is 14.0
    1311 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
    1312 {
    1313         return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
    1314 }
    1315 
    1316 //The total number of operations is 5.0
    1317 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
    1318 {
    1319         return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
    1320 }
    1321 
    1322 //The total number of operations is 5.0
    1323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
    1324 {
    1325         return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
    1326 }
    1327 
    1328 //The total number of operations is 5.0
    1329 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
    1330 {
    1331         return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
    1332 }
    1333 
    1334 //The total number of operations is 5.0
    1335 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
    1336 {
    1337         return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
    1338 }
    1339 
    1340 //The total number of operations is 26.3333333333
    1341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
    1342 {
    1343         return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
    1344 }
    1345 
    1346 //The total number of operations is 75.6666666667
    1347 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
    1348 {
    1349         return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
    1350 }
    1351 
    1352 //The total number of operations is 5.0
    1353 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
    1354 {
    1355         return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
    1356 }
    1357 
    1358 //The total number of operations is 5.0
    1359 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
    1360 {
    1361         return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
    1362 }
    1363 
    1364 //The total number of operations is 5.0
    1365 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
    1366 {
    1367         return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
    1368 }
    1369 
    1370 //The total number of operations is 4.0
    1371 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
    1372 {
    1373         return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    1374 }
    1375 
    1376 //The total number of operations is 4.0
    1377 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
    1378 {
    1379         return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    1380 }
    1381 
    1382 //The total number of operations is 4.0
    1383 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
    1384 {
    1385         return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
    1386 }
    1387 
    1388 //The total number of operations is 8.33333333333
    1389 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
    1390 {
    1391         return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
    1392 }
    1393 
    1394 //The total number of operations is 14.0
    1395 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
    1396 {
    1397         return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<((256-sh)&127)>(arg1))) : simd256<128>::slli<(sh&127)>(avx_move_lo128_to_hi128(arg1)));
    1398 }
    1399 
    1400 //The total number of operations is 3.0
    1401 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1402 {
    1403         return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
    1404 }
    1405 
    1406 //The total number of operations is 11.0
    1407 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1408 {
    1409         return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
    1410 }
    1411 
    1412 //The total number of operations is 19.0
    1413 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1414 {
    1415         return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
    1416 }
    1417 
    1418 //The total number of operations is 8.0
    1419 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1420 {
    1421         return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
    1422 }
    1423 
    1424 //The total number of operations is 8.0
    1425 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1426 {
    1427         return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
    1428 }
    1429 
    1430 //The total number of operations is 8.0
    1431 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1432 {
    1433         return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
    1434 }
    1435 
    1436 //The total number of operations is 1.0
    1437 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1438 {
    1439         return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1));
    1440 }
    1441 
    1442 //The total number of operations is 12.3333333333
    1443 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1444 {
    1445         return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
    1446 }
    1447 
    1448 //The total number of operations is 29.8333333333
    1449 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1450 {
    1451         return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
    1452 }
    1453 
    1454 //The total number of operations is 1.0
    1455 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1456 {
    1457         return simd_xor(arg1, arg2);
    1458 }
    1459 
    1460 //The total number of operations is 16.0
    1461 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1462 {
    1463         bitblock256_t ans = simd256<(1)>::sub(arg1, arg2);
    1464         bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
    1465         bitblock256_t loMask = simd256<2>::lomask();
    1466         bitblock256_t borrow = simd256<2>::slli<1>(simd_and(borrowMask, loMask));
    1467         return simd256<1>::ifh(loMask, ans, simd256<(1)>::sub(ans, borrow));
    1468 }
    1469 
    1470 //The total number of operations is 14.0
    1471 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1472 {
    1473         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::sub(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::sub(arg1, arg2));
    1474 }
    1475 
    1476 //The total number of operations is 5.0
    1477 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1478 {
    1479         return avx_general_combine256(_mm_sub_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1480 }
    1481 
    1482 //The total number of operations is 5.0
    1483 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1484 {
    1485         return avx_general_combine256(_mm_sub_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1486 }
    1487 
    1488 //The total number of operations is 5.0
    1489 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1490 {
    1491         return avx_general_combine256(_mm_sub_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1492 }
    1493 
    1494 //The total number of operations is 5.0
    1495 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1496 {
    1497         return avx_general_combine256(_mm_sub_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1498 }
    1499 
    1500 //The total number of operations is 26.3333333333
    1501 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1502 {
    1503         bitblock256_t partial = simd256<(64)>::sub(arg1, arg2);
    1504         bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
    1505         bitblock256_t borrow = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(borrowMask));
    1506         return simd256<(64)>::sub(partial, borrow);
    1507 }
    1508 
    1509 //The total number of operations is 75.6666666667
    1510 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2)
    1511 {
    1512         bitblock256_t ans = simd256<(128)>::sub(arg1, arg2);
    1513         bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
    1514         bitblock256_t loMask = simd256<256>::lomask();
    1515         bitblock256_t borrow = simd256<256>::slli<1>(simd_and(borrowMask, loMask));
    1516         return simd256<1>::ifh(loMask, ans, simd256<(128)>::sub(ans, borrow));
    1517 }
    1518 
    1519 //The total number of operations is 10.0
    1520 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
    1521 {
    1522         return simd256<16>::sub(arg1, simd_and(simd256<2>::lomask(), simd256<16>::srli<1>(arg1)));
    1523 }
    1524 
    1525 //The total number of operations is 11.0
    1526 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1)
    1527 {
    1528         return simd256<(8)>::add(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
    1529 }
    1530 
    1531 //The total number of operations is 11.0
    1532 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add_hl(bitblock256_t arg1)
    1533 {
    1534         return simd256<(16)>::add(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
    1535 }
    1536 
    1537 //The total number of operations is 10.0
    1538 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add_hl(bitblock256_t arg1)
    1539 {
    1540         return simd256<(32)>::add(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
    1541 }
    1542 
    1543 //The total number of operations is 10.0
    1544 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add_hl(bitblock256_t arg1)
    1545 {
    1546         return simd256<(64)>::add(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
    1547 }
    1548 
    1549 //The total number of operations is 10.0
    1550 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add_hl(bitblock256_t arg1)
    1551 {
    1552         return simd256<64>::add(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
    1553 }
    1554 
    1555 //The total number of operations is 35.6666666667
    1556 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1)
    1557 {
    1558         return simd256<128>::add(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
    1559 }
    1560 
    1561 //The total number of operations is 91.1666666667
    1562 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1)
    1563 {
    1564         return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
    1565 }
    1566 
    1567 //The total number of operations is 0
    1568 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()
    1569 {
    1570         return simd256<32>::constant<(-1*val)>();
    1571 }
    1572 
    1573 //The total number of operations is 0
    1574 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant()
    1575 {
    1576         return simd256<(4)>::constant<((val<<2)|(val&(3)))>();
    1577 }
    1578 
    1579 //The total number of operations is 0
    1580 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant()
    1581 {
    1582         return simd256<(8)>::constant<((val<<4)|(val&(15)))>();
    1583 }
    1584 
    1585 //The total number of operations is 0
    1586 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant()
    1587 {
    1588         return (bitblock256_t)_mm256_set1_epi8((int32_t)(val));
    1589 }
    1590 
    1591 //The total number of operations is 0
    1592 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant()
    1593 {
    1594         return (bitblock256_t)_mm256_set1_epi16((int32_t)(val));
    1595 }
    1596 
    1597 //The total number of operations is 0
    1598 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant()
    1599 {
    1600         return (bitblock256_t)_mm256_set1_epi32((int32_t)(val));
    1601 }
    1602 
    1603 //The total number of operations is 0
    1604 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant()
    1605 {
    1606         return ((bitblock256_t)(_mm256_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))));
    1607 }
    1608 
    1609 //The total number of operations is 0
    1610 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant()
    1611 {
    1612         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))));
    1613 }
    1614 
    1615 //The total number of operations is 0
    1616 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant()
    1617 {
    1618         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))));
    1619 }
    1620 
    1621 //The total number of operations is 1.0
    1622 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2)
    1623 {
    1624         return simd_or(arg1, arg2);
    1625 }
    1626 
    1627 //The total number of operations is 25.0
    1628 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2)
    1629 {
    1630         bitblock256_t hiAns = simd256<(1)>::min(arg1, arg2);
    1631         bitblock256_t loAns = simd256<(1)>::umin(arg1, arg2);
    1632         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg1));
    1633         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg2));
    1634         return simd256<1>::ifh(simd256<2>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
    1635 }
    1636 
    1637 //The total number of operations is 17.0
    1638 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::min(bitblock256_t arg1, bitblock256_t arg2)
    1639 {
    1640         bitblock256_t high_bit = simd256<4>::constant<(8)>();
    1641         return simd_xor(simd256<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1642 }
    1643 
    1644 //The total number of operations is 5.0
    1645 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::min(bitblock256_t arg1, bitblock256_t arg2)
    1646 {
    1647         return avx_general_combine256(_mm_min_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1648 }
    1649 
    1650 //The total number of operations is 5.0
    1651 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::min(bitblock256_t arg1, bitblock256_t arg2)
    1652 {
    1653         return avx_general_combine256(_mm_min_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1654 }
    1655 
    1656 //The total number of operations is 5.0
    1657 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::min(bitblock256_t arg1, bitblock256_t arg2)
    1658 {
    1659         return avx_general_combine256(_mm_min_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1660 }
    1661 
    1662 //The total number of operations is 8.0
    1663 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::min(bitblock256_t arg1, bitblock256_t arg2)
    1664 {
    1665         return simd256<1>::ifh(simd256<64>::gt(arg1, arg2), arg2, arg1);
    1666 }
    1667 
    1668 //The total number of operations is 54.6666666667
    1669 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2)
    1670 {
    1671         bitblock256_t hiAns = simd256<(64)>::min(arg1, arg2);
    1672         bitblock256_t loAns = simd256<(64)>::umin(arg1, arg2);
    1673         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg1));
    1674         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg2));
    1675         return simd256<1>::ifh(simd256<128>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
    1676 }
    1677 
    1678 //The total number of operations is 186.666666667
    1679 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2)
    1680 {
    1681         bitblock256_t hiAns = simd256<(128)>::min(arg1, arg2);
    1682         bitblock256_t loAns = simd256<(128)>::umin(arg1, arg2);
    1683         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg1));
    1684         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg2));
    1685         return simd256<1>::ifh(simd256<256>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
    1686 }
    1687 
    1688 //The total number of operations is 0
    1689 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1690 {
    1691         return simd256<2>::constant<(1)>();
    1692 }
    1693 
    1694 //The total number of operations is 0
    1695 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1696 {
    1697         return simd256<4>::constant<(3)>();
    1698 }
    1699 
    1700 //The total number of operations is 0
    1701 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1702 {
    1703         return simd256<8>::constant<(15)>();
    1704 }
    1705 
    1706 //The total number of operations is 0
    1707 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1708 {
    1709         return simd256<16>::constant<(255)>();
    1710 }
    1711 
    1712 //The total number of operations is 0
    1713 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1714 {
    1715         return simd256<32>::constant<(65535)>();
    1716 }
    1717 
    1718 //The total number of operations is 0
    1719 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1720 {
    1721         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
    1722 }
    1723 
    1724 //The total number of operations is 0
    1725 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1726 {
    1727         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
    1728 }
    1729 
    1730 //The total number of operations is 0
    1731 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1732 {
    1733         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
    1734 }
    1735 
    1736 //The total number of operations is 1.0
    1737 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1738 {
    1739         return simd_and(arg1, arg2);
    1740 }
    1741 
    1742 //The total number of operations is 24.0
    1743 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1744 {
    1745         bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
    1746         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    1747         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    1748         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1749 }
    1750 
    1751 //The total number of operations is 14.0
    1752 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1753 {
    1754         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    1755 }
    1756 
    1757 //The total number of operations is 5.0
    1758 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1759 {
    1760         return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1761 }
    1762 
    1763 //The total number of operations is 5.0
    1764 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1765 {
    1766         return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1767 }
    1768 
    1769 //The total number of operations is 5.0
    1770 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1771 {
    1772         return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1773 }
    1774 
    1775 //The total number of operations is 11.0
    1776 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1777 {
    1778         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1779         return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1780 }
    1781 
    1782 //The total number of operations is 46.6666666667
    1783 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1784 {
    1785         bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
    1786         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    1787         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    1788         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1789 }
    1790 
    1791 //The total number of operations is 132.0
    1792 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1793 {
    1794         bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
    1795         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    1796         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    1797         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1798 }
    1799 
    1800 //The total number of operations is 19.0
    1801 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
    1802 {
    1803         return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
    1804 }
    1805 
    1806 //The total number of operations is 39.0
    1807 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
    1808 {
    1809         bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
    1810         return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
    1811 }
    1812 
    1813 //The total number of operations is 4.0
    1814 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
    1815 {
    1816         return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
    1817 }
    1818 
    1819 //The total number of operations is 4.0
    1820 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
    1821 {
    1822         return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
    1823 }
    1824 
    1825 //The total number of operations is 4.0
    1826 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
    1827 {
    1828         return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
    1829 }
    1830 
    1831 //The total number of operations is 13.0
    1832 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
    1833 {
    1834         bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
    1835         return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
    1836 }
    1837 
    1838 //The total number of operations is 69.0
    1839 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
    1840 {
    1841         bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
    1842         return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
    1843 }
    1844 
    1845 //The total number of operations is 204.833333333
    1846 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
    1847 {
    1848         bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    1849         return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
    1850 }
    1851 
    1852 //The total number of operations is 2.0
    1853 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1854 {
    1855         return simd_not(simd_xor(arg1, arg2));
    1856 }
    1857 
    1858 //The total number of operations is 14.0
    1859 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1860 {
    1861         bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
    1862         bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
    1863         bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
    1864         return simd_or(loMask, hiMask);
    1865 }
    1866 
    1867 //The total number of operations is 17.0
    1868 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1869 {
    1870         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
    1871 }
    1872 
    1873 //The total number of operations is 5.0
    1874 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1875 {
    1876         return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1877 }
    1878 
    1879 //The total number of operations is 5.0
    1880 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1881 {
    1882         return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1883 }
    1884 
    1885 //The total number of operations is 5.0
    1886 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1887 {
    1888         return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1889 }
    1890 
    1891 //The total number of operations is 5.0
    1892 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1893 {
    1894         return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1895 }
    1896 
    1897 //The total number of operations is 23.6666666667
    1898 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1899 {
    1900         bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
    1901         bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
    1902         bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
    1903         return simd_or(loMask, hiMask);
    1904 }
    1905 
    1906 //The total number of operations is 54.1666666667
    1907 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1908 {
    1909         bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
    1910         bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
    1911         bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
    1912         return simd_or(loMask, hiMask);
    1913 }
    1914 
    1915 //The total number of operations is 7.0
    1916 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
    1917 {
    1918         return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
    1919 }
    1920 
    1921 //The total number of operations is 17.5
    1922 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
    1923 {
    1924         return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
    1925 }
    1926 
    1927 //The total number of operations is 12.0
    1928 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
    1929 {
    1930         bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1931         return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1932 }
    1933 
    1934 //The total number of operations is 4.0
    1935 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
    1936 {
    1937         return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    1938 }
    1939 
    1940 //The total number of operations is 4.0
    1941 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
    1942 {
    1943         return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    1944 }
    1945 
    1946 //The total number of operations is 12.0
    1947 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
    1948 {
    1949         return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
    1950 }
    1951 
    1952 //The total number of operations is 28.3333333333
    1953 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
    1954 {
    1955         return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
    1956 }
    1957 
    1958 //The total number of operations is 59.0
    1959 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
    1960 {
    1961         return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    1962 }
    1963 
    1964 //The total number of operations is 0
    1965 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
    1966 {
    1967         return simd256<2>::constant<(2)>();
    1968 }
    1969 
    1970 //The total number of operations is 0
    1971 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
    1972 {
    1973         return simd256<4>::constant<(12)>();
    1974 }
    1975 
    1976 //The total number of operations is 0
    1977 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
    1978 {
    1979         return simd256<8>::constant<(240)>();
    1980 }
    1981 
    1982 //The total number of operations is 0
    1983 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
    1984 {
    1985         return simd256<16>::constant<(65280)>();
    1986 }
    1987 
    1988 //The total number of operations is 0
    1989 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
    1990 {
    1991         return simd256<32>::constant<-65536>();
    1992 }
    1993 
    1994 //The total number of operations is 0
    1995 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
    1996 {
    1997         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    1998 }
    1999 
    2000 //The total number of operations is 0
    2001 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
    2002 {
    2003         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    2004 }
    2005 
    2006 //The total number of operations is 0
    2007 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
    2008 {
    2009         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    2010 }
    2011 
    2012 //The total number of operations is 1.0
    2013 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2)
    2014 {
    2015         return simd_xor(arg1, arg2);
    2016 }
    2017 
    2018 //The total number of operations is 16.0
    2019 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2)
    2020 {
    2021         bitblock256_t ans = simd256<(1)>::add(arg1, arg2);
    2022         bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
    2023         bitblock256_t loMask = simd256<2>::lomask();
    2024         bitblock256_t carry = simd256<2>::slli<1>(simd_and(carryMask, loMask));
    2025         return simd256<1>::ifh(loMask, ans, simd256<(1)>::add(ans, carry));
    2026 }
    2027 
    2028 //The total number of operations is 14.0
    2029 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2)
    2030 {
    2031         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::add(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::add(arg1, arg2));
    2032 }
    2033 
    2034 //The total number of operations is 5.0
    2035 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2)
    2036 {
    2037         return avx_general_combine256(_mm_add_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2038 }
    2039 
    2040 //The total number of operations is 5.0
    2041 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2)
    2042 {
    2043         return avx_general_combine256(_mm_add_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2044 }
    2045 
    2046 //The total number of operations is 5.0
    2047 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2)
    2048 {
    2049         return avx_general_combine256(_mm_add_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2050 }
    2051 
    2052 //The total number of operations is 5.0
    2053 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2)
    2054 {
    2055         return avx_general_combine256(_mm_add_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2056 }
    2057 
    2058 //The total number of operations is 26.3333333333
    2059 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2)
    2060 {
    2061         bitblock256_t partial = simd256<(64)>::add(arg1, arg2);
    2062         bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
    2063         bitblock256_t carry = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(carryMask));
    2064         return simd256<(64)>::add(partial, carry);
    2065 }
    2066 
    2067 //The total number of operations is 75.6666666667
    2068 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2)
    2069 {
    2070         bitblock256_t ans = simd256<(128)>::add(arg1, arg2);
    2071         bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
    2072         bitblock256_t loMask = simd256<256>::lomask();
    2073         bitblock256_t carry = simd256<256>::slli<1>(simd_and(carryMask, loMask));
    2074         return simd256<1>::ifh(loMask, ans, simd256<(128)>::add(ans, carry));
    2075 }
    2076 
    20772195//The total number of operations is 1.0
    20782196template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     
    22962414
    22972415//The total number of operations is 3.0
    2298 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<8>::signmask(bitblock256_t arg1)
     2416template <> IDISA_ALWAYS_INLINE FieldType<256/8>::T hsimd256<8>::signmask(bitblock256_t arg1)
    22992417{
    23002418        return ((((uint64_t)(_mm_movemask_epi8(((__m128i)(avx_select_hi128(arg1))))))<<16)|((uint64_t)(_mm_movemask_epi8(((__m128i)(avx_select_lo128(arg1)))))));
     
    23022420
    23032421//The total number of operations is 8.0
    2304 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<16>::signmask(bitblock256_t arg1)
     2422template <> IDISA_ALWAYS_INLINE FieldType<256/16>::T hsimd256<16>::signmask(bitblock256_t arg1)
    23052423{
    23062424        return hsimd256<(8)>::signmask(hsimd256<16>::packss(simd256<16>::constant<0>(), arg1));
     
    23082426
    23092427//The total number of operations is 13.0
    2310 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<32>::signmask(bitblock256_t arg1)
     2428template <> IDISA_ALWAYS_INLINE FieldType<256/32>::T hsimd256<32>::signmask(bitblock256_t arg1)
    23112429{
    23122430        return hsimd256<(16)>::signmask(hsimd256<32>::packss(simd256<32>::constant<0>(), arg1));
     
    23142432
    23152433//The total number of operations is 104.0
    2316 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<64>::signmask(bitblock256_t arg1)
     2434template <> IDISA_ALWAYS_INLINE FieldType<256/64>::T hsimd256<64>::signmask(bitblock256_t arg1)
    23172435{
    23182436        return hsimd256<(32)>::signmask(hsimd256<64>::packh(simd256<64>::constant<0>(), arg1));
     
    23202438
    23212439//The total number of operations is 248.666666667
    2322 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<128>::signmask(bitblock256_t arg1)
     2440template <> IDISA_ALWAYS_INLINE FieldType<256/128>::T hsimd256<128>::signmask(bitblock256_t arg1)
    23232441{
    23242442        return hsimd256<(64)>::signmask(hsimd256<128>::packh(simd256<128>::constant<0>(), arg1));
     
    23262444
    23272445//The total number of operations is 266.166666667
    2328 template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<256>::signmask(bitblock256_t arg1)
     2446template <> IDISA_ALWAYS_INLINE FieldType<256/256>::T hsimd256<256>::signmask(bitblock256_t arg1)
    23292447{
    23302448        return hsimd256<(128)>::signmask(hsimd256<256>::packh(simd256<256>::constant<0>(), arg1));
     
    28492967
    28502968//The total number of operations is 29.5
    2851 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     2969template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28522970{
    28532971        return simd_or(mvmd256<2>::srli<sh>(arg1), mvmd256<2>::slli<((128)-sh)>(arg2));
     
    28552973
    28562974//The total number of operations is 29.5
    2857 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     2975template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28582976{
    28592977        return simd_or(mvmd256<4>::srli<sh>(arg1), mvmd256<4>::slli<((64)-sh)>(arg2));
     
    28612979
    28622980//The total number of operations is 29.5
    2863 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     2981template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28642982{
    28652983        return simd_or(mvmd256<8>::srli<sh>(arg1), mvmd256<8>::slli<((32)-sh)>(arg2));
     
    28672985
    28682986//The total number of operations is 29.5
    2869 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     2987template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28702988{
    28712989        return simd_or(mvmd256<16>::srli<sh>(arg1), mvmd256<16>::slli<((16)-sh)>(arg2));
     
    28732991
    28742992//The total number of operations is 29.5
    2875 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     2993template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28762994{
    28772995        return simd_or(mvmd256<32>::srli<sh>(arg1), mvmd256<32>::slli<((8)-sh)>(arg2));
     
    28792997
    28802998//The total number of operations is 29.5
    2881 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     2999template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28823000{
    28833001        return simd_or(mvmd256<64>::srli<sh>(arg1), mvmd256<64>::slli<((4)-sh)>(arg2));
     
    28853003
    28863004//The total number of operations is 29.5
    2887 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     3005template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28883006{
    28893007        return simd_or(mvmd256<128>::srli<sh>(arg1), mvmd256<128>::slli<((2)-sh)>(arg2));
     
    28913009
    28923010//The total number of operations is 29.5
    2893 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
     3011template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
    28943012{
    28953013        return simd_or(mvmd256<256>::srli<sh>(arg1), mvmd256<256>::slli<((1)-sh)>(arg2));
     
    28973015
    28983016//The total number of operations is 1.0
    2899 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(uint64_t val1)
     3017template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1)
    29003018{
    29013019        return mvmd256<32>::fill((-1*val1));
     
    29033021
    29043022//The total number of operations is 1.0
    2905 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(uint64_t val1)
     3023template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1)
    29063024{
    29073025        return mvmd256<(4)>::fill(((val1<<2)|val1));
     
    29093027
    29103028//The total number of operations is 1.0
    2911 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(uint64_t val1)
     3029template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(FieldType<4>::T val1)
    29123030{
    29133031        return mvmd256<(8)>::fill(((val1<<4)|val1));
     
    29153033
    29163034//The total number of operations is 1.0
    2917 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(uint64_t val1)
     3035template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(FieldType<8>::T val1)
    29183036{
    29193037        return (bitblock256_t)_mm256_set1_epi8((int32_t)(val1));
     
    29213039
    29223040//The total number of operations is 1.0
    2923 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(uint64_t val1)
     3041template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(FieldType<16>::T val1)
    29243042{
    29253043        return (bitblock256_t)_mm256_set1_epi16((int32_t)(val1));
     
    29273045
    29283046//The total number of operations is 1.0
    2929 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(uint64_t val1)
     3047template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(FieldType<32>::T val1)
    29303048{
    29313049        return (bitblock256_t)_mm256_set1_epi32((int32_t)(val1));
     
    29333051
    29343052//The total number of operations is 5.0
    2935 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill(uint64_t val1)
     3053template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill(FieldType<64>::T val1)
    29363054{
    29373055        return mvmd256<(32)>::fill2((val1>>(32)), (val1&((4294967296ULL)-1)));
     
    29393057
    29403058//The total number of operations is 13.0
    2941 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(uint64_t val1)
     3059template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1)
    29423060{
    29433061        return mvmd256<(64)>::fill2(0, val1);
     
    29453063
    29463064//The total number of operations is 29.0
    2947 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(uint64_t val1)
     3065template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1)
    29483066{
    29493067        return mvmd256<(128)>::fill2(0, val1);
     
    29513069
    29523070//The total number of operations is 1.5
    2953 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<1>::extract(bitblock256_t arg1)
     3071template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
    29543072{
    29553073        return (((pos%2) == 0) ? (mvmd256<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd256<(2)>::extract<(pos/2)>(arg1)>>1));
     
    29573075
    29583076//The total number of operations is 1.5
    2959 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<2>::extract(bitblock256_t arg1)
     3077template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
    29603078{
    29613079        return (((pos%2) == 0) ? (mvmd256<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd256<(4)>::extract<(pos/2)>(arg1)>>2));
     
    29633081
    29643082//The total number of operations is 1.5
    2965 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<4>::extract(bitblock256_t arg1)
     3083template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
    29663084{
    29673085        return (((pos%2) == 0) ? (mvmd256<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd256<(8)>::extract<(pos/2)>(arg1)>>4));
     
    29693087
    29703088//The total number of operations is 1.5
    2971 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<8>::extract(bitblock256_t arg1)
     3089template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
    29723090{
    29733091        return (((pos%2) == 0) ? (mvmd256<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd256<(16)>::extract<(pos/2)>(arg1)>>8));
     
    29753093
    29763094//The total number of operations is 1.5
    2977 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<16>::extract(bitblock256_t arg1)
     3095template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
    29783096{
    29793097        return ((pos < 8) ? (65535&_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : (65535&_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
     
    29813099
    29823100//The total number of operations is 1.5
    2983 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<32>::extract(bitblock256_t arg1)
     3101template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
    29843102{
    29853103        return ((pos < 4) ? (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
     
    29873105
    29883106//The total number of operations is 3.0
    2989 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<64>::extract(bitblock256_t arg1)
    2990 {
    2991         return ((mvmd256<(32)>::extract<((2*pos)+1)>(arg1)<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));
     3107template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
     3108{
     3109        return ((((uint64_t)(mvmd256<(32)>::extract<((2*pos)+1)>(arg1)))<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));
    29923110}
    29933111
    29943112//The total number of operations is 23.5
    2995 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1)
    2996 {
    2997         bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(2)>::slli<1>(arg1) : simd256<(2)>::srli<1>(arg1));
    2998         bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(2)>::lomask(), arg1) : simd_and(simd256<(2)>::himask(), arg1));
    2999         return mvmd256<(2)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
     3113template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1)
     3114{
     3115        return mvmd256<(2)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd256<(2)>::slli<1>(arg1) : simd256<(2)>::srli<1>(arg1)), (((pos%2) == 0) ? simd_and(simd256<(2)>::lomask(), arg1) : simd_and(simd256<(2)>::himask(), arg1))));
    30003116}
    30013117
    30023118//The total number of operations is 16.5
    3003 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1)
    3004 {
    3005         bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(4)>::slli<2>(arg1) : simd256<(4)>::srli<2>(arg1));
    3006         bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(4)>::lomask(), arg1) : simd_and(simd256<(4)>::himask(), arg1));
    3007         return mvmd256<(4)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
     3119template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1)
     3120{
     3121        return mvmd256<(4)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd256<(4)>::slli<2>(arg1) : simd256<(4)>::srli<2>(arg1)), (((pos%2) == 0) ? simd_and(simd256<(4)>::lomask(), arg1) : simd_and(simd256<(4)>::himask(), arg1))));
    30083122}
    30093123
    30103124//The total number of operations is 9.5
    3011 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1)
    3012 {
    3013         bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(8)>::slli<4>(arg1) : simd256<(8)>::srli<4>(arg1));
    3014         bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(8)>::lomask(), arg1) : simd_and(simd256<(8)>::himask(), arg1));
    3015         return mvmd256<(8)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
     3125template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1)
     3126{
     3127        return mvmd256<(8)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd256<(8)>::slli<4>(arg1) : simd256<(8)>::srli<4>(arg1)), (((pos%2) == 0) ? simd_and(simd256<(8)>::lomask(), arg1) : simd_and(simd256<(8)>::himask(), arg1))));
    30163128}
    30173129
    30183130//The total number of operations is 2.5
    3019 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1)
     3131template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1)
    30203132{
    30213133        return ((pos < 16) ? mvmd256<8>::fill(_mm_extract_epi8(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<8>::fill(_mm_extract_epi8(avx_select_hi128(arg1), (int32_t)((pos-16)))));
     
    30233135
    30243136//The total number of operations is 2.5
    3025 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1)
     3137template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1)
    30263138{
    30273139        return ((pos < 8) ? mvmd256<16>::fill(_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<16>::fill(_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
     
    30293141
    30303142//The total number of operations is 2.5
    3031 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1)
     3143template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1)
    30323144{
    30333145        return ((pos < 4) ? mvmd256<32>::fill(_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<32>::fill(_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
     
    30353147
    30363148//The total number of operations is 8.0
    3037 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1)
     3149template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1)
    30383150{
    30393151        return simd256<1>::ifh(simd256<64>::himask(), mvmd256<(32)>::splat<((2*pos)+1)>(arg1), mvmd256<(32)>::splat<(2*pos)>(arg1));
     
    30413153
    30423154//The total number of operations is 19.0
    3043 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1)
     3155template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1)
    30443156{
    30453157        return simd256<1>::ifh(simd256<128>::himask(), mvmd256<(64)>::splat<((2*pos)+1)>(arg1), mvmd256<(64)>::splat<(2*pos)>(arg1));
     
    30473159
    30483160//The total number of operations is 41.0
    3049 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1)
     3161template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1)
    30503162{
    30513163        return simd256<1>::ifh(simd256<256>::himask(), mvmd256<(128)>::splat<((2*pos)+1)>(arg1), mvmd256<(128)>::splat<(2*pos)>(arg1));
     
    30533165
    30543166//The total number of operations is 15.0
    3055 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
     3167template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
    30563168{
    30573169        return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
     
    30593171
    30603172//The total number of operations is 7.0
    3061 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
     3173template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
    30623174{
    30633175        return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
     
    30653177
    30663178//The total number of operations is 3.0
    3067 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
     3179template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
    30683180{
    30693181        return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
     
    30713183
    30723184//The total number of operations is 1.0
    3073 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
     3185template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
    30743186{
    30753187        return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
     
    30773189
    30783190//The total number of operations is 5.0
    3079 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
     3191template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
    30803192{
    30813193        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     
    30833195
    30843196//The total number of operations is 5.0
    3085 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3197template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
    30863198{
    30873199        return simd256<1>::ifh(simd256<(4)>::himask(), mvmd256<1>::fill2(val1, val2), mvmd256<1>::fill2(val3, val4));
     
    30893201
    30903202//The total number of operations is 5.0
    3091 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3203template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4)
    30923204{
    30933205        return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<2>::fill2(val1, val2), mvmd256<2>::fill2(val3, val4));
     
    30953207
    30963208//The total number of operations is 5.0
    3097 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3209template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4)
    30983210{
    30993211        return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<4>::fill2(val1, val2), mvmd256<4>::fill2(val3, val4));
     
    31013213
    31023214//The total number of operations is 5.0
    3103 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3215template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4)
    31043216{
    31053217        return simd256<1>::ifh(simd256<(32)>::himask(), mvmd256<8>::fill2(val1, val2), mvmd256<8>::fill2(val3, val4));
     
    31073219
    31083220//The total number of operations is 3.0
    3109 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3221template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4)
    31103222{
    31113223        return simd_or(mvmd256<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd256<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
     
    31133225
    31143226//The total number of operations is 1.0
    3115 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3227template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4)
    31163228{
    31173229        return (bitblock256_t)_mm256_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
     
    31193231
    31203232//The total number of operations is 29.0
    3121 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3233template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4(FieldType<64>::T val1, FieldType<64>::T val2, FieldType<64>::T val3, FieldType<64>::T val4)
    31223234{
    31233235        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<64>::fill2(val1, val2), mvmd256<64>::fill2(val3, val4));
     
    31253237
    31263238//The total number of operations is 14.5
    3127 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1)
     3239template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1)
    31283240{
    31293241        return simd256<256>::srli<(sh*2)>(arg1);
     
    31313243
    31323244//The total number of operations is 14.5
    3133 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1)
     3245template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1)
    31343246{
    31353247        return simd256<256>::srli<(sh*4)>(arg1);
     
    31373249
    31383250//The total number of operations is 14.5
    3139 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1)
     3251template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1)
    31403252{
    31413253        return simd256<256>::srli<(sh*8)>(arg1);
     
    31433255
    31443256//The total number of operations is 14.5
    3145 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1)
     3257template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1)
    31463258{
    31473259        return simd256<256>::srli<(sh*16)>(arg1);
     
    31493261
    31503262//The total number of operations is 14.5
    3151 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1)
     3263template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1)
    31523264{
    31533265        return simd256<256>::srli<(sh*32)>(arg1);
     
    31553267
    31563268//The total number of operations is 14.5
    3157 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
     3269template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
    31583270{
    31593271        return simd256<256>::srli<(sh*64)>(arg1);
     
    31613273
    31623274//The total number of operations is 14.5
    3163 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)
     3275template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)
    31643276{
    31653277        return simd256<256>::srli<(sh*128)>(arg1);
     
    31673279
    31683280//The total number of operations is 14.5
    3169 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)
     3281template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)
    31703282{
    31713283        return simd256<256>::srli<(sh*256)>(arg1);
     
    31733285
    31743286//The total number of operations is 1.0
    3175 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(uint64_t val1, uint64_t val2)
     3287template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2)
    31763288{
    31773289        return mvmd256<(2)>::fill(((val1<<1)|(val2&(1))));
     
    31793291
    31803292//The total number of operations is 1.0
    3181 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(uint64_t val1, uint64_t val2)
     3293template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2)
    31823294{
    31833295        return mvmd256<(4)>::fill(((val1<<2)|(val2&(3))));
     
    31853297
    31863298//The total number of operations is 1.0
    3187 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(uint64_t val1, uint64_t val2)
     3299template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2)
    31883300{
    31893301        return mvmd256<(8)>::fill(((val1<<4)|(val2&(15))));
     
    31913303
    31923304//The total number of operations is 1.0
    3193 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(uint64_t val1, uint64_t val2)
     3305template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2)
    31943306{
    31953307        return mvmd256<(16)>::fill(((val1<<8)|(val2&(255))));
     
    31973309
    31983310//The total number of operations is 1.0
    3199 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(uint64_t val1, uint64_t val2)
     3311template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2)
    32003312{
    32013313        return mvmd256<(32)>::fill(((val1<<16)|(val2&(65535))));
     
    32033315
    32043316//The total number of operations is 5.0
    3205 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(uint64_t val1, uint64_t val2)
     3317template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2)
    32063318{
    32073319        return simd256<1>::ifh(simd256<(64)>::himask(), mvmd256<32>::fill(val1), mvmd256<32>::fill(val2));
     
    32093321
    32103322//The total number of operations is 13.0
    3211 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2(uint64_t val1, uint64_t val2)
     3323template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2)
    32123324{
    32133325        return simd256<1>::ifh(simd256<(128)>::himask(), mvmd256<64>::fill(val1), mvmd256<64>::fill(val2));
     
    32153327
    32163328//The total number of operations is 29.0
    3217 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2(uint64_t val1, uint64_t val2)
     3329template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2(FieldType<128>::T val1, FieldType<128>::T val2)
    32183330{
    32193331        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<128>::fill(val1), mvmd256<128>::fill(val2));
     
    32213333
    32223334//The total number of operations is 29.5
    3223 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3335template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32243336{
    32253337        return simd_or(mvmd256<2>::slli<sh>(arg1), mvmd256<2>::srli<((128)-sh)>(arg2));
     
    32273339
    32283340//The total number of operations is 29.5
    3229 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3341template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32303342{
    32313343        return simd_or(mvmd256<4>::slli<sh>(arg1), mvmd256<4>::srli<((64)-sh)>(arg2));
     
    32333345
    32343346//The total number of operations is 29.5
    3235 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3347template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32363348{
    32373349        return simd_or(mvmd256<8>::slli<sh>(arg1), mvmd256<8>::srli<((32)-sh)>(arg2));
     
    32393351
    32403352//The total number of operations is 29.5
    3241 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3353template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32423354{
    32433355        return simd_or(mvmd256<16>::slli<sh>(arg1), mvmd256<16>::srli<((16)-sh)>(arg2));
     
    32453357
    32463358//The total number of operations is 29.5
    3247 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3359template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32483360{
    32493361        return simd_or(mvmd256<32>::slli<sh>(arg1), mvmd256<32>::srli<((8)-sh)>(arg2));
     
    32513363
    32523364//The total number of operations is 29.5
    3253 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3365template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32543366{
    32553367        return simd_or(mvmd256<64>::slli<sh>(arg1), mvmd256<64>::srli<((4)-sh)>(arg2));
     
    32573369
    32583370//The total number of operations is 29.5
    3259 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3371template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32603372{
    32613373        return simd_or(mvmd256<128>::slli<sh>(arg1), mvmd256<128>::srli<((2)-sh)>(arg2));
     
    32633375
    32643376//The total number of operations is 29.5
    3265 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3377template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    32663378{
    32673379        return simd_or(mvmd256<256>::slli<sh>(arg1), mvmd256<256>::srli<((1)-sh)>(arg2));
     
    32693381
    32703382//The total number of operations is 14.0
    3271 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1)
     3383template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1)
    32723384{
    32733385        return simd256<256>::slli<(sh*2)>(arg1);
     
    32753387
    32763388//The total number of operations is 14.0
    3277 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1)
     3389template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1)
    32783390{
    32793391        return simd256<256>::slli<(sh*4)>(arg1);
     
    32813393
    32823394//The total number of operations is 14.0
    3283 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1)
     3395template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1)
    32843396{
    32853397        return simd256<256>::slli<(sh*8)>(arg1);
     
    32873399
    32883400//The total number of operations is 14.0
    3289 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1)
    3290 {
    3291         return mvmd256<(8)>::slli<(sh*2)>(arg1);
     3401template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1)
     3402{
     3403        return simd256<256>::slli<(sh*16)>(arg1);
    32923404}
    32933405
    32943406//The total number of operations is 14.0
    3295 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1)
     3407template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1)
    32963408{
    32973409        return simd256<256>::slli<(sh*32)>(arg1);
     
    32993411
    33003412//The total number of operations is 14.0
    3301 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1)
     3413template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1)
    33023414{
    33033415        return simd256<256>::slli<(sh*64)>(arg1);
     
    33053417
    33063418//The total number of operations is 14.0
    3307 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1)
    3308 {
    3309         return mvmd256<(64)>::slli<(sh*2)>(arg1);
     3419template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1)
     3420{
     3421        return simd256<256>::slli<(sh*128)>(arg1);
    33103422}
    33113423
    33123424//The total number of operations is 14.0
    3313 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1)
     3425template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1)
    33143426{
    33153427        return simd256<256>::slli<(sh*256)>(arg1);
     
    33173429
    33183430//The total number of operations is 13.0
    3319 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3431template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8)
    33203432{
    33213433        return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<1>::fill4(val1, val2, val3, val4), mvmd256<1>::fill4(val5, val6, val7, val8));
     
    33233435
    33243436//The total number of operations is 13.0
    3325 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3437template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8)
    33263438{
    33273439        return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<2>::fill4(val1, val2, val3, val4), mvmd256<2>::fill4(val5, val6, val7, val8));
     
    33293441
    33303442//The total number of operations is 7.0
    3331 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3443template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8)
    33323444{
    33333445        return simd_or(mvmd256<(8)>::fill8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd256<(8)>::fill8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
     
    33353447
    33363448//The total number of operations is 3.0
    3337 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3449template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8)
    33383450{
    33393451        return simd_or(mvmd256<(16)>::fill8((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd256<(16)>::fill8((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
     
    33413453
    33423454//The total number of operations is 1.0
    3343 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3455template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8)
    33443456{
    33453457        return (bitblock256_t)_mm256_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
     
    33473459
    33483460//The total number of operations is 5.0
    3349 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     3461template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4, FieldType<32>::T val5, FieldType<32>::T val6, FieldType<32>::T val7, FieldType<32>::T val8)
    33503462{
    33513463        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<32>::fill4(val1, val2, val3, val4), mvmd256<32>::fill4(val5, val6, val7, val8));
     
    33593471
    33603472//The total number of operations is 14.5
    3361 template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srli(bitblock256_t arg1)
     3473template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srli(bitblock256_t arg1)
    33623474{
    33633475        return simd256<256>::srli<sh>(arg1);
     
    33713483
    33723484//The total number of operations is 118.5
    3373 IDISA_ALWAYS_INLINE uint64_t bitblock256::popcount(bitblock256_t arg1)
     3485IDISA_ALWAYS_INLINE uint16_t bitblock256::popcount(bitblock256_t arg1)
    33743486{
    33753487        return mvmd256<64>::extract<0>(simd256<256>::popcount(arg1));
     
    33833495
    33843496//The total number of operations is 14.0
    3385 template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
     3497template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
    33863498{
    33873499        return simd256<256>::slli<sh>(arg1);
  • trunk/lib/idisa_cpp/idisa_sse2.cpp

    r3063 r3441  
    1515typedef __m128i bitblock128_t;
    1616
     17#ifndef FIELD_TYPE
     18#define FIELD_TYPE
    1719template <uint32_t fw> struct FieldType {
    1820   typedef int T;  //default for FieldType::T is int
     
    2729template <> struct FieldType<64> {typedef uint64_t T;};
    2830template <> struct FieldType<128> {typedef uint64_t T;};
     31#endif
    2932
    3033typedef FieldType<1>::T fw1_t;
Note: See TracChangeset for help on using the changeset viewer.