Changeset 3441
- Timestamp:
- Sep 7, 2013, 3:05:51 PM (5 years ago)
- Location:
- trunk/lib
- Files:
-
- 1 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/lib/bitblock256.hpp
r3439 r3441 18 18 union ubitblock { 19 19 bitblock256_t _256; 20 bitblock 256_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)];20 bitblock128_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)]; 21 21 uint64_t _64[sizeof(bitblock256_t)/sizeof(uint64_t)]; 22 22 uint32_t _32[sizeof(bitblock256_t)/sizeof(uint32_t)]; … … 28 28 typedef bitblock256_t carry_t; 29 29 30 static IDISA_ALWAYS_INLINE void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum); 31 static IDISA_ALWAYS_INLINE void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference); 32 static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, bitblock256_t carry_in, bitblock256_t & carry_out, bitblock256_t & rslt); 33 34 35 36 30 37 static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry); 31 38 static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry); … … 46 53 static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) { return carry;} 47 54 48 static inline void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum);49 static inline void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference);50 static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, bitblock256_t carry_in, bitblock256_t & carry_out, bitblock256_t & rslt);51 52 53 55 static inline void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) { 54 bitblock256_t all_ones = simd256<1>::constant<1>();55 bitblock256_t gen = simd_and(x, y);56 bitblock256_t prop = simd_xor(x, y);57 bitblock256_t partial_sum = simd256<64>::add(x, y);58 bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum));59 bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones);60 uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in);61 uint64_t bubble_mask = hsimd256<64>::signmask(bubble);62 uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;63 uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);64 carry_out = convert(increments >> 4);65 uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;66 sum = simd256<64>::add(partial_sum, _mm256_cvtepu8_epi64(avx_select_lo128(convert(spread))));56 bitblock256_t all_ones = simd256<1>::constant<1>(); 57 bitblock256_t gen = simd_and(x, y); 58 bitblock256_t prop = simd_xor(x, y); 59 bitblock256_t partial_sum = simd256<64>::add(x, y); 60 bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum)); 61 bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones); 62 uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in); 63 uint64_t bubble_mask = hsimd256<64>::signmask(bubble); 64 uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask; 65 uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask); 66 carry_out = convert(increments >> 4); 67 uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001; 68 sum = simd256<64>::add(partial_sum, _mm256_cvtepu8_epi64(avx_select_lo128(convert(spread)))); 67 69 } 68 70 … … 74 76 difference = simd256<128>::sub(partial, b1); 75 77 borrow_out = simd_or(gen, simd_and(prop, difference)); 76 77 78 } 78 79 … … 83 84 rslt = simd_or(simd256<64>::add(cursor, cursor), low_bits); 84 85 } 85 86 86 87 87 … … 251 251 } 252 252 253 IDISA_ALWAYS_INLINE uint64_t convert 253 IDISA_ALWAYS_INLINE uint64_t convert(bitblock256_t v) 254 254 { 255 255 return (uint64_t) mvmd256<64>::extract<0>(v); -
trunk/lib/idisa256.hpp
r2275 r3441 9 9 #define IDISA256_HPP 10 10 11 #if defined USE_AVX 11 12 #include "idisa_cpp/idisa_avx.cpp" 13 #else 14 #include "idisa_cpp/idisa_avx2.cpp" 12 15 #endif 16 17 #endif -
trunk/lib/idisa_cpp/idisa_avx.cpp
r2275 r3441 16 16 17 17 typedef __m256 bitblock256_t; 18 18 19 template <uint32_t fw> 19 20 class simd256 … … 25 26 static IDISA_ALWAYS_INLINE bitblock256_t umult(bitblock256_t arg1, bitblock256_t arg2); 26 27 static IDISA_ALWAYS_INLINE bitblock256_t ult(bitblock256_t arg1, bitblock256_t arg2); 27 template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1); 28 static IDISA_ALWAYS_INLINE bitblock256_t all(bitblock256_t arg1); 29 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1); 28 30 static IDISA_ALWAYS_INLINE bitblock256_t ctz(bitblock256_t arg1); 29 31 static IDISA_ALWAYS_INLINE bitblock256_t eq(bitblock256_t arg1, bitblock256_t arg2); … … 31 33 static IDISA_ALWAYS_INLINE bitblock256_t neg(bitblock256_t arg1); 32 34 static IDISA_ALWAYS_INLINE bitblock256_t himask(); 33 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);35 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1); 34 36 static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3); 35 37 static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2); … … 37 39 static IDISA_ALWAYS_INLINE bitblock256_t lomask(); 38 40 static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2); 39 template < uint64_tval> static IDISA_ALWAYS_INLINE bitblock256_t constant();41 template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant(); 40 42 static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2); 41 43 static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2); 42 44 static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1); 43 45 static IDISA_ALWAYS_INLINE bitblock256_t xor_hl(bitblock256_t arg1); 44 template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1); 46 static IDISA_ALWAYS_INLINE bitblock256_t any(bitblock256_t arg1); 47 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1); 45 48 static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2); 46 49 static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2); … … 56 59 static IDISA_ALWAYS_INLINE bitblock256_t packss(bitblock256_t arg1, bitblock256_t arg2); 57 60 static IDISA_ALWAYS_INLINE bitblock256_t packh(bitblock256_t arg1, bitblock256_t arg2); 58 static IDISA_ALWAYS_INLINE uint64_tsignmask(bitblock256_t arg1);61 static IDISA_ALWAYS_INLINE typename FieldType<256/fw>::T signmask(bitblock256_t arg1); 59 62 static IDISA_ALWAYS_INLINE bitblock256_t packl(bitblock256_t arg1, bitblock256_t arg2); 60 63 static IDISA_ALWAYS_INLINE bitblock256_t min_hl(bitblock256_t arg1, bitblock256_t arg2); … … 78 81 { 79 82 public: 80 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);81 static IDISA_ALWAYS_INLINE bitblock256_t fill( uint64_tval1);82 template <uint 64_t pos> static IDISA_ALWAYS_INLINE uint64_textract(bitblock256_t arg1);83 template <uint 64_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);84 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);85 static IDISA_ALWAYS_INLINE bitblock256_t fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);86 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);87 static IDISA_ALWAYS_INLINE bitblock256_t fill2( uint64_t val1, uint64_tval2);88 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dslli(bitblock256_t arg1, bitblock256_t arg2);89 static IDISA_ALWAYS_INLINE bitblock256_t fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);90 static IDISA_ALWAYS_INLINE bitblock256_t fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16);83 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2); 84 static IDISA_ALWAYS_INLINE bitblock256_t fill(typename FieldType<fw>::T val1); 85 template <uint8_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1); 86 template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1); 87 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1); 88 static IDISA_ALWAYS_INLINE bitblock256_t fill4(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4); 89 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1); 90 static IDISA_ALWAYS_INLINE bitblock256_t fill2(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2); 91 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dslli(bitblock256_t arg1, bitblock256_t arg2); 92 static IDISA_ALWAYS_INLINE bitblock256_t fill8(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8); 93 static IDISA_ALWAYS_INLINE bitblock256_t fill16(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8, typename FieldType<fw>::T val9, typename FieldType<fw>::T val10, typename FieldType<fw>::T val11, typename FieldType<fw>::T val12, typename FieldType<fw>::T val13, typename FieldType<fw>::T val14, typename FieldType<fw>::T val15, typename FieldType<fw>::T val16); 91 94 }; 92 95 … … 95 98 public: 96 99 static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(const bitblock256_t* arg1); 97 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);100 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1); 98 101 static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2); 99 102 static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1); 100 103 static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1); 101 static IDISA_ALWAYS_INLINE uint 64_t popcount(bitblock256_t arg1);102 template <uint 64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);104 static IDISA_ALWAYS_INLINE uint16_t popcount(bitblock256_t arg1); 105 template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1); 103 106 static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(const bitblock256_t* arg1); 104 107 static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t arg1, bitblock256_t* arg2); … … 110 113 IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2); 111 114 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2); 115 IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2); 112 116 IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2); 113 IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);114 117 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::max(bitblock256_t arg1, bitblock256_t arg2); 115 118 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::max(bitblock256_t arg1, bitblock256_t arg2); … … 156 159 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ult(bitblock256_t arg1, bitblock256_t arg2); 157 160 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ult(bitblock256_t arg1, bitblock256_t arg2); 158 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2); 159 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2); 160 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2); 161 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2); 162 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2); 163 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2); 164 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2); 165 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2); 166 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2); 167 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1); 168 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1); 169 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1); 170 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1); 171 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1); 172 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1); 173 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1); 174 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1); 161 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::all(bitblock256_t arg1); 162 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::all(bitblock256_t arg1); 163 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::all(bitblock256_t arg1); 164 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::all(bitblock256_t arg1); 165 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::all(bitblock256_t arg1); 166 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::all(bitblock256_t arg1); 167 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::all(bitblock256_t arg1); 168 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::all(bitblock256_t arg1); 169 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1); 170 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1); 171 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1); 172 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1); 173 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1); 174 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1); 175 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1); 176 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1); 175 177 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1); 176 178 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1); … … 208 210 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1); 209 211 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1); 212 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1); 213 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1); 214 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1); 215 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1); 216 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1); 217 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1); 218 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1); 219 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1); 210 220 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1); 211 221 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1); … … 216 226 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1); 217 227 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1); 218 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1);219 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1);220 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1);221 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1);222 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1);223 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1);224 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1);225 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1);228 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1); 229 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1); 230 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1); 231 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1); 232 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1); 233 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1); 234 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1); 235 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1); 226 236 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3); 227 237 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3); … … 250 260 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1); 251 261 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1); 252 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();253 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();254 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant();255 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant();256 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant();257 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant();258 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant();259 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant();260 template <> template < uint64_tval> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant();262 template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant(); 263 template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant(); 264 template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant(); 265 template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant(); 266 template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant(); 267 template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant(); 268 template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant(); 269 template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant(); 270 template <> template <FieldType<256>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant(); 261 271 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2); 262 272 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2); … … 276 286 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask(); 277 287 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask(); 288 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2); 289 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2); 290 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2); 291 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2); 292 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2); 293 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2); 294 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2); 295 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2); 296 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2); 278 297 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2); 279 298 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2); … … 302 321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2); 303 322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2); 304 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);305 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);306 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);307 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);308 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);309 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);310 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);311 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);323 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1); 324 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1); 325 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1); 326 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1); 327 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1); 328 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1); 329 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1); 330 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1); 312 331 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask(); 313 332 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask(); … … 318 337 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask(); 319 338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask(); 320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>:: add(bitblock256_t arg1, bitblock256_t arg2);321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>:: add(bitblock256_t arg1, bitblock256_t arg2);322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>:: add(bitblock256_t arg1, bitblock256_t arg2);323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>:: add(bitblock256_t arg1, bitblock256_t arg2);324 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>:: add(bitblock256_t arg1, bitblock256_t arg2);325 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>:: add(bitblock256_t arg1, bitblock256_t arg2);326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>:: add(bitblock256_t arg1, bitblock256_t arg2);327 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>:: add(bitblock256_t arg1, bitblock256_t arg2);328 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>:: add(bitblock256_t arg1, bitblock256_t arg2);339 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2); 340 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2); 341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2); 342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2); 343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2); 344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2); 345 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2); 346 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2); 347 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2); 329 348 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2); 330 349 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2); … … 360 379 template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packss(bitblock256_t arg1, bitblock256_t arg2); 361 380 template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packss(bitblock256_t arg1, bitblock256_t arg2); 362 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<8>::signmask(bitblock256_t arg1);363 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<16>::signmask(bitblock256_t arg1);364 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<32>::signmask(bitblock256_t arg1);365 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<64>::signmask(bitblock256_t arg1);366 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<128>::signmask(bitblock256_t arg1);367 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<256>::signmask(bitblock256_t arg1);381 template <> IDISA_ALWAYS_INLINE FieldType<256/8>::T hsimd256<8>::signmask(bitblock256_t arg1); 382 template <> IDISA_ALWAYS_INLINE FieldType<256/16>::T hsimd256<16>::signmask(bitblock256_t arg1); 383 template <> IDISA_ALWAYS_INLINE FieldType<256/32>::T hsimd256<32>::signmask(bitblock256_t arg1); 384 template <> IDISA_ALWAYS_INLINE FieldType<256/64>::T hsimd256<64>::signmask(bitblock256_t arg1); 385 template <> IDISA_ALWAYS_INLINE FieldType<256/128>::T hsimd256<128>::signmask(bitblock256_t arg1); 386 template <> IDISA_ALWAYS_INLINE FieldType<256/256>::T hsimd256<256>::signmask(bitblock256_t arg1); 368 387 template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packl(bitblock256_t arg1, bitblock256_t arg2); 369 388 template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packl(bitblock256_t arg1, bitblock256_t arg2); … … 446 465 template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::signextendl(bitblock256_t arg1); 447 466 template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::signextendl(bitblock256_t arg1); 448 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2);449 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2);450 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2);451 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2);452 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2);453 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2);454 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);455 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);456 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill( uint64_tval1);457 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill( uint64_tval1);458 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill( uint64_tval1);459 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill( uint64_tval1);460 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill( uint64_tval1);461 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill( uint64_tval1);462 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill( uint64_tval1);463 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill( uint64_tval1);464 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill( uint64_tval1);465 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<1>::extract(bitblock256_t arg1);466 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<2>::extract(bitblock256_t arg1);467 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<4>::extract(bitblock256_t arg1);468 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<8>::extract(bitblock256_t arg1);469 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<16>::extract(bitblock256_t arg1);470 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<32>::extract(bitblock256_t arg1);471 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<64>::extract(bitblock256_t arg1);472 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);473 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);474 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1);475 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1);476 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1);477 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1);478 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1);479 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);480 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);481 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16);482 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16);483 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16);484 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16);485 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16);486 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);487 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);488 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);489 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);490 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);491 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);492 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4);493 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1);494 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1);495 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1);496 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1);497 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1);498 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1);499 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1);500 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1);501 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2( uint64_t val1, uint64_tval2);502 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2( uint64_t val1, uint64_tval2);503 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2( uint64_t val1, uint64_tval2);504 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2( uint64_t val1, uint64_tval2);505 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2( uint64_t val1, uint64_tval2);506 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2( uint64_t val1, uint64_tval2);507 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2( uint64_t val1, uint64_tval2);508 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2( uint64_t val1, uint64_tval2);509 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2);510 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2);511 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2);512 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2);513 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2);514 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2);515 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2);516 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2);517 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1);518 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1);519 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1);520 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1);521 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1);522 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1);523 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1);524 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1);525 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);526 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);527 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);528 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);529 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);530 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8);467 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 468 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 469 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 470 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 471 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 472 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 473 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 474 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2); 475 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1); 476 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1); 477 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(FieldType<4>::T val1); 478 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(FieldType<8>::T val1); 479 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(FieldType<16>::T val1); 480 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(FieldType<32>::T val1); 481 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill(FieldType<64>::T val1); 482 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1); 483 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1); 484 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1); 485 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1); 486 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1); 487 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1); 488 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1); 489 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1); 490 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1); 491 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1); 492 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1); 493 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1); 494 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1); 495 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1); 496 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1); 497 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1); 498 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1); 499 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1); 500 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16); 501 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16); 502 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16); 503 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16); 504 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16); 505 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4); 506 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4); 507 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4); 508 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4); 509 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4); 510 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4); 511 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4(FieldType<64>::T val1, FieldType<64>::T val2, FieldType<64>::T val3, FieldType<64>::T val4); 512 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1); 513 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1); 514 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1); 515 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1); 516 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1); 517 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1); 518 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1); 519 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1); 520 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2); 521 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2); 522 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2); 523 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2); 524 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2); 525 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2); 526 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2); 527 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2(FieldType<128>::T val1, FieldType<128>::T val2); 528 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2); 529 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2); 530 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2); 531 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2); 532 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2); 533 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2); 534 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2); 535 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2); 536 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1); 537 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1); 538 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1); 539 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1); 540 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1); 541 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1); 542 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1); 543 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1); 544 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8); 545 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8); 546 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8); 547 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8); 548 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8); 549 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4, FieldType<32>::T val5, FieldType<32>::T val6, FieldType<32>::T val7, FieldType<32>::T val8); 531 550 532 551 //Implementation Part … … 577 596 578 597 //The total number of operations is 1.0 598 IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2) 599 { 600 return _mm256_and_ps(arg1, arg2); 601 } 602 603 //The total number of operations is 1.0 579 604 IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2) 580 605 { 581 606 return _mm256_xor_ps(arg1, arg2); 582 }583 584 //The total number of operations is 1.0585 IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2)586 {587 return _mm256_and_ps(arg1, arg2);588 607 } 589 608 … … 962 981 } 963 982 983 //The total number of operations is 12.0 984 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::all(bitblock256_t arg1) 985 { 986 bitblock256_t f0 = simd_and(arg1, simd256<2>::srli<1>(arg1)); 987 return simd_or(f0, simd256<2>::slli<1>(f0)); 988 } 989 990 //The total number of operations is 17.0 991 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::all(bitblock256_t arg1) 992 { 993 return simd256<4>::eq(arg1, simd256<8>::constant<255>()); 994 } 995 996 //The total number of operations is 5.0 997 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::all(bitblock256_t arg1) 998 { 999 return simd256<8>::eq(arg1, simd256<8>::constant<255>()); 1000 } 1001 1002 //The total number of operations is 5.0 1003 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::all(bitblock256_t arg1) 1004 { 1005 return simd256<16>::eq(arg1, simd256<8>::constant<255>()); 1006 } 1007 1008 //The total number of operations is 5.0 1009 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::all(bitblock256_t arg1) 1010 { 1011 return simd256<32>::eq(arg1, simd256<8>::constant<255>()); 1012 } 1013 1014 //The total number of operations is 5.0 1015 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::all(bitblock256_t arg1) 1016 { 1017 return simd256<64>::eq(arg1, simd256<8>::constant<255>()); 1018 } 1019 1020 //The total number of operations is 23.6666666667 1021 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::all(bitblock256_t arg1) 1022 { 1023 return simd256<128>::eq(arg1, simd256<8>::constant<255>()); 1024 } 1025 1026 //The total number of operations is 2.0 1027 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::all(bitblock256_t arg1) 1028 { 1029 return ((bitblock256::all(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>()); 1030 } 1031 1032 //The total number of operations is 5.0 1033 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1) 1034 { 1035 return simd_and(simd256<32>::srli<sh>(arg1), simd256<2>::constant<((3)>>sh)>()); 1036 } 1037 1038 //The total number of operations is 5.0 1039 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1) 1040 { 1041 return simd_and(simd256<32>::srli<sh>(arg1), simd256<4>::constant<((15)>>sh)>()); 1042 } 1043 1044 //The total number of operations is 5.0 1045 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1) 1046 { 1047 return simd_and(simd256<32>::srli<sh>(arg1), simd256<8>::constant<((255)>>sh)>()); 1048 } 1049 1050 //The total number of operations is 4.0 1051 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1) 1052 { 1053 return avx_general_combine256(_mm_srli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi16(avx_select_lo128(arg1), (int32_t)(sh))); 1054 } 1055 1056 //The total number of operations is 4.0 1057 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1) 1058 { 1059 return avx_general_combine256(_mm_srli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi32(avx_select_lo128(arg1), (int32_t)(sh))); 1060 } 1061 1062 //The total number of operations is 4.0 1063 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1) 1064 { 1065 return avx_general_combine256(_mm_srli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi64(avx_select_lo128(arg1), (int32_t)(sh))); 1066 } 1067 1068 //The total number of operations is 8.33333333333 1069 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1) 1070 { 1071 return (((sh%8) == 0) ? avx_byte_shift_right(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::srli<(sh&63)>(avx_byte_shift_right(arg1, 8)) : simd_or(simd256<64>::srli<sh>(arg1), avx_byte_shift_right(simd256<64>::slli<((128-sh)&63)>(arg1), 8)))); 1072 } 1073 1074 //The total number of operations is 14.5 1075 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1) 1076 { 1077 return ((sh < 128) ? simd_or(simd256<128>::srli<sh>(arg1), simd256<128>::slli<(128-sh)>(((bitblock256_t)(_mm256_castsi128_si256(avx_select_hi128(arg1)))))) : simd256<128>::srli<(sh-128)>(avx_move_hi128_to_lo128(arg1))); 1078 } 1079 1080 //The total number of operations is 1.0 1081 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1) 1082 { 1083 return simd_not(arg1); 1084 } 1085 1086 //The total number of operations is 27.0 1087 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1) 1088 { 1089 return simd256<2>::popcount(simd_andc(simd256<2>::sub(arg1, simd256<2>::constant<1>()), arg1)); 1090 } 1091 1092 //The total number of operations is 36.0 1093 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ctz(bitblock256_t arg1) 1094 { 1095 return simd256<4>::popcount(simd_andc(simd256<4>::sub(arg1, simd256<4>::constant<1>()), arg1)); 1096 } 1097 1098 //The total number of operations is 38.0 1099 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ctz(bitblock256_t arg1) 1100 { 1101 return simd256<8>::popcount(simd_andc(simd256<8>::sub(arg1, simd256<8>::constant<1>()), arg1)); 1102 } 1103 1104 //The total number of operations is 48.0 1105 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ctz(bitblock256_t arg1) 1106 { 1107 return simd256<16>::popcount(simd_andc(simd256<16>::sub(arg1, simd256<16>::constant<1>()), arg1)); 1108 } 1109 1110 //The total number of operations is 58.0 1111 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ctz(bitblock256_t arg1) 1112 { 1113 return simd256<32>::popcount(simd_andc(simd256<32>::sub(arg1, simd256<32>::constant<1>()), arg1)); 1114 } 1115 1116 //The total number of operations is 44.0 1117 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ctz(bitblock256_t arg1) 1118 { 1119 return simd256<64>::popcount(simd_andc(simd256<64>::sub(arg1, simd256<64>::constant<1>()), arg1)); 1120 } 1121 1122 //The total number of operations is 101.0 1123 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1) 1124 { 1125 return simd256<128>::popcount(simd_andc(simd256<128>::sub(arg1, simd256<128>::constant<1>()), arg1)); 1126 } 1127 1128 //The total number of operations is 192.166666667 1129 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1) 1130 { 1131 return simd256<256>::popcount(simd_andc(simd256<256>::sub(arg1, simd256<256>::constant<1>()), arg1)); 1132 } 1133 1134 //The total number of operations is 1.0 1135 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1136 { 1137 return simd_andc(arg1, arg2); 1138 } 1139 1140 //The total number of operations is 23.0 1141 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1142 { 1143 bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2); 1144 bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2))); 1145 mask = simd_or(mask, simd256<2>::slli<(1)>(mask)); 1146 return simd_or(simd256<2>::srai<(1)>(tmpAns), mask); 1147 } 1148 1149 //The total number of operations is 20.0 1150 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1151 { 1152 return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask()))); 1153 } 1154 1155 //The total number of operations is 7.0 1156 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1157 { 1158 bitblock256_t high_bit = simd256<8>::constant<(128)>(); 1159 return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)); 1160 } 1161 1162 //The total number of operations is 7.0 1163 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1164 { 1165 bitblock256_t high_bit = simd256<16>::constant<(32768)>(); 1166 return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)); 1167 } 1168 1169 //The total number of operations is 7.0 1170 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1171 { 1172 bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>(); 1173 return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)); 1174 } 1175 1176 //The total number of operations is 7.0 1177 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1178 { 1179 bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>(); 1180 return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)); 1181 } 1182 1183 //The total number of operations is 60.0 1184 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1185 { 1186 bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2); 1187 bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2))); 1188 mask = simd_or(mask, simd256<128>::slli<(64)>(mask)); 1189 return simd_or(simd256<128>::srai<(64)>(tmpAns), mask); 1190 } 1191 1192 //The total number of operations is 174.166666667 1193 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2) 1194 { 1195 bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2); 1196 bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2))); 1197 mask = simd_or(mask, simd256<256>::slli<(128)>(mask)); 1198 return simd_or(simd256<256>::srai<(128)>(tmpAns), mask); 1199 } 1200 1201 //The total number of operations is 7.0 1202 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1) 1203 { 1204 return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask())); 1205 } 1206 1207 //The total number of operations is 7.0 1208 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1) 1209 { 1210 return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask())); 1211 } 1212 1213 //The total number of operations is 7.0 1214 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1) 1215 { 1216 return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask())); 1217 } 1218 1219 //The total number of operations is 6.0 1220 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1) 1221 { 1222 return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask())); 1223 } 1224 1225 //The total number of operations is 6.0 1226 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1) 1227 { 1228 return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask())); 1229 } 1230 1231 //The total number of operations is 6.0 1232 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1) 1233 { 1234 return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask())); 1235 } 1236 1237 //The total number of operations is 10.3333333333 1238 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1) 1239 { 1240 return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask())); 1241 } 1242 1243 //The total number of operations is 16.5 1244 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1) 1245 { 1246 return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask())); 1247 } 1248 1249 //The total number of operations is 0 1250 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1) 1251 { 1252 return arg1; 1253 } 1254 1255 //The total number of operations is 10.0 1256 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1) 1257 { 1258 return simd256<2>::add_hl(simd256<(1)>::popcount(arg1)); 1259 } 1260 1261 //The total number of operations is 21.0 1262 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1) 1263 { 1264 return simd256<4>::add_hl(simd256<(2)>::popcount(arg1)); 1265 } 1266 1267 //The total number of operations is 32.0 1268 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1) 1269 { 1270 return simd256<8>::add_hl(simd256<(4)>::popcount(arg1)); 1271 } 1272 1273 //The total number of operations is 42.0 1274 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1) 1275 { 1276 return simd256<16>::add_hl(simd256<(8)>::popcount(arg1)); 1277 } 1278 1279 //The total number of operations is 52.0 1280 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1) 1281 { 1282 return simd256<32>::add_hl(simd256<(16)>::popcount(arg1)); 1283 } 1284 1285 //The total number of operations is 38.0 1286 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1) 1287 { 1288 bitblock256_t tmpAns = simd256<8>::popcount(arg1); 1289 return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0)))); 1290 } 1291 1292 //The total number of operations is 73.6666666667 1293 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1) 1294 { 1295 return simd256<128>::add_hl(simd256<(64)>::popcount(arg1)); 1296 } 1297 1298 //The total number of operations is 115.5 1299 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1) 1300 { 1301 bitblock256_t tmpAns = simd256<(128)>::popcount(arg1); 1302 return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns)); 1303 } 1304 1305 //The total number of operations is 14.0 1306 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1) 1307 { 1308 bitblock256_t t0 = simd256<2>::srli<1>(arg1); 1309 bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>()))); 1310 return simd_or(f0, simd256<2>::slli<1>(f0)); 1311 } 1312 1313 //The total number of operations is 20.0 1314 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1) 1315 { 1316 return simd256<4>::ugt(arg1, simd256<8>::constant<0>()); 1317 } 1318 1319 //The total number of operations is 7.0 1320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1) 1321 { 1322 return simd256<8>::ugt(arg1, simd256<8>::constant<0>()); 1323 } 1324 1325 //The total number of operations is 7.0 1326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1) 1327 { 1328 return simd256<16>::ugt(arg1, simd256<8>::constant<0>()); 1329 } 1330 1331 //The total number of operations is 7.0 1332 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1) 1333 { 1334 return simd256<32>::ugt(arg1, simd256<8>::constant<0>()); 1335 } 1336 1337 //The total number of operations is 7.0 1338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1) 1339 { 1340 return simd256<64>::ugt(arg1, simd256<8>::constant<0>()); 1341 } 1342 1343 //The total number of operations is 60.0 1344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1) 1345 { 1346 return simd256<128>::ugt(arg1, simd256<8>::constant<0>()); 1347 } 1348 1349 //The total number of operations is 1.0 1350 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1) 1351 { 1352 return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>()); 1353 } 1354 1355 //The total number of operations is 16.0 1356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1) 1357 { 1358 return simd256<2>::sub(simd256<2>::constant<0>(), arg1); 1359 } 1360 1361 //The total number of operations is 14.0 1362 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1) 1363 { 1364 return simd256<4>::sub(simd256<4>::constant<0>(), arg1); 1365 } 1366 1367 //The total number of operations is 5.0 1368 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1) 1369 { 1370 return simd256<8>::sub(simd256<8>::constant<0>(), arg1); 1371 } 1372 1373 //The total number of operations is 5.0 1374 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1) 1375 { 1376 return simd256<16>::sub(simd256<16>::constant<0>(), arg1); 1377 } 1378 1379 //The total number of operations is 5.0 1380 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1) 1381 { 1382 return simd256<32>::sub(simd256<32>::constant<0>(), arg1); 1383 } 1384 1385 //The total number of operations is 5.0 1386 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1) 1387 { 1388 return simd256<64>::sub(simd256<64>::constant<0>(), arg1); 1389 } 1390 1391 //The total number of operations is 26.3333333333 1392 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1) 1393 { 1394 return simd256<128>::sub(simd256<128>::constant<0>(), arg1); 1395 } 1396 1397 //The total number of operations is 75.6666666667 1398 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1) 1399 { 1400 return simd256<256>::sub(simd256<256>::constant<0>(), arg1); 1401 } 1402 1403 //The total number of operations is 5.0 1404 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1) 1405 { 1406 return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>()); 1407 } 1408 1409 //The total number of operations is 5.0 1410 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1) 1411 { 1412 return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>()); 1413 } 1414 1415 //The total number of operations is 5.0 1416 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1) 1417 { 1418 return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>()); 1419 } 1420 1421 //The total number of operations is 4.0 1422 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1) 1423 { 1424 return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh))); 1425 } 1426 1427 //The total number of operations is 4.0 1428 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1) 1429 { 1430 return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh))); 1431 } 1432 1433 //The total number of operations is 4.0 1434 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1) 1435 { 1436 return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh))); 1437 } 1438 1439 //The total number of operations is 8.33333333333 1440 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1) 1441 { 1442 return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8)))); 1443 } 1444 1445 //The total number of operations is 14.0 1446 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1) 1447 { 1448 return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1))); 1449 } 1450 1451 //The total number of operations is 3.0 1452 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1453 { 1454 return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1)); 1455 } 1456 1457 //The total number of operations is 11.0 1458 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1459 { 1460 return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3); 1461 } 1462 1463 //The total number of operations is 19.0 1464 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1465 { 1466 return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3); 1467 } 1468 1469 //The total number of operations is 8.0 1470 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1471 { 1472 return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3); 1473 } 1474 1475 //The total number of operations is 8.0 1476 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1477 { 1478 return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3); 1479 } 1480 1481 //The total number of operations is 8.0 1482 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1483 { 1484 return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3); 1485 } 1486 1487 //The total number of operations is 1.0 1488 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1489 { 1490 return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1)); 1491 } 1492 1493 //The total number of operations is 12.3333333333 1494 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1495 { 1496 return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3); 1497 } 1498 1499 //The total number of operations is 29.8333333333 1500 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3) 1501 { 1502 return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3); 1503 } 1504 1505 //The total number of operations is 1.0 1506 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2) 1507 { 1508 return simd_xor(arg1, arg2); 1509 } 1510 1511 //The total number of operations is 16.0 1512 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2) 1513 { 1514 bitblock256_t ans = simd256<(1)>::sub(arg1, arg2); 1515 bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans)); 1516 bitblock256_t loMask = simd256<2>::lomask(); 1517 bitblock256_t borrow = simd256<2>::slli<1>(simd_and(borrowMask, loMask)); 1518 return simd256<1>::ifh(loMask, ans, simd256<(1)>::sub(ans, borrow)); 1519 } 1520 1521 //The total number of operations is 14.0 1522 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2) 1523 { 1524 return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::sub(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::sub(arg1, arg2)); 1525 } 1526 1527 //The total number of operations is 5.0 1528 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2) 1529 { 1530 return avx_general_combine256(_mm_sub_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1531 } 1532 1533 //The total number of operations is 5.0 1534 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2) 1535 { 1536 return avx_general_combine256(_mm_sub_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1537 } 1538 1539 //The total number of operations is 5.0 1540 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2) 1541 { 1542 return avx_general_combine256(_mm_sub_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1543 } 1544 1545 //The total number of operations is 5.0 1546 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2) 1547 { 1548 return avx_general_combine256(_mm_sub_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1549 } 1550 1551 //The total number of operations is 26.3333333333 1552 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2) 1553 { 1554 bitblock256_t partial = simd256<(64)>::sub(arg1, arg2); 1555 bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2))); 1556 bitblock256_t borrow = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(borrowMask)); 1557 return simd256<(64)>::sub(partial, borrow); 1558 } 1559 1560 //The total number of operations is 75.6666666667 1561 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2) 1562 { 1563 bitblock256_t ans = simd256<(128)>::sub(arg1, arg2); 1564 bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans)); 1565 bitblock256_t loMask = simd256<256>::lomask(); 1566 bitblock256_t borrow = simd256<256>::slli<1>(simd_and(borrowMask, loMask)); 1567 return simd256<1>::ifh(loMask, ans, simd256<(128)>::sub(ans, borrow)); 1568 } 1569 1570 //The total number of operations is 10.0 1571 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1) 1572 { 1573 return simd256<16>::sub(arg1, simd_and(simd256<2>::lomask(), simd256<16>::srli<1>(arg1))); 1574 } 1575 1576 //The total number of operations is 11.0 1577 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1) 1578 { 1579 return simd256<(8)>::add(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask())); 1580 } 1581 1582 //The total number of operations is 11.0 1583 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add_hl(bitblock256_t arg1) 1584 { 1585 return simd256<(16)>::add(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask())); 1586 } 1587 1588 //The total number of operations is 10.0 1589 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add_hl(bitblock256_t arg1) 1590 { 1591 return simd256<(32)>::add(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask())); 1592 } 1593 1594 //The total number of operations is 10.0 1595 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add_hl(bitblock256_t arg1) 1596 { 1597 return simd256<(64)>::add(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask())); 1598 } 1599 1600 //The total number of operations is 10.0 1601 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add_hl(bitblock256_t arg1) 1602 { 1603 return simd256<64>::add(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask())); 1604 } 1605 1606 //The total number of operations is 35.6666666667 1607 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1) 1608 { 1609 return simd256<128>::add(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask())); 1610 } 1611 1612 //The total number of operations is 91.1666666667 1613 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1) 1614 { 1615 return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask())); 1616 } 1617 1618 //The total number of operations is 0 1619 template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant() 1620 { 1621 return simd256<32>::constant<(-1*val)>(); 1622 } 1623 1624 //The total number of operations is 0 1625 template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant() 1626 { 1627 return ((val < 0) ? simd256<(4)>::constant<((val<<2)|(val^(-4)))>() : simd256<(4)>::constant<((val<<2)|val)>()); 1628 } 1629 1630 //The total number of operations is 0 1631 template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant() 1632 { 1633 return ((val < 0) ? simd256<(8)>::constant<((val<<4)|(val^(-16)))>() : simd256<(8)>::constant<((val<<4)|val)>()); 1634 } 1635 1636 //The total number of operations is 0 1637 template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant() 1638 { 1639 return (bitblock256_t)_mm256_set1_epi8((int32_t)(val)); 1640 } 1641 1642 //The total number of operations is 0 1643 template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant() 1644 { 1645 return (bitblock256_t)_mm256_set1_epi16((int32_t)(val)); 1646 } 1647 1648 //The total number of operations is 0 1649 template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant() 1650 { 1651 return (bitblock256_t)_mm256_set1_epi32((int32_t)(val)); 1652 } 1653 1654 //The total number of operations is 0 1655 template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant() 1656 { 1657 return ((bitblock256_t)(_mm256_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val)))); 1658 } 1659 1660 //The total number of operations is 0 1661 template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant() 1662 { 1663 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val)))); 1664 } 1665 1666 //The total number of operations is 0 1667 template <> template <FieldType<256>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant() 1668 { 1669 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val)))); 1670 } 1671 1672 //The total number of operations is 1.0 1673 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2) 1674 { 1675 return simd_or(arg1, arg2); 1676 } 1677 1678 //The total number of operations is 25.0 1679 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2) 1680 { 1681 bitblock256_t hiAns = simd256<(1)>::min(arg1, arg2); 1682 bitblock256_t loAns = simd256<(1)>::umin(arg1, arg2); 1683 bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg1)); 1684 bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg2)); 1685 return simd256<1>::ifh(simd256<2>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2)); 1686 } 1687 1688 //The total number of operations is 17.0 1689 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::min(bitblock256_t arg1, bitblock256_t arg2) 1690 { 1691 bitblock256_t high_bit = simd256<4>::constant<(8)>(); 1692 return simd_xor(simd256<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit); 1693 } 1694 1695 //The total number of operations is 5.0 1696 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::min(bitblock256_t arg1, bitblock256_t arg2) 1697 { 1698 return avx_general_combine256(_mm_min_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1699 } 1700 1701 //The total number of operations is 5.0 1702 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::min(bitblock256_t arg1, bitblock256_t arg2) 1703 { 1704 return avx_general_combine256(_mm_min_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1705 } 1706 1707 //The total number of operations is 5.0 1708 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::min(bitblock256_t arg1, bitblock256_t arg2) 1709 { 1710 return avx_general_combine256(_mm_min_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1711 } 1712 1713 //The total number of operations is 8.0 1714 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::min(bitblock256_t arg1, bitblock256_t arg2) 1715 { 1716 return simd256<1>::ifh(simd256<64>::gt(arg1, arg2), arg2, arg1); 1717 } 1718 1719 //The total number of operations is 54.6666666667 1720 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2) 1721 { 1722 bitblock256_t hiAns = simd256<(64)>::min(arg1, arg2); 1723 bitblock256_t loAns = simd256<(64)>::umin(arg1, arg2); 1724 bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg1)); 1725 bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg2)); 1726 return simd256<1>::ifh(simd256<128>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2)); 1727 } 1728 1729 //The total number of operations is 186.666666667 1730 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2) 1731 { 1732 bitblock256_t hiAns = simd256<(128)>::min(arg1, arg2); 1733 bitblock256_t loAns = simd256<(128)>::umin(arg1, arg2); 1734 bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg1)); 1735 bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg2)); 1736 return simd256<1>::ifh(simd256<256>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2)); 1737 } 1738 1739 //The total number of operations is 0 1740 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask() 1741 { 1742 return simd256<2>::constant<(1)>(); 1743 } 1744 1745 //The total number of operations is 0 1746 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask() 1747 { 1748 return simd256<4>::constant<(3)>(); 1749 } 1750 1751 //The total number of operations is 0 1752 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask() 1753 { 1754 return simd256<8>::constant<(15)>(); 1755 } 1756 1757 //The total number of operations is 0 1758 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask() 1759 { 1760 return simd256<16>::constant<(255)>(); 1761 } 1762 1763 //The total number of operations is 0 1764 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask() 1765 { 1766 return simd256<32>::constant<(65535)>(); 1767 } 1768 1769 //The total number of operations is 0 1770 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask() 1771 { 1772 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1)))); 1773 } 1774 1775 //The total number of operations is 0 1776 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask() 1777 { 1778 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1)))); 1779 } 1780 1781 //The total number of operations is 0 1782 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask() 1783 { 1784 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1)))); 1785 } 1786 1787 //The total number of operations is 1.0 1788 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2) 1789 { 1790 return simd_xor(arg1, arg2); 1791 } 1792 1793 //The total number of operations is 16.0 1794 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2) 1795 { 1796 bitblock256_t ans = simd256<(1)>::add(arg1, arg2); 1797 bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans))); 1798 bitblock256_t loMask = simd256<2>::lomask(); 1799 bitblock256_t carry = simd256<2>::slli<1>(simd_and(carryMask, loMask)); 1800 return simd256<1>::ifh(loMask, ans, simd256<(1)>::add(ans, carry)); 1801 } 1802 1803 //The total number of operations is 14.0 1804 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2) 1805 { 1806 return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::add(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::add(arg1, arg2)); 1807 } 1808 1809 //The total number of operations is 5.0 1810 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2) 1811 { 1812 return avx_general_combine256(_mm_add_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1813 } 1814 1815 //The total number of operations is 5.0 1816 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2) 1817 { 1818 return avx_general_combine256(_mm_add_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1819 } 1820 1821 //The total number of operations is 5.0 1822 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2) 1823 { 1824 return avx_general_combine256(_mm_add_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1825 } 1826 1827 //The total number of operations is 5.0 1828 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2) 1829 { 1830 return avx_general_combine256(_mm_add_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1831 } 1832 1833 //The total number of operations is 26.3333333333 1834 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2) 1835 { 1836 bitblock256_t partial = simd256<(64)>::add(arg1, arg2); 1837 bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial)); 1838 bitblock256_t carry = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(carryMask)); 1839 return simd256<(64)>::add(partial, carry); 1840 } 1841 1842 //The total number of operations is 75.6666666667 1843 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2) 1844 { 1845 bitblock256_t ans = simd256<(128)>::add(arg1, arg2); 1846 bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans))); 1847 bitblock256_t loMask = simd256<256>::lomask(); 1848 bitblock256_t carry = simd256<256>::slli<1>(simd_and(carryMask, loMask)); 1849 return simd256<1>::ifh(loMask, ans, simd256<(128)>::add(ans, carry)); 1850 } 1851 1852 //The total number of operations is 1.0 1853 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2) 1854 { 1855 return simd_and(arg1, arg2); 1856 } 1857 1858 //The total number of operations is 24.0 1859 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2) 1860 { 1861 bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2); 1862 bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1)); 1863 bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2)); 1864 return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2)); 1865 } 1866 1867 //The total number of operations is 14.0 1868 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2) 1869 { 1870 return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))); 1871 } 1872 1873 //The total number of operations is 5.0 1874 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2) 1875 { 1876 return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1877 } 1878 1879 //The total number of operations is 5.0 1880 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2) 1881 { 1882 return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1883 } 1884 1885 //The total number of operations is 5.0 1886 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2) 1887 { 1888 return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1889 } 1890 1891 //The total number of operations is 11.0 1892 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2) 1893 { 1894 bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>(); 1895 return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit); 1896 } 1897 1898 //The total number of operations is 46.6666666667 1899 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2) 1900 { 1901 bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2); 1902 bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1)); 1903 bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2)); 1904 return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2)); 1905 } 1906 1907 //The total number of operations is 132.0 1908 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2) 1909 { 1910 bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2); 1911 bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1)); 1912 bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2)); 1913 return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2)); 1914 } 1915 1916 //The total number of operations is 19.0 1917 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1) 1918 { 1919 return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1); 1920 } 1921 1922 //The total number of operations is 39.0 1923 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1) 1924 { 1925 bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>()); 1926 return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1)); 1927 } 1928 1929 //The total number of operations is 4.0 1930 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1) 1931 { 1932 return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1))); 1933 } 1934 1935 //The total number of operations is 4.0 1936 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1) 1937 { 1938 return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1))); 1939 } 1940 1941 //The total number of operations is 4.0 1942 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1) 1943 { 1944 return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1))); 1945 } 1946 1947 //The total number of operations is 13.0 1948 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1) 1949 { 1950 bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>()); 1951 return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1)); 1952 } 1953 1954 //The total number of operations is 69.0 1955 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1) 1956 { 1957 bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1); 1958 return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1)); 1959 } 1960 1961 //The total number of operations is 204.833333333 1962 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1) 1963 { 1964 bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1); 1965 return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1)); 1966 } 1967 1968 //The total number of operations is 2.0 1969 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2) 1970 { 1971 return simd_not(simd_xor(arg1, arg2)); 1972 } 1973 1974 //The total number of operations is 14.0 1975 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2) 1976 { 1977 bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2); 1978 bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns)); 1979 bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask); 1980 return simd_or(loMask, hiMask); 1981 } 1982 1983 //The total number of operations is 17.0 1984 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2) 1985 { 1986 return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)))); 1987 } 1988 1989 //The total number of operations is 5.0 1990 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2) 1991 { 1992 return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1993 } 1994 1995 //The total number of operations is 5.0 1996 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2) 1997 { 1998 return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2))); 1999 } 2000 2001 //The total number of operations is 5.0 2002 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2) 2003 { 2004 return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2))); 2005 } 2006 2007 //The total number of operations is 5.0 2008 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2) 2009 { 2010 return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2))); 2011 } 2012 2013 //The total number of operations is 23.6666666667 2014 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2) 2015 { 2016 bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2); 2017 bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns)); 2018 bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask); 2019 return simd_or(loMask, hiMask); 2020 } 2021 2022 //The total number of operations is 54.1666666667 2023 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2) 2024 { 2025 bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2); 2026 bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns)); 2027 bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask); 2028 return simd_or(loMask, hiMask); 2029 } 2030 2031 //The total number of operations is 7.0 2032 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1) 2033 { 2034 return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1))); 2035 } 2036 2037 //The total number of operations is 17.5 2038 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1) 2039 { 2040 return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1)))); 2041 } 2042 2043 //The total number of operations is 12.0 2044 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1) 2045 { 2046 bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1); 2047 return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp))); 2048 } 2049 2050 //The total number of operations is 4.0 2051 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1) 2052 { 2053 return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh))); 2054 } 2055 2056 //The total number of operations is 4.0 2057 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1) 2058 { 2059 return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh))); 2060 } 2061 2062 //The total number of operations is 12.0 2063 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1) 2064 { 2065 return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1)))); 2066 } 2067 2068 //The total number of operations is 28.3333333333 2069 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1) 2070 { 2071 return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1)))); 2072 } 2073 2074 //The total number of operations is 59.0 2075 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1) 2076 { 2077 return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1)))); 2078 } 2079 2080 //The total number of operations is 0 2081 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask() 2082 { 2083 return simd256<2>::constant<(2)>(); 2084 } 2085 2086 //The total number of operations is 0 2087 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask() 2088 { 2089 return simd256<4>::constant<(12)>(); 2090 } 2091 2092 //The total number of operations is 0 2093 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask() 2094 { 2095 return simd256<8>::constant<(240)>(); 2096 } 2097 2098 //The total number of operations is 0 2099 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask() 2100 { 2101 return simd256<16>::constant<(65280)>(); 2102 } 2103 2104 //The total number of operations is 0 2105 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask() 2106 { 2107 return simd256<32>::constant<-65536>(); 2108 } 2109 2110 //The total number of operations is 0 2111 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask() 2112 { 2113 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0)))); 2114 } 2115 2116 //The total number of operations is 0 2117 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask() 2118 { 2119 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0)))); 2120 } 2121 2122 //The total number of operations is 0 2123 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask() 2124 { 2125 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0)))); 2126 } 2127 964 2128 //The total number of operations is 1.0 965 2129 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2) … … 1029 2193 } 1030 2194 1031 //The total number of operations is 5.01032 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1)1033 {1034 return simd_and(simd256<32>::srli<sh>(arg1), simd256<2>::constant<((3)>>sh)>());1035 }1036 1037 //The total number of operations is 5.01038 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1)1039 {1040 return simd_and(simd256<32>::srli<sh>(arg1), simd256<4>::constant<((15)>>sh)>());1041 }1042 1043 //The total number of operations is 5.01044 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1)1045 {1046 return simd_and(simd256<32>::srli<sh>(arg1), simd256<8>::constant<((255)>>sh)>());1047 }1048 1049 //The total number of operations is 4.01050 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1)1051 {1052 return avx_general_combine256(_mm_srli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));1053 }1054 1055 //The total number of operations is 4.01056 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1)1057 {1058 return avx_general_combine256(_mm_srli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));1059 }1060 1061 //The total number of operations is 4.01062 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1)1063 {1064 return avx_general_combine256(_mm_srli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));1065 }1066 1067 //The total number of operations is 8.333333333331068 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1)1069 {1070 return (((sh%8) == 0) ? avx_byte_shift_right(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::srli<(sh&63)>(avx_byte_shift_right(arg1, 8)) : simd_or(simd256<64>::srli<sh>(arg1), avx_byte_shift_right(simd256<64>::slli<((128-sh)&63)>(arg1), 8))));1071 }1072 1073 //The total number of operations is 14.51074 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1)1075 {1076 return ((sh < 128) ? simd_or(simd256<128>::srli<sh>(arg1), simd256<128>::slli<((256-sh)&127)>(((bitblock256_t)(_mm256_castsi128_si256(avx_select_hi128(arg1)))))) : simd256<128>::srli<(sh&127)>(avx_move_hi128_to_lo128(arg1)));1077 }1078 1079 //The total number of operations is 1.01080 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1)1081 {1082 return simd_not(arg1);1083 }1084 1085 //The total number of operations is 27.01086 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1)1087 {1088 return simd256<2>::popcount(simd_andc(simd256<2>::sub(arg1, simd256<2>::constant<1>()), arg1));1089 }1090 1091 //The total number of operations is 36.01092 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ctz(bitblock256_t arg1)1093 {1094 return simd256<4>::popcount(simd_andc(simd256<4>::sub(arg1, simd256<4>::constant<1>()), arg1));1095 }1096 1097 //The total number of operations is 38.01098 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ctz(bitblock256_t arg1)1099 {1100 return simd256<8>::popcount(simd_andc(simd256<8>::sub(arg1, simd256<8>::constant<1>()), arg1));1101 }1102 1103 //The total number of operations is 48.01104 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ctz(bitblock256_t arg1)1105 {1106 return simd256<16>::popcount(simd_andc(simd256<16>::sub(arg1, simd256<16>::constant<1>()), arg1));1107 }1108 1109 //The total number of operations is 58.01110 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ctz(bitblock256_t arg1)1111 {1112 return simd256<32>::popcount(simd_andc(simd256<32>::sub(arg1, simd256<32>::constant<1>()), arg1));1113 }1114 1115 //The total number of operations is 44.01116 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ctz(bitblock256_t arg1)1117 {1118 return simd256<64>::popcount(simd_andc(simd256<64>::sub(arg1, simd256<64>::constant<1>()), arg1));1119 }1120 1121 //The total number of operations is 101.01122 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1)1123 {1124 return simd256<128>::popcount(simd_andc(simd256<128>::sub(arg1, simd256<128>::constant<1>()), arg1));1125 }1126 1127 //The total number of operations is 192.1666666671128 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1)1129 {1130 return simd256<256>::popcount(simd_andc(simd256<256>::sub(arg1, simd256<256>::constant<1>()), arg1));1131 }1132 1133 //The total number of operations is 1.01134 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)1135 {1136 return simd_andc(arg1, arg2);1137 }1138 1139 //The total number of operations is 23.01140 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)1141 {1142 bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);1143 bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));1144 mask = simd_or(mask, simd256<2>::slli<(1)>(mask));1145 return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);1146 }1147 1148 //The total number of operations is 20.01149 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)1150 {1151 return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));1152 }1153 1154 //The total number of operations is 7.01155 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)1156 {1157 bitblock256_t high_bit = simd256<8>::constant<(128)>();1158 return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));1159 }1160 1161 //The total number of operations is 7.01162 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)1163 {1164 bitblock256_t high_bit = simd256<16>::constant<(32768)>();1165 return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));1166 }1167 1168 //The total number of operations is 7.01169 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)1170 {1171 bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();1172 return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));1173 }1174 1175 //The total number of operations is 7.01176 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)1177 {1178 bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();1179 return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));1180 }1181 1182 //The total number of operations is 60.01183 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)1184 {1185 bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);1186 bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));1187 mask = simd_or(mask, simd256<128>::slli<(64)>(mask));1188 return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);1189 }1190 1191 //The total number of operations is 174.1666666671192 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)1193 {1194 bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);1195 bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));1196 mask = simd_or(mask, simd256<256>::slli<(128)>(mask));1197 return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);1198 }1199 1200 //The total number of operations is 7.01201 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)1202 {1203 return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));1204 }1205 1206 //The total number of operations is 7.01207 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)1208 {1209 return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));1210 }1211 1212 //The total number of operations is 7.01213 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)1214 {1215 return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));1216 }1217 1218 //The total number of operations is 6.01219 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)1220 {1221 return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));1222 }1223 1224 //The total number of operations is 6.01225 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)1226 {1227 return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));1228 }1229 1230 //The total number of operations is 6.01231 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)1232 {1233 return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));1234 }1235 1236 //The total number of operations is 10.33333333331237 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)1238 {1239 return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));1240 }1241 1242 //The total number of operations is 16.51243 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)1244 {1245 return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));1246 }1247 1248 //The total number of operations is 01249 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)1250 {1251 return arg1;1252 }1253 1254 //The total number of operations is 10.01255 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)1256 {1257 return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));1258 }1259 1260 //The total number of operations is 21.01261 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)1262 {1263 return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));1264 }1265 1266 //The total number of operations is 32.01267 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)1268 {1269 return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));1270 }1271 1272 //The total number of operations is 42.01273 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)1274 {1275 return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));1276 }1277 1278 //The total number of operations is 52.01279 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)1280 {1281 return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));1282 }1283 1284 //The total number of operations is 38.01285 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)1286 {1287 bitblock256_t tmpAns = simd256<8>::popcount(arg1);1288 return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));1289 }1290 1291 //The total number of operations is 73.66666666671292 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)1293 {1294 return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));1295 }1296 1297 //The total number of operations is 115.51298 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)1299 {1300 bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);1301 return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));1302 }1303 1304 //The total number of operations is 16.01305 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)1306 {1307 return simd256<2>::sub(simd256<2>::constant<0>(), arg1);1308 }1309 1310 //The total number of operations is 14.01311 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)1312 {1313 return simd256<4>::sub(simd256<4>::constant<0>(), arg1);1314 }1315 1316 //The total number of operations is 5.01317 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)1318 {1319 return simd256<8>::sub(simd256<8>::constant<0>(), arg1);1320 }1321 1322 //The total number of operations is 5.01323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)1324 {1325 return simd256<16>::sub(simd256<16>::constant<0>(), arg1);1326 }1327 1328 //The total number of operations is 5.01329 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)1330 {1331 return simd256<32>::sub(simd256<32>::constant<0>(), arg1);1332 }1333 1334 //The total number of operations is 5.01335 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)1336 {1337 return simd256<64>::sub(simd256<64>::constant<0>(), arg1);1338 }1339 1340 //The total number of operations is 26.33333333331341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)1342 {1343 return simd256<128>::sub(simd256<128>::constant<0>(), arg1);1344 }1345 1346 //The total number of operations is 75.66666666671347 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)1348 {1349 return simd256<256>::sub(simd256<256>::constant<0>(), arg1);1350 }1351 1352 //The total number of operations is 5.01353 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)1354 {1355 return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());1356 }1357 1358 //The total number of operations is 5.01359 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)1360 {1361 return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());1362 }1363 1364 //The total number of operations is 5.01365 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)1366 {1367 return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());1368 }1369 1370 //The total number of operations is 4.01371 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)1372 {1373 return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));1374 }1375 1376 //The total number of operations is 4.01377 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)1378 {1379 return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));1380 }1381 1382 //The total number of operations is 4.01383 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)1384 {1385 return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));1386 }1387 1388 //The total number of operations is 8.333333333331389 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)1390 {1391 return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));1392 }1393 1394 //The total number of operations is 14.01395 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)1396 {1397 return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<((256-sh)&127)>(arg1))) : simd256<128>::slli<(sh&127)>(avx_move_lo128_to_hi128(arg1)));1398 }1399 1400 //The total number of operations is 3.01401 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1402 {1403 return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));1404 }1405 1406 //The total number of operations is 11.01407 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1408 {1409 return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);1410 }1411 1412 //The total number of operations is 19.01413 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1414 {1415 return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);1416 }1417 1418 //The total number of operations is 8.01419 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1420 {1421 return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);1422 }1423 1424 //The total number of operations is 8.01425 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1426 {1427 return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);1428 }1429 1430 //The total number of operations is 8.01431 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1432 {1433 return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);1434 }1435 1436 //The total number of operations is 1.01437 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1438 {1439 return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1));1440 }1441 1442 //The total number of operations is 12.33333333331443 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1444 {1445 return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);1446 }1447 1448 //The total number of operations is 29.83333333331449 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)1450 {1451 return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);1452 }1453 1454 //The total number of operations is 1.01455 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2)1456 {1457 return simd_xor(arg1, arg2);1458 }1459 1460 //The total number of operations is 16.01461 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2)1462 {1463 bitblock256_t ans = simd256<(1)>::sub(arg1, arg2);1464 bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));1465 bitblock256_t loMask = simd256<2>::lomask();1466 bitblock256_t borrow = simd256<2>::slli<1>(simd_and(borrowMask, loMask));1467 return simd256<1>::ifh(loMask, ans, simd256<(1)>::sub(ans, borrow));1468 }1469 1470 //The total number of operations is 14.01471 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2)1472 {1473 return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::sub(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::sub(arg1, arg2));1474 }1475 1476 //The total number of operations is 5.01477 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2)1478 {1479 return avx_general_combine256(_mm_sub_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));1480 }1481 1482 //The total number of operations is 5.01483 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2)1484 {1485 return avx_general_combine256(_mm_sub_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));1486 }1487 1488 //The total number of operations is 5.01489 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2)1490 {1491 return avx_general_combine256(_mm_sub_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));1492 }1493 1494 //The total number of operations is 5.01495 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2)1496 {1497 return avx_general_combine256(_mm_sub_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));1498 }1499 1500 //The total number of operations is 26.33333333331501 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2)1502 {1503 bitblock256_t partial = simd256<(64)>::sub(arg1, arg2);1504 bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));1505 bitblock256_t borrow = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(borrowMask));1506 return simd256<(64)>::sub(partial, borrow);1507 }1508 1509 //The total number of operations is 75.66666666671510 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2)1511 {1512 bitblock256_t ans = simd256<(128)>::sub(arg1, arg2);1513 bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));1514 bitblock256_t loMask = simd256<256>::lomask();1515 bitblock256_t borrow = simd256<256>::slli<1>(simd_and(borrowMask, loMask));1516 return simd256<1>::ifh(loMask, ans, simd256<(128)>::sub(ans, borrow));1517 }1518 1519 //The total number of operations is 10.01520 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)1521 {1522 return simd256<16>::sub(arg1, simd_and(simd256<2>::lomask(), simd256<16>::srli<1>(arg1)));1523 }1524 1525 //The total number of operations is 11.01526 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1)1527 {1528 return simd256<(8)>::add(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));1529 }1530 1531 //The total number of operations is 11.01532 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add_hl(bitblock256_t arg1)1533 {1534 return simd256<(16)>::add(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));1535 }1536 1537 //The total number of operations is 10.01538 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add_hl(bitblock256_t arg1)1539 {1540 return simd256<(32)>::add(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));1541 }1542 1543 //The total number of operations is 10.01544 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add_hl(bitblock256_t arg1)1545 {1546 return simd256<(64)>::add(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));1547 }1548 1549 //The total number of operations is 10.01550 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add_hl(bitblock256_t arg1)1551 {1552 return simd256<64>::add(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));1553 }1554 1555 //The total number of operations is 35.66666666671556 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1)1557 {1558 return simd256<128>::add(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));1559 }1560 1561 //The total number of operations is 91.16666666671562 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1)1563 {1564 return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));1565 }1566 1567 //The total number of operations is 01568 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()1569 {1570 return simd256<32>::constant<(-1*val)>();1571 }1572 1573 //The total number of operations is 01574 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant()1575 {1576 return simd256<(4)>::constant<((val<<2)|(val&(3)))>();1577 }1578 1579 //The total number of operations is 01580 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant()1581 {1582 return simd256<(8)>::constant<((val<<4)|(val&(15)))>();1583 }1584 1585 //The total number of operations is 01586 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant()1587 {1588 return (bitblock256_t)_mm256_set1_epi8((int32_t)(val));1589 }1590 1591 //The total number of operations is 01592 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant()1593 {1594 return (bitblock256_t)_mm256_set1_epi16((int32_t)(val));1595 }1596 1597 //The total number of operations is 01598 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant()1599 {1600 return (bitblock256_t)_mm256_set1_epi32((int32_t)(val));1601 }1602 1603 //The total number of operations is 01604 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant()1605 {1606 return ((bitblock256_t)(_mm256_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))));1607 }1608 1609 //The total number of operations is 01610 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant()1611 {1612 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))));1613 }1614 1615 //The total number of operations is 01616 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant()1617 {1618 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))));1619 }1620 1621 //The total number of operations is 1.01622 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2)1623 {1624 return simd_or(arg1, arg2);1625 }1626 1627 //The total number of operations is 25.01628 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2)1629 {1630 bitblock256_t hiAns = simd256<(1)>::min(arg1, arg2);1631 bitblock256_t loAns = simd256<(1)>::umin(arg1, arg2);1632 bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg1));1633 bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg2));1634 return simd256<1>::ifh(simd256<2>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));1635 }1636 1637 //The total number of operations is 17.01638 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::min(bitblock256_t arg1, bitblock256_t arg2)1639 {1640 bitblock256_t high_bit = simd256<4>::constant<(8)>();1641 return simd_xor(simd256<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);1642 }1643 1644 //The total number of operations is 5.01645 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::min(bitblock256_t arg1, bitblock256_t arg2)1646 {1647 return avx_general_combine256(_mm_min_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));1648 }1649 1650 //The total number of operations is 5.01651 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::min(bitblock256_t arg1, bitblock256_t arg2)1652 {1653 return avx_general_combine256(_mm_min_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));1654 }1655 1656 //The total number of operations is 5.01657 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::min(bitblock256_t arg1, bitblock256_t arg2)1658 {1659 return avx_general_combine256(_mm_min_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));1660 }1661 1662 //The total number of operations is 8.01663 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::min(bitblock256_t arg1, bitblock256_t arg2)1664 {1665 return simd256<1>::ifh(simd256<64>::gt(arg1, arg2), arg2, arg1);1666 }1667 1668 //The total number of operations is 54.66666666671669 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2)1670 {1671 bitblock256_t hiAns = simd256<(64)>::min(arg1, arg2);1672 bitblock256_t loAns = simd256<(64)>::umin(arg1, arg2);1673 bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg1));1674 bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg2));1675 return simd256<1>::ifh(simd256<128>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));1676 }1677 1678 //The total number of operations is 186.6666666671679 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2)1680 {1681 bitblock256_t hiAns = simd256<(128)>::min(arg1, arg2);1682 bitblock256_t loAns = simd256<(128)>::umin(arg1, arg2);1683 bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg1));1684 bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg2));1685 return simd256<1>::ifh(simd256<256>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));1686 }1687 1688 //The total number of operations is 01689 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()1690 {1691 return simd256<2>::constant<(1)>();1692 }1693 1694 //The total number of operations is 01695 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()1696 {1697 return simd256<4>::constant<(3)>();1698 }1699 1700 //The total number of operations is 01701 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()1702 {1703 return simd256<8>::constant<(15)>();1704 }1705 1706 //The total number of operations is 01707 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()1708 {1709 return simd256<16>::constant<(255)>();1710 }1711 1712 //The total number of operations is 01713 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()1714 {1715 return simd256<32>::constant<(65535)>();1716 }1717 1718 //The total number of operations is 01719 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()1720 {1721 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));1722 }1723 1724 //The total number of operations is 01725 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()1726 {1727 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));1728 }1729 1730 //The total number of operations is 01731 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()1732 {1733 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));1734 }1735 1736 //The total number of operations is 1.01737 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)1738 {1739 return simd_and(arg1, arg2);1740 }1741 1742 //The total number of operations is 24.01743 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)1744 {1745 bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);1746 bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));1747 bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));1748 return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));1749 }1750 1751 //The total number of operations is 14.01752 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)1753 {1754 return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));1755 }1756 1757 //The total number of operations is 5.01758 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)1759 {1760 return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));1761 }1762 1763 //The total number of operations is 5.01764 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)1765 {1766 return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));1767 }1768 1769 //The total number of operations is 5.01770 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)1771 {1772 return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));1773 }1774 1775 //The total number of operations is 11.01776 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)1777 {1778 bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();1779 return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);1780 }1781 1782 //The total number of operations is 46.66666666671783 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)1784 {1785 bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);1786 bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));1787 bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));1788 return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));1789 }1790 1791 //The total number of operations is 132.01792 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)1793 {1794 bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);1795 bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));1796 bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));1797 return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));1798 }1799 1800 //The total number of operations is 19.01801 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)1802 {1803 return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);1804 }1805 1806 //The total number of operations is 39.01807 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)1808 {1809 bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());1810 return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));1811 }1812 1813 //The total number of operations is 4.01814 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)1815 {1816 return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));1817 }1818 1819 //The total number of operations is 4.01820 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)1821 {1822 return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));1823 }1824 1825 //The total number of operations is 4.01826 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)1827 {1828 return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));1829 }1830 1831 //The total number of operations is 13.01832 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)1833 {1834 bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());1835 return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));1836 }1837 1838 //The total number of operations is 69.01839 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)1840 {1841 bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);1842 return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));1843 }1844 1845 //The total number of operations is 204.8333333331846 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)1847 {1848 bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);1849 return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));1850 }1851 1852 //The total number of operations is 2.01853 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)1854 {1855 return simd_not(simd_xor(arg1, arg2));1856 }1857 1858 //The total number of operations is 14.01859 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)1860 {1861 bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);1862 bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));1863 bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);1864 return simd_or(loMask, hiMask);1865 }1866 1867 //The total number of operations is 17.01868 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)1869 {1870 return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));1871 }1872 1873 //The total number of operations is 5.01874 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)1875 {1876 return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));1877 }1878 1879 //The total number of operations is 5.01880 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)1881 {1882 return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));1883 }1884 1885 //The total number of operations is 5.01886 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)1887 {1888 return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));1889 }1890 1891 //The total number of operations is 5.01892 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)1893 {1894 return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));1895 }1896 1897 //The total number of operations is 23.66666666671898 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)1899 {1900 bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);1901 bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));1902 bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);1903 return simd_or(loMask, hiMask);1904 }1905 1906 //The total number of operations is 54.16666666671907 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)1908 {1909 bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);1910 bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));1911 bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);1912 return simd_or(loMask, hiMask);1913 }1914 1915 //The total number of operations is 7.01916 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)1917 {1918 return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));1919 }1920 1921 //The total number of operations is 17.51922 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)1923 {1924 return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));1925 }1926 1927 //The total number of operations is 12.01928 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)1929 {1930 bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);1931 return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));1932 }1933 1934 //The total number of operations is 4.01935 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)1936 {1937 return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));1938 }1939 1940 //The total number of operations is 4.01941 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)1942 {1943 return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));1944 }1945 1946 //The total number of operations is 12.01947 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)1948 {1949 return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));1950 }1951 1952 //The total number of operations is 28.33333333331953 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)1954 {1955 return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));1956 }1957 1958 //The total number of operations is 59.01959 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)1960 {1961 return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));1962 }1963 1964 //The total number of operations is 01965 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()1966 {1967 return simd256<2>::constant<(2)>();1968 }1969 1970 //The total number of operations is 01971 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()1972 {1973 return simd256<4>::constant<(12)>();1974 }1975 1976 //The total number of operations is 01977 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()1978 {1979 return simd256<8>::constant<(240)>();1980 }1981 1982 //The total number of operations is 01983 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()1984 {1985 return simd256<16>::constant<(65280)>();1986 }1987 1988 //The total number of operations is 01989 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()1990 {1991 return simd256<32>::constant<-65536>();1992 }1993 1994 //The total number of operations is 01995 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()1996 {1997 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));1998 }1999 2000 //The total number of operations is 02001 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()2002 {2003 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));2004 }2005 2006 //The total number of operations is 02007 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()2008 {2009 return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));2010 }2011 2012 //The total number of operations is 1.02013 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2)2014 {2015 return simd_xor(arg1, arg2);2016 }2017 2018 //The total number of operations is 16.02019 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2)2020 {2021 bitblock256_t ans = simd256<(1)>::add(arg1, arg2);2022 bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));2023 bitblock256_t loMask = simd256<2>::lomask();2024 bitblock256_t carry = simd256<2>::slli<1>(simd_and(carryMask, loMask));2025 return simd256<1>::ifh(loMask, ans, simd256<(1)>::add(ans, carry));2026 }2027 2028 //The total number of operations is 14.02029 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2)2030 {2031 return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::add(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::add(arg1, arg2));2032 }2033 2034 //The total number of operations is 5.02035 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2)2036 {2037 return avx_general_combine256(_mm_add_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));2038 }2039 2040 //The total number of operations is 5.02041 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2)2042 {2043 return avx_general_combine256(_mm_add_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));2044 }2045 2046 //The total number of operations is 5.02047 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2)2048 {2049 return avx_general_combine256(_mm_add_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));2050 }2051 2052 //The total number of operations is 5.02053 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2)2054 {2055 return avx_general_combine256(_mm_add_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));2056 }2057 2058 //The total number of operations is 26.33333333332059 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2)2060 {2061 bitblock256_t partial = simd256<(64)>::add(arg1, arg2);2062 bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));2063 bitblock256_t carry = simd256<128>::slli<(64)>(simd256<(64)>::srli<(63)>(carryMask));2064 return simd256<(64)>::add(partial, carry);2065 }2066 2067 //The total number of operations is 75.66666666672068 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2)2069 {2070 bitblock256_t ans = simd256<(128)>::add(arg1, arg2);2071 bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));2072 bitblock256_t loMask = simd256<256>::lomask();2073 bitblock256_t carry = simd256<256>::slli<1>(simd_and(carryMask, loMask));2074 return simd256<1>::ifh(loMask, ans, simd256<(128)>::add(ans, carry));2075 }2076 2077 2195 //The total number of operations is 1.0 2078 2196 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2) … … 2296 2414 2297 2415 //The total number of operations is 3.0 2298 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<8>::signmask(bitblock256_t arg1)2416 template <> IDISA_ALWAYS_INLINE FieldType<256/8>::T hsimd256<8>::signmask(bitblock256_t arg1) 2299 2417 { 2300 2418 return ((((uint64_t)(_mm_movemask_epi8(((__m128i)(avx_select_hi128(arg1))))))<<16)|((uint64_t)(_mm_movemask_epi8(((__m128i)(avx_select_lo128(arg1))))))); … … 2302 2420 2303 2421 //The total number of operations is 8.0 2304 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<16>::signmask(bitblock256_t arg1)2422 template <> IDISA_ALWAYS_INLINE FieldType<256/16>::T hsimd256<16>::signmask(bitblock256_t arg1) 2305 2423 { 2306 2424 return hsimd256<(8)>::signmask(hsimd256<16>::packss(simd256<16>::constant<0>(), arg1)); … … 2308 2426 2309 2427 //The total number of operations is 13.0 2310 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<32>::signmask(bitblock256_t arg1)2428 template <> IDISA_ALWAYS_INLINE FieldType<256/32>::T hsimd256<32>::signmask(bitblock256_t arg1) 2311 2429 { 2312 2430 return hsimd256<(16)>::signmask(hsimd256<32>::packss(simd256<32>::constant<0>(), arg1)); … … 2314 2432 2315 2433 //The total number of operations is 104.0 2316 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<64>::signmask(bitblock256_t arg1)2434 template <> IDISA_ALWAYS_INLINE FieldType<256/64>::T hsimd256<64>::signmask(bitblock256_t arg1) 2317 2435 { 2318 2436 return hsimd256<(32)>::signmask(hsimd256<64>::packh(simd256<64>::constant<0>(), arg1)); … … 2320 2438 2321 2439 //The total number of operations is 248.666666667 2322 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<128>::signmask(bitblock256_t arg1)2440 template <> IDISA_ALWAYS_INLINE FieldType<256/128>::T hsimd256<128>::signmask(bitblock256_t arg1) 2323 2441 { 2324 2442 return hsimd256<(64)>::signmask(hsimd256<128>::packh(simd256<128>::constant<0>(), arg1)); … … 2326 2444 2327 2445 //The total number of operations is 266.166666667 2328 template <> IDISA_ALWAYS_INLINE uint64_thsimd256<256>::signmask(bitblock256_t arg1)2446 template <> IDISA_ALWAYS_INLINE FieldType<256/256>::T hsimd256<256>::signmask(bitblock256_t arg1) 2329 2447 { 2330 2448 return hsimd256<(128)>::signmask(hsimd256<256>::packh(simd256<256>::constant<0>(), arg1)); … … 2849 2967 2850 2968 //The total number of operations is 29.5 2851 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2)2969 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2852 2970 { 2853 2971 return simd_or(mvmd256<2>::srli<sh>(arg1), mvmd256<2>::slli<((128)-sh)>(arg2)); … … 2855 2973 2856 2974 //The total number of operations is 29.5 2857 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2)2975 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2858 2976 { 2859 2977 return simd_or(mvmd256<4>::srli<sh>(arg1), mvmd256<4>::slli<((64)-sh)>(arg2)); … … 2861 2979 2862 2980 //The total number of operations is 29.5 2863 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2)2981 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2864 2982 { 2865 2983 return simd_or(mvmd256<8>::srli<sh>(arg1), mvmd256<8>::slli<((32)-sh)>(arg2)); … … 2867 2985 2868 2986 //The total number of operations is 29.5 2869 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2)2987 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2870 2988 { 2871 2989 return simd_or(mvmd256<16>::srli<sh>(arg1), mvmd256<16>::slli<((16)-sh)>(arg2)); … … 2873 2991 2874 2992 //The total number of operations is 29.5 2875 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2)2993 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2876 2994 { 2877 2995 return simd_or(mvmd256<32>::srli<sh>(arg1), mvmd256<32>::slli<((8)-sh)>(arg2)); … … 2879 2997 2880 2998 //The total number of operations is 29.5 2881 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2)2999 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2882 3000 { 2883 3001 return simd_or(mvmd256<64>::srli<sh>(arg1), mvmd256<64>::slli<((4)-sh)>(arg2)); … … 2885 3003 2886 3004 //The total number of operations is 29.5 2887 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2)3005 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2888 3006 { 2889 3007 return simd_or(mvmd256<128>::srli<sh>(arg1), mvmd256<128>::slli<((2)-sh)>(arg2)); … … 2891 3009 2892 3010 //The total number of operations is 29.5 2893 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2)3011 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2) 2894 3012 { 2895 3013 return simd_or(mvmd256<256>::srli<sh>(arg1), mvmd256<256>::slli<((1)-sh)>(arg2)); … … 2897 3015 2898 3016 //The total number of operations is 1.0 2899 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill( uint64_tval1)3017 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1) 2900 3018 { 2901 3019 return mvmd256<32>::fill((-1*val1)); … … 2903 3021 2904 3022 //The total number of operations is 1.0 2905 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill( uint64_tval1)3023 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1) 2906 3024 { 2907 3025 return mvmd256<(4)>::fill(((val1<<2)|val1)); … … 2909 3027 2910 3028 //The total number of operations is 1.0 2911 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill( uint64_tval1)3029 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(FieldType<4>::T val1) 2912 3030 { 2913 3031 return mvmd256<(8)>::fill(((val1<<4)|val1)); … … 2915 3033 2916 3034 //The total number of operations is 1.0 2917 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill( uint64_tval1)3035 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(FieldType<8>::T val1) 2918 3036 { 2919 3037 return (bitblock256_t)_mm256_set1_epi8((int32_t)(val1)); … … 2921 3039 2922 3040 //The total number of operations is 1.0 2923 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill( uint64_tval1)3041 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(FieldType<16>::T val1) 2924 3042 { 2925 3043 return (bitblock256_t)_mm256_set1_epi16((int32_t)(val1)); … … 2927 3045 2928 3046 //The total number of operations is 1.0 2929 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill( uint64_tval1)3047 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(FieldType<32>::T val1) 2930 3048 { 2931 3049 return (bitblock256_t)_mm256_set1_epi32((int32_t)(val1)); … … 2933 3051 2934 3052 //The total number of operations is 5.0 2935 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill( uint64_tval1)3053 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill(FieldType<64>::T val1) 2936 3054 { 2937 3055 return mvmd256<(32)>::fill2((val1>>(32)), (val1&((4294967296ULL)-1))); … … 2939 3057 2940 3058 //The total number of operations is 13.0 2941 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill( uint64_tval1)3059 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1) 2942 3060 { 2943 3061 return mvmd256<(64)>::fill2(0, val1); … … 2945 3063 2946 3064 //The total number of operations is 29.0 2947 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill( uint64_tval1)3065 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1) 2948 3066 { 2949 3067 return mvmd256<(128)>::fill2(0, val1); … … 2951 3069 2952 3070 //The total number of operations is 1.5 2953 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<1>::extract(bitblock256_t arg1)3071 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1) 2954 3072 { 2955 3073 return (((pos%2) == 0) ? (mvmd256<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd256<(2)>::extract<(pos/2)>(arg1)>>1)); … … 2957 3075 2958 3076 //The total number of operations is 1.5 2959 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<2>::extract(bitblock256_t arg1)3077 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1) 2960 3078 { 2961 3079 return (((pos%2) == 0) ? (mvmd256<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd256<(4)>::extract<(pos/2)>(arg1)>>2)); … … 2963 3081 2964 3082 //The total number of operations is 1.5 2965 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<4>::extract(bitblock256_t arg1)3083 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1) 2966 3084 { 2967 3085 return (((pos%2) == 0) ? (mvmd256<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd256<(8)>::extract<(pos/2)>(arg1)>>4)); … … 2969 3087 2970 3088 //The total number of operations is 1.5 2971 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<8>::extract(bitblock256_t arg1)3089 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1) 2972 3090 { 2973 3091 return (((pos%2) == 0) ? (mvmd256<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd256<(16)>::extract<(pos/2)>(arg1)>>8)); … … 2975 3093 2976 3094 //The total number of operations is 1.5 2977 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<16>::extract(bitblock256_t arg1)3095 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1) 2978 3096 { 2979 3097 return ((pos < 8) ? (65535&_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : (65535&_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8))))); … … 2981 3099 2982 3100 //The total number of operations is 1.5 2983 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<32>::extract(bitblock256_t arg1)3101 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1) 2984 3102 { 2985 3103 return ((pos < 4) ? (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4))))); … … 2987 3105 2988 3106 //The total number of operations is 3.0 2989 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE uint64_tmvmd256<64>::extract(bitblock256_t arg1)2990 { 2991 return (( mvmd256<(32)>::extract<((2*pos)+1)>(arg1)<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));3107 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1) 3108 { 3109 return ((((uint64_t)(mvmd256<(32)>::extract<((2*pos)+1)>(arg1)))<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1)); 2992 3110 } 2993 3111 2994 3112 //The total number of operations is 23.5 2995 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1) 2996 { 2997 bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(2)>::slli<1>(arg1) : simd256<(2)>::srli<1>(arg1)); 2998 bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(2)>::lomask(), arg1) : simd_and(simd256<(2)>::himask(), arg1)); 2999 return mvmd256<(2)>::splat<(pos/2)>(simd_or(tmpArg, arg11)); 3113 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1) 3114 { 3115 return mvmd256<(2)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd256<(2)>::slli<1>(arg1) : simd256<(2)>::srli<1>(arg1)), (((pos%2) == 0) ? simd_and(simd256<(2)>::lomask(), arg1) : simd_and(simd256<(2)>::himask(), arg1)))); 3000 3116 } 3001 3117 3002 3118 //The total number of operations is 16.5 3003 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1) 3004 { 3005 bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(4)>::slli<2>(arg1) : simd256<(4)>::srli<2>(arg1)); 3006 bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(4)>::lomask(), arg1) : simd_and(simd256<(4)>::himask(), arg1)); 3007 return mvmd256<(4)>::splat<(pos/2)>(simd_or(tmpArg, arg11)); 3119 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1) 3120 { 3121 return mvmd256<(4)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd256<(4)>::slli<2>(arg1) : simd256<(4)>::srli<2>(arg1)), (((pos%2) == 0) ? simd_and(simd256<(4)>::lomask(), arg1) : simd_and(simd256<(4)>::himask(), arg1)))); 3008 3122 } 3009 3123 3010 3124 //The total number of operations is 9.5 3011 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1) 3012 { 3013 bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(8)>::slli<4>(arg1) : simd256<(8)>::srli<4>(arg1)); 3014 bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(8)>::lomask(), arg1) : simd_and(simd256<(8)>::himask(), arg1)); 3015 return mvmd256<(8)>::splat<(pos/2)>(simd_or(tmpArg, arg11)); 3125 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1) 3126 { 3127 return mvmd256<(8)>::splat<(pos/2)>(simd_or((((pos%2) == 0) ? simd256<(8)>::slli<4>(arg1) : simd256<(8)>::srli<4>(arg1)), (((pos%2) == 0) ? simd_and(simd256<(8)>::lomask(), arg1) : simd_and(simd256<(8)>::himask(), arg1)))); 3016 3128 } 3017 3129 3018 3130 //The total number of operations is 2.5 3019 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1)3131 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1) 3020 3132 { 3021 3133 return ((pos < 16) ? mvmd256<8>::fill(_mm_extract_epi8(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<8>::fill(_mm_extract_epi8(avx_select_hi128(arg1), (int32_t)((pos-16))))); … … 3023 3135 3024 3136 //The total number of operations is 2.5 3025 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1)3137 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1) 3026 3138 { 3027 3139 return ((pos < 8) ? mvmd256<16>::fill(_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<16>::fill(_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8))))); … … 3029 3141 3030 3142 //The total number of operations is 2.5 3031 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1)3143 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1) 3032 3144 { 3033 3145 return ((pos < 4) ? mvmd256<32>::fill(_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<32>::fill(_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4))))); … … 3035 3147 3036 3148 //The total number of operations is 8.0 3037 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1)3149 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1) 3038 3150 { 3039 3151 return simd256<1>::ifh(simd256<64>::himask(), mvmd256<(32)>::splat<((2*pos)+1)>(arg1), mvmd256<(32)>::splat<(2*pos)>(arg1)); … … 3041 3153 3042 3154 //The total number of operations is 19.0 3043 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1)3155 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1) 3044 3156 { 3045 3157 return simd256<1>::ifh(simd256<128>::himask(), mvmd256<(64)>::splat<((2*pos)+1)>(arg1), mvmd256<(64)>::splat<(2*pos)>(arg1)); … … 3047 3159 3048 3160 //The total number of operations is 41.0 3049 template <> template <uint 64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1)3161 template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1) 3050 3162 { 3051 3163 return simd256<1>::ifh(simd256<256>::himask(), mvmd256<(128)>::splat<((2*pos)+1)>(arg1), mvmd256<(128)>::splat<(2*pos)>(arg1)); … … 3053 3165 3054 3166 //The total number of operations is 15.0 3055 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16)3167 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16) 3056 3168 { 3057 3169 return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)))); … … 3059 3171 3060 3172 //The total number of operations is 7.0 3061 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16)3173 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16) 3062 3174 { 3063 3175 return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)))); … … 3065 3177 3066 3178 //The total number of operations is 3.0 3067 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16)3179 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16) 3068 3180 { 3069 3181 return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)))); … … 3071 3183 3072 3184 //The total number of operations is 1.0 3073 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16)3185 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16) 3074 3186 { 3075 3187 return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16)); … … 3077 3189 3078 3190 //The total number of operations is 5.0 3079 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_tval16)3191 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16) 3080 3192 { 3081 3193 return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16)); … … 3083 3195 3084 3196 //The total number of operations is 5.0 3085 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3197 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4) 3086 3198 { 3087 3199 return simd256<1>::ifh(simd256<(4)>::himask(), mvmd256<1>::fill2(val1, val2), mvmd256<1>::fill2(val3, val4)); … … 3089 3201 3090 3202 //The total number of operations is 5.0 3091 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3203 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4) 3092 3204 { 3093 3205 return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<2>::fill2(val1, val2), mvmd256<2>::fill2(val3, val4)); … … 3095 3207 3096 3208 //The total number of operations is 5.0 3097 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3209 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4) 3098 3210 { 3099 3211 return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<4>::fill2(val1, val2), mvmd256<4>::fill2(val3, val4)); … … 3101 3213 3102 3214 //The total number of operations is 5.0 3103 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3215 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4) 3104 3216 { 3105 3217 return simd256<1>::ifh(simd256<(32)>::himask(), mvmd256<8>::fill2(val1, val2), mvmd256<8>::fill2(val3, val4)); … … 3107 3219 3108 3220 //The total number of operations is 3.0 3109 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3221 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4) 3110 3222 { 3111 3223 return simd_or(mvmd256<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd256<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535)))); … … 3113 3225 3114 3226 //The total number of operations is 1.0 3115 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3227 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4) 3116 3228 { 3117 3229 return (bitblock256_t)_mm256_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4)); … … 3119 3231 3120 3232 //The total number of operations is 29.0 3121 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4( uint64_t val1, uint64_t val2, uint64_t val3, uint64_tval4)3233 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill4(FieldType<64>::T val1, FieldType<64>::T val2, FieldType<64>::T val3, FieldType<64>::T val4) 3122 3234 { 3123 3235 return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<64>::fill2(val1, val2), mvmd256<64>::fill2(val3, val4)); … … 3125 3237 3126 3238 //The total number of operations is 14.5 3127 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1)3239 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1) 3128 3240 { 3129 3241 return simd256<256>::srli<(sh*2)>(arg1); … … 3131 3243 3132 3244 //The total number of operations is 14.5 3133 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1)3245 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1) 3134 3246 { 3135 3247 return simd256<256>::srli<(sh*4)>(arg1); … … 3137 3249 3138 3250 //The total number of operations is 14.5 3139 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1)3251 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1) 3140 3252 { 3141 3253 return simd256<256>::srli<(sh*8)>(arg1); … … 3143 3255 3144 3256 //The total number of operations is 14.5 3145 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1)3257 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1) 3146 3258 { 3147 3259 return simd256<256>::srli<(sh*16)>(arg1); … … 3149 3261 3150 3262 //The total number of operations is 14.5 3151 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1)3263 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1) 3152 3264 { 3153 3265 return simd256<256>::srli<(sh*32)>(arg1); … … 3155 3267 3156 3268 //The total number of operations is 14.5 3157 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)3269 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1) 3158 3270 { 3159 3271 return simd256<256>::srli<(sh*64)>(arg1); … … 3161 3273 3162 3274 //The total number of operations is 14.5 3163 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)3275 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1) 3164 3276 { 3165 3277 return simd256<256>::srli<(sh*128)>(arg1); … … 3167 3279 3168 3280 //The total number of operations is 14.5 3169 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)3281 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1) 3170 3282 { 3171 3283 return simd256<256>::srli<(sh*256)>(arg1); … … 3173 3285 3174 3286 //The total number of operations is 1.0 3175 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2( uint64_t val1, uint64_tval2)3287 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2) 3176 3288 { 3177 3289 return mvmd256<(2)>::fill(((val1<<1)|(val2&(1)))); … … 3179 3291 3180 3292 //The total number of operations is 1.0 3181 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2( uint64_t val1, uint64_tval2)3293 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2) 3182 3294 { 3183 3295 return mvmd256<(4)>::fill(((val1<<2)|(val2&(3)))); … … 3185 3297 3186 3298 //The total number of operations is 1.0 3187 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2( uint64_t val1, uint64_tval2)3299 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2) 3188 3300 { 3189 3301 return mvmd256<(8)>::fill(((val1<<4)|(val2&(15)))); … … 3191 3303 3192 3304 //The total number of operations is 1.0 3193 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2( uint64_t val1, uint64_tval2)3305 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2) 3194 3306 { 3195 3307 return mvmd256<(16)>::fill(((val1<<8)|(val2&(255)))); … … 3197 3309 3198 3310 //The total number of operations is 1.0 3199 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2( uint64_t val1, uint64_tval2)3311 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2) 3200 3312 { 3201 3313 return mvmd256<(32)>::fill(((val1<<16)|(val2&(65535)))); … … 3203 3315 3204 3316 //The total number of operations is 5.0 3205 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2( uint64_t val1, uint64_tval2)3317 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2) 3206 3318 { 3207 3319 return simd256<1>::ifh(simd256<(64)>::himask(), mvmd256<32>::fill(val1), mvmd256<32>::fill(val2)); … … 3209 3321 3210 3322 //The total number of operations is 13.0 3211 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2( uint64_t val1, uint64_tval2)3323 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2) 3212 3324 { 3213 3325 return simd256<1>::ifh(simd256<(128)>::himask(), mvmd256<64>::fill(val1), mvmd256<64>::fill(val2)); … … 3215 3327 3216 3328 //The total number of operations is 29.0 3217 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2( uint64_t val1, uint64_tval2)3329 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill2(FieldType<128>::T val1, FieldType<128>::T val2) 3218 3330 { 3219 3331 return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<128>::fill(val1), mvmd256<128>::fill(val2)); … … 3221 3333 3222 3334 //The total number of operations is 29.5 3223 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2)3335 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3224 3336 { 3225 3337 return simd_or(mvmd256<2>::slli<sh>(arg1), mvmd256<2>::srli<((128)-sh)>(arg2)); … … 3227 3339 3228 3340 //The total number of operations is 29.5 3229 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2)3341 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3230 3342 { 3231 3343 return simd_or(mvmd256<4>::slli<sh>(arg1), mvmd256<4>::srli<((64)-sh)>(arg2)); … … 3233 3345 3234 3346 //The total number of operations is 29.5 3235 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2)3347 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3236 3348 { 3237 3349 return simd_or(mvmd256<8>::slli<sh>(arg1), mvmd256<8>::srli<((32)-sh)>(arg2)); … … 3239 3351 3240 3352 //The total number of operations is 29.5 3241 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2)3353 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3242 3354 { 3243 3355 return simd_or(mvmd256<16>::slli<sh>(arg1), mvmd256<16>::srli<((16)-sh)>(arg2)); … … 3245 3357 3246 3358 //The total number of operations is 29.5 3247 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2)3359 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3248 3360 { 3249 3361 return simd_or(mvmd256<32>::slli<sh>(arg1), mvmd256<32>::srli<((8)-sh)>(arg2)); … … 3251 3363 3252 3364 //The total number of operations is 29.5 3253 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)3365 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3254 3366 { 3255 3367 return simd_or(mvmd256<64>::slli<sh>(arg1), mvmd256<64>::srli<((4)-sh)>(arg2)); … … 3257 3369 3258 3370 //The total number of operations is 29.5 3259 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)3371 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3260 3372 { 3261 3373 return simd_or(mvmd256<128>::slli<sh>(arg1), mvmd256<128>::srli<((2)-sh)>(arg2)); … … 3263 3375 3264 3376 //The total number of operations is 29.5 3265 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)3377 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2) 3266 3378 { 3267 3379 return simd_or(mvmd256<256>::slli<sh>(arg1), mvmd256<256>::srli<((1)-sh)>(arg2)); … … 3269 3381 3270 3382 //The total number of operations is 14.0 3271 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1)3383 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1) 3272 3384 { 3273 3385 return simd256<256>::slli<(sh*2)>(arg1); … … 3275 3387 3276 3388 //The total number of operations is 14.0 3277 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1)3389 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1) 3278 3390 { 3279 3391 return simd256<256>::slli<(sh*4)>(arg1); … … 3281 3393 3282 3394 //The total number of operations is 14.0 3283 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1)3395 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1) 3284 3396 { 3285 3397 return simd256<256>::slli<(sh*8)>(arg1); … … 3287 3399 3288 3400 //The total number of operations is 14.0 3289 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1)3290 { 3291 return mvmd256<(8)>::slli<(sh*2)>(arg1);3401 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1) 3402 { 3403 return simd256<256>::slli<(sh*16)>(arg1); 3292 3404 } 3293 3405 3294 3406 //The total number of operations is 14.0 3295 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1)3407 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1) 3296 3408 { 3297 3409 return simd256<256>::slli<(sh*32)>(arg1); … … 3299 3411 3300 3412 //The total number of operations is 14.0 3301 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1)3413 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1) 3302 3414 { 3303 3415 return simd256<256>::slli<(sh*64)>(arg1); … … 3305 3417 3306 3418 //The total number of operations is 14.0 3307 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1)3308 { 3309 return mvmd256<(64)>::slli<(sh*2)>(arg1);3419 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1) 3420 { 3421 return simd256<256>::slli<(sh*128)>(arg1); 3310 3422 } 3311 3423 3312 3424 //The total number of operations is 14.0 3313 template <> template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1)3425 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1) 3314 3426 { 3315 3427 return simd256<256>::slli<(sh*256)>(arg1); … … 3317 3429 3318 3430 //The total number of operations is 13.0 3319 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8)3431 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8) 3320 3432 { 3321 3433 return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<1>::fill4(val1, val2, val3, val4), mvmd256<1>::fill4(val5, val6, val7, val8)); … … 3323 3435 3324 3436 //The total number of operations is 13.0 3325 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8)3437 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8) 3326 3438 { 3327 3439 return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<2>::fill4(val1, val2, val3, val4), mvmd256<2>::fill4(val5, val6, val7, val8)); … … 3329 3441 3330 3442 //The total number of operations is 7.0 3331 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8)3443 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8) 3332 3444 { 3333 3445 return simd_or(mvmd256<(8)>::fill8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd256<(8)>::fill8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)))); … … 3335 3447 3336 3448 //The total number of operations is 3.0 3337 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8)3449 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8) 3338 3450 { 3339 3451 return simd_or(mvmd256<(16)>::fill8((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd256<(16)>::fill8((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)))); … … 3341 3453 3342 3454 //The total number of operations is 1.0 3343 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8)3455 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8) 3344 3456 { 3345 3457 return (bitblock256_t)_mm256_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8)); … … 3347 3459 3348 3460 //The total number of operations is 5.0 3349 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8( uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_tval8)3461 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4, FieldType<32>::T val5, FieldType<32>::T val6, FieldType<32>::T val7, FieldType<32>::T val8) 3350 3462 { 3351 3463 return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<32>::fill4(val1, val2, val3, val4), mvmd256<32>::fill4(val5, val6, val7, val8)); … … 3359 3471 3360 3472 //The total number of operations is 14.5 3361 template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srli(bitblock256_t arg1)3473 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srli(bitblock256_t arg1) 3362 3474 { 3363 3475 return simd256<256>::srli<sh>(arg1); … … 3371 3483 3372 3484 //The total number of operations is 118.5 3373 IDISA_ALWAYS_INLINE uint 64_t bitblock256::popcount(bitblock256_t arg1)3485 IDISA_ALWAYS_INLINE uint16_t bitblock256::popcount(bitblock256_t arg1) 3374 3486 { 3375 3487 return mvmd256<64>::extract<0>(simd256<256>::popcount(arg1)); … … 3383 3495 3384 3496 //The total number of operations is 14.0 3385 template <uint 64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)3497 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1) 3386 3498 { 3387 3499 return simd256<256>::slli<sh>(arg1); -
trunk/lib/idisa_cpp/idisa_sse2.cpp
r3063 r3441 15 15 typedef __m128i bitblock128_t; 16 16 17 #ifndef FIELD_TYPE 18 #define FIELD_TYPE 17 19 template <uint32_t fw> struct FieldType { 18 20 typedef int T; //default for FieldType::T is int … … 27 29 template <> struct FieldType<64> {typedef uint64_t T;}; 28 30 template <> struct FieldType<128> {typedef uint64_t T;}; 31 #endif 29 32 30 33 typedef FieldType<1>::T fw1_t;
Note: See TracChangeset
for help on using the changeset viewer.