Ignore:
Timestamp:
Sep 22, 2013, 3:09:25 PM (6 years ago)
Author:
linmengl
Message:

add mvmd_insert to sse and avx. Update all sse library. Hand modified sse2 is saved as idisa_sse2_hand.cpp

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_avx.cpp

    r3441 r3525  
    1616
    1717typedef __m256 bitblock256_t;
    18                        
     18               
     19#ifndef FIELD_TYPE
     20#define FIELD_TYPE     
     21template <uint32_t fw> struct FieldType {
     22   typedef int T;  //default for FieldType::T is int
     23};
     24
     25template <> struct FieldType<1> {typedef uint8_t T;};
     26template <> struct FieldType<2> {typedef uint8_t T;};
     27template <> struct FieldType<4> {typedef uint8_t T;};
     28template <> struct FieldType<8> {typedef uint8_t T;};
     29template <> struct FieldType<16> {typedef uint16_t T;};
     30template <> struct FieldType<32> {typedef uint32_t T;};
     31template <> struct FieldType<64> {typedef uint64_t T;};
     32template <> struct FieldType<128> {typedef uint64_t T;};
     33template <> struct FieldType<256> {typedef uint64_t T;};
     34#endif
     35
    1936template <uint32_t fw>
    2037class simd256
     
    2845        static IDISA_ALWAYS_INLINE bitblock256_t all(bitblock256_t arg1);
    2946        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
     47        static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    3048        static IDISA_ALWAYS_INLINE bitblock256_t ctz(bitblock256_t arg1);
    3149        static IDISA_ALWAYS_INLINE bitblock256_t eq(bitblock256_t arg1, bitblock256_t arg2);
     
    3452        static IDISA_ALWAYS_INLINE bitblock256_t himask();
    3553        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    36         static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    3754        static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2);
    3855        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1);
     
    4158        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    4259        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
     60        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    4361        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    4462        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
     
    4765        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
    4866        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
    49         static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    5067        static IDISA_ALWAYS_INLINE bitblock256_t ugt(bitblock256_t arg1, bitblock256_t arg2);
    5168};
     
    83100        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);
    84101        static IDISA_ALWAYS_INLINE bitblock256_t fill(typename FieldType<fw>::T val1);
    85         template <uint8_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
     102        template <uint16_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
    86103        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);
    87104        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
     
    99116        static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(const bitblock256_t* arg1);
    100117        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    101         static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    102118        static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1);
    103119        static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1);
     
    105121        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    106122        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(const bitblock256_t* arg1);
     123        static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    107124        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t arg1, bitblock256_t* arg2);
    108125};
     
    111128IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2);
    112129IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1);
     130IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    113131IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    114 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    115132IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    116133IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
     
    184201template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1);
    185202template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1);
     203template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
     204template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
     205template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
     206template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
     207template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
     208template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
     209template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
     210template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
     211template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
    186212template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2);
    187213template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2);
     
    243269template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    244270template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    245 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
    246 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
    247 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
    248 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
    249 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
    250 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
    251 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
    252 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
    253 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
     271template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
     272template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
     273template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
     274template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
     275template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
     276template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
     277template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
     278template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    254279template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1);
    255280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1);
     
    260285template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
    261286template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
     287template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
     288template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
     289template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
     290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
     291template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    262295template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    263296template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     
    278311template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
    279312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
    280 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
    286 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    287 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
     313template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
     320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
     321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
     322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
     323template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     324template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
     325template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
     326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
     328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
     329template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
     330template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
     331template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
     332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
     333template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
     334template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
     335template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
     336template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
     337template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
     338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
     339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
     340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
     341template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
     342template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
     343template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
     344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
     345template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
     346template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
     347template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
     348template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
     349template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
     350template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
     351template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
     352template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
     353template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
     354template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
     355template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
     356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    288357template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2);
    289358template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2);
     
    295364template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    296365template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    297 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    298 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
    299 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
    300 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
    301 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
    302 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
    303 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
    304 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
    305 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
    306366template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
    307367template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
     
    312372template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
    313373template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
    314 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
    320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
    321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
    322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
    323 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
    324 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
    325 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
    326 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
    327 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
    328 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
    329 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
    330 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    331 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
    332 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
    333 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
    334 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
    335 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
    336 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
    337 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
    338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    339 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
    340 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
    341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
    342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
    343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
    344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
    345 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
    346 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
    347 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    348 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    349 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
    350 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
    351 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
    352 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
    353 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
    354 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
    355 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
    356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
    357374template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
    358375template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    473490template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    474491template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     492template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
     493template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
     494template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
     495template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
     496template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
    475497template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1);
    476498template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1);
     
    482504template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1);
    483505template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1);
    484 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
    485 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
    486 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
    487 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
    488 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
    489 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
    490 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
     506template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
     507template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
     508template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
     509template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
     510template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
     511template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
     512template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
    491513template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);
    492514template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);
     
    498520template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
    499521template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
    500 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
    501 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
    502 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
    503 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
    504 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
    505522template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
    506523template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     
    584601
    585602//The total number of operations is 1.0
     603IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
     604{
     605        return _mm256_or_ps(arg1, arg2);
     606}
     607
     608//The total number of operations is 1.0
    586609IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
    587610{
    588611        return _mm256_andnot_ps(arg2, arg1);
    589 }
    590 
    591 //The total number of operations is 1.0
    592 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
    593 {
    594         return _mm256_or_ps(arg1, arg2);
    595612}
    596613
     
    11331150
    11341151//The total number of operations is 1.0
    1135 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1136 {
    1137         return simd_andc(arg1, arg2);
    1138 }
    1139 
    1140 //The total number of operations is 23.0
    1141 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1142 {
    1143         bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
    1144         bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    1145         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    1146         return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
    1147 }
    1148 
    1149 //The total number of operations is 20.0
    1150 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1151 {
    1152         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
    1153 }
    1154 
    1155 //The total number of operations is 7.0
    1156 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1157 {
    1158         bitblock256_t high_bit = simd256<8>::constant<(128)>();
    1159         return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1160 }
    1161 
    1162 //The total number of operations is 7.0
    1163 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1164 {
    1165         bitblock256_t high_bit = simd256<16>::constant<(32768)>();
    1166         return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1167 }
    1168 
    1169 //The total number of operations is 7.0
    1170 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1171 {
    1172         bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
    1173         return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1174 }
    1175 
    1176 //The total number of operations is 7.0
    1177 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1178 {
    1179         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1180         return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1181 }
    1182 
    1183 //The total number of operations is 60.0
    1184 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1185 {
    1186         bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
    1187         bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    1188         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    1189         return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
    1190 }
    1191 
    1192 //The total number of operations is 174.166666667
    1193 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1194 {
    1195         bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
    1196         bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    1197         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    1198         return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
    1199 }
    1200 
    1201 //The total number of operations is 7.0
    1202 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
    1203 {
    1204         return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
    1205 }
    1206 
    1207 //The total number of operations is 7.0
    1208 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
    1209 {
    1210         return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
    1211 }
    1212 
    1213 //The total number of operations is 7.0
    1214 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
    1215 {
    1216         return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
    1217 }
    1218 
    1219 //The total number of operations is 6.0
    1220 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
    1221 {
    1222         return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
    1223 }
    1224 
    1225 //The total number of operations is 6.0
    1226 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
    1227 {
    1228         return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
    1229 }
    1230 
    1231 //The total number of operations is 6.0
    1232 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
    1233 {
    1234         return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
    1235 }
    1236 
    1237 //The total number of operations is 10.3333333333
    1238 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
    1239 {
    1240         return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
    1241 }
    1242 
    1243 //The total number of operations is 16.5
    1244 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
    1245 {
    1246         return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
    1247 }
    1248 
    1249 //The total number of operations is 0
    1250 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
    1251 {
    1252         return arg1;
    1253 }
    1254 
    1255 //The total number of operations is 10.0
    1256 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
    1257 {
    1258         return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
    1259 }
    1260 
    1261 //The total number of operations is 21.0
    1262 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
    1263 {
    1264         return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
    1265 }
    1266 
    1267 //The total number of operations is 32.0
    1268 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
    1269 {
    1270         return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
    1271 }
    1272 
    1273 //The total number of operations is 42.0
    1274 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
    1275 {
    1276         return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
    1277 }
    1278 
    1279 //The total number of operations is 52.0
    1280 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
    1281 {
    1282         return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
    1283 }
    1284 
    1285 //The total number of operations is 38.0
    1286 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
    1287 {
    1288         bitblock256_t tmpAns = simd256<8>::popcount(arg1);
    1289         return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
    1290 }
    1291 
    1292 //The total number of operations is 73.6666666667
    1293 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
    1294 {
    1295         return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
    1296 }
    1297 
    1298 //The total number of operations is 115.5
    1299 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
    1300 {
    1301         bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
    1302         return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
    1303 }
    1304 
    1305 //The total number of operations is 14.0
    1306 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
    1307 {
    1308         bitblock256_t t0 = simd256<2>::srli<1>(arg1);
    1309         bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
    1310         return simd_or(f0, simd256<2>::slli<1>(f0));
    1311 }
    1312 
    1313 //The total number of operations is 20.0
    1314 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
    1315 {
    1316         return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
    1317 }
    1318 
    1319 //The total number of operations is 7.0
    1320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
    1321 {
    1322         return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
    1323 }
    1324 
    1325 //The total number of operations is 7.0
    1326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
    1327 {
    1328         return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
    1329 }
    1330 
    1331 //The total number of operations is 7.0
    1332 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
    1333 {
    1334         return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
    1335 }
    1336 
    1337 //The total number of operations is 7.0
    1338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
    1339 {
    1340         return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
    1341 }
    1342 
    1343 //The total number of operations is 60.0
    1344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
    1345 {
    1346         return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
    1347 }
    1348 
    1349 //The total number of operations is 1.0
    1350 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
    1351 {
    1352         return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
    1353 }
    1354 
    1355 //The total number of operations is 16.0
    1356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
    1357 {
    1358         return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
    1359 }
    1360 
    1361 //The total number of operations is 14.0
    1362 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
    1363 {
    1364         return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
    1365 }
    1366 
    1367 //The total number of operations is 5.0
    1368 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
    1369 {
    1370         return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
    1371 }
    1372 
    1373 //The total number of operations is 5.0
    1374 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
    1375 {
    1376         return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
    1377 }
    1378 
    1379 //The total number of operations is 5.0
    1380 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
    1381 {
    1382         return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
    1383 }
    1384 
    1385 //The total number of operations is 5.0
    1386 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
    1387 {
    1388         return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
    1389 }
    1390 
    1391 //The total number of operations is 26.3333333333
    1392 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
    1393 {
    1394         return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
    1395 }
    1396 
    1397 //The total number of operations is 75.6666666667
    1398 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
    1399 {
    1400         return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
    1401 }
    1402 
    1403 //The total number of operations is 5.0
    1404 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
    1405 {
    1406         return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
    1407 }
    1408 
    1409 //The total number of operations is 5.0
    1410 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
    1411 {
    1412         return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
    1413 }
    1414 
    1415 //The total number of operations is 5.0
    1416 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
    1417 {
    1418         return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
    1419 }
    1420 
    1421 //The total number of operations is 4.0
    1422 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
    1423 {
    1424         return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    1425 }
    1426 
    1427 //The total number of operations is 4.0
    1428 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
    1429 {
    1430         return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    1431 }
    1432 
    1433 //The total number of operations is 4.0
    1434 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
    1435 {
    1436         return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
    1437 }
    1438 
    1439 //The total number of operations is 8.33333333333
    1440 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
    1441 {
    1442         return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
    1443 }
    1444 
    1445 //The total number of operations is 14.0
    1446 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
    1447 {
    1448         return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
    1449 }
    1450 
    1451 //The total number of operations is 3.0
    1452 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1453 {
    1454         return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
    1455 }
    1456 
    1457 //The total number of operations is 11.0
    1458 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1459 {
    1460         return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
    1461 }
    1462 
    1463 //The total number of operations is 19.0
    1464 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1465 {
    1466         return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
    1467 }
    1468 
    1469 //The total number of operations is 8.0
    1470 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1471 {
    1472         return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
    1473 }
    1474 
    1475 //The total number of operations is 8.0
    1476 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1477 {
    1478         return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
    1479 }
    1480 
    1481 //The total number of operations is 8.0
    1482 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1483 {
    1484         return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
    1485 }
    1486 
    1487 //The total number of operations is 1.0
    1488 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1489 {
    1490         return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1));
    1491 }
    1492 
    1493 //The total number of operations is 12.3333333333
    1494 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1495 {
    1496         return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
    1497 }
    1498 
    1499 //The total number of operations is 29.8333333333
    1500 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1501 {
    1502         return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
    1503 }
    1504 
    1505 //The total number of operations is 1.0
    15061152template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2)
    15071153{
     
    15681214}
    15691215
     1216//The total number of operations is 1.0
     1217template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1218{
     1219        return simd_andc(arg1, arg2);
     1220}
     1221
     1222//The total number of operations is 23.0
     1223template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1224{
     1225        bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
     1226        bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     1227        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     1228        return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
     1229}
     1230
     1231//The total number of operations is 20.0
     1232template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1233{
     1234        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
     1235}
     1236
     1237//The total number of operations is 7.0
     1238template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1239{
     1240        bitblock256_t high_bit = simd256<8>::constant<(128)>();
     1241        return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1242}
     1243
     1244//The total number of operations is 7.0
     1245template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1246{
     1247        bitblock256_t high_bit = simd256<16>::constant<(32768)>();
     1248        return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1249}
     1250
     1251//The total number of operations is 7.0
     1252template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1253{
     1254        bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
     1255        return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1256}
     1257
     1258//The total number of operations is 7.0
     1259template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1260{
     1261        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1262        return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1263}
     1264
     1265//The total number of operations is 60.0
     1266template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1267{
     1268        bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
     1269        bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     1270        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     1271        return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
     1272}
     1273
     1274//The total number of operations is 174.166666667
     1275template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1276{
     1277        bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
     1278        bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     1279        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     1280        return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
     1281}
     1282
     1283//The total number of operations is 7.0
     1284template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
     1285{
     1286        return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
     1287}
     1288
     1289//The total number of operations is 7.0
     1290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
     1291{
     1292        return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
     1293}
     1294
     1295//The total number of operations is 7.0
     1296template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
     1297{
     1298        return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
     1299}
     1300
     1301//The total number of operations is 6.0
     1302template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
     1303{
     1304        return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
     1305}
     1306
     1307//The total number of operations is 6.0
     1308template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
     1309{
     1310        return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
     1311}
     1312
     1313//The total number of operations is 6.0
     1314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
     1315{
     1316        return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
     1317}
     1318
     1319//The total number of operations is 10.3333333333
     1320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
     1321{
     1322        return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
     1323}
     1324
     1325//The total number of operations is 16.5
     1326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
     1327{
     1328        return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1329}
     1330
     1331//The total number of operations is 0
     1332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
     1333{
     1334        return arg1;
     1335}
     1336
     1337//The total number of operations is 10.0
     1338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
     1339{
     1340        return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
     1341}
     1342
     1343//The total number of operations is 21.0
     1344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
     1345{
     1346        return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
     1347}
     1348
     1349//The total number of operations is 32.0
     1350template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
     1351{
     1352        return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
     1353}
     1354
     1355//The total number of operations is 42.0
     1356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
     1357{
     1358        return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
     1359}
     1360
     1361//The total number of operations is 52.0
     1362template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
     1363{
     1364        return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
     1365}
     1366
     1367//The total number of operations is 38.0
     1368template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
     1369{
     1370        bitblock256_t tmpAns = simd256<8>::popcount(arg1);
     1371        return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
     1372}
     1373
     1374//The total number of operations is 73.6666666667
     1375template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
     1376{
     1377        return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
     1378}
     1379
     1380//The total number of operations is 115.5
     1381template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
     1382{
     1383        bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
     1384        return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
     1385}
     1386
     1387//The total number of operations is 14.0
     1388template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
     1389{
     1390        bitblock256_t t0 = simd256<2>::srli<1>(arg1);
     1391        bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
     1392        return simd_or(f0, simd256<2>::slli<1>(f0));
     1393}
     1394
     1395//The total number of operations is 20.0
     1396template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
     1397{
     1398        return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
     1399}
     1400
     1401//The total number of operations is 7.0
     1402template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
     1403{
     1404        return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
     1405}
     1406
     1407//The total number of operations is 7.0
     1408template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
     1409{
     1410        return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
     1411}
     1412
     1413//The total number of operations is 7.0
     1414template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
     1415{
     1416        return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
     1417}
     1418
     1419//The total number of operations is 7.0
     1420template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
     1421{
     1422        return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
     1423}
     1424
     1425//The total number of operations is 60.0
     1426template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
     1427{
     1428        return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
     1429}
     1430
     1431//The total number of operations is 1.0
     1432template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
     1433{
     1434        return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
     1435}
     1436
     1437//The total number of operations is 16.0
     1438template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
     1439{
     1440        return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
     1441}
     1442
     1443//The total number of operations is 14.0
     1444template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
     1445{
     1446        return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
     1447}
     1448
     1449//The total number of operations is 5.0
     1450template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
     1451{
     1452        return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
     1453}
     1454
     1455//The total number of operations is 5.0
     1456template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
     1457{
     1458        return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
     1459}
     1460
     1461//The total number of operations is 5.0
     1462template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
     1463{
     1464        return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
     1465}
     1466
     1467//The total number of operations is 5.0
     1468template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
     1469{
     1470        return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
     1471}
     1472
     1473//The total number of operations is 26.3333333333
     1474template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
     1475{
     1476        return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
     1477}
     1478
     1479//The total number of operations is 75.6666666667
     1480template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
     1481{
     1482        return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
     1483}
     1484
     1485//The total number of operations is 5.0
     1486template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
     1487{
     1488        return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
     1489}
     1490
     1491//The total number of operations is 5.0
     1492template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
     1493{
     1494        return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
     1495}
     1496
     1497//The total number of operations is 5.0
     1498template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
     1499{
     1500        return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
     1501}
     1502
     1503//The total number of operations is 4.0
     1504template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
     1505{
     1506        return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     1507}
     1508
     1509//The total number of operations is 4.0
     1510template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
     1511{
     1512        return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     1513}
     1514
     1515//The total number of operations is 4.0
     1516template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
     1517{
     1518        return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
     1519}
     1520
     1521//The total number of operations is 8.33333333333
     1522template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
     1523{
     1524        return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
     1525}
     1526
     1527//The total number of operations is 14.0
     1528template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
     1529{
     1530        return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
     1531}
     1532
     1533//The total number of operations is 3.0
     1534template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1535{
     1536        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
     1537}
     1538
     1539//The total number of operations is 11.0
     1540template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1541{
     1542        return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
     1543}
     1544
     1545//The total number of operations is 19.0
     1546template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1547{
     1548        return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
     1549}
     1550
     1551//The total number of operations is 8.0
     1552template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1553{
     1554        return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
     1555}
     1556
     1557//The total number of operations is 8.0
     1558template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1559{
     1560        return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
     1561}
     1562
     1563//The total number of operations is 8.0
     1564template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1565{
     1566        return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
     1567}
     1568
     1569//The total number of operations is 1.0
     1570template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1571{
     1572        return (bitblock256_t)_mm256_blendv_pd(arg3, arg2, arg1);
     1573}
     1574
     1575//The total number of operations is 12.3333333333
     1576template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1577{
     1578        return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
     1579}
     1580
     1581//The total number of operations is 29.8333333333
     1582template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1583{
     1584        return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
     1585}
     1586
     1587//The total number of operations is 7.0
     1588template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
     1589{
     1590        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
     1591}
     1592
     1593//The total number of operations is 17.5
     1594template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
     1595{
     1596        return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
     1597}
     1598
     1599//The total number of operations is 12.0
     1600template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
     1601{
     1602        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1603        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1604}
     1605
     1606//The total number of operations is 4.0
     1607template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
     1608{
     1609        return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     1610}
     1611
     1612//The total number of operations is 4.0
     1613template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
     1614{
     1615        return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     1616}
     1617
     1618//The total number of operations is 12.0
     1619template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
     1620{
     1621        return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
     1622}
     1623
     1624//The total number of operations is 28.3333333333
     1625template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
     1626{
     1627        return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
     1628}
     1629
     1630//The total number of operations is 59.0
     1631template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
     1632{
     1633        return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
     1634}
     1635
    15701636//The total number of operations is 10.0
    15711637template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
     
    16141680{
    16151681        return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1682}
     1683
     1684//The total number of operations is 0
     1685template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1686{
     1687        return simd256<2>::constant<(1)>();
     1688}
     1689
     1690//The total number of operations is 0
     1691template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1692{
     1693        return simd256<4>::constant<(3)>();
     1694}
     1695
     1696//The total number of operations is 0
     1697template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1698{
     1699        return simd256<8>::constant<(15)>();
     1700}
     1701
     1702//The total number of operations is 0
     1703template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1704{
     1705        return simd256<16>::constant<(255)>();
     1706}
     1707
     1708//The total number of operations is 0
     1709template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1710{
     1711        return simd256<32>::constant<(65535)>();
     1712}
     1713
     1714//The total number of operations is 0
     1715template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1716{
     1717        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     1718}
     1719
     1720//The total number of operations is 0
     1721template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1722{
     1723        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     1724}
     1725
     1726//The total number of operations is 0
     1727template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1728{
     1729        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
    16161730}
    16171731
     
    17371851}
    17381852
     1853//The total number of operations is 1.0
     1854template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1855{
     1856        return simd_and(arg1, arg2);
     1857}
     1858
     1859//The total number of operations is 24.0
     1860template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1861{
     1862        bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
     1863        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     1864        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     1865        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1866}
     1867
     1868//The total number of operations is 14.0
     1869template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1870{
     1871        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     1872}
     1873
     1874//The total number of operations is 5.0
     1875template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1876{
     1877        return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1878}
     1879
     1880//The total number of operations is 5.0
     1881template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1882{
     1883        return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1884}
     1885
     1886//The total number of operations is 5.0
     1887template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1888{
     1889        return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1890}
     1891
     1892//The total number of operations is 11.0
     1893template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1894{
     1895        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1896        return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1897}
     1898
     1899//The total number of operations is 46.6666666667
     1900template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1901{
     1902        bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
     1903        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     1904        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     1905        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1906}
     1907
     1908//The total number of operations is 132.0
     1909template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1910{
     1911        bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
     1912        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     1913        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     1914        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1915}
     1916
     1917//The total number of operations is 1.0
     1918template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1919{
     1920        return simd_or(arg1, arg2);
     1921}
     1922
     1923//The total number of operations is 24.0
     1924template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1925{
     1926        bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
     1927        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     1928        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     1929        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1930}
     1931
     1932//The total number of operations is 14.0
     1933template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1934{
     1935        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     1936}
     1937
     1938//The total number of operations is 5.0
     1939template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1940{
     1941        return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1942}
     1943
     1944//The total number of operations is 5.0
     1945template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1946{
     1947        return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1948}
     1949
     1950//The total number of operations is 5.0
     1951template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1952{
     1953        return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1954}
     1955
     1956//The total number of operations is 11.0
     1957template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1958{
     1959        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1960        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1961}
     1962
     1963//The total number of operations is 46.6666666667
     1964template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1965{
     1966        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
     1967        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     1968        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     1969        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1970}
     1971
     1972//The total number of operations is 132.0
     1973template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1974{
     1975        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
     1976        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     1977        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     1978        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1979}
     1980
     1981//The total number of operations is 1.0
     1982template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
     1983{
     1984        return simd_andc(arg1, arg2);
     1985}
     1986
     1987//The total number of operations is 24.0
     1988template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
     1989{
     1990        bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
     1991        bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
     1992        bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     1993        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     1994        return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
     1995}
     1996
     1997//The total number of operations is 38.0
     1998template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
     1999{
     2000        bitblock256_t high_bit = simd256<4>::constant<(8)>();
     2001        return simd256<4>::ult(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     2002}
     2003
     2004//The total number of operations is 13.0
     2005template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2006{
     2007        return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
     2008}
     2009
     2010//The total number of operations is 13.0
     2011template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2012{
     2013        return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
     2014}
     2015
     2016//The total number of operations is 13.0
     2017template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2018{
     2019        return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
     2020}
     2021
     2022//The total number of operations is 13.0
     2023template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2024{
     2025        return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
     2026}
     2027
     2028//The total number of operations is 81.0
     2029template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2030{
     2031        bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
     2032        bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
     2033        bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     2034        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     2035        return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
     2036}
     2037
     2038//The total number of operations is 263.166666667
     2039template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2040{
     2041        bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
     2042        bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
     2043        bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     2044        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     2045        return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
     2046}
     2047
     2048//The total number of operations is 2.0
     2049template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2050{
     2051        return simd_not(simd_xor(arg1, arg2));
     2052}
     2053
     2054//The total number of operations is 14.0
     2055template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2056{
     2057        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
     2058        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
     2059        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
     2060        return simd_or(loMask, hiMask);
     2061}
     2062
     2063//The total number of operations is 17.0
     2064template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2065{
     2066        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
     2067}
     2068
     2069//The total number of operations is 5.0
     2070template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2071{
     2072        return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2073}
     2074
     2075//The total number of operations is 5.0
     2076template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2077{
     2078        return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2079}
     2080
     2081//The total number of operations is 5.0
     2082template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2083{
     2084        return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2085}
     2086
     2087//The total number of operations is 5.0
     2088template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2089{
     2090        return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2091}
     2092
     2093//The total number of operations is 23.6666666667
     2094template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2095{
     2096        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
     2097        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
     2098        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
     2099        return simd_or(loMask, hiMask);
     2100}
     2101
     2102//The total number of operations is 54.1666666667
     2103template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2104{
     2105        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
     2106        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
     2107        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
     2108        return simd_or(loMask, hiMask);
     2109}
     2110
    17392111//The total number of operations is 0
    1740 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1741 {
    1742         return simd256<2>::constant<(1)>();
     2112template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
     2113{
     2114        return simd256<2>::constant<(2)>();
    17432115}
    17442116
    17452117//The total number of operations is 0
    1746 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1747 {
    1748         return simd256<4>::constant<(3)>();
     2118template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
     2119{
     2120        return simd256<4>::constant<(12)>();
    17492121}
    17502122
    17512123//The total number of operations is 0
    1752 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1753 {
    1754         return simd256<8>::constant<(15)>();
     2124template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
     2125{
     2126        return simd256<8>::constant<(240)>();
    17552127}
    17562128
    17572129//The total number of operations is 0
    1758 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1759 {
    1760         return simd256<16>::constant<(255)>();
     2130template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
     2131{
     2132        return simd256<16>::constant<(65280)>();
    17612133}
    17622134
    17632135//The total number of operations is 0
    1764 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1765 {
    1766         return simd256<32>::constant<(65535)>();
     2136template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
     2137{
     2138        return simd256<32>::constant<-65536>();
    17672139}
    17682140
    17692141//The total number of operations is 0
    1770 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1771 {
    1772         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     2142template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
     2143{
     2144        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    17732145}
    17742146
    17752147//The total number of operations is 0
    1776 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1777 {
    1778         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     2148template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
     2149{
     2150        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    17792151}
    17802152
    17812153//The total number of operations is 0
    1782 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1783 {
    1784         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
     2154template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
     2155{
     2156        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    17852157}
    17862158
     
    18502222}
    18512223
    1852 //The total number of operations is 1.0
    1853 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1854 {
    1855         return simd_and(arg1, arg2);
    1856 }
    1857 
    1858 //The total number of operations is 24.0
    1859 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1860 {
    1861         bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
    1862         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    1863         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    1864         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1865 }
    1866 
    1867 //The total number of operations is 14.0
    1868 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1869 {
    1870         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    1871 }
    1872 
    1873 //The total number of operations is 5.0
    1874 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1875 {
    1876         return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1877 }
    1878 
    1879 //The total number of operations is 5.0
    1880 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1881 {
    1882         return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1883 }
    1884 
    1885 //The total number of operations is 5.0
    1886 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1887 {
    1888         return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1889 }
    1890 
    1891 //The total number of operations is 11.0
    1892 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1893 {
    1894         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1895         return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1896 }
    1897 
    1898 //The total number of operations is 46.6666666667
    1899 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1900 {
    1901         bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
    1902         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    1903         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    1904         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1905 }
    1906 
    1907 //The total number of operations is 132.0
    1908 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1909 {
    1910         bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
    1911         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    1912         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    1913         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1914 }
    1915 
    19162224//The total number of operations is 19.0
    19172225template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     
    19642272        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    19652273        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
    1966 }
    1967 
    1968 //The total number of operations is 2.0
    1969 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1970 {
    1971         return simd_not(simd_xor(arg1, arg2));
    1972 }
    1973 
    1974 //The total number of operations is 14.0
    1975 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1976 {
    1977         bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
    1978         bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
    1979         bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
    1980         return simd_or(loMask, hiMask);
    1981 }
    1982 
    1983 //The total number of operations is 17.0
    1984 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1985 {
    1986         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
    1987 }
    1988 
    1989 //The total number of operations is 5.0
    1990 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1991 {
    1992         return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1993 }
    1994 
    1995 //The total number of operations is 5.0
    1996 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1997 {
    1998         return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1999 }
    2000 
    2001 //The total number of operations is 5.0
    2002 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2003 {
    2004         return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2005 }
    2006 
    2007 //The total number of operations is 5.0
    2008 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2009 {
    2010         return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2011 }
    2012 
    2013 //The total number of operations is 23.6666666667
    2014 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2015 {
    2016         bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
    2017         bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
    2018         bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
    2019         return simd_or(loMask, hiMask);
    2020 }
    2021 
    2022 //The total number of operations is 54.1666666667
    2023 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2024 {
    2025         bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
    2026         bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
    2027         bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
    2028         return simd_or(loMask, hiMask);
    2029 }
    2030 
    2031 //The total number of operations is 7.0
    2032 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
    2033 {
    2034         return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
    2035 }
    2036 
    2037 //The total number of operations is 17.5
    2038 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
    2039 {
    2040         return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
    2041 }
    2042 
    2043 //The total number of operations is 12.0
    2044 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
    2045 {
    2046         bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    2047         return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    2048 }
    2049 
    2050 //The total number of operations is 4.0
    2051 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
    2052 {
    2053         return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    2054 }
    2055 
    2056 //The total number of operations is 4.0
    2057 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
    2058 {
    2059         return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    2060 }
    2061 
    2062 //The total number of operations is 12.0
    2063 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
    2064 {
    2065         return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
    2066 }
    2067 
    2068 //The total number of operations is 28.3333333333
    2069 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
    2070 {
    2071         return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
    2072 }
    2073 
    2074 //The total number of operations is 59.0
    2075 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
    2076 {
    2077         return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    2078 }
    2079 
    2080 //The total number of operations is 0
    2081 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
    2082 {
    2083         return simd256<2>::constant<(2)>();
    2084 }
    2085 
    2086 //The total number of operations is 0
    2087 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
    2088 {
    2089         return simd256<4>::constant<(12)>();
    2090 }
    2091 
    2092 //The total number of operations is 0
    2093 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
    2094 {
    2095         return simd256<8>::constant<(240)>();
    2096 }
    2097 
    2098 //The total number of operations is 0
    2099 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
    2100 {
    2101         return simd256<16>::constant<(65280)>();
    2102 }
    2103 
    2104 //The total number of operations is 0
    2105 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
    2106 {
    2107         return simd256<32>::constant<-65536>();
    2108 }
    2109 
    2110 //The total number of operations is 0
    2111 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
    2112 {
    2113         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    2114 }
    2115 
    2116 //The total number of operations is 0
    2117 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
    2118 {
    2119         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    2120 }
    2121 
    2122 //The total number of operations is 0
    2123 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
    2124 {
    2125         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    2126 }
    2127 
    2128 //The total number of operations is 1.0
    2129 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2130 {
    2131         return simd_andc(arg1, arg2);
    2132 }
    2133 
    2134 //The total number of operations is 24.0
    2135 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2136 {
    2137         bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
    2138         bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
    2139         bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    2140         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    2141         return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
    2142 }
    2143 
    2144 //The total number of operations is 38.0
    2145 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2146 {
    2147         bitblock256_t high_bit = simd256<4>::constant<(8)>();
    2148         return simd256<4>::ult(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    2149 }
    2150 
    2151 //The total number of operations is 13.0
    2152 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2153 {
    2154         return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
    2155 }
    2156 
    2157 //The total number of operations is 13.0
    2158 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2159 {
    2160         return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
    2161 }
    2162 
    2163 //The total number of operations is 13.0
    2164 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2165 {
    2166         return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
    2167 }
    2168 
    2169 //The total number of operations is 13.0
    2170 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2171 {
    2172         return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
    2173 }
    2174 
    2175 //The total number of operations is 81.0
    2176 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2177 {
    2178         bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
    2179         bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
    2180         bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    2181         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    2182         return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
    2183 }
    2184 
    2185 //The total number of operations is 263.166666667
    2186 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2187 {
    2188         bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
    2189         bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
    2190         bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    2191         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    2192         return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
    2193 }
    2194 
    2195 //The total number of operations is 1.0
    2196 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2197 {
    2198         return simd_or(arg1, arg2);
    2199 }
    2200 
    2201 //The total number of operations is 24.0
    2202 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2203 {
    2204         bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
    2205         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    2206         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    2207         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    2208 }
    2209 
    2210 //The total number of operations is 14.0
    2211 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2212 {
    2213         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    2214 }
    2215 
    2216 //The total number of operations is 5.0
    2217 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2218 {
    2219         return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2220 }
    2221 
    2222 //The total number of operations is 5.0
    2223 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2224 {
    2225         return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2226 }
    2227 
    2228 //The total number of operations is 5.0
    2229 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2230 {
    2231         return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2232 }
    2233 
    2234 //The total number of operations is 11.0
    2235 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2236 {
    2237         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    2238         return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    2239 }
    2240 
    2241 //The total number of operations is 46.6666666667
    2242 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2243 {
    2244         bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
    2245         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    2246         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    2247         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    2248 }
    2249 
    2250 //The total number of operations is 132.0
    2251 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2252 {
    2253         bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
    2254         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    2255         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    2256         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    22572274}
    22582275
     
    30143031}
    30153032
     3033//The total number of operations is 15.0
     3034template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
     3035{
     3036        return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
     3037}
     3038
     3039//The total number of operations is 7.0
     3040template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
     3041{
     3042        return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
     3043}
     3044
     3045//The total number of operations is 3.0
     3046template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
     3047{
     3048        return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
     3049}
     3050
     3051//The total number of operations is 1.0
     3052template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
     3053{
     3054        return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
     3055}
     3056
     3057//The total number of operations is 5.0
     3058template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
     3059{
     3060        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     3061}
     3062
    30163063//The total number of operations is 1.0
    30173064template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1)
     
    30693116
    30703117//The total number of operations is 1.5
    3071 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
     3118template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
    30723119{
    30733120        return (((pos%2) == 0) ? (mvmd256<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd256<(2)>::extract<(pos/2)>(arg1)>>1));
     
    30753122
    30763123//The total number of operations is 1.5
    3077 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
     3124template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
    30783125{
    30793126        return (((pos%2) == 0) ? (mvmd256<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd256<(4)>::extract<(pos/2)>(arg1)>>2));
     
    30813128
    30823129//The total number of operations is 1.5
    3083 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
     3130template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
    30843131{
    30853132        return (((pos%2) == 0) ? (mvmd256<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd256<(8)>::extract<(pos/2)>(arg1)>>4));
     
    30873134
    30883135//The total number of operations is 1.5
    3089 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
     3136template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
    30903137{
    30913138        return (((pos%2) == 0) ? (mvmd256<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd256<(16)>::extract<(pos/2)>(arg1)>>8));
     
    30933140
    30943141//The total number of operations is 1.5
    3095 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
     3142template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
    30963143{
    30973144        return ((pos < 8) ? (65535&_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : (65535&_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
     
    30993146
    31003147//The total number of operations is 1.5
    3101 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
     3148template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
    31023149{
    31033150        return ((pos < 4) ? (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
     
    31053152
    31063153//The total number of operations is 3.0
    3107 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
     3154template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
    31083155{
    31093156        return ((((uint64_t)(mvmd256<(32)>::extract<((2*pos)+1)>(arg1)))<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));
     
    31643211}
    31653212
    3166 //The total number of operations is 15.0
    3167 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
    3168 {
    3169         return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
    3170 }
    3171 
    3172 //The total number of operations is 7.0
    3173 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
    3174 {
    3175         return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
    3176 }
    3177 
    3178 //The total number of operations is 3.0
    3179 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
    3180 {
    3181         return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
    3182 }
    3183 
    3184 //The total number of operations is 1.0
    3185 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
    3186 {
    3187         return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
    3188 }
    3189 
    3190 //The total number of operations is 5.0
    3191 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
    3192 {
    3193         return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    3194 }
    3195 
    31963213//The total number of operations is 5.0
    31973214template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
     
    34763493}
    34773494
    3478 //The total number of operations is 1.0
    3479 IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
    3480 {
    3481         _mm256_store_ps((float*)(arg2), arg1);
    3482 }
    3483 
    34843495//The total number of operations is 118.5
    34853496IDISA_ALWAYS_INLINE uint16_t bitblock256::popcount(bitblock256_t arg1)
     
    34943505}
    34953506
     3507//The total number of operations is 1.0
     3508IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
     3509{
     3510        return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
     3511}
     3512
     3513//The total number of operations is 1.0
     3514IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
     3515{
     3516        return _mm256_load_ps((float*)(arg1));
     3517}
     3518
     3519//The total number of operations is 1.0
     3520IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
     3521{
     3522        _mm256_storeu_ps((float*)(arg2), arg1);
     3523}
     3524
    34963525//The total number of operations is 14.0
    34973526template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
     
    35013530
    35023531//The total number of operations is 1.0
    3503 IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
    3504 {
    3505         return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
    3506 }
    3507 
    3508 //The total number of operations is 1.0
    3509 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
    3510 {
    3511         return _mm256_load_ps((float*)(arg1));
    3512 }
    3513 
    3514 //The total number of operations is 1.0
    3515 IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
    3516 {
    3517         _mm256_storeu_ps((float*)(arg2), arg1);
     3532IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
     3533{
     3534        _mm256_store_ps((float*)(arg2), arg1);
    35183535}
    35193536
Note: See TracChangeset for help on using the changeset viewer.