Changeset 3525 for trunk/lib


Ignore:
Timestamp:
Sep 22, 2013, 3:09:25 PM (6 years ago)
Author:
linmengl
Message:

add mvmd_insert to sse and avx. Update all sse library. Hand modified sse2 is saved as idisa_sse2_hand.cpp

Location:
trunk/lib/idisa_cpp
Files:
1 added
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_avx.cpp

    r3441 r3525  
    1616
    1717typedef __m256 bitblock256_t;
    18                        
     18               
     19#ifndef FIELD_TYPE
     20#define FIELD_TYPE     
     21template <uint32_t fw> struct FieldType {
     22   typedef int T;  //default for FieldType::T is int
     23};
     24
     25template <> struct FieldType<1> {typedef uint8_t T;};
     26template <> struct FieldType<2> {typedef uint8_t T;};
     27template <> struct FieldType<4> {typedef uint8_t T;};
     28template <> struct FieldType<8> {typedef uint8_t T;};
     29template <> struct FieldType<16> {typedef uint16_t T;};
     30template <> struct FieldType<32> {typedef uint32_t T;};
     31template <> struct FieldType<64> {typedef uint64_t T;};
     32template <> struct FieldType<128> {typedef uint64_t T;};
     33template <> struct FieldType<256> {typedef uint64_t T;};
     34#endif
     35
    1936template <uint32_t fw>
    2037class simd256
     
    2845        static IDISA_ALWAYS_INLINE bitblock256_t all(bitblock256_t arg1);
    2946        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
     47        static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    3048        static IDISA_ALWAYS_INLINE bitblock256_t ctz(bitblock256_t arg1);
    3149        static IDISA_ALWAYS_INLINE bitblock256_t eq(bitblock256_t arg1, bitblock256_t arg2);
     
    3452        static IDISA_ALWAYS_INLINE bitblock256_t himask();
    3553        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    36         static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    3754        static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2);
    3855        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1);
     
    4158        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    4259        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
     60        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    4361        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    4462        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
     
    4765        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
    4866        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
    49         static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    5067        static IDISA_ALWAYS_INLINE bitblock256_t ugt(bitblock256_t arg1, bitblock256_t arg2);
    5168};
     
    83100        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);
    84101        static IDISA_ALWAYS_INLINE bitblock256_t fill(typename FieldType<fw>::T val1);
    85         template <uint8_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
     102        template <uint16_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
    86103        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);
    87104        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
     
    99116        static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(const bitblock256_t* arg1);
    100117        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    101         static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    102118        static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1);
    103119        static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1);
     
    105121        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    106122        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(const bitblock256_t* arg1);
     123        static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    107124        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t arg1, bitblock256_t* arg2);
    108125};
     
    111128IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2);
    112129IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1);
     130IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    113131IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    114 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    115132IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    116133IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
     
    184201template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1);
    185202template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1);
     203template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
     204template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
     205template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
     206template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
     207template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
     208template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
     209template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
     210template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
     211template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
    186212template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2);
    187213template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2);
     
    243269template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    244270template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    245 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
    246 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
    247 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
    248 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
    249 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
    250 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
    251 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
    252 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
    253 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
     271template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
     272template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
     273template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
     274template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
     275template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
     276template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
     277template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
     278template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    254279template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1);
    255280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1);
     
    260285template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
    261286template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
     287template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
     288template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
     289template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
     290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
     291template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    262295template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    263296template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     
    278311template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
    279312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
    280 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
    286 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    287 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
     313template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
     320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
     321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
     322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
     323template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     324template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
     325template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
     326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
     328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
     329template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
     330template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
     331template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
     332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
     333template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
     334template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
     335template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
     336template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
     337template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
     338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
     339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
     340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
     341template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
     342template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
     343template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
     344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
     345template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
     346template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
     347template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
     348template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
     349template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
     350template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
     351template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
     352template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
     353template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
     354template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
     355template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
     356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    288357template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2);
    289358template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2);
     
    295364template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    296365template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    297 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    298 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
    299 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
    300 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
    301 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
    302 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
    303 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
    304 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
    305 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
    306366template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
    307367template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
     
    312372template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
    313373template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
    314 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
    320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
    321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
    322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
    323 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
    324 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
    325 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
    326 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
    327 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
    328 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
    329 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
    330 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    331 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
    332 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
    333 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
    334 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
    335 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
    336 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
    337 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
    338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
    339 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
    340 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
    341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
    342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
    343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
    344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
    345 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
    346 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
    347 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    348 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    349 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
    350 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
    351 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
    352 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
    353 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
    354 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
    355 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
    356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
    357374template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
    358375template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    473490template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
    474491template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
     492template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
     493template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
     494template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
     495template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
     496template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
    475497template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1);
    476498template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(FieldType<2>::T val1);
     
    482504template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1);
    483505template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1);
    484 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
    485 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
    486 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
    487 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
    488 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
    489 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
    490 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
     506template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
     507template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
     508template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
     509template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
     510template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
     511template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
     512template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
    491513template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);
    492514template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);
     
    498520template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
    499521template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
    500 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
    501 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
    502 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
    503 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
    504 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16);
    505522template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
    506523template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     
    584601
    585602//The total number of operations is 1.0
     603IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
     604{
     605        return _mm256_or_ps(arg1, arg2);
     606}
     607
     608//The total number of operations is 1.0
    586609IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
    587610{
    588611        return _mm256_andnot_ps(arg2, arg1);
    589 }
    590 
    591 //The total number of operations is 1.0
    592 IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
    593 {
    594         return _mm256_or_ps(arg1, arg2);
    595612}
    596613
     
    11331150
    11341151//The total number of operations is 1.0
    1135 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1136 {
    1137         return simd_andc(arg1, arg2);
    1138 }
    1139 
    1140 //The total number of operations is 23.0
    1141 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1142 {
    1143         bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
    1144         bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    1145         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    1146         return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
    1147 }
    1148 
    1149 //The total number of operations is 20.0
    1150 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1151 {
    1152         return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
    1153 }
    1154 
    1155 //The total number of operations is 7.0
    1156 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1157 {
    1158         bitblock256_t high_bit = simd256<8>::constant<(128)>();
    1159         return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1160 }
    1161 
    1162 //The total number of operations is 7.0
    1163 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1164 {
    1165         bitblock256_t high_bit = simd256<16>::constant<(32768)>();
    1166         return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1167 }
    1168 
    1169 //The total number of operations is 7.0
    1170 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1171 {
    1172         bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
    1173         return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1174 }
    1175 
    1176 //The total number of operations is 7.0
    1177 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1178 {
    1179         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1180         return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    1181 }
    1182 
    1183 //The total number of operations is 60.0
    1184 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1185 {
    1186         bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
    1187         bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    1188         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    1189         return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
    1190 }
    1191 
    1192 //The total number of operations is 174.166666667
    1193 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
    1194 {
    1195         bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
    1196         bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    1197         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    1198         return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
    1199 }
    1200 
    1201 //The total number of operations is 7.0
    1202 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
    1203 {
    1204         return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
    1205 }
    1206 
    1207 //The total number of operations is 7.0
    1208 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
    1209 {
    1210         return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
    1211 }
    1212 
    1213 //The total number of operations is 7.0
    1214 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
    1215 {
    1216         return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
    1217 }
    1218 
    1219 //The total number of operations is 6.0
    1220 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
    1221 {
    1222         return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
    1223 }
    1224 
    1225 //The total number of operations is 6.0
    1226 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
    1227 {
    1228         return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
    1229 }
    1230 
    1231 //The total number of operations is 6.0
    1232 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
    1233 {
    1234         return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
    1235 }
    1236 
    1237 //The total number of operations is 10.3333333333
    1238 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
    1239 {
    1240         return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
    1241 }
    1242 
    1243 //The total number of operations is 16.5
    1244 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
    1245 {
    1246         return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
    1247 }
    1248 
    1249 //The total number of operations is 0
    1250 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
    1251 {
    1252         return arg1;
    1253 }
    1254 
    1255 //The total number of operations is 10.0
    1256 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
    1257 {
    1258         return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
    1259 }
    1260 
    1261 //The total number of operations is 21.0
    1262 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
    1263 {
    1264         return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
    1265 }
    1266 
    1267 //The total number of operations is 32.0
    1268 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
    1269 {
    1270         return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
    1271 }
    1272 
    1273 //The total number of operations is 42.0
    1274 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
    1275 {
    1276         return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
    1277 }
    1278 
    1279 //The total number of operations is 52.0
    1280 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
    1281 {
    1282         return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
    1283 }
    1284 
    1285 //The total number of operations is 38.0
    1286 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
    1287 {
    1288         bitblock256_t tmpAns = simd256<8>::popcount(arg1);
    1289         return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
    1290 }
    1291 
    1292 //The total number of operations is 73.6666666667
    1293 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
    1294 {
    1295         return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
    1296 }
    1297 
    1298 //The total number of operations is 115.5
    1299 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
    1300 {
    1301         bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
    1302         return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
    1303 }
    1304 
    1305 //The total number of operations is 14.0
    1306 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
    1307 {
    1308         bitblock256_t t0 = simd256<2>::srli<1>(arg1);
    1309         bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
    1310         return simd_or(f0, simd256<2>::slli<1>(f0));
    1311 }
    1312 
    1313 //The total number of operations is 20.0
    1314 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
    1315 {
    1316         return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
    1317 }
    1318 
    1319 //The total number of operations is 7.0
    1320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
    1321 {
    1322         return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
    1323 }
    1324 
    1325 //The total number of operations is 7.0
    1326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
    1327 {
    1328         return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
    1329 }
    1330 
    1331 //The total number of operations is 7.0
    1332 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
    1333 {
    1334         return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
    1335 }
    1336 
    1337 //The total number of operations is 7.0
    1338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
    1339 {
    1340         return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
    1341 }
    1342 
    1343 //The total number of operations is 60.0
    1344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
    1345 {
    1346         return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
    1347 }
    1348 
    1349 //The total number of operations is 1.0
    1350 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
    1351 {
    1352         return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
    1353 }
    1354 
    1355 //The total number of operations is 16.0
    1356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
    1357 {
    1358         return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
    1359 }
    1360 
    1361 //The total number of operations is 14.0
    1362 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
    1363 {
    1364         return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
    1365 }
    1366 
    1367 //The total number of operations is 5.0
    1368 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
    1369 {
    1370         return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
    1371 }
    1372 
    1373 //The total number of operations is 5.0
    1374 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
    1375 {
    1376         return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
    1377 }
    1378 
    1379 //The total number of operations is 5.0
    1380 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
    1381 {
    1382         return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
    1383 }
    1384 
    1385 //The total number of operations is 5.0
    1386 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
    1387 {
    1388         return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
    1389 }
    1390 
    1391 //The total number of operations is 26.3333333333
    1392 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
    1393 {
    1394         return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
    1395 }
    1396 
    1397 //The total number of operations is 75.6666666667
    1398 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
    1399 {
    1400         return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
    1401 }
    1402 
    1403 //The total number of operations is 5.0
    1404 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
    1405 {
    1406         return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
    1407 }
    1408 
    1409 //The total number of operations is 5.0
    1410 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
    1411 {
    1412         return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
    1413 }
    1414 
    1415 //The total number of operations is 5.0
    1416 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
    1417 {
    1418         return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
    1419 }
    1420 
    1421 //The total number of operations is 4.0
    1422 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
    1423 {
    1424         return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    1425 }
    1426 
    1427 //The total number of operations is 4.0
    1428 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
    1429 {
    1430         return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    1431 }
    1432 
    1433 //The total number of operations is 4.0
    1434 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
    1435 {
    1436         return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
    1437 }
    1438 
    1439 //The total number of operations is 8.33333333333
    1440 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
    1441 {
    1442         return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
    1443 }
    1444 
    1445 //The total number of operations is 14.0
    1446 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
    1447 {
    1448         return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
    1449 }
    1450 
    1451 //The total number of operations is 3.0
    1452 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1453 {
    1454         return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
    1455 }
    1456 
    1457 //The total number of operations is 11.0
    1458 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1459 {
    1460         return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
    1461 }
    1462 
    1463 //The total number of operations is 19.0
    1464 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1465 {
    1466         return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
    1467 }
    1468 
    1469 //The total number of operations is 8.0
    1470 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1471 {
    1472         return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
    1473 }
    1474 
    1475 //The total number of operations is 8.0
    1476 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1477 {
    1478         return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
    1479 }
    1480 
    1481 //The total number of operations is 8.0
    1482 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1483 {
    1484         return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
    1485 }
    1486 
    1487 //The total number of operations is 1.0
    1488 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1489 {
    1490         return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1));
    1491 }
    1492 
    1493 //The total number of operations is 12.3333333333
    1494 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1495 {
    1496         return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
    1497 }
    1498 
    1499 //The total number of operations is 29.8333333333
    1500 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
    1501 {
    1502         return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
    1503 }
    1504 
    1505 //The total number of operations is 1.0
    15061152template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2)
    15071153{
     
    15681214}
    15691215
     1216//The total number of operations is 1.0
     1217template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1218{
     1219        return simd_andc(arg1, arg2);
     1220}
     1221
     1222//The total number of operations is 23.0
     1223template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1224{
     1225        bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
     1226        bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     1227        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     1228        return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
     1229}
     1230
     1231//The total number of operations is 20.0
     1232template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1233{
     1234        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
     1235}
     1236
     1237//The total number of operations is 7.0
     1238template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1239{
     1240        bitblock256_t high_bit = simd256<8>::constant<(128)>();
     1241        return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1242}
     1243
     1244//The total number of operations is 7.0
     1245template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1246{
     1247        bitblock256_t high_bit = simd256<16>::constant<(32768)>();
     1248        return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1249}
     1250
     1251//The total number of operations is 7.0
     1252template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1253{
     1254        bitblock256_t high_bit = simd256<32>::constant<(2147483648ULL)>();
     1255        return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1256}
     1257
     1258//The total number of operations is 7.0
     1259template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1260{
     1261        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1262        return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     1263}
     1264
     1265//The total number of operations is 60.0
     1266template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1267{
     1268        bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
     1269        bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     1270        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     1271        return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
     1272}
     1273
     1274//The total number of operations is 174.166666667
     1275template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
     1276{
     1277        bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
     1278        bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     1279        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     1280        return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
     1281}
     1282
     1283//The total number of operations is 7.0
     1284template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
     1285{
     1286        return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
     1287}
     1288
     1289//The total number of operations is 7.0
     1290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
     1291{
     1292        return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
     1293}
     1294
     1295//The total number of operations is 7.0
     1296template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
     1297{
     1298        return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
     1299}
     1300
     1301//The total number of operations is 6.0
     1302template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
     1303{
     1304        return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
     1305}
     1306
     1307//The total number of operations is 6.0
     1308template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
     1309{
     1310        return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
     1311}
     1312
     1313//The total number of operations is 6.0
     1314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
     1315{
     1316        return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
     1317}
     1318
     1319//The total number of operations is 10.3333333333
     1320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
     1321{
     1322        return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
     1323}
     1324
     1325//The total number of operations is 16.5
     1326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
     1327{
     1328        return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1329}
     1330
     1331//The total number of operations is 0
     1332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
     1333{
     1334        return arg1;
     1335}
     1336
     1337//The total number of operations is 10.0
     1338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
     1339{
     1340        return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
     1341}
     1342
     1343//The total number of operations is 21.0
     1344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
     1345{
     1346        return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
     1347}
     1348
     1349//The total number of operations is 32.0
     1350template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
     1351{
     1352        return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
     1353}
     1354
     1355//The total number of operations is 42.0
     1356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
     1357{
     1358        return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
     1359}
     1360
     1361//The total number of operations is 52.0
     1362template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
     1363{
     1364        return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
     1365}
     1366
     1367//The total number of operations is 38.0
     1368template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
     1369{
     1370        bitblock256_t tmpAns = simd256<8>::popcount(arg1);
     1371        return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
     1372}
     1373
     1374//The total number of operations is 73.6666666667
     1375template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
     1376{
     1377        return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
     1378}
     1379
     1380//The total number of operations is 115.5
     1381template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
     1382{
     1383        bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
     1384        return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
     1385}
     1386
     1387//The total number of operations is 14.0
     1388template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::any(bitblock256_t arg1)
     1389{
     1390        bitblock256_t t0 = simd256<2>::srli<1>(arg1);
     1391        bitblock256_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd256<8>::constant<255>())));
     1392        return simd_or(f0, simd256<2>::slli<1>(f0));
     1393}
     1394
     1395//The total number of operations is 20.0
     1396template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::any(bitblock256_t arg1)
     1397{
     1398        return simd256<4>::ugt(arg1, simd256<8>::constant<0>());
     1399}
     1400
     1401//The total number of operations is 7.0
     1402template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::any(bitblock256_t arg1)
     1403{
     1404        return simd256<8>::ugt(arg1, simd256<8>::constant<0>());
     1405}
     1406
     1407//The total number of operations is 7.0
     1408template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::any(bitblock256_t arg1)
     1409{
     1410        return simd256<16>::ugt(arg1, simd256<8>::constant<0>());
     1411}
     1412
     1413//The total number of operations is 7.0
     1414template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::any(bitblock256_t arg1)
     1415{
     1416        return simd256<32>::ugt(arg1, simd256<8>::constant<0>());
     1417}
     1418
     1419//The total number of operations is 7.0
     1420template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::any(bitblock256_t arg1)
     1421{
     1422        return simd256<64>::ugt(arg1, simd256<8>::constant<0>());
     1423}
     1424
     1425//The total number of operations is 60.0
     1426template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::any(bitblock256_t arg1)
     1427{
     1428        return simd256<128>::ugt(arg1, simd256<8>::constant<0>());
     1429}
     1430
     1431//The total number of operations is 1.0
     1432template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::any(bitblock256_t arg1)
     1433{
     1434        return ((bitblock256::any(arg1)) ? simd256<8>::constant<255>() : simd256<8>::constant<0>());
     1435}
     1436
     1437//The total number of operations is 16.0
     1438template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
     1439{
     1440        return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
     1441}
     1442
     1443//The total number of operations is 14.0
     1444template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
     1445{
     1446        return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
     1447}
     1448
     1449//The total number of operations is 5.0
     1450template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
     1451{
     1452        return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
     1453}
     1454
     1455//The total number of operations is 5.0
     1456template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
     1457{
     1458        return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
     1459}
     1460
     1461//The total number of operations is 5.0
     1462template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
     1463{
     1464        return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
     1465}
     1466
     1467//The total number of operations is 5.0
     1468template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
     1469{
     1470        return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
     1471}
     1472
     1473//The total number of operations is 26.3333333333
     1474template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
     1475{
     1476        return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
     1477}
     1478
     1479//The total number of operations is 75.6666666667
     1480template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
     1481{
     1482        return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
     1483}
     1484
     1485//The total number of operations is 5.0
     1486template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
     1487{
     1488        return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
     1489}
     1490
     1491//The total number of operations is 5.0
     1492template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
     1493{
     1494        return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
     1495}
     1496
     1497//The total number of operations is 5.0
     1498template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
     1499{
     1500        return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
     1501}
     1502
     1503//The total number of operations is 4.0
     1504template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
     1505{
     1506        return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     1507}
     1508
     1509//The total number of operations is 4.0
     1510template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
     1511{
     1512        return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     1513}
     1514
     1515//The total number of operations is 4.0
     1516template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
     1517{
     1518        return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
     1519}
     1520
     1521//The total number of operations is 8.33333333333
     1522template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
     1523{
     1524        return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh&63)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<((128-sh)&63)>(arg1), 8))));
     1525}
     1526
     1527//The total number of operations is 14.0
     1528template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
     1529{
     1530        return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
     1531}
     1532
     1533//The total number of operations is 3.0
     1534template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1535{
     1536        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
     1537}
     1538
     1539//The total number of operations is 11.0
     1540template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1541{
     1542        return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
     1543}
     1544
     1545//The total number of operations is 19.0
     1546template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1547{
     1548        return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
     1549}
     1550
     1551//The total number of operations is 8.0
     1552template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1553{
     1554        return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
     1555}
     1556
     1557//The total number of operations is 8.0
     1558template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1559{
     1560        return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
     1561}
     1562
     1563//The total number of operations is 8.0
     1564template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1565{
     1566        return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
     1567}
     1568
     1569//The total number of operations is 1.0
     1570template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1571{
     1572        return (bitblock256_t)_mm256_blendv_pd(arg3, arg2, arg1);
     1573}
     1574
     1575//The total number of operations is 12.3333333333
     1576template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1577{
     1578        return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
     1579}
     1580
     1581//The total number of operations is 29.8333333333
     1582template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
     1583{
     1584        return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
     1585}
     1586
     1587//The total number of operations is 7.0
     1588template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
     1589{
     1590        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
     1591}
     1592
     1593//The total number of operations is 17.5
     1594template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
     1595{
     1596        return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
     1597}
     1598
     1599//The total number of operations is 12.0
     1600template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
     1601{
     1602        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1603        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1604}
     1605
     1606//The total number of operations is 4.0
     1607template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
     1608{
     1609        return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     1610}
     1611
     1612//The total number of operations is 4.0
     1613template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
     1614{
     1615        return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     1616}
     1617
     1618//The total number of operations is 12.0
     1619template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
     1620{
     1621        return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
     1622}
     1623
     1624//The total number of operations is 28.3333333333
     1625template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
     1626{
     1627        return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
     1628}
     1629
     1630//The total number of operations is 59.0
     1631template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
     1632{
     1633        return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
     1634}
     1635
    15701636//The total number of operations is 10.0
    15711637template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
     
    16141680{
    16151681        return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
     1682}
     1683
     1684//The total number of operations is 0
     1685template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1686{
     1687        return simd256<2>::constant<(1)>();
     1688}
     1689
     1690//The total number of operations is 0
     1691template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1692{
     1693        return simd256<4>::constant<(3)>();
     1694}
     1695
     1696//The total number of operations is 0
     1697template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1698{
     1699        return simd256<8>::constant<(15)>();
     1700}
     1701
     1702//The total number of operations is 0
     1703template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1704{
     1705        return simd256<16>::constant<(255)>();
     1706}
     1707
     1708//The total number of operations is 0
     1709template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1710{
     1711        return simd256<32>::constant<(65535)>();
     1712}
     1713
     1714//The total number of operations is 0
     1715template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1716{
     1717        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     1718}
     1719
     1720//The total number of operations is 0
     1721template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1722{
     1723        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     1724}
     1725
     1726//The total number of operations is 0
     1727template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1728{
     1729        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
    16161730}
    16171731
     
    17371851}
    17381852
     1853//The total number of operations is 1.0
     1854template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1855{
     1856        return simd_and(arg1, arg2);
     1857}
     1858
     1859//The total number of operations is 24.0
     1860template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1861{
     1862        bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
     1863        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     1864        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     1865        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1866}
     1867
     1868//The total number of operations is 14.0
     1869template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1870{
     1871        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     1872}
     1873
     1874//The total number of operations is 5.0
     1875template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1876{
     1877        return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1878}
     1879
     1880//The total number of operations is 5.0
     1881template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1882{
     1883        return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1884}
     1885
     1886//The total number of operations is 5.0
     1887template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1888{
     1889        return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1890}
     1891
     1892//The total number of operations is 11.0
     1893template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1894{
     1895        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1896        return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1897}
     1898
     1899//The total number of operations is 46.6666666667
     1900template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1901{
     1902        bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
     1903        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     1904        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     1905        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1906}
     1907
     1908//The total number of operations is 132.0
     1909template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
     1910{
     1911        bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
     1912        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     1913        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     1914        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1915}
     1916
     1917//The total number of operations is 1.0
     1918template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1919{
     1920        return simd_or(arg1, arg2);
     1921}
     1922
     1923//The total number of operations is 24.0
     1924template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1925{
     1926        bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
     1927        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     1928        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     1929        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1930}
     1931
     1932//The total number of operations is 14.0
     1933template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1934{
     1935        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     1936}
     1937
     1938//The total number of operations is 5.0
     1939template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1940{
     1941        return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1942}
     1943
     1944//The total number of operations is 5.0
     1945template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1946{
     1947        return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1948}
     1949
     1950//The total number of operations is 5.0
     1951template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1952{
     1953        return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1954}
     1955
     1956//The total number of operations is 11.0
     1957template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1958{
     1959        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     1960        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1961}
     1962
     1963//The total number of operations is 46.6666666667
     1964template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1965{
     1966        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
     1967        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     1968        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     1969        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1970}
     1971
     1972//The total number of operations is 132.0
     1973template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
     1974{
     1975        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
     1976        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     1977        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     1978        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1979}
     1980
     1981//The total number of operations is 1.0
     1982template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
     1983{
     1984        return simd_andc(arg1, arg2);
     1985}
     1986
     1987//The total number of operations is 24.0
     1988template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
     1989{
     1990        bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
     1991        bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
     1992        bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
     1993        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
     1994        return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
     1995}
     1996
     1997//The total number of operations is 38.0
     1998template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
     1999{
     2000        bitblock256_t high_bit = simd256<4>::constant<(8)>();
     2001        return simd256<4>::ult(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     2002}
     2003
     2004//The total number of operations is 13.0
     2005template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2006{
     2007        return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
     2008}
     2009
     2010//The total number of operations is 13.0
     2011template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2012{
     2013        return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
     2014}
     2015
     2016//The total number of operations is 13.0
     2017template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2018{
     2019        return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
     2020}
     2021
     2022//The total number of operations is 13.0
     2023template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2024{
     2025        return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
     2026}
     2027
     2028//The total number of operations is 81.0
     2029template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2030{
     2031        bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
     2032        bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
     2033        bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
     2034        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
     2035        return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
     2036}
     2037
     2038//The total number of operations is 263.166666667
     2039template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
     2040{
     2041        bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
     2042        bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
     2043        bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
     2044        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
     2045        return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
     2046}
     2047
     2048//The total number of operations is 2.0
     2049template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2050{
     2051        return simd_not(simd_xor(arg1, arg2));
     2052}
     2053
     2054//The total number of operations is 14.0
     2055template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2056{
     2057        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
     2058        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
     2059        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
     2060        return simd_or(loMask, hiMask);
     2061}
     2062
     2063//The total number of operations is 17.0
     2064template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2065{
     2066        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
     2067}
     2068
     2069//The total number of operations is 5.0
     2070template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2071{
     2072        return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2073}
     2074
     2075//The total number of operations is 5.0
     2076template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2077{
     2078        return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2079}
     2080
     2081//The total number of operations is 5.0
     2082template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2083{
     2084        return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2085}
     2086
     2087//The total number of operations is 5.0
     2088template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2089{
     2090        return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2091}
     2092
     2093//The total number of operations is 23.6666666667
     2094template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2095{
     2096        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
     2097        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
     2098        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
     2099        return simd_or(loMask, hiMask);
     2100}
     2101
     2102//The total number of operations is 54.1666666667
     2103template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2104{
     2105        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
     2106        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
     2107        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
     2108        return simd_or(loMask, hiMask);
     2109}
     2110
    17392111//The total number of operations is 0
    1740 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1741 {
    1742         return simd256<2>::constant<(1)>();
     2112template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
     2113{
     2114        return simd256<2>::constant<(2)>();
    17432115}
    17442116
    17452117//The total number of operations is 0
    1746 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1747 {
    1748         return simd256<4>::constant<(3)>();
     2118template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
     2119{
     2120        return simd256<4>::constant<(12)>();
    17492121}
    17502122
    17512123//The total number of operations is 0
    1752 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1753 {
    1754         return simd256<8>::constant<(15)>();
     2124template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
     2125{
     2126        return simd256<8>::constant<(240)>();
    17552127}
    17562128
    17572129//The total number of operations is 0
    1758 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1759 {
    1760         return simd256<16>::constant<(255)>();
     2130template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
     2131{
     2132        return simd256<16>::constant<(65280)>();
    17612133}
    17622134
    17632135//The total number of operations is 0
    1764 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1765 {
    1766         return simd256<32>::constant<(65535)>();
     2136template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
     2137{
     2138        return simd256<32>::constant<-65536>();
    17672139}
    17682140
    17692141//The total number of operations is 0
    1770 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1771 {
    1772         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
     2142template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
     2143{
     2144        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    17732145}
    17742146
    17752147//The total number of operations is 0
    1776 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1777 {
    1778         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
     2148template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
     2149{
     2150        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    17792151}
    17802152
    17812153//The total number of operations is 0
    1782 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1783 {
    1784         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
     2154template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
     2155{
     2156        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    17852157}
    17862158
     
    18502222}
    18512223
    1852 //The total number of operations is 1.0
    1853 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1854 {
    1855         return simd_and(arg1, arg2);
    1856 }
    1857 
    1858 //The total number of operations is 24.0
    1859 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1860 {
    1861         bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
    1862         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    1863         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    1864         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1865 }
    1866 
    1867 //The total number of operations is 14.0
    1868 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1869 {
    1870         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    1871 }
    1872 
    1873 //The total number of operations is 5.0
    1874 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1875 {
    1876         return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1877 }
    1878 
    1879 //The total number of operations is 5.0
    1880 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1881 {
    1882         return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1883 }
    1884 
    1885 //The total number of operations is 5.0
    1886 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1887 {
    1888         return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1889 }
    1890 
    1891 //The total number of operations is 11.0
    1892 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1893 {
    1894         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1895         return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1896 }
    1897 
    1898 //The total number of operations is 46.6666666667
    1899 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1900 {
    1901         bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
    1902         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    1903         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    1904         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1905 }
    1906 
    1907 //The total number of operations is 132.0
    1908 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
    1909 {
    1910         bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
    1911         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    1912         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    1913         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1914 }
    1915 
    19162224//The total number of operations is 19.0
    19172225template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     
    19642272        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    19652273        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
    1966 }
    1967 
    1968 //The total number of operations is 2.0
    1969 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1970 {
    1971         return simd_not(simd_xor(arg1, arg2));
    1972 }
    1973 
    1974 //The total number of operations is 14.0
    1975 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1976 {
    1977         bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
    1978         bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
    1979         bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
    1980         return simd_or(loMask, hiMask);
    1981 }
    1982 
    1983 //The total number of operations is 17.0
    1984 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1985 {
    1986         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
    1987 }
    1988 
    1989 //The total number of operations is 5.0
    1990 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1991 {
    1992         return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1993 }
    1994 
    1995 //The total number of operations is 5.0
    1996 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
    1997 {
    1998         return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1999 }
    2000 
    2001 //The total number of operations is 5.0
    2002 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2003 {
    2004         return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2005 }
    2006 
    2007 //The total number of operations is 5.0
    2008 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2009 {
    2010         return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2011 }
    2012 
    2013 //The total number of operations is 23.6666666667
    2014 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2015 {
    2016         bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
    2017         bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
    2018         bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
    2019         return simd_or(loMask, hiMask);
    2020 }
    2021 
    2022 //The total number of operations is 54.1666666667
    2023 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2024 {
    2025         bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
    2026         bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
    2027         bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
    2028         return simd_or(loMask, hiMask);
    2029 }
    2030 
    2031 //The total number of operations is 7.0
    2032 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
    2033 {
    2034         return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
    2035 }
    2036 
    2037 //The total number of operations is 17.5
    2038 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
    2039 {
    2040         return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
    2041 }
    2042 
    2043 //The total number of operations is 12.0
    2044 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
    2045 {
    2046         bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    2047         return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    2048 }
    2049 
    2050 //The total number of operations is 4.0
    2051 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
    2052 {
    2053         return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    2054 }
    2055 
    2056 //The total number of operations is 4.0
    2057 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
    2058 {
    2059         return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    2060 }
    2061 
    2062 //The total number of operations is 12.0
    2063 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
    2064 {
    2065         return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
    2066 }
    2067 
    2068 //The total number of operations is 28.3333333333
    2069 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
    2070 {
    2071         return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
    2072 }
    2073 
    2074 //The total number of operations is 59.0
    2075 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
    2076 {
    2077         return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    2078 }
    2079 
    2080 //The total number of operations is 0
    2081 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
    2082 {
    2083         return simd256<2>::constant<(2)>();
    2084 }
    2085 
    2086 //The total number of operations is 0
    2087 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
    2088 {
    2089         return simd256<4>::constant<(12)>();
    2090 }
    2091 
    2092 //The total number of operations is 0
    2093 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
    2094 {
    2095         return simd256<8>::constant<(240)>();
    2096 }
    2097 
    2098 //The total number of operations is 0
    2099 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
    2100 {
    2101         return simd256<16>::constant<(65280)>();
    2102 }
    2103 
    2104 //The total number of operations is 0
    2105 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
    2106 {
    2107         return simd256<32>::constant<-65536>();
    2108 }
    2109 
    2110 //The total number of operations is 0
    2111 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
    2112 {
    2113         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
    2114 }
    2115 
    2116 //The total number of operations is 0
    2117 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
    2118 {
    2119         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
    2120 }
    2121 
    2122 //The total number of operations is 0
    2123 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
    2124 {
    2125         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    2126 }
    2127 
    2128 //The total number of operations is 1.0
    2129 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2130 {
    2131         return simd_andc(arg1, arg2);
    2132 }
    2133 
    2134 //The total number of operations is 24.0
    2135 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2136 {
    2137         bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
    2138         bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
    2139         bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
    2140         mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
    2141         return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
    2142 }
    2143 
    2144 //The total number of operations is 38.0
    2145 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2146 {
    2147         bitblock256_t high_bit = simd256<4>::constant<(8)>();
    2148         return simd256<4>::ult(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    2149 }
    2150 
    2151 //The total number of operations is 13.0
    2152 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2153 {
    2154         return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
    2155 }
    2156 
    2157 //The total number of operations is 13.0
    2158 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2159 {
    2160         return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
    2161 }
    2162 
    2163 //The total number of operations is 13.0
    2164 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2165 {
    2166         return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
    2167 }
    2168 
    2169 //The total number of operations is 13.0
    2170 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2171 {
    2172         return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
    2173 }
    2174 
    2175 //The total number of operations is 81.0
    2176 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2177 {
    2178         bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
    2179         bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
    2180         bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
    2181         mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
    2182         return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
    2183 }
    2184 
    2185 //The total number of operations is 263.166666667
    2186 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
    2187 {
    2188         bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
    2189         bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
    2190         bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
    2191         mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
    2192         return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
    2193 }
    2194 
    2195 //The total number of operations is 1.0
    2196 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2197 {
    2198         return simd_or(arg1, arg2);
    2199 }
    2200 
    2201 //The total number of operations is 24.0
    2202 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2203 {
    2204         bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
    2205         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    2206         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    2207         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    2208 }
    2209 
    2210 //The total number of operations is 14.0
    2211 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2212 {
    2213         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    2214 }
    2215 
    2216 //The total number of operations is 5.0
    2217 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2218 {
    2219         return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2220 }
    2221 
    2222 //The total number of operations is 5.0
    2223 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2224 {
    2225         return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2226 }
    2227 
    2228 //The total number of operations is 5.0
    2229 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2230 {
    2231         return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2232 }
    2233 
    2234 //The total number of operations is 11.0
    2235 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2236 {
    2237         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    2238         return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    2239 }
    2240 
    2241 //The total number of operations is 46.6666666667
    2242 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2243 {
    2244         bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
    2245         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    2246         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    2247         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    2248 }
    2249 
    2250 //The total number of operations is 132.0
    2251 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2252 {
    2253         bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
    2254         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    2255         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    2256         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    22572274}
    22582275
     
    30143031}
    30153032
     3033//The total number of operations is 15.0
     3034template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
     3035{
     3036        return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
     3037}
     3038
     3039//The total number of operations is 7.0
     3040template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
     3041{
     3042        return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
     3043}
     3044
     3045//The total number of operations is 3.0
     3046template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
     3047{
     3048        return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
     3049}
     3050
     3051//The total number of operations is 1.0
     3052template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
     3053{
     3054        return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
     3055}
     3056
     3057//The total number of operations is 5.0
     3058template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
     3059{
     3060        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     3061}
     3062
    30163063//The total number of operations is 1.0
    30173064template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(FieldType<1>::T val1)
     
    30693116
    30703117//The total number of operations is 1.5
    3071 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
     3118template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
    30723119{
    30733120        return (((pos%2) == 0) ? (mvmd256<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd256<(2)>::extract<(pos/2)>(arg1)>>1));
     
    30753122
    30763123//The total number of operations is 1.5
    3077 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
     3124template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
    30783125{
    30793126        return (((pos%2) == 0) ? (mvmd256<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd256<(4)>::extract<(pos/2)>(arg1)>>2));
     
    30813128
    30823129//The total number of operations is 1.5
    3083 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
     3130template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
    30843131{
    30853132        return (((pos%2) == 0) ? (mvmd256<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd256<(8)>::extract<(pos/2)>(arg1)>>4));
     
    30873134
    30883135//The total number of operations is 1.5
    3089 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
     3136template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
    30903137{
    30913138        return (((pos%2) == 0) ? (mvmd256<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd256<(16)>::extract<(pos/2)>(arg1)>>8));
     
    30933140
    30943141//The total number of operations is 1.5
    3095 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
     3142template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
    30963143{
    30973144        return ((pos < 8) ? (65535&_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : (65535&_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
     
    30993146
    31003147//The total number of operations is 1.5
    3101 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
     3148template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
    31023149{
    31033150        return ((pos < 4) ? (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
     
    31053152
    31063153//The total number of operations is 3.0
    3107 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
     3154template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
    31083155{
    31093156        return ((((uint64_t)(mvmd256<(32)>::extract<((2*pos)+1)>(arg1)))<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));
     
    31643211}
    31653212
    3166 //The total number of operations is 15.0
    3167 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
    3168 {
    3169         return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
    3170 }
    3171 
    3172 //The total number of operations is 7.0
    3173 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
    3174 {
    3175         return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
    3176 }
    3177 
    3178 //The total number of operations is 3.0
    3179 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
    3180 {
    3181         return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
    3182 }
    3183 
    3184 //The total number of operations is 1.0
    3185 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
    3186 {
    3187         return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
    3188 }
    3189 
    3190 //The total number of operations is 5.0
    3191 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8, FieldType<16>::T val9, FieldType<16>::T val10, FieldType<16>::T val11, FieldType<16>::T val12, FieldType<16>::T val13, FieldType<16>::T val14, FieldType<16>::T val15, FieldType<16>::T val16)
    3192 {
    3193         return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    3194 }
    3195 
    31963213//The total number of operations is 5.0
    31973214template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
     
    34763493}
    34773494
    3478 //The total number of operations is 1.0
    3479 IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
    3480 {
    3481         _mm256_store_ps((float*)(arg2), arg1);
    3482 }
    3483 
    34843495//The total number of operations is 118.5
    34853496IDISA_ALWAYS_INLINE uint16_t bitblock256::popcount(bitblock256_t arg1)
     
    34943505}
    34953506
     3507//The total number of operations is 1.0
     3508IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
     3509{
     3510        return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
     3511}
     3512
     3513//The total number of operations is 1.0
     3514IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
     3515{
     3516        return _mm256_load_ps((float*)(arg1));
     3517}
     3518
     3519//The total number of operations is 1.0
     3520IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
     3521{
     3522        _mm256_storeu_ps((float*)(arg2), arg1);
     3523}
     3524
    34963525//The total number of operations is 14.0
    34973526template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
     
    35013530
    35023531//The total number of operations is 1.0
    3503 IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
    3504 {
    3505         return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
    3506 }
    3507 
    3508 //The total number of operations is 1.0
    3509 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
    3510 {
    3511         return _mm256_load_ps((float*)(arg1));
    3512 }
    3513 
    3514 //The total number of operations is 1.0
    3515 IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
    3516 {
    3517         _mm256_storeu_ps((float*)(arg2), arg1);
     3532IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
     3533{
     3534        _mm256_store_ps((float*)(arg2), arg1);
    35183535}
    35193536
  • trunk/lib/idisa_cpp/idisa_avx2.cpp

    r3462 r3525  
    103103        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);
    104104        static IDISA_ALWAYS_INLINE bitblock256_t fill(typename FieldType<fw>::T val1);
    105         template <uint8_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
     105        template <uint16_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock256_t arg1);
    106106        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);
    107107        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
     
    112112        static IDISA_ALWAYS_INLINE bitblock256_t fill8(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8);
    113113        static IDISA_ALWAYS_INLINE bitblock256_t fill16(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8, typename FieldType<fw>::T val9, typename FieldType<fw>::T val10, typename FieldType<fw>::T val11, typename FieldType<fw>::T val12, typename FieldType<fw>::T val13, typename FieldType<fw>::T val14, typename FieldType<fw>::T val15, typename FieldType<fw>::T val16);
     114        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock256_t insert(bitblock256_t arg1, typename FieldType<fw>::T arg2);
    114115};
    115116
     
    126127        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    127128        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(const bitblock256_t* arg1);
     129        static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    128130        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t arg1, bitblock256_t* arg2);
    129         static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t arg1, bitblock256_t* arg2);
    130131};
    131132
     
    519520template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::fill(FieldType<128>::T val1);
    520521template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::fill(FieldType<256>::T val1);
    521 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
    522 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
    523 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
    524 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
    525 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
    526 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
    527 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
     522template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1);
     523template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1);
     524template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1);
     525template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1);
     526template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1);
     527template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1);
     528template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1);
    528529template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);
    529530template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);
     
    535536template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
    536537template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
     538template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::insert(bitblock256_t arg1, FieldType<2>::T arg2);
     539template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::insert(bitblock256_t arg1, FieldType<4>::T arg2);
     540template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::insert(bitblock256_t arg1, FieldType<8>::T arg2);
     541template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::insert(bitblock256_t arg1, FieldType<16>::T arg2);
     542template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::insert(bitblock256_t arg1, FieldType<32>::T arg2);
     543template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::insert(bitblock256_t arg1, FieldType<64>::T arg2);
    537544template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
    538545template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     
    31933200
    31943201//The total number of operations is 1.5
    3195 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
     3202template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd256<1>::extract(bitblock256_t arg1)
    31963203{
    31973204        return (((pos%2) == 0) ? (mvmd256<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd256<(2)>::extract<(pos/2)>(arg1)>>1));
     
    31993206
    32003207//The total number of operations is 1.5
    3201 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
     3208template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd256<2>::extract(bitblock256_t arg1)
    32023209{
    32033210        return (((pos%2) == 0) ? (mvmd256<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd256<(4)>::extract<(pos/2)>(arg1)>>2));
     
    32053212
    32063213//The total number of operations is 1.5
    3207 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
     3214template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd256<4>::extract(bitblock256_t arg1)
    32083215{
    32093216        return (((pos%2) == 0) ? (mvmd256<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd256<(8)>::extract<(pos/2)>(arg1)>>4));
     
    32113218
    32123219//The total number of operations is 1.5
    3213 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
     3220template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd256<8>::extract(bitblock256_t arg1)
    32143221{
    32153222        return (((pos%2) == 0) ? (mvmd256<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd256<(16)>::extract<(pos/2)>(arg1)>>8));
     
    32173224
    32183225//The total number of operations is 1.5
    3219 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
     3226template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd256<16>::extract(bitblock256_t arg1)
    32203227{
    32213228        return ((pos < 8) ? (65535&_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : (65535&_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
     
    32233230
    32243231//The total number of operations is 1.5
    3225 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
     3232template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd256<32>::extract(bitblock256_t arg1)
    32263233{
    32273234        return ((pos < 4) ? (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
     
    32293236
    32303237//The total number of operations is 3.0
    3231 template <> template <uint8_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
     3238template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd256<64>::extract(bitblock256_t arg1)
    32323239{
    32333240        return ((((uint64_t)(mvmd256<(32)>::extract<((2*pos)+1)>(arg1)))<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));
     
    32883295}
    32893296
     3297//The total number of operations is 7.5
     3298template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::insert(bitblock256_t arg1, FieldType<2>::T arg2)
     3299{
     3300        uint32_t v = (arg2&(3));
     3301        uint64_t doublev = mvmd256<(4)>::extract<(pos/2)>(arg1);
     3302        return mvmd256<(4)>::insert<(pos/2)>(arg1, (((pos&1) == 0) ? (((doublev>>2)<<2)|v) : ((doublev&(3))|(v<<2))));
     3303}
     3304
     3305//The total number of operations is 6.0
     3306template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::insert(bitblock256_t arg1, FieldType<4>::T arg2)
     3307{
     3308        uint32_t v = (arg2&(15));
     3309        uint64_t doublev = mvmd256<(8)>::extract<(pos/2)>(arg1);
     3310        return mvmd256<(8)>::insert<(pos/2)>(arg1, (((pos&1) == 0) ? (((doublev>>4)<<4)|v) : ((doublev&(15))|(v<<4))));
     3311}
     3312
     3313//The total number of operations is 4.5
     3314template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::insert(bitblock256_t arg1, FieldType<8>::T arg2)
     3315{
     3316        uint32_t v = (arg2&(255));
     3317        uint64_t doublev = mvmd256<(16)>::extract<(pos/2)>(arg1);
     3318        return mvmd256<(16)>::insert<(pos/2)>(arg1, (((pos&1) == 0) ? (((doublev>>8)<<8)|v) : ((doublev&(255))|(v<<8))));
     3319}
     3320
     3321//The total number of operations is 3.0
     3322template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::insert(bitblock256_t arg1, FieldType<16>::T arg2)
     3323{
     3324        return ((pos < 8) ? avx_general_combine256(avx_select_hi128(arg1), _mm_insert_epi16(avx_select_lo128(arg1), (int32_t)(arg2), (int32_t)(pos))) : avx_general_combine256(_mm_insert_epi16(avx_select_hi128(arg1), (int32_t)(arg2), (int32_t)((pos-8))), avx_select_lo128(arg1)));
     3325}
     3326
     3327//The total number of operations is 6.0
     3328template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::insert(bitblock256_t arg1, FieldType<32>::T arg2)
     3329{
     3330        return mvmd256<(16)>::insert<(2*pos)>(mvmd256<(16)>::insert<((2*pos)+1)>(arg1, (arg2>>(16))), (arg2&(65535)));
     3331}
     3332
     3333//The total number of operations is 12.0
     3334template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::insert(bitblock256_t arg1, FieldType<64>::T arg2)
     3335{
     3336        return mvmd256<(32)>::insert<(2*pos)>(mvmd256<(32)>::insert<((2*pos)+1)>(arg1, (arg2>>(32))), (arg2&((4294967296ULL)-1)));
     3337}
     3338
    32903339//The total number of operations is 5.0
    32913340template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
     
    36073656
    36083657//The total number of operations is 1.0
    3609 IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
    3610 {
    3611         _mm256_store_si256((bitblock256_t*)(arg2), arg1);
    3612 }
    3613 
    3614 //The total number of operations is 1.0
    36153658IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
    36163659{
     
    36243667}
    36253668
     3669//The total number of operations is 1.0
     3670IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
     3671{
     3672        _mm256_store_si256((bitblock256_t*)(arg2), arg1);
     3673}
     3674
    36263675#endif
  • trunk/lib/idisa_cpp/idisa_neon.cpp

    r2275 r3525  
    1414
    1515typedef uint64x2_t bitblock128_t;
     16               
     17#ifndef FIELD_TYPE
     18#define FIELD_TYPE     
     19template <uint32_t fw> struct FieldType {
     20   typedef int T;  //default for FieldType::T is int
     21};
     22
     23template <> struct FieldType<1> {typedef uint8_t T;};
     24template <> struct FieldType<2> {typedef uint8_t T;};
     25template <> struct FieldType<4> {typedef uint8_t T;};
     26template <> struct FieldType<8> {typedef uint8_t T;};
     27template <> struct FieldType<16> {typedef uint16_t T;};
     28template <> struct FieldType<32> {typedef uint32_t T;};
     29template <> struct FieldType<64> {typedef uint64_t T;};
     30template <> struct FieldType<128> {typedef uint64_t T;};
     31template <> struct FieldType<256> {typedef uint64_t T;};
     32#endif
     33
    1634template <uint32_t fw>
    1735class simd128
     
    2240        static IDISA_ALWAYS_INLINE bitblock128_t gt(bitblock128_t arg1, bitblock128_t arg2);
    2341        static IDISA_ALWAYS_INLINE bitblock128_t ult(bitblock128_t arg1, bitblock128_t arg2);
    24         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     42        static IDISA_ALWAYS_INLINE bitblock128_t all(bitblock128_t arg1);
     43        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     44        static IDISA_ALWAYS_INLINE bitblock128_t ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    2545        static IDISA_ALWAYS_INLINE bitblock128_t ctz(bitblock128_t arg1);
    2646        static IDISA_ALWAYS_INLINE bitblock128_t eq(bitblock128_t arg1, bitblock128_t arg2);
     
    2848        static IDISA_ALWAYS_INLINE bitblock128_t neg(bitblock128_t arg1);
    2949        static IDISA_ALWAYS_INLINE bitblock128_t himask();
    30         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    31         static IDISA_ALWAYS_INLINE bitblock128_t ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
     50        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    3251        static IDISA_ALWAYS_INLINE bitblock128_t sub(bitblock128_t arg1, bitblock128_t arg2);
    3352        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    3453        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3554        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    36         template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
     55        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    3756        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
     57        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    3858        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    3959        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
    4060        static IDISA_ALWAYS_INLINE bitblock128_t xor_hl(bitblock128_t arg1);
    41         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
     61        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
     62        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    4263        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    43         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    4464        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    4565};
     
    5373        static IDISA_ALWAYS_INLINE bitblock128_t packss(bitblock128_t arg1, bitblock128_t arg2);
    5474        static IDISA_ALWAYS_INLINE bitblock128_t packh(bitblock128_t arg1, bitblock128_t arg2);
    55         static IDISA_ALWAYS_INLINE uint64_t signmask(bitblock128_t arg1);
     75        static IDISA_ALWAYS_INLINE typename FieldType<128/fw>::T signmask(bitblock128_t arg1);
    5676        static IDISA_ALWAYS_INLINE bitblock128_t packl(bitblock128_t arg1, bitblock128_t arg2);
    5777        static IDISA_ALWAYS_INLINE bitblock128_t min_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    7595{
    7696public:
    77         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dsrli(bitblock128_t arg1, bitblock128_t arg2);
    78         static IDISA_ALWAYS_INLINE bitblock128_t fill(uint64_t val1);
    79         template <uint64_t pos> static IDISA_ALWAYS_INLINE uint64_t extract(bitblock128_t arg1);
    80         template <uint64_t pos> static IDISA_ALWAYS_INLINE bitblock128_t splat(bitblock128_t arg1);
    81         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    82         static IDISA_ALWAYS_INLINE bitblock128_t fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    83         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    84         static IDISA_ALWAYS_INLINE bitblock128_t fill2(uint64_t val1, uint64_t val2);
    85         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dslli(bitblock128_t arg1, bitblock128_t arg2);
    86         static IDISA_ALWAYS_INLINE bitblock128_t fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    87         static IDISA_ALWAYS_INLINE bitblock128_t fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
     97        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dsrli(bitblock128_t arg1, bitblock128_t arg2);
     98        static IDISA_ALWAYS_INLINE bitblock128_t fill(typename FieldType<fw>::T val1);
     99        template <uint16_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock128_t arg1);
     100        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock128_t splat(bitblock128_t arg1);
     101        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
     102        static IDISA_ALWAYS_INLINE bitblock128_t fill4(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4);
     103        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     104        static IDISA_ALWAYS_INLINE bitblock128_t fill2(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2);
     105        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dslli(bitblock128_t arg1, bitblock128_t arg2);
     106        static IDISA_ALWAYS_INLINE bitblock128_t fill8(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8);
     107        static IDISA_ALWAYS_INLINE bitblock128_t fill16(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8, typename FieldType<fw>::T val9, typename FieldType<fw>::T val10, typename FieldType<fw>::T val11, typename FieldType<fw>::T val12, typename FieldType<fw>::T val13, typename FieldType<fw>::T val14, typename FieldType<fw>::T val15, typename FieldType<fw>::T val16);
    88108};
    89109
     
    92112public:
    93113        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(const uint64_t const* arg1);
    94         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    95         static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t arg1, uint64_t* arg2);
     114        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    96115        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    97116        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    98         static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
    99         template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
     117        static IDISA_ALWAYS_INLINE uint16_t popcount(bitblock128_t arg1);
     118        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    100119        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(const uint64_t const* arg1);
     120        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t arg1, uint64_t* arg2);
    101121        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t arg1, uint64_t* arg2);
    102122};
     
    105125IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    106126IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     127IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    107128IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    108 IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
     129IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    109130IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
    110 IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    111131template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::max(bitblock128_t arg1, bitblock128_t arg2);
    112132template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::max(bitblock128_t arg1, bitblock128_t arg2);
     
    139159template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ult(bitblock128_t arg1, bitblock128_t arg2);
    140160template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ult(bitblock128_t arg1, bitblock128_t arg2);
    141 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    142 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
    143 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lt(bitblock128_t arg1, bitblock128_t arg2);
    144 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lt(bitblock128_t arg1, bitblock128_t arg2);
    145 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lt(bitblock128_t arg1, bitblock128_t arg2);
    146 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lt(bitblock128_t arg1, bitblock128_t arg2);
    147 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    148 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    149 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1);
    150 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1);
    151 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1);
    152 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1);
    153 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1);
    154 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1);
    155 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1);
     161template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::all(bitblock128_t arg1);
     162template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::all(bitblock128_t arg1);
     163template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::all(bitblock128_t arg1);
     164template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::all(bitblock128_t arg1);
     165template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::all(bitblock128_t arg1);
     166template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::all(bitblock128_t arg1);
     167template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::all(bitblock128_t arg1);
     168template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1);
     169template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1);
     170template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1);
     171template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1);
     172template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1);
     173template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1);
     174template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1);
    156175template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1);
    157176template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1);
     
    162181template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1);
    163182template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1);
     183template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2);
     184template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2);
     185template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2);
     186template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2);
     187template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2);
     188template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2);
     189template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2);
     190template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2);
    164191template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2);
    165192template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2);
     
    185212template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1);
    186213template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1);
     214template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::any(bitblock128_t arg1);
     215template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::any(bitblock128_t arg1);
     216template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::any(bitblock128_t arg1);
     217template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::any(bitblock128_t arg1);
     218template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::any(bitblock128_t arg1);
     219template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::any(bitblock128_t arg1);
     220template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::any(bitblock128_t arg1);
    187221template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1);
    188222template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1);
     
    192226template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1);
    193227template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1);
    194 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1);
    195 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1);
    196 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1);
    197 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1);
    198 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1);
    199 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1);
    200 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1);
     228template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1);
     229template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1);
     230template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1);
     231template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1);
     232template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1);
     233template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1);
     234template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1);
    201235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    202236template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
     
    207241template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    208242template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    209 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2);
    210 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2);
    211 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2);
    212 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2);
    213 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2);
    214 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2);
    215 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2);
    216 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2);
     243template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     244template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     245template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     246template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     247template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     248template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     249template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    217250template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1);
    218251template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1);
     
    222255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1);
    223256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    224 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    225 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
    226 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
    227 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
    228 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
    229 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
    230 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
    231 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
     257template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     258template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     259template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     263template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
     264template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
     265template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     266template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
     267template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
     268template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
     269template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
     270template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
     271template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
    232272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::min(bitblock128_t arg1, bitblock128_t arg2);
    233273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::min(bitblock128_t arg1, bitblock128_t arg2);
     
    238278template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    239279template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
    240 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    241 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    242 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    243 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    244 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    245 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    246 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    247280template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    248281template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    253286template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    254287template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    255 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    256 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    257 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    258 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    259 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    260 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    261 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     288template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
     297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lt(bitblock128_t arg1, bitblock128_t arg2);
     299template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lt(bitblock128_t arg1, bitblock128_t arg2);
     300template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lt(bitblock128_t arg1, bitblock128_t arg2);
     301template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lt(bitblock128_t arg1, bitblock128_t arg2);
     302template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
     303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    262304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    263305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    268310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    269311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    271 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    272 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    273 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    274 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    275 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    276 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    277312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    278313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    290325template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    291326template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    292 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    293 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    294 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    295 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    296 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    297 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    298 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    299 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     329template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     330template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     331template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     332template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     333template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    300334template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    301335template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    319353template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<64>::packss(bitblock128_t arg1, bitblock128_t arg2);
    320354template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<128>::packss(bitblock128_t arg1, bitblock128_t arg2);
    321 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<4>::signmask(bitblock128_t arg1);
    322 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<8>::signmask(bitblock128_t arg1);
    323 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<16>::signmask(bitblock128_t arg1);
    324 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<32>::signmask(bitblock128_t arg1);
    325 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<64>::signmask(bitblock128_t arg1);
    326 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<128>::signmask(bitblock128_t arg1);
     355template <> IDISA_ALWAYS_INLINE FieldType<128/4>::T hsimd128<4>::signmask(bitblock128_t arg1);
     356template <> IDISA_ALWAYS_INLINE FieldType<128/8>::T hsimd128<8>::signmask(bitblock128_t arg1);
     357template <> IDISA_ALWAYS_INLINE FieldType<128/16>::T hsimd128<16>::signmask(bitblock128_t arg1);
     358template <> IDISA_ALWAYS_INLINE FieldType<128/32>::T hsimd128<32>::signmask(bitblock128_t arg1);
     359template <> IDISA_ALWAYS_INLINE FieldType<128/64>::T hsimd128<64>::signmask(bitblock128_t arg1);
     360template <> IDISA_ALWAYS_INLINE FieldType<128/128>::T hsimd128<128>::signmask(bitblock128_t arg1);
    327361template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::packl(bitblock128_t arg1, bitblock128_t arg2);
    328362template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::packl(bitblock128_t arg1, bitblock128_t arg2);
     
    395429template <> IDISA_ALWAYS_INLINE bitblock128_t esimd128<32>::signextendl(bitblock128_t arg1);
    396430template <> IDISA_ALWAYS_INLINE bitblock128_t esimd128<64>::signextendl(bitblock128_t arg1);
    397 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    398 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    399 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    400 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    401 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    402 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    403 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
    404 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(uint64_t val1);
    405 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(uint64_t val1);
    406 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(uint64_t val1);
    407 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(uint64_t val1);
    408 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(uint64_t val1);
    409 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(uint64_t val1);
    410 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(uint64_t val1);
    411 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(uint64_t val1);
    412 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<1>::extract(bitblock128_t arg1);
    413 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<2>::extract(bitblock128_t arg1);
    414 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<4>::extract(bitblock128_t arg1);
    415 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<8>::extract(bitblock128_t arg1);
    416 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<16>::extract(bitblock128_t arg1);
    417 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<32>::extract(bitblock128_t arg1);
    418 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<64>::extract(bitblock128_t arg1);
    419 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1);
    420 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1);
    421 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1);
    422 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1);
    423 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1);
    424 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1);
    425 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1);
    426 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1);
    427 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    428 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    429 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    430 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    431 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    432 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    433 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    434 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    435 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    436 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    437 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1);
    438 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1);
    439 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1);
    440 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1);
    441 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1);
    442 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1);
    443 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1);
    444 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(uint64_t val1, uint64_t val2);
    445 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(uint64_t val1, uint64_t val2);
    446 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(uint64_t val1, uint64_t val2);
    447 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(uint64_t val1, uint64_t val2);
    448 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(uint64_t val1, uint64_t val2);
    449 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(uint64_t val1, uint64_t val2);
    450 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(uint64_t val1, uint64_t val2);
    451 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    452 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    453 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    454 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    455 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    456 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    457 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    458 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    459 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    460 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    461 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    462 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    463 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    464 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    465 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    466 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    467 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    468 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    469 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     431template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     432template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     433template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     434template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     435template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     436template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     437template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2);
     438template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16);
     439template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16);
     440template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16);
     441template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16);
     442template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(FieldType<1>::T val1);
     443template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(FieldType<2>::T val1);
     444template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(FieldType<4>::T val1);
     445template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(FieldType<8>::T val1);
     446template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(FieldType<16>::T val1);
     447template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(FieldType<32>::T val1);
     448template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(FieldType<64>::T val1);
     449template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(FieldType<128>::T val1);
     450template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd128<1>::extract(bitblock128_t arg1);
     451template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd128<2>::extract(bitblock128_t arg1);
     452template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd128<4>::extract(bitblock128_t arg1);
     453template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd128<8>::extract(bitblock128_t arg1);
     454template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd128<16>::extract(bitblock128_t arg1);
     455template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd128<32>::extract(bitblock128_t arg1);
     456template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd128<64>::extract(bitblock128_t arg1);
     457template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1);
     458template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1);
     459template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1);
     460template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1);
     461template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1);
     462template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1);
     463template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1);
     464template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1);
     465template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4);
     466template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4);
     467template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4);
     468template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4);
     469template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4);
     470template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4);
     471template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1);
     472template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1);
     473template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1);
     474template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1);
     475template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1);
     476template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1);
     477template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1);
     478template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2);
     479template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2);
     480template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2);
     481template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2);
     482template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2);
     483template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2);
     484template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2);
     485template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     486template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     487template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     488template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     489template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     490template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     491template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     492template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     493template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     494template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     495template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     496template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     497template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     498template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
     499template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8);
     500template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8);
     501template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8);
     502template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8);
     503template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8);
    470504
    471505//Implementation Part
     
    489523
    490524//The total number of operations is 1.0
     525IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
     526{
     527        return vorrq_u64(arg1, arg2);
     528}
     529
     530//The total number of operations is 1.0
    491531IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    492532{
     
    495535
    496536//The total number of operations is 1.0
    497 IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
    498 {
    499         return vorrq_u64(arg1, arg2);
     537IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
     538{
     539        return vandq_u64(arg1, arg2);
    500540}
    501541
     
    504544{
    505545        return veorq_u64(arg1, arg2);
    506 }
    507 
    508 //The total number of operations is 1.0
    509 IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
    510 {
    511         return vandq_u64(arg1, arg2);
    512546}
    513547
     
    723757}
    724758
     759//The total number of operations is 5.5
     760template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::all(bitblock128_t arg1)
     761{
     762        bitblock128_t f0 = simd_and(arg1, simd128<2>::srli<1>(arg1));
     763        return simd_or(f0, simd128<2>::slli<1>(f0));
     764}
     765
     766//The total number of operations is 9.0
     767template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::all(bitblock128_t arg1)
     768{
     769        return simd128<4>::eq(arg1, simd128<8>::constant<255>());
     770}
     771
     772//The total number of operations is 1.0
     773template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::all(bitblock128_t arg1)
     774{
     775        return simd128<8>::eq(arg1, simd128<8>::constant<255>());
     776}
     777
     778//The total number of operations is 1.0
     779template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::all(bitblock128_t arg1)
     780{
     781        return simd128<16>::eq(arg1, simd128<8>::constant<255>());
     782}
     783
     784//The total number of operations is 1.0
     785template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::all(bitblock128_t arg1)
     786{
     787        return simd128<32>::eq(arg1, simd128<8>::constant<255>());
     788}
     789
     790//The total number of operations is 4.5
     791template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::all(bitblock128_t arg1)
     792{
     793        return simd128<64>::eq(arg1, simd128<8>::constant<255>());
     794}
     795
     796//The total number of operations is 5.0
     797template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::all(bitblock128_t arg1)
     798{
     799        return ((bitblock128::all(arg1)) ? simd128<8>::constant<255>() : simd128<8>::constant<0>());
     800}
     801
     802//The total number of operations is 2.0
     803template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1)
     804{
     805        return simd_and(simd128<32>::srli<sh>(arg1), simd128<2>::constant<((3)>>sh)>());
     806}
     807
     808//The total number of operations is 2.0
     809template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1)
     810{
     811        return simd_and(simd128<32>::srli<sh>(arg1), simd128<4>::constant<((15)>>sh)>());
     812}
     813
     814//The total number of operations is 1.0
     815template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1)
     816{
     817        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u8((uint8x16_t)(arg1), (int32_t)(sh)))));
     818}
     819
     820//The total number of operations is 1.0
     821template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1)
     822{
     823        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u16((uint16x8_t)(arg1), (int32_t)(sh)))));
     824}
     825
     826//The total number of operations is 1.0
     827template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1)
     828{
     829        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u32((uint32x4_t)(arg1), (int32_t)(sh)))));
     830}
     831
     832//The total number of operations is 1.0
     833template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1)
     834{
     835        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u64((uint64x2_t)(arg1), (int32_t)(sh)))));
     836}
     837
     838//The total number of operations is 3.16666666667
     839template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1)
     840{
     841        return ((sh == 64) ? neon_shift_right_64_bits(arg1) : ((sh > 64) ? simd128<64>::srli<(sh&63)>(neon_shift_right_64_bits(arg1)) : simd_or(neon_shift_right_64_bits(simd128<64>::slli<(64-sh)>(arg1)), simd128<64>::srli<sh>(arg1))));
     842}
     843
     844//The total number of operations is 1.0
     845template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1)
     846{
     847        return simd_not(arg1);
     848}
     849
     850//The total number of operations is 9.5
     851template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1)
     852{
     853        bitblock128_t tmp = simd_not(arg1);
     854        return simd128<1>::ifh(simd128<2>::himask(), simd_and(tmp, simd128<128>::slli<1>(tmp)), simd_and(simd128<128>::srli<1>(arg1), tmp));
     855}
     856
     857//The total number of operations is 12.0
     858template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ctz(bitblock128_t arg1)
     859{
     860        return simd128<4>::popcount(simd_andc(simd128<4>::sub(arg1, simd128<4>::constant<1>()), arg1));
     861}
     862
     863//The total number of operations is 3.0
     864template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ctz(bitblock128_t arg1)
     865{
     866        return simd128<8>::popcount(simd_andc(simd128<8>::sub(arg1, simd128<8>::constant<1>()), arg1));
     867}
     868
     869//The total number of operations is 6.0
     870template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ctz(bitblock128_t arg1)
     871{
     872        return simd128<16>::popcount(simd_andc(simd128<16>::sub(arg1, simd128<16>::constant<1>()), arg1));
     873}
     874
     875//The total number of operations is 9.0
     876template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ctz(bitblock128_t arg1)
     877{
     878        return simd128<32>::popcount(simd_andc(simd128<32>::sub(arg1, simd128<32>::constant<1>()), arg1));
     879}
     880
     881//The total number of operations is 12.0
     882template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1)
     883{
     884        return simd128<64>::popcount(simd_andc(simd128<64>::sub(arg1, simd128<64>::constant<1>()), arg1));
     885}
     886
     887//The total number of operations is 25.5
     888template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1)
     889{
     890        return simd128<128>::popcount(simd_andc(simd128<128>::sub(arg1, simd128<128>::constant<1>()), arg1));
     891}
     892
     893//The total number of operations is 1.0
     894template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2)
     895{
     896        return simd_xor(arg1, arg2);
     897}
     898
     899//The total number of operations is 7.33333333333
     900template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2)
     901{
     902        bitblock128_t tmp = simd_xor(arg1, arg2);
     903        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(simd_not(arg1), arg2))), tmp);
     904}
     905
     906//The total number of operations is 4.0
     907template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2)
     908{
     909        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::sub(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::sub(arg1, arg2));
     910}
     911
     912//The total number of operations is 1.0
     913template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2)
     914{
     915        return (bitblock128_t)vsubq_u8((uint8x16_t)(arg1), (uint8x16_t)(arg2));
     916}
     917
     918//The total number of operations is 1.0
     919template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2)
     920{
     921        return (bitblock128_t)vsubq_u16((uint16x8_t)(arg1), (uint16x8_t)(arg2));
     922}
     923
     924//The total number of operations is 1.0
     925template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2)
     926{
     927        return (bitblock128_t)vsubq_u32((uint32x4_t)(arg1), (uint32x4_t)(arg2));
     928}
     929
     930//The total number of operations is 1.0
     931template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2)
     932{
     933        return (bitblock128_t)vsubq_u64((uint64x2_t)(arg1), (uint64x2_t)(arg2));
     934}
     935
     936//The total number of operations is 9.33333333333
     937template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2)
     938{
     939        bitblock128_t partial = simd128<(64)>::sub(arg1, arg2);
     940        bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
     941        bitblock128_t borrow = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(borrowMask));
     942        return simd128<(64)>::sub(partial, borrow);
     943}
     944
     945//The total number of operations is 1.0
     946template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     947{
     948        return simd_andc(arg1, arg2);
     949}
     950
     951//The total number of operations is 12.5
     952template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     953{
     954        bitblock128_t tmp = simd_not(arg2);
     955        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd128<128>::slli<1>(simd_and(arg1, tmp)), simd_or(arg1, tmp)));
     956        return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<128>::srli<1>(tmpAns));
     957}
     958
     959//The total number of operations is 7.0
     960template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     961{
     962        bitblock128_t high_bit = simd128<4>::constant<(8)>();
     963        return simd128<4>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     964}
     965
     966//The total number of operations is 3.0
     967template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     968{
     969        bitblock128_t high_bit = simd128<8>::constant<(128)>();
     970        return simd128<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     971}
     972
     973//The total number of operations is 3.0
     974template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     975{
     976        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     977        return simd128<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     978}
     979
     980//The total number of operations is 3.0
     981template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     982{
     983        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     984        return simd128<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
     985}
     986
     987//The total number of operations is 9.5
     988template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     989{
     990        bitblock128_t tmpAns = simd128<(32)>::ugt(arg1, arg2);
     991        bitblock128_t mask = simd_and(tmpAns, simd128<64>::srli<(32)>(simd128<(32)>::eq(arg1, arg2)));
     992        mask = simd_or(mask, simd128<64>::slli<(32)>(mask));
     993        return simd_or(simd128<64>::srai<(32)>(tmpAns), mask);
     994}
     995
     996//The total number of operations is 29.1666666667
     997template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ugt(bitblock128_t arg1, bitblock128_t arg2)
     998{
     999        bitblock128_t tmpAns = simd128<(64)>::ugt(arg1, arg2);
     1000        bitblock128_t mask = simd_and(tmpAns, simd128<128>::srli<(64)>(simd128<(64)>::eq(arg1, arg2)));
     1001        mask = simd_or(mask, simd128<128>::slli<(64)>(mask));
     1002        return simd_or(simd128<128>::srai<(64)>(tmpAns), mask);
     1003}
     1004
     1005//The total number of operations is 4.0
     1006template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::xor_hl(bitblock128_t arg1)
     1007{
     1008        return simd_xor(simd128<2>::srli<(1)>(arg1), simd_and(arg1, simd128<2>::lomask()));
     1009}
     1010
     1011//The total number of operations is 4.0
     1012template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::xor_hl(bitblock128_t arg1)
     1013{
     1014        return simd_xor(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
     1015}
     1016
     1017//The total number of operations is 3.0
     1018template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::xor_hl(bitblock128_t arg1)
     1019{
     1020        return simd_xor(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
     1021}
     1022
     1023//The total number of operations is 3.0
     1024template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::xor_hl(bitblock128_t arg1)
     1025{
     1026        return simd_xor(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
     1027}
     1028
     1029//The total number of operations is 3.0
     1030template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::xor_hl(bitblock128_t arg1)
     1031{
     1032        return simd_xor(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
     1033}
     1034
     1035//The total number of operations is 3.0
     1036template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::xor_hl(bitblock128_t arg1)
     1037{
     1038        return simd_xor(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
     1039}
     1040
     1041//The total number of operations is 5.16666666667
     1042template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::xor_hl(bitblock128_t arg1)
     1043{
     1044        return simd_xor(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
     1045}
     1046
     1047//The total number of operations is 0
     1048template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::popcount(bitblock128_t arg1)
     1049{
     1050        return arg1;
     1051}
     1052
     1053//The total number of operations is 3.0
     1054template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::popcount(bitblock128_t arg1)
     1055{
     1056        return simd128<2>::add_hl(simd128<(1)>::popcount(arg1));
     1057}
     1058
     1059//The total number of operations is 7.0
     1060template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::popcount(bitblock128_t arg1)
     1061{
     1062        return simd128<4>::add_hl(simd128<(2)>::popcount(arg1));
     1063}
     1064
     1065//The total number of operations is 1.0
     1066template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::popcount(bitblock128_t arg1)
     1067{
     1068        return (bitblock128_t)vcntq_u8((uint8x16_t)(arg1));
     1069}
     1070
     1071//The total number of operations is 4.0
     1072template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::popcount(bitblock128_t arg1)
     1073{
     1074        return simd128<16>::add_hl(simd128<(8)>::popcount(arg1));
     1075}
     1076
     1077//The total number of operations is 7.0
     1078template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::popcount(bitblock128_t arg1)
     1079{
     1080        return simd128<32>::add_hl(simd128<(16)>::popcount(arg1));
     1081}
     1082
     1083//The total number of operations is 10.0
     1084template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1)
     1085{
     1086        return simd128<64>::add_hl(simd128<(32)>::popcount(arg1));
     1087}
     1088
     1089//The total number of operations is 15.1666666667
     1090template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1)
     1091{
     1092        bitblock128_t tmpAns = simd128<(64)>::popcount(arg1);
     1093        return simd128<(64)>::add(simd_and(tmpAns, simd128<128>::lomask()), simd128<128>::srli<(64)>(tmpAns));
     1094}
     1095
     1096//The total number of operations is 7.5
     1097template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::any(bitblock128_t arg1)
     1098{
     1099        bitblock128_t t0 = simd128<2>::srli<1>(arg1);
     1100        bitblock128_t f0 = simd_or(t0, simd_and(arg1, simd_xor(t0, simd128<8>::constant<255>())));
     1101        return simd_or(f0, simd128<2>::slli<1>(f0));
     1102}
     1103
     1104//The total number of operations is 7.0
     1105template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::any(bitblock128_t arg1)
     1106{
     1107        return simd128<4>::ugt(arg1, simd128<8>::constant<0>());
     1108}
     1109
     1110//The total number of operations is 3.0
     1111template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::any(bitblock128_t arg1)
     1112{
     1113        return simd128<8>::ugt(arg1, simd128<8>::constant<0>());
     1114}
     1115
     1116//The total number of operations is 3.0
     1117template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::any(bitblock128_t arg1)
     1118{
     1119        return simd128<16>::ugt(arg1, simd128<8>::constant<0>());
     1120}
     1121
     1122//The total number of operations is 3.0
     1123template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::any(bitblock128_t arg1)
     1124{
     1125        return simd128<32>::ugt(arg1, simd128<8>::constant<0>());
     1126}
     1127
     1128//The total number of operations is 9.5
     1129template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::any(bitblock128_t arg1)
     1130{
     1131        return simd128<64>::ugt(arg1, simd128<8>::constant<0>());
     1132}
     1133
     1134//The total number of operations is 5.0
     1135template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::any(bitblock128_t arg1)
     1136{
     1137        return ((bitblock128::any(arg1)) ? simd128<8>::constant<255>() : simd128<8>::constant<0>());
     1138}
     1139
     1140//The total number of operations is 4.33333333333
     1141template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1)
     1142{
     1143        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(arg1, simd128<128>::slli<1>(arg1)), arg1);
     1144}
     1145
     1146//The total number of operations is 4.0
     1147template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1)
     1148{
     1149        return simd128<4>::sub(simd128<4>::constant<0>(), arg1);
     1150}
     1151
     1152//The total number of operations is 1.0
     1153template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::neg(bitblock128_t arg1)
     1154{
     1155        return (bitblock128_t)vnegq_s8((int8x16_t)(arg1));
     1156}
     1157
     1158//The total number of operations is 1.0
     1159template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::neg(bitblock128_t arg1)
     1160{
     1161        return (bitblock128_t)vnegq_s16((int16x8_t)(arg1));
     1162}
     1163
     1164//The total number of operations is 1.0
     1165template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
     1166{
     1167        return (bitblock128_t)vnegq_s32((int32x4_t)(arg1));
     1168}
     1169
     1170//The total number of operations is 1.0
     1171template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1)
     1172{
     1173        return simd128<64>::sub(simd128<64>::constant<0>(), arg1);
     1174}
     1175
     1176//The total number of operations is 9.33333333333
     1177template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1)
     1178{
     1179        return simd128<128>::sub(simd128<128>::constant<0>(), arg1);
     1180}
     1181
     1182//The total number of operations is 1.5
     1183template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1)
     1184{
     1185        return simd_and(simd128<32>::slli<sh>(arg1), simd128<2>::constant<(((3)<<sh)&(3))>());
     1186}
     1187
     1188//The total number of operations is 1.5
     1189template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1)
     1190{
     1191        return simd_and(simd128<32>::slli<sh>(arg1), simd128<4>::constant<(((15)<<sh)&(15))>());
     1192}
     1193
     1194//The total number of operations is 0.5
     1195template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1)
     1196{
     1197        return ((sh == 8) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u8((uint8x16_t)(arg1), (int32_t)(sh)))));
     1198}
     1199
     1200//The total number of operations is 0.5
     1201template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1)
     1202{
     1203        return ((sh == 16) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u16((uint16x8_t)(arg1), (int32_t)(sh)))));
     1204}
     1205
     1206//The total number of operations is 0.5
     1207template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1)
     1208{
     1209        return ((sh == 32) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u32((uint32x4_t)(arg1), (int32_t)(sh)))));
     1210}
     1211
     1212//The total number of operations is 0.5
     1213template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1)
     1214{
     1215        return ((sh == 64) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u64((uint64x2_t)(arg1), (int32_t)(sh)))));
     1216}
     1217
     1218//The total number of operations is 2.33333333333
     1219template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1)
     1220{
     1221        return ((sh == 128) ? simd128<32>::constant<0>() : ((sh >= 64) ? simd128<64>::slli<(sh&63)>(neon_shift_left_64_bits(arg1)) : simd_or(neon_shift_left_64_bits(simd128<64>::srli<(64-sh)>(arg1)), simd128<64>::slli<sh>(arg1))));
     1222}
     1223
     1224//The total number of operations is 1.0
     1225template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1226{
     1227        return vbslq_u64(arg1, arg2, arg3);
     1228}
     1229
     1230//The total number of operations is 4.0
     1231template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1232{
     1233        return simd128<(1)>::ifh(simd128<1>::ifh(simd128<2>::himask(), arg1, simd128<2>::srli<(1)>(arg1)), arg2, arg3);
     1234}
     1235
     1236//The total number of operations is 6.0
     1237template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1238{
     1239        return simd128<1>::ifh(simd128<4>::gt(simd128<4>::constant<0>(), arg1), arg2, arg3);
     1240}
     1241
     1242//The total number of operations is 2.0
     1243template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1244{
     1245        return simd128<1>::ifh(simd128<8>::gt(simd128<8>::constant<0>(), arg1), arg2, arg3);
     1246}
     1247
     1248//The total number of operations is 2.0
     1249template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1250{
     1251        return simd128<1>::ifh(simd128<16>::gt(simd128<16>::constant<0>(), arg1), arg2, arg3);
     1252}
     1253
     1254//The total number of operations is 2.0
     1255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1256{
     1257        return simd128<1>::ifh(simd128<32>::gt(simd128<32>::constant<0>(), arg1), arg2, arg3);
     1258}
     1259
     1260//The total number of operations is 4.0
     1261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1262{
     1263        return simd128<(32)>::ifh(simd128<1>::ifh(simd128<64>::himask(), arg1, simd128<64>::srli<(32)>(arg1)), arg2, arg3);
     1264}
     1265
     1266//The total number of operations is 8.16666666667
     1267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
     1268{
     1269        return simd128<(64)>::ifh(simd128<1>::ifh(simd128<128>::himask(), arg1, simd128<128>::srli<(64)>(arg1)), arg2, arg3);
     1270}
     1271
     1272//The total number of operations is 4.0
     1273template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1274{
     1275        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1276}
     1277
     1278//The total number of operations is 8.0
     1279template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1280{
     1281        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1282        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1283}
     1284
     1285//The total number of operations is 1.0
     1286template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1287{
     1288        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s8((int8x16_t)(arg1), (int32_t)(sh)))));
     1289}
     1290
     1291//The total number of operations is 1.0
     1292template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1293{
     1294        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s16((int16x8_t)(arg1), (int32_t)(sh)))));
     1295}
     1296
     1297//The total number of operations is 1.0
     1298template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1299{
     1300        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s32((int32x4_t)(arg1), (int32_t)(sh)))));
     1301}
     1302
     1303//The total number of operations is 1.0
     1304template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1305{
     1306        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s64((int64x2_t)(arg1), (int32_t)(sh)))));
     1307}
     1308
     1309//The total number of operations is 6.66666666667
     1310template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1311{
     1312        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
     1313}
     1314
     1315//The total number of operations is 3.0
     1316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1)
     1317{
     1318        return simd128<16>::sub(arg1, simd_and(simd128<2>::lomask(), simd128<16>::srli<1>(arg1)));
     1319}
     1320
     1321//The total number of operations is 4.0
     1322template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1)
     1323{
     1324        return simd128<(8)>::add(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
     1325}
     1326
     1327//The total number of operations is 3.0
     1328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add_hl(bitblock128_t arg1)
     1329{
     1330        return simd128<(16)>::add(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
     1331}
     1332
     1333//The total number of operations is 3.0
     1334template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add_hl(bitblock128_t arg1)
     1335{
     1336        return simd128<(32)>::add(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
     1337}
     1338
     1339//The total number of operations is 3.0
     1340template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add_hl(bitblock128_t arg1)
     1341{
     1342        return simd128<(64)>::add(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
     1343}
     1344
     1345//The total number of operations is 3.0
     1346template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1)
     1347{
     1348        return simd128<64>::add(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
     1349}
     1350
     1351//The total number of operations is 13.5
     1352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1)
     1353{
     1354        return simd128<128>::add(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
     1355}
     1356
     1357//The total number of operations is 0
     1358template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1359{
     1360        return simd128<2>::constant<(1)>();
     1361}
     1362
     1363//The total number of operations is 0
     1364template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1365{
     1366        return simd128<4>::constant<(3)>();
     1367}
     1368
     1369//The total number of operations is 0
     1370template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1371{
     1372        return simd128<8>::constant<(15)>();
     1373}
     1374
     1375//The total number of operations is 0
     1376template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1377{
     1378        return simd128<16>::constant<(255)>();
     1379}
     1380
     1381//The total number of operations is 0
     1382template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1383{
     1384        return simd128<32>::constant<(65535)>();
     1385}
     1386
     1387//The total number of operations is 0
     1388template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1389{
     1390        return simd128<64>::constant<4294967295ULL>();
     1391}
     1392
     1393//The total number of operations is 0
     1394template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1395{
     1396        return vsetq_lane_u64((uint64_t)(-1), simd128<64>::constant<0>(), (int32_t)(0));
     1397}
     1398
     1399//The total number of operations is 0
     1400template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
     1401{
     1402        return simd128<32>::constant<(-1*val)>();
     1403}
     1404
     1405//The total number of operations is 0
     1406template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant()
     1407{
     1408        return ((val < 0) ? simd128<(4)>::constant<((val<<2)|(val^(-4)))>() : simd128<(4)>::constant<((val<<2)|val)>());
     1409}
     1410
     1411//The total number of operations is 0
     1412template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant()
     1413{
     1414        return ((val < 0) ? simd128<(8)>::constant<((val<<4)|(val^(-16)))>() : simd128<(8)>::constant<((val<<4)|val)>());
     1415}
     1416
     1417//The total number of operations is 0
     1418template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant()
     1419{
     1420        return (bitblock128_t)vdupq_n_u8((uint8_t)(val));
     1421}
     1422
     1423//The total number of operations is 0
     1424template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant()
     1425{
     1426        return (bitblock128_t)vdupq_n_u16((uint16_t)(val));
     1427}
     1428
     1429//The total number of operations is 0
     1430template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant()
     1431{
     1432        return (bitblock128_t)vdupq_n_u32((uint32_t)(val));
     1433}
     1434
     1435//The total number of operations is 0
     1436template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant()
     1437{
     1438        return (bitblock128_t)vdupq_n_u64((uint64_t)(val));
     1439}
     1440
     1441//The total number of operations is 0
     1442template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant()
     1443{
     1444        return vsetq_lane_u64((uint64_t)(0), simd128<64>::constant<val>(), (int32_t)(1));
     1445}
     1446
     1447//The total number of operations is 1.0
     1448template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::min(bitblock128_t arg1, bitblock128_t arg2)
     1449{
     1450        return simd_or(arg1, arg2);
     1451}
     1452
     1453//The total number of operations is 13.0
     1454template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::min(bitblock128_t arg1, bitblock128_t arg2)
     1455{
     1456        bitblock128_t hiAns = simd128<(1)>::min(arg1, arg2);
     1457        bitblock128_t loAns = simd128<(1)>::umin(arg1, arg2);
     1458        bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(hiAns, arg1));
     1459        bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(hiAns, arg2));
     1460        return simd128<1>::ifh(simd128<2>::himask(), hiAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, loAns, arg1), arg2));
     1461}
     1462
     1463//The total number of operations is 6.0
     1464template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::min(bitblock128_t arg1, bitblock128_t arg2)
     1465{
     1466        return simd128<1>::ifh(simd128<4>::lt(arg1, arg2), arg1, arg2);
     1467}
     1468
     1469//The total number of operations is 1.0
     1470template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::min(bitblock128_t arg1, bitblock128_t arg2)
     1471{
     1472        return (bitblock128_t)vminq_s8((int8x16_t)(arg1), (int8x16_t)(arg2));
     1473}
     1474
     1475//The total number of operations is 1.0
     1476template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::min(bitblock128_t arg1, bitblock128_t arg2)
     1477{
     1478        return (bitblock128_t)vminq_s16((int16x8_t)(arg1), (int16x8_t)(arg2));
     1479}
     1480
     1481//The total number of operations is 1.0
     1482template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::min(bitblock128_t arg1, bitblock128_t arg2)
     1483{
     1484        return (bitblock128_t)vminq_s32((int32x4_t)(arg1), (int32x4_t)(arg2));
     1485}
     1486
     1487//The total number of operations is 11.5
     1488template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2)
     1489{
     1490        return simd128<1>::ifh(simd128<64>::gt(arg1, arg2), arg2, arg1);
     1491}
     1492
     1493//The total number of operations is 40.6666666667
     1494template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2)
     1495{
     1496        return simd128<1>::ifh(simd128<128>::lt(arg1, arg2), arg1, arg2);
     1497}
     1498
     1499//The total number of operations is 1.0
     1500template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1501{
     1502        return simd_and(arg1, arg2);
     1503}
     1504
     1505//The total number of operations is 12.0
     1506template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1507{
     1508        bitblock128_t tmpAns = simd128<(1)>::umin(arg1, arg2);
     1509        bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg1));
     1510        bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg2));
     1511        return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1512}
     1513
     1514//The total number of operations is 9.0
     1515template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1516{
     1517        bitblock128_t high_bit = simd128<4>::constant<(8)>();
     1518        return simd_xor(simd128<4>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1519}
     1520
     1521//The total number of operations is 4.0
     1522template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1523{
     1524        bitblock128_t high_bit = simd128<8>::constant<(128)>();
     1525        return simd_xor(simd128<8>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1526}
     1527
     1528//The total number of operations is 4.0
     1529template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1530{
     1531        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     1532        return simd_xor(simd128<16>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1533}
     1534
     1535//The total number of operations is 4.0
     1536template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1537{
     1538        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     1539        return simd_xor(simd128<32>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1540}
     1541
     1542//The total number of operations is 11.0
     1543template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1544{
     1545        bitblock128_t tmpAns = simd128<(32)>::umin(arg1, arg2);
     1546        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     1547        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     1548        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1549}
     1550
     1551//The total number of operations is 29.3333333333
     1552template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2)
     1553{
     1554        bitblock128_t tmpAns = simd128<(64)>::umin(arg1, arg2);
     1555        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     1556        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     1557        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1558}
     1559
     1560//The total number of operations is 1.0
     1561template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1562{
     1563        return simd_or(arg1, arg2);
     1564}
     1565
     1566//The total number of operations is 12.0
     1567template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1568{
     1569        bitblock128_t tmpAns = simd128<(1)>::umax(arg1, arg2);
     1570        bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg1));
     1571        bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg2));
     1572        return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1573}
     1574
     1575//The total number of operations is 9.0
     1576template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1577{
     1578        bitblock128_t high_bit = simd128<4>::constant<(8)>();
     1579        return simd_xor(simd128<4>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1580}
     1581
     1582//The total number of operations is 4.0
     1583template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1584{
     1585        bitblock128_t high_bit = simd128<8>::constant<(128)>();
     1586        return simd_xor(simd128<8>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1587}
     1588
     1589//The total number of operations is 4.0
     1590template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1591{
     1592        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     1593        return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1594}
     1595
     1596//The total number of operations is 4.0
     1597template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1598{
     1599        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     1600        return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1601}
     1602
     1603//The total number of operations is 11.0
     1604template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1605{
     1606        bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
     1607        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     1608        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     1609        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1610}
     1611
     1612//The total number of operations is 29.3333333333
     1613template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1614{
     1615        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     1616        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     1617        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     1618        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1619}
     1620
    7251621//The total number of operations is 1.0
    7261622template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2)
     
    7821678
    7831679//The total number of operations is 2.0
    784 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1)
    785 {
    786         return simd_and(simd128<32>::srli<sh>(arg1), simd128<2>::constant<((3)>>sh)>());
    787 }
    788 
    789 //The total number of operations is 2.0
    790 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1)
    791 {
    792         return simd_and(simd128<32>::srli<sh>(arg1), simd128<4>::constant<((15)>>sh)>());
    793 }
    794 
    795 //The total number of operations is 1.0
    796 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1)
    797 {
    798         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u8((uint8x16_t)(arg1), (int32_t)(sh)))));
    799 }
    800 
    801 //The total number of operations is 1.0
    802 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1)
    803 {
    804         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u16((uint16x8_t)(arg1), (int32_t)(sh)))));
    805 }
    806 
    807 //The total number of operations is 1.0
    808 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1)
    809 {
    810         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u32((uint32x4_t)(arg1), (int32_t)(sh)))));
    811 }
    812 
    813 //The total number of operations is 1.0
    814 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1)
    815 {
    816         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_u64((uint64x2_t)(arg1), (int32_t)(sh)))));
    817 }
    818 
    819 //The total number of operations is 3.16666666667
    820 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1)
    821 {
    822         return ((sh == 64) ? neon_shift_right_64_bits(arg1) : ((sh > 64) ? simd128<64>::srli<(sh&63)>(neon_shift_right_64_bits(arg1)) : simd_or(neon_shift_right_64_bits(simd128<64>::slli<(64-sh)>(arg1)), simd128<64>::srli<sh>(arg1))));
    823 }
    824 
    825 //The total number of operations is 1.0
    826 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1)
    827 {
    828         return simd_not(arg1);
    829 }
    830 
    831 //The total number of operations is 9.5
    832 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1)
    833 {
    834         bitblock128_t tmp = simd_not(arg1);
    835         return simd128<1>::ifh(simd128<2>::himask(), simd_and(tmp, simd128<128>::slli<1>(tmp)), simd_and(simd128<128>::srli<1>(arg1), tmp));
    836 }
    837 
    838 //The total number of operations is 12.0
    839 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ctz(bitblock128_t arg1)
    840 {
    841         return simd128<4>::popcount(simd_andc(simd128<4>::sub(arg1, simd128<4>::constant<1>()), arg1));
    842 }
    843 
    844 //The total number of operations is 3.0
    845 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ctz(bitblock128_t arg1)
    846 {
    847         return simd128<8>::popcount(simd_andc(simd128<8>::sub(arg1, simd128<8>::constant<1>()), arg1));
    848 }
    849 
    850 //The total number of operations is 6.0
    851 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ctz(bitblock128_t arg1)
    852 {
    853         return simd128<16>::popcount(simd_andc(simd128<16>::sub(arg1, simd128<16>::constant<1>()), arg1));
    854 }
    855 
    856 //The total number of operations is 9.0
    857 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ctz(bitblock128_t arg1)
    858 {
    859         return simd128<32>::popcount(simd_andc(simd128<32>::sub(arg1, simd128<32>::constant<1>()), arg1));
    860 }
    861 
    862 //The total number of operations is 12.0
    863 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1)
    864 {
    865         return simd128<64>::popcount(simd_andc(simd128<64>::sub(arg1, simd128<64>::constant<1>()), arg1));
    866 }
    867 
    868 //The total number of operations is 25.5
    869 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1)
    870 {
    871         return simd128<128>::popcount(simd_andc(simd128<128>::sub(arg1, simd128<128>::constant<1>()), arg1));
    872 }
    873 
    874 //The total number of operations is 1.0
    875 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    876 {
    877         return simd_andc(arg1, arg2);
    878 }
    879 
    880 //The total number of operations is 12.5
    881 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    882 {
    883         bitblock128_t tmp = simd_not(arg2);
    884         bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd128<128>::slli<1>(simd_and(arg1, tmp)), simd_or(arg1, tmp)));
    885         return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<128>::srli<1>(tmpAns));
    886 }
    887 
    888 //The total number of operations is 7.0
    889 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    890 {
    891         bitblock128_t high_bit = simd128<4>::constant<(8)>();
    892         return simd128<4>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    893 }
    894 
    895 //The total number of operations is 3.0
    896 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    897 {
    898         bitblock128_t high_bit = simd128<8>::constant<(128)>();
    899         return simd128<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    900 }
    901 
    902 //The total number of operations is 3.0
    903 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    904 {
    905         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    906         return simd128<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    907 }
    908 
    909 //The total number of operations is 3.0
    910 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    911 {
    912         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    913         return simd128<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
    914 }
    915 
    916 //The total number of operations is 9.5
    917 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    918 {
    919         bitblock128_t tmpAns = simd128<(32)>::ugt(arg1, arg2);
    920         bitblock128_t mask = simd_and(tmpAns, simd128<64>::srli<(32)>(simd128<(32)>::eq(arg1, arg2)));
    921         mask = simd_or(mask, simd128<64>::slli<(32)>(mask));
    922         return simd_or(simd128<64>::srai<(32)>(tmpAns), mask);
    923 }
    924 
    925 //The total number of operations is 29.1666666667
    926 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ugt(bitblock128_t arg1, bitblock128_t arg2)
    927 {
    928         bitblock128_t tmpAns = simd128<(64)>::ugt(arg1, arg2);
    929         bitblock128_t mask = simd_and(tmpAns, simd128<128>::srli<(64)>(simd128<(64)>::eq(arg1, arg2)));
    930         mask = simd_or(mask, simd128<128>::slli<(64)>(mask));
    931         return simd_or(simd128<128>::srai<(64)>(tmpAns), mask);
    932 }
    933 
    934 //The total number of operations is 4.0
    935 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::xor_hl(bitblock128_t arg1)
    936 {
    937         return simd_xor(simd128<2>::srli<(1)>(arg1), simd_and(arg1, simd128<2>::lomask()));
    938 }
    939 
    940 //The total number of operations is 4.0
    941 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::xor_hl(bitblock128_t arg1)
    942 {
    943         return simd_xor(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
    944 }
    945 
    946 //The total number of operations is 3.0
    947 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::xor_hl(bitblock128_t arg1)
    948 {
    949         return simd_xor(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
    950 }
    951 
    952 //The total number of operations is 3.0
    953 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::xor_hl(bitblock128_t arg1)
    954 {
    955         return simd_xor(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
    956 }
    957 
    958 //The total number of operations is 3.0
    959 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::xor_hl(bitblock128_t arg1)
    960 {
    961         return simd_xor(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
    962 }
    963 
    964 //The total number of operations is 3.0
    965 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::xor_hl(bitblock128_t arg1)
    966 {
    967         return simd_xor(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
    968 }
    969 
    970 //The total number of operations is 5.16666666667
    971 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::xor_hl(bitblock128_t arg1)
    972 {
    973         return simd_xor(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
    974 }
    975 
    976 //The total number of operations is 0
    977 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::popcount(bitblock128_t arg1)
    978 {
    979         return arg1;
    980 }
    981 
    982 //The total number of operations is 3.0
    983 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::popcount(bitblock128_t arg1)
    984 {
    985         return simd128<2>::add_hl(simd128<(1)>::popcount(arg1));
    986 }
    987 
    988 //The total number of operations is 7.0
    989 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::popcount(bitblock128_t arg1)
    990 {
    991         return simd128<4>::add_hl(simd128<(2)>::popcount(arg1));
    992 }
    993 
    994 //The total number of operations is 1.0
    995 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::popcount(bitblock128_t arg1)
    996 {
    997         return (bitblock128_t)vcntq_u8((uint8x16_t)(arg1));
    998 }
    999 
    1000 //The total number of operations is 4.0
    1001 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::popcount(bitblock128_t arg1)
    1002 {
    1003         return simd128<16>::add_hl(simd128<(8)>::popcount(arg1));
    1004 }
    1005 
    1006 //The total number of operations is 7.0
    1007 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::popcount(bitblock128_t arg1)
    1008 {
    1009         return simd128<32>::add_hl(simd128<(16)>::popcount(arg1));
    1010 }
    1011 
    1012 //The total number of operations is 10.0
    1013 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1)
    1014 {
    1015         return simd128<64>::add_hl(simd128<(32)>::popcount(arg1));
    1016 }
    1017 
    1018 //The total number of operations is 15.1666666667
    1019 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1)
    1020 {
    1021         bitblock128_t tmpAns = simd128<(64)>::popcount(arg1);
    1022         return simd128<(64)>::add(simd_and(tmpAns, simd128<128>::lomask()), simd128<128>::srli<(64)>(tmpAns));
    1023 }
    1024 
    1025 //The total number of operations is 4.33333333333
    1026 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1)
    1027 {
    1028         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(arg1, simd128<128>::slli<1>(arg1)), arg1);
    1029 }
    1030 
    1031 //The total number of operations is 4.0
    1032 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1)
    1033 {
    1034         return simd128<4>::sub(simd128<4>::constant<0>(), arg1);
    1035 }
    1036 
    1037 //The total number of operations is 1.0
    1038 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::neg(bitblock128_t arg1)
    1039 {
    1040         return (bitblock128_t)vnegq_s8((int8x16_t)(arg1));
    1041 }
    1042 
    1043 //The total number of operations is 1.0
    1044 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::neg(bitblock128_t arg1)
    1045 {
    1046         return (bitblock128_t)vnegq_s16((int16x8_t)(arg1));
    1047 }
    1048 
    1049 //The total number of operations is 1.0
    1050 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
    1051 {
    1052         return (bitblock128_t)vnegq_s32((int32x4_t)(arg1));
    1053 }
    1054 
    1055 //The total number of operations is 1.0
    1056 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1)
    1057 {
    1058         return simd128<64>::sub(simd128<64>::constant<0>(), arg1);
    1059 }
    1060 
    1061 //The total number of operations is 9.33333333333
    1062 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1)
    1063 {
    1064         return simd128<128>::sub(simd128<128>::constant<0>(), arg1);
    1065 }
    1066 
    1067 //The total number of operations is 1.5
    1068 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1)
    1069 {
    1070         return simd_and(simd128<32>::slli<sh>(arg1), simd128<2>::constant<(((3)<<sh)&(3))>());
    1071 }
    1072 
    1073 //The total number of operations is 1.5
    1074 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1)
    1075 {
    1076         return simd_and(simd128<32>::slli<sh>(arg1), simd128<4>::constant<(((15)<<sh)&(15))>());
    1077 }
    1078 
    1079 //The total number of operations is 0.5
    1080 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1)
    1081 {
    1082         return ((sh == 8) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u8((uint8x16_t)(arg1), (int32_t)(sh)))));
    1083 }
    1084 
    1085 //The total number of operations is 0.5
    1086 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1)
    1087 {
    1088         return ((sh == 16) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u16((uint16x8_t)(arg1), (int32_t)(sh)))));
    1089 }
    1090 
    1091 //The total number of operations is 0.5
    1092 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1)
    1093 {
    1094         return ((sh == 32) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u32((uint32x4_t)(arg1), (int32_t)(sh)))));
    1095 }
    1096 
    1097 //The total number of operations is 0.5
    1098 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1)
    1099 {
    1100         return ((sh == 64) ? simd128<32>::constant<0>() : ((bitblock128_t)(vshlq_n_u64((uint64x2_t)(arg1), (int32_t)(sh)))));
    1101 }
    1102 
    1103 //The total number of operations is 2.33333333333
    1104 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1)
    1105 {
    1106         return ((sh == 128) ? simd128<32>::constant<0>() : ((sh >= 64) ? simd128<64>::slli<(sh&63)>(neon_shift_left_64_bits(arg1)) : simd_or(neon_shift_left_64_bits(simd128<64>::srli<(64-sh)>(arg1)), simd128<64>::slli<sh>(arg1))));
    1107 }
    1108 
    1109 //The total number of operations is 1.0
    1110 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1111 {
    1112         return vbslq_u64(arg1, arg2, arg3);
    1113 }
    1114 
    1115 //The total number of operations is 4.0
    1116 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1117 {
    1118         return simd128<(1)>::ifh(simd128<1>::ifh(simd128<2>::himask(), arg1, simd128<2>::srli<(1)>(arg1)), arg2, arg3);
    1119 }
    1120 
    1121 //The total number of operations is 6.0
    1122 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1123 {
    1124         return simd128<1>::ifh(simd128<4>::gt(simd128<4>::constant<0>(), arg1), arg2, arg3);
    1125 }
    1126 
    1127 //The total number of operations is 2.0
    1128 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1129 {
    1130         return simd128<1>::ifh(simd128<8>::gt(simd128<8>::constant<0>(), arg1), arg2, arg3);
    1131 }
    1132 
    1133 //The total number of operations is 2.0
    1134 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1135 {
    1136         return simd128<1>::ifh(simd128<16>::gt(simd128<16>::constant<0>(), arg1), arg2, arg3);
    1137 }
    1138 
    1139 //The total number of operations is 2.0
    1140 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1141 {
    1142         return simd128<1>::ifh(simd128<32>::gt(simd128<32>::constant<0>(), arg1), arg2, arg3);
    1143 }
    1144 
    1145 //The total number of operations is 4.0
    1146 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1147 {
    1148         return simd128<(32)>::ifh(simd128<1>::ifh(simd128<64>::himask(), arg1, simd128<64>::srli<(32)>(arg1)), arg2, arg3);
    1149 }
    1150 
    1151 //The total number of operations is 8.16666666667
    1152 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
    1153 {
    1154         return simd128<(64)>::ifh(simd128<1>::ifh(simd128<128>::himask(), arg1, simd128<128>::srli<(64)>(arg1)), arg2, arg3);
    1155 }
    1156 
    1157 //The total number of operations is 1.0
    1158 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1159 {
    1160         return simd_xor(arg1, arg2);
    1161 }
    1162 
    1163 //The total number of operations is 7.33333333333
    1164 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1165 {
    1166         bitblock128_t tmp = simd_xor(arg1, arg2);
    1167         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(simd_not(arg1), arg2))), tmp);
    1168 }
    1169 
    1170 //The total number of operations is 4.0
    1171 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1172 {
    1173         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::sub(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::sub(arg1, arg2));
    1174 }
    1175 
    1176 //The total number of operations is 1.0
    1177 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1178 {
    1179         return (bitblock128_t)vsubq_u8((uint8x16_t)(arg1), (uint8x16_t)(arg2));
    1180 }
    1181 
    1182 //The total number of operations is 1.0
    1183 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1184 {
    1185         return (bitblock128_t)vsubq_u16((uint16x8_t)(arg1), (uint16x8_t)(arg2));
    1186 }
    1187 
    1188 //The total number of operations is 1.0
    1189 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1190 {
    1191         return (bitblock128_t)vsubq_u32((uint32x4_t)(arg1), (uint32x4_t)(arg2));
    1192 }
    1193 
    1194 //The total number of operations is 1.0
    1195 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1196 {
    1197         return (bitblock128_t)vsubq_u64((uint64x2_t)(arg1), (uint64x2_t)(arg2));
    1198 }
    1199 
    1200 //The total number of operations is 9.33333333333
    1201 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2)
    1202 {
    1203         bitblock128_t partial = simd128<(64)>::sub(arg1, arg2);
    1204         bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
    1205         bitblock128_t borrow = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(borrowMask));
    1206         return simd128<(64)>::sub(partial, borrow);
    1207 }
    1208 
    1209 //The total number of operations is 3.0
    1210 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1)
    1211 {
    1212         return simd128<16>::sub(arg1, simd_and(simd128<2>::lomask(), simd128<16>::srli<1>(arg1)));
    1213 }
    1214 
    1215 //The total number of operations is 4.0
    1216 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1)
    1217 {
    1218         return simd128<(8)>::add(simd128<4>::srli<(2)>(arg1), simd_and(arg1, simd128<4>::lomask()));
    1219 }
    1220 
    1221 //The total number of operations is 3.0
    1222 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add_hl(bitblock128_t arg1)
    1223 {
    1224         return simd128<(16)>::add(simd128<8>::srli<(4)>(arg1), simd_and(arg1, simd128<8>::lomask()));
    1225 }
    1226 
    1227 //The total number of operations is 3.0
    1228 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add_hl(bitblock128_t arg1)
    1229 {
    1230         return simd128<(32)>::add(simd128<16>::srli<(8)>(arg1), simd_and(arg1, simd128<16>::lomask()));
    1231 }
    1232 
    1233 //The total number of operations is 3.0
    1234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add_hl(bitblock128_t arg1)
    1235 {
    1236         return simd128<(64)>::add(simd128<32>::srli<(16)>(arg1), simd_and(arg1, simd128<32>::lomask()));
    1237 }
    1238 
    1239 //The total number of operations is 3.0
    1240 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1)
    1241 {
    1242         return simd128<64>::add(simd128<64>::srli<(32)>(arg1), simd_and(arg1, simd128<64>::lomask()));
    1243 }
    1244 
    1245 //The total number of operations is 13.5
    1246 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1)
    1247 {
    1248         return simd128<128>::add(simd128<128>::srli<(64)>(arg1), simd_and(arg1, simd128<128>::lomask()));
    1249 }
    1250 
    1251 //The total number of operations is 0
    1252 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    1253 {
    1254         return simd128<32>::constant<(-1*val)>();
    1255 }
    1256 
    1257 //The total number of operations is 0
    1258 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant()
    1259 {
    1260         return simd128<(4)>::constant<((val<<2)|(val&(3)))>();
    1261 }
    1262 
    1263 //The total number of operations is 0
    1264 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant()
    1265 {
    1266         return simd128<(8)>::constant<((val<<4)|(val&(15)))>();
    1267 }
    1268 
    1269 //The total number of operations is 0
    1270 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant()
    1271 {
    1272         return (bitblock128_t)vdupq_n_u8((uint8_t)(val));
    1273 }
    1274 
    1275 //The total number of operations is 0
    1276 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant()
    1277 {
    1278         return (bitblock128_t)vdupq_n_u16((uint16_t)(val));
    1279 }
    1280 
    1281 //The total number of operations is 0
    1282 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant()
    1283 {
    1284         return (bitblock128_t)vdupq_n_u32((uint32_t)(val));
    1285 }
    1286 
    1287 //The total number of operations is 0
    1288 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant()
    1289 {
    1290         return (bitblock128_t)vdupq_n_u64((uint64_t)(val));
    1291 }
    1292 
    1293 //The total number of operations is 0
    1294 template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant()
    1295 {
    1296         return vsetq_lane_u64((uint64_t)(0), simd128<64>::constant<val>(), (int32_t)(1));
    1297 }
    1298 
    1299 //The total number of operations is 1.0
    1300 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::min(bitblock128_t arg1, bitblock128_t arg2)
    1301 {
    1302         return simd_or(arg1, arg2);
    1303 }
    1304 
    1305 //The total number of operations is 13.0
    1306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::min(bitblock128_t arg1, bitblock128_t arg2)
    1307 {
    1308         bitblock128_t hiAns = simd128<(1)>::min(arg1, arg2);
    1309         bitblock128_t loAns = simd128<(1)>::umin(arg1, arg2);
    1310         bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(hiAns, arg1));
    1311         bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(hiAns, arg2));
    1312         return simd128<1>::ifh(simd128<2>::himask(), hiAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, loAns, arg1), arg2));
    1313 }
    1314 
    1315 //The total number of operations is 6.0
    1316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::min(bitblock128_t arg1, bitblock128_t arg2)
    1317 {
    1318         return simd128<1>::ifh(simd128<4>::gt(arg1, arg2), arg2, arg1);
    1319 }
    1320 
    1321 //The total number of operations is 1.0
    1322 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::min(bitblock128_t arg1, bitblock128_t arg2)
    1323 {
    1324         return (bitblock128_t)vminq_s8((int8x16_t)(arg1), (int8x16_t)(arg2));
    1325 }
    1326 
    1327 //The total number of operations is 1.0
    1328 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::min(bitblock128_t arg1, bitblock128_t arg2)
    1329 {
    1330         return (bitblock128_t)vminq_s16((int16x8_t)(arg1), (int16x8_t)(arg2));
    1331 }
    1332 
    1333 //The total number of operations is 1.0
    1334 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::min(bitblock128_t arg1, bitblock128_t arg2)
    1335 {
    1336         return (bitblock128_t)vminq_s32((int32x4_t)(arg1), (int32x4_t)(arg2));
    1337 }
    1338 
    1339 //The total number of operations is 11.5
    1340 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2)
    1341 {
    1342         return simd128<1>::ifh(simd128<64>::gt(arg1, arg2), arg2, arg1);
    1343 }
    1344 
    1345 //The total number of operations is 40.6666666667
    1346 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2)
    1347 {
    1348         return simd128<1>::ifh(simd128<128>::gt(arg1, arg2), arg2, arg1);
    1349 }
    1350 
    1351 //The total number of operations is 0
    1352 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1353 {
    1354         return simd128<2>::constant<(1)>();
    1355 }
    1356 
    1357 //The total number of operations is 0
    1358 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1359 {
    1360         return simd128<4>::constant<(3)>();
    1361 }
    1362 
    1363 //The total number of operations is 0
    1364 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1365 {
    1366         return simd128<8>::constant<(15)>();
    1367 }
    1368 
    1369 //The total number of operations is 0
    1370 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1371 {
    1372         return simd128<16>::constant<(255)>();
    1373 }
    1374 
    1375 //The total number of operations is 0
    1376 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1377 {
    1378         return simd128<32>::constant<(65535)>();
    1379 }
    1380 
    1381 //The total number of operations is 0
    1382 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1383 {
    1384         return simd128<64>::constant<4294967295ULL>();
    1385 }
    1386 
    1387 //The total number of operations is 0
    1388 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1389 {
    1390         return vsetq_lane_u64((uint64_t)(-1), simd128<64>::constant<0>(), (int32_t)(0));
    1391 }
    1392 
    1393 //The total number of operations is 1.0
    1394 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1395 {
    1396         return simd_and(arg1, arg2);
    1397 }
    1398 
    1399 //The total number of operations is 12.0
    1400 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1401 {
    1402         bitblock128_t tmpAns = simd128<(1)>::umin(arg1, arg2);
    1403         bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg1));
    1404         bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg2));
    1405         return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1406 }
    1407 
    1408 //The total number of operations is 9.0
    1409 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1410 {
    1411         bitblock128_t high_bit = simd128<4>::constant<(8)>();
    1412         return simd_xor(simd128<4>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1413 }
    1414 
    1415 //The total number of operations is 4.0
    1416 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1417 {
    1418         bitblock128_t high_bit = simd128<8>::constant<(128)>();
    1419         return simd_xor(simd128<8>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1420 }
    1421 
    1422 //The total number of operations is 4.0
    1423 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1424 {
    1425         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1426         return simd_xor(simd128<16>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1427 }
    1428 
    1429 //The total number of operations is 4.0
    1430 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1431 {
    1432         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1433         return simd_xor(simd128<32>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1434 }
    1435 
    1436 //The total number of operations is 11.0
    1437 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1438 {
    1439         bitblock128_t tmpAns = simd128<(32)>::umin(arg1, arg2);
    1440         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1441         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1442         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1443 }
    1444 
    1445 //The total number of operations is 29.3333333333
    1446 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2)
    1447 {
    1448         bitblock128_t tmpAns = simd128<(64)>::umin(arg1, arg2);
    1449         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1450         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1451         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1452 }
    1453 
    1454 //The total number of operations is 5.33333333333
    1455 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1456 {
    1457         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1458 }
    1459 
    1460 //The total number of operations is 10.0
    1461 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1462 {
    1463         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1464         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1465 }
    1466 
    1467 //The total number of operations is 1.0
    1468 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1469 {
    1470         return (bitblock128_t)vabsq_s8((int8x16_t)(arg1));
    1471 }
    1472 
    1473 //The total number of operations is 1.0
    1474 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1475 {
    1476         return (bitblock128_t)vabsq_s16((int16x8_t)(arg1));
    1477 }
    1478 
    1479 //The total number of operations is 1.0
    1480 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1481 {
    1482         return (bitblock128_t)vabsq_s32((int32x4_t)(arg1));
    1483 }
    1484 
    1485 //The total number of operations is 8.5
    1486 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1487 {
    1488         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1489         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1490 }
    1491 
    1492 //The total number of operations is 31.8333333333
    1493 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1494 {
    1495         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1496         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    1497 }
    1498 
    1499 //The total number of operations is 2.0
    15001680template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    15011681{
     
    15541734}
    15551735
    1556 //The total number of operations is 4.0
    1557 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1558 {
    1559         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1560 }
    1561 
    1562 //The total number of operations is 8.0
    1563 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1564 {
    1565         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1566         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1567 }
    1568 
    1569 //The total number of operations is 1.0
    1570 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1571 {
    1572         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s8((int8x16_t)(arg1), (int32_t)(sh)))));
    1573 }
    1574 
    1575 //The total number of operations is 1.0
    1576 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1577 {
    1578         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s16((int16x8_t)(arg1), (int32_t)(sh)))));
    1579 }
    1580 
    1581 //The total number of operations is 1.0
    1582 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1583 {
    1584         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s32((int32x4_t)(arg1), (int32_t)(sh)))));
    1585 }
    1586 
    1587 //The total number of operations is 1.0
    1588 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1589 {
    1590         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s64((int64x2_t)(arg1), (int32_t)(sh)))));
    1591 }
    1592 
    1593 //The total number of operations is 6.66666666667
    1594 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1595 {
    1596         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1597 }
    1598 
    15991736//The total number of operations is 0
    16001737template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    16911828}
    16921829
    1693 //The total number of operations is 1.0
    1694 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1695 {
    1696         return simd_or(arg1, arg2);
    1697 }
    1698 
    1699 //The total number of operations is 12.0
    1700 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1701 {
    1702         bitblock128_t tmpAns = simd128<(1)>::umax(arg1, arg2);
    1703         bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg1));
    1704         bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg2));
    1705         return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1706 }
    1707 
    1708 //The total number of operations is 9.0
    1709 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1710 {
    1711         bitblock128_t high_bit = simd128<4>::constant<(8)>();
    1712         return simd_xor(simd128<4>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1713 }
    1714 
    1715 //The total number of operations is 4.0
    1716 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1717 {
    1718         bitblock128_t high_bit = simd128<8>::constant<(128)>();
    1719         return simd_xor(simd128<8>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1720 }
    1721 
    1722 //The total number of operations is 4.0
    1723 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1724 {
    1725         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1726         return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1727 }
    1728 
    1729 //The total number of operations is 4.0
    1730 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1731 {
    1732         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1733         return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1734 }
    1735 
    1736 //The total number of operations is 11.0
    1737 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1738 {
    1739         bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
    1740         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1741         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1742         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1743 }
    1744 
    1745 //The total number of operations is 29.3333333333
    1746 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1747 {
    1748         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1749         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1750         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1751         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1830//The total number of operations is 5.33333333333
     1831template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1832{
     1833        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1834}
     1835
     1836//The total number of operations is 10.0
     1837template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1838{
     1839        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1840        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1841}
     1842
     1843//The total number of operations is 1.0
     1844template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1845{
     1846        return (bitblock128_t)vabsq_s8((int8x16_t)(arg1));
     1847}
     1848
     1849//The total number of operations is 1.0
     1850template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1851{
     1852        return (bitblock128_t)vabsq_s16((int16x8_t)(arg1));
     1853}
     1854
     1855//The total number of operations is 1.0
     1856template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1857{
     1858        return (bitblock128_t)vabsq_s32((int32x4_t)(arg1));
     1859}
     1860
     1861//The total number of operations is 8.5
     1862template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1863{
     1864        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1865        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1866}
     1867
     1868//The total number of operations is 31.8333333333
     1869template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1870{
     1871        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1872        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    17521873}
    17531874
     
    18932014
    18942015//The total number of operations is 45.0
    1895 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<4>::signmask(bitblock128_t arg1)
     2016template <> IDISA_ALWAYS_INLINE FieldType<128/4>::T hsimd128<4>::signmask(bitblock128_t arg1)
    18962017{
    18972018        uint64_t tmpAns1 = hsimd128<(8)>::signmask(esimd128<4>::mergeh(arg1, simd128<4>::constant<0>()));
     
    19012022
    19022023//The total number of operations is 18.0
    1903 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<8>::signmask(bitblock128_t arg1)
     2024template <> IDISA_ALWAYS_INLINE FieldType<128/8>::T hsimd128<8>::signmask(bitblock128_t arg1)
    19042025{
    19052026        uint64_t tmpAns1 = hsimd128<(16)>::signmask(esimd128<8>::mergeh(arg1, simd128<8>::constant<0>()));
     
    19092030
    19102031//The total number of operations is 8.0
    1911 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<16>::signmask(bitblock128_t arg1)
     2032template <> IDISA_ALWAYS_INLINE FieldType<128/16>::T hsimd128<16>::signmask(bitblock128_t arg1)
    19122033{
    19132034        return (((((((((mvmd128<16>::extract<7>(arg1)>>8)&128)|((mvmd128<16>::extract<6>(arg1)>>9)&64))|((mvmd128<16>::extract<5>(arg1)>>10)&32))|((mvmd128<16>::extract<4>(arg1)>>11)&16))|((mvmd128<16>::extract<3>(arg1)>>12)&8))|((mvmd128<16>::extract<2>(arg1)>>13)&4))|((mvmd128<16>::extract<1>(arg1)>>14)&2))|(mvmd128<16>::extract<0>(arg1)>>15));
     
    19152036
    19162037//The total number of operations is 4.0
    1917 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<32>::signmask(bitblock128_t arg1)
     2038template <> IDISA_ALWAYS_INLINE FieldType<128/32>::T hsimd128<32>::signmask(bitblock128_t arg1)
    19182039{
    19192040        return (((((mvmd128<32>::extract<3>(arg1)>>28)&8)|((mvmd128<32>::extract<2>(arg1)>>29)&4))|((mvmd128<32>::extract<1>(arg1)>>30)&2))|(mvmd128<32>::extract<0>(arg1)>>31));
     
    19212042
    19222043//The total number of operations is 2.0
    1923 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<64>::signmask(bitblock128_t arg1)
     2044template <> IDISA_ALWAYS_INLINE FieldType<128/64>::T hsimd128<64>::signmask(bitblock128_t arg1)
    19242045{
    19252046        return (((mvmd128<64>::extract<1>(arg1)>>62)&2)|(mvmd128<64>::extract<0>(arg1)>>63));
     
    19272048
    19282049//The total number of operations is 6.16666666667
    1929 template <> IDISA_ALWAYS_INLINE uint64_t hsimd128<128>::signmask(bitblock128_t arg1)
     2050template <> IDISA_ALWAYS_INLINE FieldType<128/128>::T hsimd128<128>::signmask(bitblock128_t arg1)
    19302051{
    19312052        return hsimd128<(64)>::signmask(hsimd128<128>::packh(simd128<128>::constant<0>(), arg1));
     
    23692490
    23702491//The total number of operations is 6.5
    2371 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2492template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    23722493{
    23732494        return simd_or(mvmd128<2>::srli<sh>(arg1), mvmd128<2>::slli<((64)-sh)>(arg2));
     
    23752496
    23762497//The total number of operations is 6.5
    2377 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2498template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    23782499{
    23792500        return simd_or(mvmd128<4>::srli<sh>(arg1), mvmd128<4>::slli<((32)-sh)>(arg2));
     
    23812502
    23822503//The total number of operations is 6.5
    2383 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2504template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    23842505{
    23852506        return simd_or(mvmd128<8>::srli<sh>(arg1), mvmd128<8>::slli<((16)-sh)>(arg2));
     
    23872508
    23882509//The total number of operations is 6.5
    2389 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2510template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    23902511{
    23912512        return simd_or(mvmd128<16>::srli<sh>(arg1), mvmd128<16>::slli<((8)-sh)>(arg2));
     
    23932514
    23942515//The total number of operations is 6.5
    2395 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2516template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    23962517{
    23972518        return simd_or(mvmd128<32>::srli<sh>(arg1), mvmd128<32>::slli<((4)-sh)>(arg2));
     
    23992520
    24002521//The total number of operations is 6.5
    2401 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2522template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    24022523{
    24032524        return simd_or(mvmd128<64>::srli<sh>(arg1), mvmd128<64>::slli<((2)-sh)>(arg2));
     
    24052526
    24062527//The total number of operations is 6.5
    2407 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
     2528template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dsrli(bitblock128_t arg1, bitblock128_t arg2)
    24082529{
    24092530        return simd_or(mvmd128<128>::srli<sh>(arg1), mvmd128<128>::slli<((1)-sh)>(arg2));
    24102531}
    24112532
    2412 //The total number of operations is 1.0
    2413 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(uint64_t val1)
     2533//The total number of operations is 15.0
     2534template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8, FieldType<1>::T val9, FieldType<1>::T val10, FieldType<1>::T val11, FieldType<1>::T val12, FieldType<1>::T val13, FieldType<1>::T val14, FieldType<1>::T val15, FieldType<1>::T val16)
     2535{
     2536        return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<1>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<1>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     2537}
     2538
     2539//The total number of operations is 15.0
     2540template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8, FieldType<2>::T val9, FieldType<2>::T val10, FieldType<2>::T val11, FieldType<2>::T val12, FieldType<2>::T val13, FieldType<2>::T val14, FieldType<2>::T val15, FieldType<2>::T val16)
     2541{
     2542        return simd128<1>::ifh(simd128<(32)>::himask(), mvmd128<2>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<2>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     2543}
     2544
     2545//The total number of operations is 15.0
     2546template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8, FieldType<4>::T val9, FieldType<4>::T val10, FieldType<4>::T val11, FieldType<4>::T val12, FieldType<4>::T val13, FieldType<4>::T val14, FieldType<4>::T val15, FieldType<4>::T val16)
     2547{
     2548        return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<4>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<4>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     2549}
     2550
     2551//The total number of operations is 15.0
     2552template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8, FieldType<8>::T val9, FieldType<8>::T val10, FieldType<8>::T val11, FieldType<8>::T val12, FieldType<8>::T val13, FieldType<8>::T val14, FieldType<8>::T val15, FieldType<8>::T val16)
     2553{
     2554        return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<8>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<8>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
     2555}
     2556
     2557//The total number of operations is 1.0
     2558template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill(FieldType<1>::T val1)
    24142559{
    24152560        return mvmd128<32>::fill((-1*val1));
     
    24172562
    24182563//The total number of operations is 1.0
    2419 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(uint64_t val1)
     2564template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill(FieldType<2>::T val1)
    24202565{
    24212566        return mvmd128<(4)>::fill(((val1<<2)|val1));
     
    24232568
    24242569//The total number of operations is 1.0
    2425 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(uint64_t val1)
     2570template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill(FieldType<4>::T val1)
    24262571{
    24272572        return mvmd128<(8)>::fill(((val1<<4)|val1));
     
    24292574
    24302575//The total number of operations is 1.0
    2431 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(uint64_t val1)
     2576template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill(FieldType<8>::T val1)
    24322577{
    24332578        return (bitblock128_t)vdupq_n_u8((uint8_t)(val1));
     
    24352580
    24362581//The total number of operations is 1.0
    2437 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(uint64_t val1)
     2582template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill(FieldType<16>::T val1)
    24382583{
    24392584        return (bitblock128_t)vdupq_n_u16((uint16_t)(val1));
     
    24412586
    24422587//The total number of operations is 1.0
    2443 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(uint64_t val1)
     2588template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill(FieldType<32>::T val1)
    24442589{
    24452590        return (bitblock128_t)vdupq_n_u32((uint32_t)(val1));
     
    24472592
    24482593//The total number of operations is 1.0
    2449 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(uint64_t val1)
     2594template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill(FieldType<64>::T val1)
    24502595{
    24512596        return (bitblock128_t)vdupq_n_u64((uint64_t)(val1));
     
    24532598
    24542599//The total number of operations is 3.0
    2455 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(uint64_t val1)
     2600template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::fill(FieldType<128>::T val1)
    24562601{
    24572602        return mvmd128<(64)>::fill2(0, val1);
     
    24592604
    24602605//The total number of operations is 1.0
    2461 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<1>::extract(bitblock128_t arg1)
     2606template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<1>::T mvmd128<1>::extract(bitblock128_t arg1)
    24622607{
    24632608        return (((pos%2) == 0) ? (mvmd128<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd128<(2)>::extract<(pos/2)>(arg1)>>1));
     
    24652610
    24662611//The total number of operations is 1.0
    2467 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<2>::extract(bitblock128_t arg1)
     2612template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<2>::T mvmd128<2>::extract(bitblock128_t arg1)
    24682613{
    24692614        return (((pos%2) == 0) ? (mvmd128<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd128<(4)>::extract<(pos/2)>(arg1)>>2));
     
    24712616
    24722617//The total number of operations is 1.0
    2473 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<4>::extract(bitblock128_t arg1)
     2618template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<4>::T mvmd128<4>::extract(bitblock128_t arg1)
    24742619{
    24752620        return (((pos%2) == 0) ? (mvmd128<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd128<(8)>::extract<(pos/2)>(arg1)>>4));
     
    24772622
    24782623//The total number of operations is 1.0
    2479 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<8>::extract(bitblock128_t arg1)
     2624template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<8>::T mvmd128<8>::extract(bitblock128_t arg1)
    24802625{
    24812626        return vgetq_lane_u8((uint8x16_t)(arg1), (int32_t)(pos));
     
    24832628
    24842629//The total number of operations is 1.0
    2485 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<16>::extract(bitblock128_t arg1)
     2630template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<16>::T mvmd128<16>::extract(bitblock128_t arg1)
    24862631{
    24872632        return vgetq_lane_u16((uint16x8_t)(arg1), (int32_t)(pos));
     
    24892634
    24902635//The total number of operations is 1.0
    2491 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<32>::extract(bitblock128_t arg1)
     2636template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<32>::T mvmd128<32>::extract(bitblock128_t arg1)
    24922637{
    24932638        return vgetq_lane_u32((uint32x4_t)(arg1), (int32_t)(pos));
     
    24952640
    24962641//The total number of operations is 1.0
    2497 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd128<64>::extract(bitblock128_t arg1)
     2642template <> template <uint16_t pos> IDISA_ALWAYS_INLINE FieldType<64>::T mvmd128<64>::extract(bitblock128_t arg1)
    24982643{
    24992644        return vgetq_lane_u64((uint64x2_t)(arg1), (int32_t)(pos));
     
    25012646
    25022647//The total number of operations is 2.0
    2503 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1)
     2648template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::splat(bitblock128_t arg1)
    25042649{
    25052650        return mvmd128<1>::fill(mvmd128<1>::extract<pos>(arg1));
     
    25072652
    25082653//The total number of operations is 2.0
    2509 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1)
     2654template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::splat(bitblock128_t arg1)
    25102655{
    25112656        return mvmd128<2>::fill(mvmd128<2>::extract<pos>(arg1));
     
    25132658
    25142659//The total number of operations is 2.0
    2515 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1)
     2660template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::splat(bitblock128_t arg1)
    25162661{
    25172662        return mvmd128<4>::fill(mvmd128<4>::extract<pos>(arg1));
     
    25192664
    25202665//The total number of operations is 2.0
    2521 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1)
     2666template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::splat(bitblock128_t arg1)
    25222667{
    25232668        return mvmd128<8>::fill(mvmd128<8>::extract<pos>(arg1));
     
    25252670
    25262671//The total number of operations is 2.0
    2527 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1)
     2672template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::splat(bitblock128_t arg1)
    25282673{
    25292674        return mvmd128<16>::fill(mvmd128<16>::extract<pos>(arg1));
     
    25312676
    25322677//The total number of operations is 2.0
    2533 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1)
     2678template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::splat(bitblock128_t arg1)
    25342679{
    25352680        return mvmd128<32>::fill(mvmd128<32>::extract<pos>(arg1));
     
    25372682
    25382683//The total number of operations is 2.0
    2539 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1)
     2684template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::splat(bitblock128_t arg1)
    25402685{
    25412686        return mvmd128<64>::fill(mvmd128<64>::extract<pos>(arg1));
     
    25432688
    25442689//The total number of operations is 5.0
    2545 template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1)
     2690template <> template <uint16_t pos> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::splat(bitblock128_t arg1)
    25462691{
    25472692        return simd128<1>::ifh(simd128<128>::himask(), mvmd128<(64)>::splat<((2*pos)+1)>(arg1), mvmd128<(64)>::splat<(2*pos)>(arg1));
    25482693}
    25492694
    2550 //The total number of operations is 15.0
    2551 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2552 {
    2553         return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<1>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<1>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    2554 }
    2555 
    2556 //The total number of operations is 15.0
    2557 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2558 {
    2559         return simd128<1>::ifh(simd128<(32)>::himask(), mvmd128<2>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<2>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    2560 }
    2561 
    2562 //The total number of operations is 15.0
    2563 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2564 {
    2565         return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<4>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<4>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    2566 }
    2567 
    2568 //The total number of operations is 15.0
    2569 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
    2570 {
    2571         return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<8>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd128<8>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
    2572 }
    2573 
    2574 //The total number of operations is 3.0
    2575 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     2695//The total number of operations is 3.0
     2696template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4)
    25762697{
    25772698        return simd128<1>::ifh(simd128<(4)>::himask(), mvmd128<1>::fill2(val1, val2), mvmd128<1>::fill2(val3, val4));
     
    25792700
    25802701//The total number of operations is 3.0
    2581 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     2702template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4)
    25822703{
    25832704        return simd128<1>::ifh(simd128<(8)>::himask(), mvmd128<2>::fill2(val1, val2), mvmd128<2>::fill2(val3, val4));
     
    25852706
    25862707//The total number of operations is 3.0
    2587 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     2708template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill4(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4)
    25882709{
    25892710        return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<4>::fill2(val1, val2), mvmd128<4>::fill2(val3, val4));
     
    25912712
    25922713//The total number of operations is 3.0
    2593 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     2714template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill4(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4)
    25942715{
    25952716        return simd128<1>::ifh(simd128<(32)>::himask(), mvmd128<8>::fill2(val1, val2), mvmd128<8>::fill2(val3, val4));
     
    25972718
    25982719//The total number of operations is 3.0
    2599 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     2720template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill4(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4)
    26002721{
    26012722        return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<16>::fill2(val1, val2), mvmd128<16>::fill2(val3, val4));
     
    26032724
    26042725//The total number of operations is 7.0
    2605 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     2726template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill4(FieldType<32>::T val1, FieldType<32>::T val2, FieldType<32>::T val3, FieldType<32>::T val4)
    26062727{
    26072728        return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<32>::fill2(val1, val2), mvmd128<32>::fill2(val3, val4));
     
    26092730
    26102731//The total number of operations is 3.16666666667
    2611 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1)
     2732template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::srli(bitblock128_t arg1)
    26122733{
    26132734        return simd128<128>::srli<(sh*2)>(arg1);
     
    26152736
    26162737//The total number of operations is 3.16666666667
    2617 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1)
     2738template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::srli(bitblock128_t arg1)
    26182739{
    26192740        return simd128<128>::srli<(sh*4)>(arg1);
     
    26212742
    26222743//The total number of operations is 3.16666666667
    2623 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1)
     2744template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::srli(bitblock128_t arg1)
    26242745{
    26252746        return simd128<128>::srli<(sh*8)>(arg1);
     
    26272748
    26282749//The total number of operations is 3.16666666667
    2629 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1)
     2750template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::srli(bitblock128_t arg1)
    26302751{
    26312752        return simd128<128>::srli<(sh*16)>(arg1);
     
    26332754
    26342755//The total number of operations is 3.16666666667
    2635 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1)
     2756template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::srli(bitblock128_t arg1)
    26362757{
    26372758        return simd128<128>::srli<(sh*32)>(arg1);
     
    26392760
    26402761//The total number of operations is 3.16666666667
    2641 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1)
     2762template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::srli(bitblock128_t arg1)
    26422763{
    26432764        return simd128<128>::srli<(sh*64)>(arg1);
     
    26452766
    26462767//The total number of operations is 3.16666666667
    2647 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1)
     2768template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::srli(bitblock128_t arg1)
    26482769{
    26492770        return simd128<128>::srli<(sh*128)>(arg1);
     
    26512772
    26522773//The total number of operations is 1.0
    2653 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(uint64_t val1, uint64_t val2)
     2774template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill2(FieldType<1>::T val1, FieldType<1>::T val2)
    26542775{
    26552776        return mvmd128<(2)>::fill(((val1<<1)|(val2&(1))));
     
    26572778
    26582779//The total number of operations is 1.0
    2659 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(uint64_t val1, uint64_t val2)
     2780template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill2(FieldType<2>::T val1, FieldType<2>::T val2)
    26602781{
    26612782        return mvmd128<(4)>::fill(((val1<<2)|(val2&(3))));
     
    26632784
    26642785//The total number of operations is 1.0
    2665 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(uint64_t val1, uint64_t val2)
     2786template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill2(FieldType<4>::T val1, FieldType<4>::T val2)
    26662787{
    26672788        return mvmd128<(8)>::fill(((val1<<4)|(val2&(15))));
     
    26692790
    26702791//The total number of operations is 1.0
    2671 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(uint64_t val1, uint64_t val2)
     2792template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill2(FieldType<8>::T val1, FieldType<8>::T val2)
    26722793{
    26732794        return mvmd128<(16)>::fill(((val1<<8)|(val2&(255))));
     
    26752796
    26762797//The total number of operations is 1.0
    2677 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(uint64_t val1, uint64_t val2)
     2798template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill2(FieldType<16>::T val1, FieldType<16>::T val2)
    26782799{
    26792800        return mvmd128<(32)>::fill(((val1<<16)|(val2&(65535))));
     
    26812802
    26822803//The total number of operations is 3.0
    2683 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(uint64_t val1, uint64_t val2)
     2804template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::fill2(FieldType<32>::T val1, FieldType<32>::T val2)
    26842805{
    26852806        return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<32>::fill(val1), mvmd128<32>::fill(val2));
     
    26872808
    26882809//The total number of operations is 3.0
    2689 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(uint64_t val1, uint64_t val2)
     2810template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::fill2(FieldType<64>::T val1, FieldType<64>::T val2)
    26902811{
    26912812        return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<64>::fill(val1), mvmd128<64>::fill(val2));
     
    26932814
    26942815//The total number of operations is 6.5
    2695 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2816template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    26962817{
    26972818        return simd_or(mvmd128<2>::slli<sh>(arg1), mvmd128<2>::srli<((64)-sh)>(arg2));
     
    26992820
    27002821//The total number of operations is 6.5
    2701 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2822template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    27022823{
    27032824        return simd_or(mvmd128<4>::slli<sh>(arg1), mvmd128<4>::srli<((32)-sh)>(arg2));
     
    27052826
    27062827//The total number of operations is 6.5
    2707 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2828template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    27082829{
    27092830        return simd_or(mvmd128<8>::slli<sh>(arg1), mvmd128<8>::srli<((16)-sh)>(arg2));
     
    27112832
    27122833//The total number of operations is 6.5
    2713 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2834template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    27142835{
    27152836        return simd_or(mvmd128<16>::slli<sh>(arg1), mvmd128<16>::srli<((8)-sh)>(arg2));
     
    27172838
    27182839//The total number of operations is 6.5
    2719 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2840template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    27202841{
    27212842        return simd_or(mvmd128<32>::slli<sh>(arg1), mvmd128<32>::srli<((4)-sh)>(arg2));
     
    27232844
    27242845//The total number of operations is 6.5
    2725 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2846template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    27262847{
    27272848        return simd_or(mvmd128<64>::slli<sh>(arg1), mvmd128<64>::srli<((2)-sh)>(arg2));
     
    27292850
    27302851//The total number of operations is 6.5
    2731 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2)
     2852template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2)
    27322853{
    27332854        return simd_or(mvmd128<128>::slli<sh>(arg1), mvmd128<128>::srli<((1)-sh)>(arg2));
     
    27352856
    27362857//The total number of operations is 2.33333333333
    2737 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     2858template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    27382859{
    27392860        return simd128<128>::slli<(sh*2)>(arg1);
     
    27412862
    27422863//The total number of operations is 2.33333333333
    2743 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2744 {
    2745         return mvmd128<(2)>::slli<(sh*2)>(arg1);
     2864template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     2865{
     2866        return simd128<128>::slli<(sh*4)>(arg1);
    27462867}
    27472868
    27482869//The total number of operations is 2.33333333333
    2749 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     2870template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    27502871{
    27512872        return simd128<128>::slli<(sh*8)>(arg1);
     
    27532874
    27542875//The total number of operations is 2.33333333333
    2755 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2756 {
    2757         return mvmd128<(8)>::slli<(sh*2)>(arg1);
     2876template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     2877{
     2878        return simd128<128>::slli<(sh*16)>(arg1);
    27582879}
    27592880
    27602881//The total number of operations is 2.33333333333
    2761 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     2882template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    27622883{
    27632884        return simd128<128>::slli<(sh*32)>(arg1);
     
    27652886
    27662887//The total number of operations is 2.33333333333
    2767 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2768 {
    2769         return mvmd128<(32)>::slli<(sh*2)>(arg1);
     2888template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     2889{
     2890        return simd128<128>::slli<(sh*64)>(arg1);
    27702891}
    27712892
    27722893//The total number of operations is 2.33333333333
    2773 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     2894template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    27742895{
    27752896        return simd128<128>::slli<(sh*128)>(arg1);
     
    27772898
    27782899//The total number of operations is 7.0
    2779 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     2900template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(FieldType<1>::T val1, FieldType<1>::T val2, FieldType<1>::T val3, FieldType<1>::T val4, FieldType<1>::T val5, FieldType<1>::T val6, FieldType<1>::T val7, FieldType<1>::T val8)
    27802901{
    27812902        return simd128<1>::ifh(simd128<(8)>::himask(), mvmd128<1>::fill4(val1, val2, val3, val4), mvmd128<1>::fill4(val5, val6, val7, val8));
     
    27832904
    27842905//The total number of operations is 7.0
    2785 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     2906template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(FieldType<2>::T val1, FieldType<2>::T val2, FieldType<2>::T val3, FieldType<2>::T val4, FieldType<2>::T val5, FieldType<2>::T val6, FieldType<2>::T val7, FieldType<2>::T val8)
    27862907{
    27872908        return simd128<1>::ifh(simd128<(16)>::himask(), mvmd128<2>::fill4(val1, val2, val3, val4), mvmd128<2>::fill4(val5, val6, val7, val8));
     
    27892910
    27902911//The total number of operations is 7.0
    2791 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     2912template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill8(FieldType<4>::T val1, FieldType<4>::T val2, FieldType<4>::T val3, FieldType<4>::T val4, FieldType<4>::T val5, FieldType<4>::T val6, FieldType<4>::T val7, FieldType<4>::T val8)
    27922913{
    27932914        return simd128<1>::ifh(simd128<(32)>::himask(), mvmd128<4>::fill4(val1, val2, val3, val4), mvmd128<4>::fill4(val5, val6, val7, val8));
     
    27952916
    27962917//The total number of operations is 7.0
    2797 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     2918template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill8(FieldType<8>::T val1, FieldType<8>::T val2, FieldType<8>::T val3, FieldType<8>::T val4, FieldType<8>::T val5, FieldType<8>::T val6, FieldType<8>::T val7, FieldType<8>::T val8)
    27982919{
    27992920        return simd128<1>::ifh(simd128<(64)>::himask(), mvmd128<8>::fill4(val1, val2, val3, val4), mvmd128<8>::fill4(val5, val6, val7, val8));
     
    28012922
    28022923//The total number of operations is 7.0
    2803 template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     2924template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::fill8(FieldType<16>::T val1, FieldType<16>::T val2, FieldType<16>::T val3, FieldType<16>::T val4, FieldType<16>::T val5, FieldType<16>::T val6, FieldType<16>::T val7, FieldType<16>::T val8)
    28042925{
    28052926        return simd128<1>::ifh(simd128<(128)>::himask(), mvmd128<16>::fill4(val1, val2, val3, val4), mvmd128<16>::fill4(val5, val6, val7, val8));
     
    28132934
    28142935//The total number of operations is 3.16666666667
    2815 template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     2936template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
    28162937{
    28172938        return simd128<128>::srli<sh>(arg1);
    28182939}
    28192940
    2820 //The total number of operations is 1.0
    2821 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, uint64_t* arg2)
    2822 {
    2823         vst1q_u64((uint64_t*)(arg1), arg2);
    2824 }
    2825 
    28262941//The total number of operations is 16.1666666667
    2827 IDISA_ALWAYS_INLINE uint64_t bitblock128::popcount(bitblock128_t arg1)
     2942IDISA_ALWAYS_INLINE uint16_t bitblock128::popcount(bitblock128_t arg1)
    28282943{
    28292944        return mvmd128<64>::extract<0>(simd128<128>::popcount(arg1));
     
    28362951}
    28372952
    2838 //The total number of operations is 2.33333333333
    2839 template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    2840 {
    2841         return simd128<128>::slli<sh>(arg1);
    2842 }
    2843 
    28442953//The total number of operations is 5.0
    28452954IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
     
    28602969}
    28612970
     2971//The total number of operations is 2.33333333333
     2972template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     2973{
     2974        return simd128<128>::slli<sh>(arg1);
     2975}
     2976
     2977//The total number of operations is 1.0
     2978IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, uint64_t* arg2)
     2979{
     2980        vst1q_u64((uint64_t*)(arg1), arg2);
     2981}
     2982
    28622983#endif
  • trunk/lib/idisa_cpp/idisa_sse2.cpp

    r3441 r3525  
    1414
    1515typedef __m128i bitblock128_t;
    16 
     16               
    1717#ifndef FIELD_TYPE
    18 #define FIELD_TYPE
     18#define FIELD_TYPE     
    1919template <uint32_t fw> struct FieldType {
    2020   typedef int T;  //default for FieldType::T is int
     
    2929template <> struct FieldType<64> {typedef uint64_t T;};
    3030template <> struct FieldType<128> {typedef uint64_t T;};
     31template <> struct FieldType<256> {typedef uint64_t T;};
    3132#endif
    32 
    33 typedef FieldType<1>::T fw1_t;
    34 typedef FieldType<2>::T fw2_t;
    35 typedef FieldType<4>::T fw4_t;
    36 typedef FieldType<8>::T fw8_t;
    37 typedef FieldType<16>::T fw16_t;
    38 typedef FieldType<32>::T fw32_t;
    39 typedef FieldType<64>::T fw64_t;
    40 typedef FieldType<128>::T fw128_t;
    4133
    4234template <uint32_t fw>
     
    4941        static IDISA_ALWAYS_INLINE bitblock128_t umult(bitblock128_t arg1, bitblock128_t arg2);
    5042        static IDISA_ALWAYS_INLINE bitblock128_t ult(bitblock128_t arg1, bitblock128_t arg2);
    51         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     43        static IDISA_ALWAYS_INLINE bitblock128_t all(bitblock128_t arg1);
     44        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     45        static IDISA_ALWAYS_INLINE bitblock128_t ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    5246        static IDISA_ALWAYS_INLINE bitblock128_t ctz(bitblock128_t arg1);
    5347        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t shift_mask);
     48        static IDISA_ALWAYS_INLINE bitblock128_t vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    5449        static IDISA_ALWAYS_INLINE bitblock128_t eq(bitblock128_t arg1, bitblock128_t arg2);
    5550        static IDISA_ALWAYS_INLINE bitblock128_t popcount(bitblock128_t arg1);
    5651        static IDISA_ALWAYS_INLINE bitblock128_t neg(bitblock128_t arg1);
    5752        static IDISA_ALWAYS_INLINE bitblock128_t himask();
    58         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    59         static IDISA_ALWAYS_INLINE bitblock128_t ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
     53        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    6054        static IDISA_ALWAYS_INLINE bitblock128_t sub(bitblock128_t arg1, bitblock128_t arg2);
    6155        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    6256        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    6357        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    6459        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    65 //      template <typename FieldType<fw>::T> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    66         template <uint64_t> static IDISA_ALWAYS_INLINE bitblock128_t constant();
     60        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    6761        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
     62        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6863        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    6964        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
    7065        static IDISA_ALWAYS_INLINE bitblock128_t xor_hl(bitblock128_t arg1);
    71         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
     66        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
     67        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    7268        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    73         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    7469        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    75         //
    76         // Hand-coded floating point routines.
    77         //
    78         static IDISA_ALWAYS_INLINE bitblock128_t i2f(bitblock128_t arg1);
    79         static IDISA_ALWAYS_INLINE bitblock128_t f2i(bitblock128_t arg1);
    80         static IDISA_ALWAYS_INLINE bitblock128_t fdiv(bitblock128_t arg1, bitblock128_t arg2);
    81         static IDISA_ALWAYS_INLINE bitblock128_t fmul(bitblock128_t arg1, bitblock128_t arg2);
    82         static IDISA_ALWAYS_INLINE bitblock128_t fadd(bitblock128_t arg1, bitblock128_t arg2);
    83         static IDISA_ALWAYS_INLINE bitblock128_t fsub(bitblock128_t arg1, bitblock128_t arg2);
    84         static IDISA_ALWAYS_INLINE bitblock128_t feq(bitblock128_t arg1, bitblock128_t arg2);
    85         static IDISA_ALWAYS_INLINE bitblock128_t flt(bitblock128_t arg1, bitblock128_t arg2);
    86         static IDISA_ALWAYS_INLINE bitblock128_t fle(bitblock128_t arg1, bitblock128_t arg2);
    87         static IDISA_ALWAYS_INLINE bitblock128_t fsqrt(bitblock128_t arg1);
    8870};
    8971
     
    119101public:
    120102        template <uint64_t msk> static IDISA_ALWAYS_INLINE bitblock128_t shufflei(bitblock128_t arg1);
    121         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dsrli(bitblock128_t arg1, bitblock128_t arg2);
     103        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dsrli(bitblock128_t arg1, bitblock128_t arg2);
    122104        static IDISA_ALWAYS_INLINE bitblock128_t fill(typename FieldType<fw>::T val1);
    123         template <uint8_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock128_t arg1);
    124         template <uint8_t pos> static IDISA_ALWAYS_INLINE bitblock128_t splat(bitblock128_t arg1);
    125         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
     105        template <uint16_t pos> static IDISA_ALWAYS_INLINE typename FieldType<fw>::T extract(bitblock128_t arg1);
     106        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock128_t splat(bitblock128_t arg1);
     107        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    126108        static IDISA_ALWAYS_INLINE bitblock128_t fill4(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4);
    127         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     109        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    128110        static IDISA_ALWAYS_INLINE bitblock128_t fill2(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2);
    129         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dslli(bitblock128_t arg1, bitblock128_t arg2);
     111        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t dslli(bitblock128_t arg1, bitblock128_t arg2);
    130112        static IDISA_ALWAYS_INLINE bitblock128_t fill8(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8);
    131113        static IDISA_ALWAYS_INLINE bitblock128_t fill16(typename FieldType<fw>::T val1, typename FieldType<fw>::T val2, typename FieldType<fw>::T val3, typename FieldType<fw>::T val4, typename FieldType<fw>::T val5, typename FieldType<fw>::T val6, typename FieldType<fw>::T val7, typename FieldType<fw>::T val8, typename FieldType<fw>::T val9, typename FieldType<fw>::T val10, typename FieldType<fw>::T val11, typename FieldType<fw>::T val12, typename FieldType<fw>::T val13, typename FieldType<fw>::T val14, typename FieldType<fw>::T val15, typename FieldType<fw>::T val16);
     114        template <uint16_t pos> static IDISA_ALWAYS_INLINE bitblock128_t insert(bitblock128_t arg1, typename FieldType<fw>::T arg2);
    132115};
    133116
     
    137120        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    138121        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(const bitblock128_t* arg1);
    139         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     122        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
    140123        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    141         static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
    142124        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    143125        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    144         static IDISA_ALWAYS_INLINE uint8_t popcount(bitblock128_t arg1);
    145         template <uint8_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
     126        static IDISA_ALWAYS_INLINE uint16_t popcount(bitblock128_t arg1);
     127        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    146128        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(const bitblock128_t* arg1);
     129        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
    147130        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t arg1, bitblock128_t* arg2);
    148131};
     
    151134IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    152135IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     136IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    153137IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    154 IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
     138IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    155139IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
    156 IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    157140template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::max(bitblock128_t arg1, bitblock128_t arg2);
    158141template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::max(bitblock128_t arg1, bitblock128_t arg2);
     
    194177template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ult(bitblock128_t arg1, bitblock128_t arg2);
    195178template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ult(bitblock128_t arg1, bitblock128_t arg2);
    196 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    197 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
    198 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lt(bitblock128_t arg1, bitblock128_t arg2);
    199 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lt(bitblock128_t arg1, bitblock128_t arg2);
    200 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lt(bitblock128_t arg1, bitblock128_t arg2);
    201 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lt(bitblock128_t arg1, bitblock128_t arg2);
    202 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    203 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    204 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1);
    205 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1);
    206 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1);
    207 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1);
    208 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1);
    209 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1);
    210 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1);
     179template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::all(bitblock128_t arg1);
     180template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::all(bitblock128_t arg1);
     181template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::all(bitblock128_t arg1);
     182template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::all(bitblock128_t arg1);
     183template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::all(bitblock128_t arg1);
     184template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::all(bitblock128_t arg1);
     185template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::all(bitblock128_t arg1);
     186template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srli(bitblock128_t arg1);
     187template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srli(bitblock128_t arg1);
     188template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srli(bitblock128_t arg1);
     189template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srli(bitblock128_t arg1);
     190template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srli(bitblock128_t arg1);
     191template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srli(bitblock128_t arg1);
     192template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srli(bitblock128_t arg1);
    211193template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ctz(bitblock128_t arg1);
    212194template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ctz(bitblock128_t arg1);
     
    217199template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ctz(bitblock128_t arg1);
    218200template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ctz(bitblock128_t arg1);
    219 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sll(bitblock128_t arg1, bitblock128_t shift_mask);
    220201template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sll(bitblock128_t arg1, bitblock128_t shift_mask);
     202template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2);
     203template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2);
     204template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2);
     205template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2);
     206template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2);
     207template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2);
     208template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2);
     209template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2);
    221210template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ugt(bitblock128_t arg1, bitblock128_t arg2);
    222211template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ugt(bitblock128_t arg1, bitblock128_t arg2);
     
    242231template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::popcount(bitblock128_t arg1);
    243232template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::popcount(bitblock128_t arg1);
     233template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::any(bitblock128_t arg1);
     234template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::any(bitblock128_t arg1);
     235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::any(bitblock128_t arg1);
     236template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::any(bitblock128_t arg1);
     237template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::any(bitblock128_t arg1);
     238template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::any(bitblock128_t arg1);
     239template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::any(bitblock128_t arg1);
    244240template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::neg(bitblock128_t arg1);
    245241template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::neg(bitblock128_t arg1);
     
    249245template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::neg(bitblock128_t arg1);
    250246template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::neg(bitblock128_t arg1);
    251 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1);
    252 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1);
    253 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1);
    254 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1);
    255 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1);
    256 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1);
    257 template <> template <uint8_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1);
     247template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::slli(bitblock128_t arg1);
     248template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::slli(bitblock128_t arg1);
     249template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::slli(bitblock128_t arg1);
     250template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::slli(bitblock128_t arg1);
     251template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::slli(bitblock128_t arg1);
     252template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::slli(bitblock128_t arg1);
     253template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::slli(bitblock128_t arg1);
    258254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    259255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
     
    264260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    265261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::sub(bitblock128_t arg1, bitblock128_t arg2);
    267 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::sub(bitblock128_t arg1, bitblock128_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::sub(bitblock128_t arg1, bitblock128_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::sub(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::sub(bitblock128_t arg1, bitblock128_t arg2);
    271 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::sub(bitblock128_t arg1, bitblock128_t arg2);
    272 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::sub(bitblock128_t arg1, bitblock128_t arg2);
    273 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::sub(bitblock128_t arg1, bitblock128_t arg2);
     262template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     263template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     264template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     265template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     266template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     267template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     268template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    274271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1);
    275272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1);
     
    279276template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1);
    280277template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    282278template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    283 
    284 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    285 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
    286 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
    287 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
    288 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
    289 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
    290 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
    291 template <> template <uint64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
    292 #if 0
    293 template <> template <fw1_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    294 template <> template <fw2_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
    295 template <> template <fw4_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
    296 template <> template <fw8_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
    297 template <> template <fw16_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
    298 template <> template <fw32_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
    299 template <> template <fw64_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
    300 template <> template <fw128_t> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
    301 #endif
     279template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     280template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     281template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     282template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     283template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     284template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     285template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
     286template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     287template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     288template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
     289template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     290template <> template <FieldType<4>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::constant();
     291template <> template <FieldType<8>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::constant();
     292template <> template <FieldType<16>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::constant();
     293template <> template <FieldType<32>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::constant();
     294template <> template <FieldType<64>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::constant();
     295template <> template <FieldType<128>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::constant();
    302296