Changeset 3576 for trunk


Ignore:
Timestamp:
Nov 29, 2013, 3:03:33 PM (4 years ago)
Author:
linmengl
Message:

regenerate libraries, with negative number constants eliminated.

Location:
trunk/lib/idisa_cpp
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_avx.cpp

    r3526 r3576  
    5555        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1);
    5656        static IDISA_ALWAYS_INLINE bitblock256_t lomask();
     57        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
    5758        static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2);
    5859        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    5960        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
    60         static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    6161        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    6262        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
     
    6464        static IDISA_ALWAYS_INLINE bitblock256_t any(bitblock256_t arg1);
    6565        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
    66         static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
     66        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    6767        static IDISA_ALWAYS_INLINE bitblock256_t ugt(bitblock256_t arg1, bitblock256_t arg2);
    6868};
     
    128128IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2);
    129129IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1);
     130IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    130131IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    131 IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    132132IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    133133IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
     
    269269template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    270270template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    271 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
    272 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
    273 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
    274 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
    275 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
    276 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
    277 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
    278 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    279271template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1);
    280272template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1);
     
    285277template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
    286278template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
    287 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
    288 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
    289 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
    290 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
    291 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
    292 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
    293 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    294 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    295279template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    296280template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     
    311295template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
    312296template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
     297template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
     298template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
     299template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
     300template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
     301template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
     302template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
     303template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
     304template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    313305template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    314306template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     
    320312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
    321313template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
    322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
    324 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
    325 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
    326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
    327 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
    328 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
    329 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
    330 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
     315template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
     316template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
     317template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
     318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
     319template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
     320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
     321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
     322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
     323template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
     324template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
     325template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
     326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
     328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
     329template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
     330template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
     331template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
     332template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
     333template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
     334template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
     335template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
     336template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
     337template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
     338template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    331339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
    332340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
     
    338346template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
    339347template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    340 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
    341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
    342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
    343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
    344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
    345 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
    346 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
    347 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
    348 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
    349348template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
    350349template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
     
    364363template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    365364template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    366 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
    367 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
    368 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
    369 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
    370 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
    371 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
    372 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
    373 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
     365template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
     366template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     367template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
     368template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
     369template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
     370template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
     371template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
     372template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
     373template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
    374374template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
    375375template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    597597IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1)
    598598{
    599         return simd_xor(arg1, simd256<32>::constant<-1>());
     599        return simd_xor(arg1, simd256<32>::constant<4294967295ULL>());
     600}
     601
     602//The total number of operations is 1.0
     603IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
     604{
     605        return _mm256_andnot_ps(arg2, arg1);
    600606}
    601607
     
    604610{
    605611        return _mm256_or_ps(arg1, arg2);
    606 }
    607 
    608 //The total number of operations is 1.0
    609 IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
    610 {
    611         return _mm256_andnot_ps(arg2, arg1);
    612612}
    613613
     
    15851585}
    15861586
    1587 //The total number of operations is 7.0
    1588 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
    1589 {
    1590         return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
    1591 }
    1592 
    1593 //The total number of operations is 17.5
    1594 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
    1595 {
    1596         return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
    1597 }
    1598 
    1599 //The total number of operations is 12.0
    1600 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
    1601 {
    1602         bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1603         return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1604 }
    1605 
    1606 //The total number of operations is 4.0
    1607 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
    1608 {
    1609         return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
    1610 }
    1611 
    1612 //The total number of operations is 4.0
    1613 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
    1614 {
    1615         return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
    1616 }
    1617 
    1618 //The total number of operations is 12.0
    1619 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
    1620 {
    1621         return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
    1622 }
    1623 
    1624 //The total number of operations is 28.3333333333
    1625 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
    1626 {
    1627         return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
    1628 }
    1629 
    1630 //The total number of operations is 58.5
    1631 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
    1632 {
    1633         return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    1634 }
    1635 
    16361587//The total number of operations is 10.0
    16371588template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
     
    16831634
    16841635//The total number of operations is 0
    1685 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1686 {
    1687         return simd256<2>::constant<(1)>();
    1688 }
    1689 
    1690 //The total number of operations is 0
    1691 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1692 {
    1693         return simd256<4>::constant<(3)>();
    1694 }
    1695 
    1696 //The total number of operations is 0
    1697 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1698 {
    1699         return simd256<8>::constant<(15)>();
    1700 }
    1701 
    1702 //The total number of operations is 0
    1703 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1704 {
    1705         return simd256<16>::constant<(255)>();
    1706 }
    1707 
    1708 //The total number of operations is 0
    1709 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1710 {
    1711         return simd256<32>::constant<(65535)>();
    1712 }
    1713 
    1714 //The total number of operations is 0
    1715 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1716 {
    1717         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
    1718 }
    1719 
    1720 //The total number of operations is 0
    1721 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1722 {
    1723         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
    1724 }
    1725 
    1726 //The total number of operations is 0
    1727 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1728 {
    1729         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
    1730 }
    1731 
    1732 //The total number of operations is 0
    17331636template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()
    17341637{
    1735         return simd256<32>::constant<(-1*val)>();
     1638        return simd256<2>::constant<((val+val)+val)>();
    17361639}
    17371640
     
    18511754}
    18521755
     1756//The total number of operations is 0
     1757template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1758{
     1759        return simd256<2>::constant<(1)>();
     1760}
     1761
     1762//The total number of operations is 0
     1763template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1764{
     1765        return simd256<4>::constant<(3)>();
     1766}
     1767
     1768//The total number of operations is 0
     1769template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1770{
     1771        return simd256<8>::constant<(15)>();
     1772}
     1773
     1774//The total number of operations is 0
     1775template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1776{
     1777        return simd256<16>::constant<(255)>();
     1778}
     1779
     1780//The total number of operations is 0
     1781template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1782{
     1783        return simd256<32>::constant<(65535)>();
     1784}
     1785
     1786//The total number of operations is 0
     1787template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1788{
     1789        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)))));
     1790}
     1791
     1792//The total number of operations is 0
     1793template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1794{
     1795        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)))));
     1796}
     1797
     1798//The total number of operations is 0
     1799template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1800{
     1801        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)))));
     1802}
     1803
    18531804//The total number of operations is 1.0
    18541805template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     
    19151866}
    19161867
    1917 //The total number of operations is 1.0
    1918 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1919 {
    1920         return simd_or(arg1, arg2);
    1921 }
    1922 
    1923 //The total number of operations is 24.0
    1924 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1925 {
    1926         bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
    1927         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    1928         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    1929         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1868//The total number of operations is 19.0
     1869template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     1870{
     1871        return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
     1872}
     1873
     1874//The total number of operations is 39.0
     1875template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
     1876{
     1877        bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
     1878        return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
     1879}
     1880
     1881//The total number of operations is 4.0
     1882template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
     1883{
     1884        return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
     1885}
     1886
     1887//The total number of operations is 4.0
     1888template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
     1889{
     1890        return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
     1891}
     1892
     1893//The total number of operations is 4.0
     1894template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
     1895{
     1896        return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
     1897}
     1898
     1899//The total number of operations is 13.0
     1900template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
     1901{
     1902        bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
     1903        return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
     1904}
     1905
     1906//The total number of operations is 69.0
     1907template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
     1908{
     1909        bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
     1910        return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
     1911}
     1912
     1913//The total number of operations is 204.333333333
     1914template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
     1915{
     1916        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
     1917        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
     1918}
     1919
     1920//The total number of operations is 2.0
     1921template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1922{
     1923        return simd_not(simd_xor(arg1, arg2));
    19301924}
    19311925
    19321926//The total number of operations is 14.0
    1933 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1934 {
    1935         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    1936 }
    1937 
    1938 //The total number of operations is 5.0
    1939 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1940 {
    1941         return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1942 }
    1943 
    1944 //The total number of operations is 5.0
    1945 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1946 {
    1947         return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1948 }
    1949 
    1950 //The total number of operations is 5.0
    1951 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1952 {
    1953         return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1954 }
    1955 
    1956 //The total number of operations is 11.0
    1957 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1958 {
    1959         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    1960         return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1961 }
    1962 
    1963 //The total number of operations is 46.6666666667
    1964 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1965 {
    1966         bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
    1967         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    1968         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    1969         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1970 }
    1971 
    1972 //The total number of operations is 131.0
    1973 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1974 {
    1975         bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
    1976         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    1977         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    1978         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1927template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1928{
     1929        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
     1930        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
     1931        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
     1932        return simd_or(loMask, hiMask);
     1933}
     1934
     1935//The total number of operations is 17.0
     1936template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1937{
     1938        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
     1939}
     1940
     1941//The total number of operations is 5.0
     1942template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1943{
     1944        return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1945}
     1946
     1947//The total number of operations is 5.0
     1948template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1949{
     1950        return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1951}
     1952
     1953//The total number of operations is 5.0
     1954template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1955{
     1956        return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1957}
     1958
     1959//The total number of operations is 5.0
     1960template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1961{
     1962        return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     1963}
     1964
     1965//The total number of operations is 23.6666666667
     1966template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1967{
     1968        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
     1969        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
     1970        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
     1971        return simd_or(loMask, hiMask);
     1972}
     1973
     1974//The total number of operations is 53.6666666667
     1975template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1976{
     1977        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
     1978        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
     1979        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
     1980        return simd_or(loMask, hiMask);
     1981}
     1982
     1983//The total number of operations is 7.0
     1984template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
     1985{
     1986        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
     1987}
     1988
     1989//The total number of operations is 17.5
     1990template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
     1991{
     1992        return simd_or(simd_and(simd256<4>::himask(), simd256<(2)>::srai<((sh < (2)) ? sh : (2))>(arg1)), ((sh <= (2)) ? simd256<4>::srli<sh>(arg1) : simd256<(2)>::srai<(sh-(2))>(simd256<4>::srli<(2)>(arg1))));
     1993}
     1994
     1995//The total number of operations is 12.0
     1996template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
     1997{
     1998        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1999        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     2000}
     2001
     2002//The total number of operations is 4.0
     2003template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
     2004{
     2005        return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
     2006}
     2007
     2008//The total number of operations is 4.0
     2009template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
     2010{
     2011        return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
     2012}
     2013
     2014//The total number of operations is 12.0
     2015template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
     2016{
     2017        return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
     2018}
     2019
     2020//The total number of operations is 28.3333333333
     2021template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
     2022{
     2023        return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
     2024}
     2025
     2026//The total number of operations is 58.5
     2027template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
     2028{
     2029        return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    19792030}
    19802031
     
    20462097}
    20472098
    2048 //The total number of operations is 2.0
    2049 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2050 {
    2051         return simd_not(simd_xor(arg1, arg2));
    2052 }
    2053 
    2054 //The total number of operations is 14.0
    2055 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2056 {
    2057         bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
    2058         bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
    2059         bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
    2060         return simd_or(loMask, hiMask);
    2061 }
    2062 
    2063 //The total number of operations is 17.0
    2064 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2065 {
    2066         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
    2067 }
    2068 
    2069 //The total number of operations is 5.0
    2070 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2071 {
    2072         return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2073 }
    2074 
    2075 //The total number of operations is 5.0
    2076 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2077 {
    2078         return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2079 }
    2080 
    2081 //The total number of operations is 5.0
    2082 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2083 {
    2084         return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2085 }
    2086 
    2087 //The total number of operations is 5.0
    2088 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2089 {
    2090         return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    2091 }
    2092 
    2093 //The total number of operations is 23.6666666667
    2094 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2095 {
    2096         bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
    2097         bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
    2098         bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
    2099         return simd_or(loMask, hiMask);
    2100 }
    2101 
    2102 //The total number of operations is 53.6666666667
    2103 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2104 {
    2105         bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
    2106         bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
    2107         bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
    2108         return simd_or(loMask, hiMask);
    2109 }
    2110 
    21112099//The total number of operations is 0
    21122100template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
     
    21362124template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
    21372125{
    2138         return simd256<32>::constant<-65536>();
     2126        return simd256<32>::constant<4294901760ULL>();
    21392127}
    21402128
     
    21422130template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
    21432131{
    2144         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
     2132        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0))));
    21452133}
    21462134
     
    21482136template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
    21492137{
    2150         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
     2138        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0))));
    21512139}
    21522140
     
    21542142template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
    21552143{
    2156         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
     2144        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    21572145}
    21582146
     
    22222210}
    22232211
    2224 //The total number of operations is 19.0
    2225 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
    2226 {
    2227         return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
    2228 }
    2229 
    2230 //The total number of operations is 39.0
    2231 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
    2232 {
    2233         bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
    2234         return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
    2235 }
    2236 
    2237 //The total number of operations is 4.0
    2238 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
    2239 {
    2240         return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
    2241 }
    2242 
    2243 //The total number of operations is 4.0
    2244 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
    2245 {
    2246         return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
    2247 }
    2248 
    2249 //The total number of operations is 4.0
    2250 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
    2251 {
    2252         return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
    2253 }
    2254 
    2255 //The total number of operations is 13.0
    2256 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
    2257 {
    2258         bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
    2259         return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
    2260 }
    2261 
    2262 //The total number of operations is 69.0
    2263 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
    2264 {
    2265         bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
    2266         return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
    2267 }
    2268 
    2269 //The total number of operations is 204.333333333
    2270 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
    2271 {
    2272         bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    2273         return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
     2212//The total number of operations is 1.0
     2213template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2214{
     2215        return simd_or(arg1, arg2);
     2216}
     2217
     2218//The total number of operations is 24.0
     2219template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2220{
     2221        bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
     2222        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     2223        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     2224        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2225}
     2226
     2227//The total number of operations is 14.0
     2228template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2229{
     2230        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     2231}
     2232
     2233//The total number of operations is 5.0
     2234template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2235{
     2236        return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2237}
     2238
     2239//The total number of operations is 5.0
     2240template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2241{
     2242        return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2243}
     2244
     2245//The total number of operations is 5.0
     2246template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2247{
     2248        return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2249}
     2250
     2251//The total number of operations is 11.0
     2252template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2253{
     2254        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     2255        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2256}
     2257
     2258//The total number of operations is 46.6666666667
     2259template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2260{
     2261        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
     2262        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     2263        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     2264        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2265}
     2266
     2267//The total number of operations is 131.0
     2268template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2269{
     2270        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
     2271        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     2272        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     2273        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    22742274}
    22752275
     
    35023502IDISA_ALWAYS_INLINE bool bitblock256::all(bitblock256_t arg1)
    35033503{
    3504         return _mm256_testz_si256(((__m256i)(simd_not(arg1))), ((__m256i)(simd256<8>::constant<-1>()))) == 1;
    3505 }
    3506 
    3507 //The total number of operations is 1.0
    3508 IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
    3509 {
    3510         return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
    3511 }
    3512 
    3513 //The total number of operations is 1.0
    3514 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
    3515 {
    3516         return _mm256_load_ps((float*)(arg1));
    3517 }
    3518 
    3519 //The total number of operations is 1.0
    3520 IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
    3521 {
    3522         _mm256_storeu_ps((float*)(arg2), arg1);
     3504        return _mm256_testz_si256(((__m256i)(simd_not(arg1))), ((__m256i)(simd256<8>::constant<255>()))) == 1;
    35233505}
    35243506
     
    35303512
    35313513//The total number of operations is 1.0
     3514IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
     3515{
     3516        return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
     3517}
     3518
     3519//The total number of operations is 1.0
     3520IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
     3521{
     3522        return _mm256_load_ps((float*)(arg1));
     3523}
     3524
     3525//The total number of operations is 1.0
    35323526IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
    35333527{
     
    35353529}
    35363530
     3531//The total number of operations is 1.0
     3532IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
     3533{
     3534        _mm256_storeu_ps((float*)(arg2), arg1);
     3535}
     3536
    35373537#endif
  • trunk/lib/idisa_cpp/idisa_avx2.cpp

    r3526 r3576  
    5858        static IDISA_ALWAYS_INLINE bitblock256_t srl(bitblock256_t arg1, bitblock256_t shift_mask);
    5959        static IDISA_ALWAYS_INLINE bitblock256_t lomask();
     60        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
    6061        static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2);
    6162        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    6263        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
    63         static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    6565        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
     
    6767        static IDISA_ALWAYS_INLINE bitblock256_t any(bitblock256_t arg1);
    6868        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
    69         static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
     69        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
    7070        static IDISA_ALWAYS_INLINE bitblock256_t ugt(bitblock256_t arg1, bitblock256_t arg2);
    7171};
     
    134134IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2);
    135135IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1);
     136IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    136137IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
    137 IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
    138138IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
    139139IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
     
    281281template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    282282template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
    283 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
    284 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
    285 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
    286 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
    287 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
    288 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
    289 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
    290 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    291283template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1);
    292284template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1);
     
    301293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srl(bitblock256_t arg1, bitblock256_t shift_mask);
    302294template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srl(bitblock256_t arg1, bitblock256_t shift_mask);
    303 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
    304 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
    305 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
    306 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
    307 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
    308 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
    309 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    310 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    311295template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    312296template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     
    327311template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
    328312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
     313template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
     314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
     315template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
     316template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
     317template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
     318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
     319template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
     320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    329321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    330322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     
    336328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
    337329template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
    338 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    339 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
    340 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
    341 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
    342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
    343 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
    344 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
    345 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
    346 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
     330template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
     331template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
     332template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
     333template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
     334template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
     335template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
     336template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
     337template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
     338template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
     339template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
     340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
     341template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
     342template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
     343template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
     344template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
     345template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
     346template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
     347template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
     348template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
     349template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
     350template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
     351template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
     352template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
     353template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
     354template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
    347355template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
    348356template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
     
    354362template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
    355363template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
    356 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
    357 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
    358 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
    359 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
    360 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
    361 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
    362 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
    363 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
    364 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
    365364template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
    366365template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
     
    380379template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    381380template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    382 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
    383 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
    384 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
    385 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
    386 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
    387 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
    388 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
    389 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
     381template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
     382template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     383template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
     384template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
     385template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
     386template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
     387template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
     388template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
     389template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
    390390template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
    391391template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    619619IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1)
    620620{
    621         return simd_xor(arg1, simd256<32>::constant<-1>());
     621        return simd_xor(arg1, simd256<32>::constant<4294967295ULL>());
     622}
     623
     624//The total number of operations is 1.0
     625IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
     626{
     627        return _mm256_andnot_si256(arg2, arg1);
    622628}
    623629
     
    626632{
    627633        return _mm256_or_si256(arg1, arg2);
    628 }
    629 
    630 //The total number of operations is 1.0
    631 IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
    632 {
    633         return _mm256_andnot_si256(arg2, arg1);
    634634}
    635635
     
    16401640}
    16411641
    1642 //The total number of operations is 4.0
    1643 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
    1644 {
    1645         return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
    1646 }
    1647 
    1648 //The total number of operations is 10.0
    1649 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
    1650 {
    1651         bitblock256_t tmp = simd256<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1652         return simd_or(tmp, simd256<4>::sub(simd256<4>::constant<0>(), simd_and(simd256<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1653 }
    1654 
    1655 //The total number of operations is 5.0
    1656 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
    1657 {
    1658         bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1659         return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1660 }
    1661 
    1662 //The total number of operations is 1.0
    1663 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
    1664 {
    1665         return _mm256_srai_epi16(arg1, (int32_t)(sh));
    1666 }
    1667 
    1668 //The total number of operations is 1.0
    1669 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
    1670 {
    1671         return _mm256_srai_epi32(arg1, (int32_t)(sh));
    1672 }
    1673 
    1674 //The total number of operations is 4.5
    1675 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
    1676 {
    1677         return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
    1678 }
    1679 
    1680 //The total number of operations is 14.0833333333
    1681 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
    1682 {
    1683         return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
    1684 }
    1685 
    1686 //The total number of operations is 32.625
    1687 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
    1688 {
    1689         return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    1690 }
    1691 
    16921642//The total number of operations is 3.0
    16931643template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
     
    17661716
    17671717//The total number of operations is 0
    1768 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1769 {
    1770         return simd256<2>::constant<(1)>();
    1771 }
    1772 
    1773 //The total number of operations is 0
    1774 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1775 {
    1776         return simd256<4>::constant<(3)>();
    1777 }
    1778 
    1779 //The total number of operations is 0
    1780 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1781 {
    1782         return simd256<8>::constant<(15)>();
    1783 }
    1784 
    1785 //The total number of operations is 0
    1786 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1787 {
    1788         return simd256<16>::constant<(255)>();
    1789 }
    1790 
    1791 //The total number of operations is 0
    1792 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1793 {
    1794         return simd256<32>::constant<(65535)>();
    1795 }
    1796 
    1797 //The total number of operations is 0
    1798 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1799 {
    1800         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1))));
    1801 }
    1802 
    1803 //The total number of operations is 0
    1804 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1805 {
    1806         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1))));
    1807 }
    1808 
    1809 //The total number of operations is 0
    1810 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1811 {
    1812         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1))));
    1813 }
    1814 
    1815 //The total number of operations is 0
    18161718template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()
    18171719{
    1818         return simd256<32>::constant<(-1*val)>();
     1720        return simd256<2>::constant<((val+val)+val)>();
    18191721}
    18201722
     
    19301832}
    19311833
     1834//The total number of operations is 0
     1835template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1836{
     1837        return simd256<2>::constant<(1)>();
     1838}
     1839
     1840//The total number of operations is 0
     1841template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1842{
     1843        return simd256<4>::constant<(3)>();
     1844}
     1845
     1846//The total number of operations is 0
     1847template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1848{
     1849        return simd256<8>::constant<(15)>();
     1850}
     1851
     1852//The total number of operations is 0
     1853template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1854{
     1855        return simd256<16>::constant<(255)>();
     1856}
     1857
     1858//The total number of operations is 0
     1859template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1860{
     1861        return simd256<32>::constant<(65535)>();
     1862}
     1863
     1864//The total number of operations is 0
     1865template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1866{
     1867        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)))));
     1868}
     1869
     1870//The total number of operations is 0
     1871template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1872{
     1873        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)))));
     1874}
     1875
     1876//The total number of operations is 0
     1877template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1878{
     1879        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)))));
     1880}
     1881
    19321882//The total number of operations is 1.0
    19331883template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     
    19911941}
    19921942
    1993 //The total number of operations is 1.0
    1994 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1995 {
    1996         return simd_or(arg1, arg2);
    1997 }
    1998 
    1999 //The total number of operations is 16.0
    2000 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2001 {
    2002         return simd_or(simd_and(simd256<(4)>::himask(), simd256<(4)>::umax(arg1, arg2)), simd256<(4)>::umax(simd_and(simd256<(4)>::lomask(), arg1), simd_and(simd256<(4)>::lomask(), arg2)));
    2003 }
    2004 
    2005 //The total number of operations is 6.0
    2006 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2007 {
    2008         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    2009 }
    2010 
    2011 //The total number of operations is 1.0
    2012 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2013 {
    2014         return _mm256_max_epu8(arg1, arg2);
    2015 }
    2016 
    2017 //The total number of operations is 1.0
    2018 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2019 {
    2020         return _mm256_max_epu16(arg1, arg2);
    2021 }
    2022 
    2023 //The total number of operations is 1.0
    2024 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2025 {
    2026         return _mm256_max_epu32(arg1, arg2);
    2027 }
    2028 
    2029 //The total number of operations is 7.0
    2030 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2031 {
    2032         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
    2033         return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    2034 }
    2035 
    2036 //The total number of operations is 28.6666666667
    2037 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2038 {
    2039         bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
    2040         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    2041         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    2042         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    2043 }
    2044 
    2045 //The total number of operations is 84.0
    2046 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
    2047 {
    2048         bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
    2049         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    2050         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    2051         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1943//The total number of operations is 14.5
     1944template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     1945{
     1946        return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
     1947}
     1948
     1949//The total number of operations is 19.0
     1950template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
     1951{
     1952        bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
     1953        return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
     1954}
     1955
     1956//The total number of operations is 1.0
     1957template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
     1958{
     1959        return _mm256_abs_epi8(arg1);
     1960}
     1961
     1962//The total number of operations is 1.0
     1963template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
     1964{
     1965        return _mm256_abs_epi16(arg1);
     1966}
     1967
     1968//The total number of operations is 1.0
     1969template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
     1970{
     1971        return _mm256_abs_epi32(arg1);
     1972}
     1973
     1974//The total number of operations is 5.0
     1975template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
     1976{
     1977        bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
     1978        return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
     1979}
     1980
     1981//The total number of operations is 37.0
     1982template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
     1983{
     1984        bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
     1985        return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
     1986}
     1987
     1988//The total number of operations is 120.833333333
     1989template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
     1990{
     1991        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
     1992        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
     1993}
     1994
     1995//The total number of operations is 2.0
     1996template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
     1997{
     1998        return simd_not(simd_xor(arg1, arg2));
     1999}
     2000
     2001//The total number of operations is 8.0
     2002template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2003{
     2004        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
     2005        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
     2006        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
     2007        return simd_or(loMask, hiMask);
     2008}
     2009
     2010//The total number of operations is 9.0
     2011template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2012{
     2013        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
     2014}
     2015
     2016//The total number of operations is 1.0
     2017template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2018{
     2019        return _mm256_cmpeq_epi8(arg1, arg2);
     2020}
     2021
     2022//The total number of operations is 1.0
     2023template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2024{
     2025        return _mm256_cmpeq_epi16(arg1, arg2);
     2026}
     2027
     2028//The total number of operations is 1.0
     2029template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2030{
     2031        return _mm256_cmpeq_epi32(arg1, arg2);
     2032}
     2033
     2034//The total number of operations is 1.0
     2035template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2036{
     2037        return _mm256_cmpeq_epi64(arg1, arg2);
     2038}
     2039
     2040//The total number of operations is 13.6666666667
     2041template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2042{
     2043        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
     2044        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
     2045        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
     2046        return simd_or(loMask, hiMask);
     2047}
     2048
     2049//The total number of operations is 34.6666666667
     2050template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
     2051{
     2052        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
     2053        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
     2054        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
     2055        return simd_or(loMask, hiMask);
     2056}
     2057
     2058//The total number of operations is 4.0
     2059template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
     2060{
     2061        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
     2062}
     2063
     2064//The total number of operations is 10.0
     2065template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
     2066{
     2067        bitblock256_t tmp = simd256<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     2068        return simd_or(tmp, simd256<4>::sub(simd256<4>::constant<0>(), simd_and(simd256<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     2069}
     2070
     2071//The total number of operations is 5.0
     2072template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
     2073{
     2074        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     2075        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     2076}
     2077
     2078//The total number of operations is 1.0
     2079template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
     2080{
     2081        return _mm256_srai_epi16(arg1, (int32_t)(sh));
     2082}
     2083
     2084//The total number of operations is 1.0
     2085template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
     2086{
     2087        return _mm256_srai_epi32(arg1, (int32_t)(sh));
     2088}
     2089
     2090//The total number of operations is 4.5
     2091template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
     2092{
     2093        return simd_or(simd_and(simd256<64>::himask(), simd256<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd256<64>::srli<sh>(arg1) : simd256<(32)>::srai<(sh-(32))>(simd256<64>::srli<(32)>(arg1))));
     2094}
     2095
     2096//The total number of operations is 14.0833333333
     2097template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
     2098{
     2099        return simd_or(simd_and(simd256<128>::himask(), simd256<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd256<128>::srli<sh>(arg1) : simd256<(64)>::srai<(sh-(64))>(simd256<128>::srli<(64)>(arg1))));
     2100}
     2101
     2102//The total number of operations is 32.625
     2103template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
     2104{
     2105        return simd_or(simd_and(simd256<256>::himask(), simd256<(128)>::srai<((sh < (128)) ? sh : (128))>(arg1)), ((sh <= (128)) ? simd256<256>::srli<sh>(arg1) : simd256<(128)>::srai<(sh-(128))>(simd256<256>::srli<(128)>(arg1))));
    20522106}
    20532107
     
    21182172}
    21192173
    2120 //The total number of operations is 2.0
    2121 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2122 {
    2123         return simd_not(simd_xor(arg1, arg2));
    2124 }
    2125 
    2126 //The total number of operations is 8.0
    2127 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2128 {
    2129         bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
    2130         bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
    2131         bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
    2132         return simd_or(loMask, hiMask);
    2133 }
    2134 
    2135 //The total number of operations is 9.0
    2136 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2137 {
    2138         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
    2139 }
    2140 
    2141 //The total number of operations is 1.0
    2142 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2143 {
    2144         return _mm256_cmpeq_epi8(arg1, arg2);
    2145 }
    2146 
    2147 //The total number of operations is 1.0
    2148 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2149 {
    2150         return _mm256_cmpeq_epi16(arg1, arg2);
    2151 }
    2152 
    2153 //The total number of operations is 1.0
    2154 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2155 {
    2156         return _mm256_cmpeq_epi32(arg1, arg2);
    2157 }
    2158 
    2159 //The total number of operations is 1.0
    2160 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2161 {
    2162         return _mm256_cmpeq_epi64(arg1, arg2);
    2163 }
    2164 
    2165 //The total number of operations is 13.6666666667
    2166 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2167 {
    2168         bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
    2169         bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
    2170         bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
    2171         return simd_or(loMask, hiMask);
    2172 }
    2173 
    2174 //The total number of operations is 34.6666666667
    2175 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
    2176 {
    2177         bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
    2178         bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
    2179         bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
    2180         return simd_or(loMask, hiMask);
    2181 }
    2182 
    21832174//The total number of operations is 0
    21842175template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
     
    22082199template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
    22092200{
    2210         return simd256<32>::constant<-65536>();
     2201        return simd256<32>::constant<4294901760ULL>();
    22112202}
    22122203
     
    22142205template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
    22152206{
    2216         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0))));
     2207        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0))));
    22172208}
    22182209
     
    22202211template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
    22212212{
    2222         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0))));
     2213        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0))));
    22232214}
    22242215
     
    22262217template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
    22272218{
    2228         return ((bitblock256_t)(_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
     2219        return ((bitblock256_t)(_mm256_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0))));
    22292220}
    22302221
     
    22942285}
    22952286
    2296 //The total number of operations is 14.5
    2297 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
    2298 {
    2299         return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
    2300 }
    2301 
    2302 //The total number of operations is 19.0
    2303 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
    2304 {
    2305         bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
    2306         return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
    2307 }
    2308 
    2309 //The total number of operations is 1.0
    2310 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
    2311 {
    2312         return _mm256_abs_epi8(arg1);
    2313 }
    2314 
    2315 //The total number of operations is 1.0
    2316 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
    2317 {
    2318         return _mm256_abs_epi16(arg1);
    2319 }
    2320 
    2321 //The total number of operations is 1.0
    2322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
    2323 {
    2324         return _mm256_abs_epi32(arg1);
    2325 }
    2326 
    2327 //The total number of operations is 5.0
    2328 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
    2329 {
    2330         bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
    2331         return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
    2332 }
    2333 
    2334 //The total number of operations is 37.0
    2335 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
    2336 {
    2337         bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
    2338         return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
    2339 }
    2340 
    2341 //The total number of operations is 120.833333333
    2342 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
    2343 {
    2344         bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    2345         return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
     2287//The total number of operations is 1.0
     2288template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2289{
     2290        return simd_or(arg1, arg2);
     2291}
     2292
     2293//The total number of operations is 16.0
     2294template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2295{
     2296        return simd_or(simd_and(simd256<(4)>::himask(), simd256<(4)>::umax(arg1, arg2)), simd256<(4)>::umax(simd_and(simd256<(4)>::lomask(), arg1), simd_and(simd256<(4)>::lomask(), arg2)));
     2297}
     2298
     2299//The total number of operations is 6.0
     2300template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2301{
     2302        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     2303}
     2304
     2305//The total number of operations is 1.0
     2306template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2307{
     2308        return _mm256_max_epu8(arg1, arg2);
     2309}
     2310
     2311//The total number of operations is 1.0
     2312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2313{
     2314        return _mm256_max_epu16(arg1, arg2);
     2315}
     2316
     2317//The total number of operations is 1.0
     2318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2319{
     2320        return _mm256_max_epu32(arg1, arg2);
     2321}
     2322
     2323//The total number of operations is 7.0
     2324template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2325{
     2326        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808ULL)>();
     2327        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2328}
     2329
     2330//The total number of operations is 28.6666666667
     2331template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2332{
     2333        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
     2334        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     2335        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     2336        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2337}
     2338
     2339//The total number of operations is 84.0
     2340template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2341{
     2342        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
     2343        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     2344        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     2345        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    23462346}
    23472347
     
    34123412template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
    34133413{
    3414         return ((sh == 3) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(0), (int64_t)(0), (int64_t)(-1)), _mm256_permute4x64_epi64(arg1, (int32_t)(3))) : ((sh == 2) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(0), (int64_t)(-1), (int64_t)(-1)), _mm256_permute4x64_epi64(arg1, (int32_t)(14))) : ((sh == 1) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(-1), (int64_t)(-1), (int64_t)(-1)), _mm256_permute4x64_epi64(arg1, (int32_t)(57))) : ((sh == 0) ? arg1 : simd256<32>::constant<0>()))));
     3414        return ((sh == 3) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(0), (int64_t)(0), (int64_t)(18446744073709551615ULL)), _mm256_permute4x64_epi64(arg1, (int32_t)(3))) : ((sh == 2) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(0), (int64_t)(18446744073709551615ULL), (int64_t)(18446744073709551615ULL)), _mm256_permute4x64_epi64(arg1, (int32_t)(14))) : ((sh == 1) ? simd_and(_mm256_set_epi64x((int64_t)(0), (int64_t)(18446744073709551615ULL), (int64_t)(18446744073709551615ULL), (int64_t)(18446744073709551615ULL)), _mm256_permute4x64_epi64(arg1, (int32_t)(57))) : ((sh == 0) ? arg1 : simd256<32>::constant<0>()))));
    34153415}
    34163416
     
    35563556template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1)
    35573557{
    3558         return ((sh == 1) ? simd_and(_mm256_set_epi64x((int64_t)(-1), (int64_t)(-1), (int64_t)(-1), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)((144)))) : ((sh == 2) ? simd_and(_mm256_set_epi64x((int64_t)(-1), (int64_t)(-1), (int64_t)(0), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)(64))) : ((sh == 3) ? simd_and(_mm256_set_epi64x((int64_t)(-1), (int64_t)(0), (int64_t)(0), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)(0))) : ((sh == 0) ? arg1 : simd256<32>::constant<0>()))));
     3558        return ((sh == 1) ? simd_and(_mm256_set_epi64x((int64_t)(18446744073709551615ULL), (int64_t)(18446744073709551615ULL), (int64_t)(18446744073709551615ULL), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)((144)))) : ((sh == 2) ? simd_and(_mm256_set_epi64x((int64_t)(18446744073709551615ULL), (int64_t)(18446744073709551615ULL), (int64_t)(0), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)(64))) : ((sh == 3) ? simd_and(_mm256_set_epi64x((int64_t)(18446744073709551615ULL), (int64_t)(0), (int64_t)(0), (int64_t)(0)), _mm256_permute4x64_epi64(arg1, (int32_t)(0))) : ((sh == 0) ? arg1 : simd256<32>::constant<0>()))));
    35593559}
    35603560
     
    36343634IDISA_ALWAYS_INLINE bool bitblock256::all(bitblock256_t arg1)
    36353635{
    3636         return _mm256_testz_si256(((__m256i)(simd_not(arg1))), ((__m256i)(simd256<8>::constant<-1>()))) == 1;
     3636        return _mm256_testz_si256(((__m256i)(simd_not(arg1))), ((__m256i)(simd256<8>::constant<255>()))) == 1;
    36373637}
    36383638
     
    36433643}
    36443644
    3645 //The total number of operations is 1.0
    3646 IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
    3647 {
    3648         return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
    3649 }
    3650 
    3651 //The total number of operations is 1.0
    3652 IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
    3653 {
    3654         return _mm256_load_si256((bitblock256_t*)(arg1));
    3655 }
    3656 
    3657 //The total number of operations is 1.0
    3658 IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
    3659 {
    3660         _mm256_storeu_si256((bitblock256_t*)(arg2), arg1);
    3661 }
    3662 
    36633645//The total number of operations is 9.5
    36643646template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
     
    36683650
    36693651//The total number of operations is 1.0
     3652IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
     3653{
     3654        return _mm256_testz_si256(((__m256i)(arg1)), ((__m256i)(arg1))) == 0;
     3655}
     3656
     3657//The total number of operations is 1.0
     3658IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(const bitblock256_t* arg1)
     3659{
     3660        return _mm256_load_si256((bitblock256_t*)(arg1));
     3661}
     3662
     3663//The total number of operations is 1.0
    36703664IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t arg1, bitblock256_t* arg2)
    36713665{
     
    36733667}
    36743668
     3669//The total number of operations is 1.0
     3670IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(bitblock256_t arg1, bitblock256_t* arg2)
     3671{
     3672        _mm256_storeu_si256((bitblock256_t*)(arg2), arg1);
     3673}
     3674
    36753675#endif
  • trunk/lib/idisa_cpp/idisa_neon.cpp

    r3526 r3576  
    5252        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    5353        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     54        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    5455        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    5556        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    5657        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    57         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    5858        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    5959        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    6161        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
    6262        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    63         static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
     63        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    6565};
     
    125125IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    126126IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     127IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    127128IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    128 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    129129IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    130130IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
     
    241241template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    242242template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    243 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    244 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    245 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    246 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    247 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    248 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    249 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    250243template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1);
    251244template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add_hl(bitblock128_t arg1);
     
    255248template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add_hl(bitblock128_t arg1);
    256249template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    257 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    258 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    259 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    260 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    261 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    262 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    263 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    264250template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    265251template <> template <FieldType<2>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     
    278264template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    279265template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     268template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    280273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    281274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    286279template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    287280template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    288 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    289 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    290 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    291 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    292 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    293 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    294 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    295 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     281template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     282template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     283template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     284template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     285template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     286template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     287template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     288template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
     289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
     296template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     297template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     298template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     299template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     300template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     301template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     302template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    296303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    297304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     
    302309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    303310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    304 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    305 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
    306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
    307 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
    308 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
    309 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
    310 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    311 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    312311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    313312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    325324template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    326325template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    327 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    328 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    329 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    331 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    332 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    333 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     326template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     329template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     330template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     331template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     332template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     333template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    334334template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    335335template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    523523
    524524//The total number of operations is 1.0
     525IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
     526{
     527        return vbicq_u64(arg1, arg2);
     528}
     529
     530//The total number of operations is 1.0
    525531IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
    526532{
    527533        return vorrq_u64(arg1, arg2);
    528 }
    529 
    530 //The total number of operations is 1.0
    531 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    532 {
    533         return vbicq_u64(arg1, arg2);
    534534}
    535535
     
    12701270}
    12711271
    1272 //The total number of operations is 4.0
    1273 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1274 {
    1275         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1276 }
    1277 
    1278 //The total number of operations is 8.0
    1279 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1280 {
    1281         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1282         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1283 }
    1284 
    1285 //The total number of operations is 1.0
    1286 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1287 {
    1288         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s8((int8x16_t)(arg1), (int32_t)(sh)))));
    1289 }
    1290 
    1291 //The total number of operations is 1.0
    1292 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1293 {
    1294         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s16((int16x8_t)(arg1), (int32_t)(sh)))));
    1295 }
    1296 
    1297 //The total number of operations is 1.0
    1298 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1299 {
    1300         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s32((int32x4_t)(arg1), (int32_t)(sh)))));
    1301 }
    1302 
    1303 //The total number of operations is 1.0
    1304 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1305 {
    1306         return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s64((int64x2_t)(arg1), (int32_t)(sh)))));
    1307 }
    1308 
    1309 //The total number of operations is 6.66666666667
    1310 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1311 {
    1312         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1313 }
    1314 
    13151272//The total number of operations is 3.0
    13161273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add_hl(bitblock128_t arg1)
     
    13561313
    13571314//The total number of operations is 0
    1358 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1359 {
    1360         return simd128<2>::constant<(1)>();
    1361 }
    1362 
    1363 //The total number of operations is 0
    1364 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1365 {
    1366         return simd128<4>::constant<(3)>();
    1367 }
    1368 
    1369 //The total number of operations is 0
    1370 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1371 {
    1372         return simd128<8>::constant<(15)>();
    1373 }
    1374 
    1375 //The total number of operations is 0
    1376 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1377 {
    1378         return simd128<16>::constant<(255)>();
    1379 }
    1380 
    1381 //The total number of operations is 0
    1382 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1383 {
    1384         return simd128<32>::constant<(65535)>();
    1385 }
    1386 
    1387 //The total number of operations is 0
    1388 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1389 {
    1390         return simd128<64>::constant<4294967295ULL>();
    1391 }
    1392 
    1393 //The total number of operations is 0
    1394 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1395 {
    1396         return vsetq_lane_u64((uint64_t)(-1), simd128<64>::constant<0>(), (int32_t)(0));
    1397 }
    1398 
    1399 //The total number of operations is 0
    14001315template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    14011316{
    1402         return simd128<32>::constant<(-1*val)>();
     1317        return simd128<2>::constant<((val+val)+val)>();
    14031318}
    14041319
     
    14971412}
    14981413
     1414//The total number of operations is 0
     1415template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1416{
     1417        return simd128<2>::constant<(1)>();
     1418}
     1419
     1420//The total number of operations is 0
     1421template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1422{
     1423        return simd128<4>::constant<(3)>();
     1424}
     1425
     1426//The total number of operations is 0
     1427template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1428{
     1429        return simd128<8>::constant<(15)>();
     1430}
     1431
     1432//The total number of operations is 0
     1433template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1434{
     1435        return simd128<16>::constant<(255)>();
     1436}
     1437
     1438//The total number of operations is 0
     1439template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1440{
     1441        return simd128<32>::constant<(65535)>();
     1442}
     1443
     1444//The total number of operations is 0
     1445template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1446{
     1447        return simd128<64>::constant<4294967295ULL>();
     1448}
     1449
     1450//The total number of operations is 0
     1451template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1452{
     1453        return vsetq_lane_u64((uint64_t)(-1), simd128<64>::constant<0>(), (int32_t)(0));
     1454}
     1455
    14991456//The total number of operations is 1.0
    15001457template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    15581515}
    15591516
    1560 //The total number of operations is 1.0
    1561 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1562 {
    1563         return simd_or(arg1, arg2);
     1517//The total number of operations is 5.33333333333
     1518template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1519{
     1520        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1521}
     1522
     1523//The total number of operations is 10.0
     1524template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1525{
     1526        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1527        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1528}
     1529
     1530//The total number of operations is 1.0
     1531template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1532{
     1533        return (bitblock128_t)vabsq_s8((int8x16_t)(arg1));
     1534}
     1535
     1536//The total number of operations is 1.0
     1537template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1538{
     1539        return (bitblock128_t)vabsq_s16((int16x8_t)(arg1));
     1540}
     1541
     1542//The total number of operations is 1.0
     1543template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1544{
     1545        return (bitblock128_t)vabsq_s32((int32x4_t)(arg1));
     1546}
     1547
     1548//The total number of operations is 8.5
     1549template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1550{
     1551        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1552        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1553}
     1554
     1555//The total number of operations is 31.8333333333
     1556template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1557{
     1558        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1559        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1560}
     1561
     1562//The total number of operations is 2.0
     1563template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1564{
     1565        return simd_not(simd_xor(arg1, arg2));
     1566}
     1567
     1568//The total number of operations is 7.5
     1569template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1570{
     1571        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1572        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1573        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1574        return simd_or(loMask, hiMask);
     1575}
     1576
     1577//The total number of operations is 9.0
     1578template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1579{
     1580        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1581}
     1582
     1583//The total number of operations is 1.0
     1584template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1585{
     1586        return (bitblock128_t)vceqq_u8((uint8x16_t)(arg1), (uint8x16_t)(arg2));
     1587}
     1588
     1589//The total number of operations is 1.0
     1590template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1591{
     1592        return (bitblock128_t)vceqq_u16((uint16x8_t)(arg1), (uint16x8_t)(arg2));
     1593}
     1594
     1595//The total number of operations is 1.0
     1596template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1597{
     1598        return (bitblock128_t)vceqq_u32((uint32x4_t)(arg1), (uint32x4_t)(arg2));
     1599}
     1600
     1601//The total number of operations is 4.5
     1602template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1603{
     1604        bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
     1605        bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
     1606        bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
     1607        return simd_or(loMask, hiMask);
    15641608}
    15651609
    15661610//The total number of operations is 12.0
    1567 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1568 {
    1569         bitblock128_t tmpAns = simd128<(1)>::umax(arg1, arg2);
    1570         bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg1));
    1571         bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg2));
    1572         return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1573 }
    1574 
    1575 //The total number of operations is 9.0
    1576 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1577 {
    1578         bitblock128_t high_bit = simd128<4>::constant<(8)>();
    1579         return simd_xor(simd128<4>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1611template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1612{
     1613        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1614        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1615        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1616        return simd_or(loMask, hiMask);
    15801617}
    15811618
    15821619//The total number of operations is 4.0
    1583 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1584 {
    1585         bitblock128_t high_bit = simd128<8>::constant<(128)>();
    1586         return simd_xor(simd128<8>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1587 }
    1588 
    1589 //The total number of operations is 4.0
    1590 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1591 {
    1592         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1593         return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1594 }
    1595 
    1596 //The total number of operations is 4.0
    1597 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1598 {
    1599         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1600         return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1601 }
    1602 
    1603 //The total number of operations is 11.0
    1604 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1605 {
    1606         bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
    1607         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1608         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1609         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1610 }
    1611 
    1612 //The total number of operations is 29.3333333333
    1613 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1614 {
    1615         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1616         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1617         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1618         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1620template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1621{
     1622        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1623}
     1624
     1625//The total number of operations is 8.0
     1626template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1627{
     1628        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1629        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1630}
     1631
     1632//The total number of operations is 1.0
     1633template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1634{
     1635        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s8((int8x16_t)(arg1), (int32_t)(sh)))));
     1636}
     1637
     1638//The total number of operations is 1.0
     1639template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1640{
     1641        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s16((int16x8_t)(arg1), (int32_t)(sh)))));
     1642}
     1643
     1644//The total number of operations is 1.0
     1645template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1646{
     1647        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s32((int32x4_t)(arg1), (int32_t)(sh)))));
     1648}
     1649
     1650//The total number of operations is 1.0
     1651template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1652{
     1653        return ((sh == 0) ? arg1 : ((bitblock128_t)(vshrq_n_s64((int64x2_t)(arg1), (int32_t)(sh)))));
     1654}
     1655
     1656//The total number of operations is 6.66666666667
     1657template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1658{
     1659        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    16191660}
    16201661
     
    16771718}
    16781719
    1679 //The total number of operations is 2.0
    1680 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1681 {
    1682         return simd_not(simd_xor(arg1, arg2));
    1683 }
    1684 
    1685 //The total number of operations is 7.5
    1686 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1687 {
    1688         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1689         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1690         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1691         return simd_or(loMask, hiMask);
    1692 }
    1693 
    1694 //The total number of operations is 9.0
    1695 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1696 {
    1697         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1698 }
    1699 
    1700 //The total number of operations is 1.0
    1701 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1702 {
    1703         return (bitblock128_t)vceqq_u8((uint8x16_t)(arg1), (uint8x16_t)(arg2));
    1704 }
    1705 
    1706 //The total number of operations is 1.0
    1707 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1708 {
    1709         return (bitblock128_t)vceqq_u16((uint16x8_t)(arg1), (uint16x8_t)(arg2));
    1710 }
    1711 
    1712 //The total number of operations is 1.0
    1713 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1714 {
    1715         return (bitblock128_t)vceqq_u32((uint32x4_t)(arg1), (uint32x4_t)(arg2));
    1716 }
    1717 
    1718 //The total number of operations is 4.5
    1719 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1720 {
    1721         bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
    1722         bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
    1723         bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
    1724         return simd_or(loMask, hiMask);
    1725 }
    1726 
    1727 //The total number of operations is 12.0
    1728 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1729 {
    1730         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1731         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1732         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1733         return simd_or(loMask, hiMask);
    1734 }
    1735 
    17361720//The total number of operations is 0
    17371721template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    17611745template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    17621746{
    1763         return simd128<32>::constant<-65536>();
     1747        return simd128<32>::constant<4294901760ULL>();
    17641748}
    17651749
     
    18281812}
    18291813
    1830 //The total number of operations is 5.33333333333
    1831 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1832 {
    1833         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1834 }
    1835 
    1836 //The total number of operations is 10.0
    1837 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1838 {
    1839         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1840         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1841 }
    1842 
    1843 //The total number of operations is 1.0
    1844 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1845 {
    1846         return (bitblock128_t)vabsq_s8((int8x16_t)(arg1));
    1847 }
    1848 
    1849 //The total number of operations is 1.0
    1850 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1851 {
    1852         return (bitblock128_t)vabsq_s16((int16x8_t)(arg1));
    1853 }
    1854 
    1855 //The total number of operations is 1.0
    1856 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1857 {
    1858         return (bitblock128_t)vabsq_s32((int32x4_t)(arg1));
    1859 }
    1860 
    1861 //The total number of operations is 8.5
    1862 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1863 {
    1864         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1865         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1866 }
    1867 
    1868 //The total number of operations is 31.8333333333
    1869 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1870 {
    1871         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1872         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1814//The total number of operations is 1.0
     1815template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1816{
     1817        return simd_or(arg1, arg2);
     1818}
     1819
     1820//The total number of operations is 12.0
     1821template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1822{
     1823        bitblock128_t tmpAns = simd128<(1)>::umax(arg1, arg2);
     1824        bitblock128_t eqMask1 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg1));
     1825        bitblock128_t eqMask2 = simd128<2>::srli<(1)>(simd128<(1)>::eq(tmpAns, arg2));
     1826        return simd128<1>::ifh(simd128<2>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1827}
     1828
     1829//The total number of operations is 9.0
     1830template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1831{
     1832        bitblock128_t high_bit = simd128<4>::constant<(8)>();
     1833        return simd_xor(simd128<4>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1834}
     1835
     1836//The total number of operations is 4.0
     1837template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1838{
     1839        bitblock128_t high_bit = simd128<8>::constant<(128)>();
     1840        return simd_xor(simd128<8>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1841}
     1842
     1843//The total number of operations is 4.0
     1844template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1845{
     1846        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     1847        return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1848}
     1849
     1850//The total number of operations is 4.0
     1851template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1852{
     1853        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     1854        return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     1855}
     1856
     1857//The total number of operations is 11.0
     1858template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1859{
     1860        bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
     1861        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     1862        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     1863        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1864}
     1865
     1866//The total number of operations is 29.3333333333
     1867template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1868{
     1869        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     1870        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     1871        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     1872        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    18731873}
    18741874
     
    29482948IDISA_ALWAYS_INLINE bool bitblock128::all(bitblock128_t arg1)
    29492949{
    2950         return hsimd128<32>::signmask(simd128<32>::eq(arg1, simd128<32>::constant<-1>())) == 15;
     2950        return hsimd128<32>::signmask(simd128<32>::eq(arg1, simd128<32>::constant<4294967295ULL>())) == 15;
     2951}
     2952
     2953//The total number of operations is 2.33333333333
     2954template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     2955{
     2956        return simd128<128>::slli<sh>(arg1);
    29512957}
    29522958
     
    29642970
    29652971//The total number of operations is 1.0
     2972IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, uint64_t* arg2)
     2973{
     2974        vst1q_u64((uint64_t*)(arg1), arg2);
     2975}
     2976
     2977//The total number of operations is 1.0
    29662978IDISA_ALWAYS_INLINE void bitblock128::store_unaligned(bitblock128_t arg1, uint64_t* arg2)
    29672979{
     
    29692981}
    29702982
    2971 //The total number of operations is 2.33333333333
    2972 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    2973 {
    2974         return simd128<128>::slli<sh>(arg1);
    2975 }
    2976 
    2977 //The total number of operations is 1.0
    2978 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, uint64_t* arg2)
    2979 {
    2980         vst1q_u64((uint64_t*)(arg1), arg2);
    2981 }
    2982 
    29832983#endif
  • trunk/lib/idisa_cpp/idisa_sse2.cpp

    r3573 r3576  
    5656        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    5757        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    5859        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    5960        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    6061        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    6162        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    62         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6363        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    6666        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
    6767        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    68         static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
     68        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6969        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    7070};
     
    134134IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    135135IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     136IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    136137IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    137 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    138138IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    139139IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
     
    260260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    261261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    262 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    263 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    264 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    265 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    266 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    267 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    268 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    269262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    270263template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     
    277270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    278271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    279 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    280 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    286272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    287273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     
    302288template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    303289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    304297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    305298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    310303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    311304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    312 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    313 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    314 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
     313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
     320template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     321template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     322template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     323template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     324template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     325template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    320327template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    321328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     
    326333template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    327334template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    328 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    329 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
    330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
    331 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
    332 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
    333 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
    334 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    335 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    336335template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    337336template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    349348template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    350349template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    351 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    352 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    353 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    354 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    355 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    356 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    357 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     350template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     351template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     353template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     354template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     355template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     356template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     357template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    358358template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    359359template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    555555IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1)
    556556{
    557         return simd_xor(arg1, simd128<32>::constant<4294967295>());
     557        return simd_xor(arg1, simd128<32>::constant<4294967295ULL>());
     558}
     559
     560//The total number of operations is 1.0
     561IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
     562{
     563        return _mm_andnot_si128(arg2, arg1);
    558564}
    559565
     
    562568{
    563569        return _mm_or_si128(arg1, arg2);
    564 }
    565 
    566 //The total number of operations is 1.0
    567 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    568 {
    569         return _mm_andnot_si128(arg2, arg1);
    570570}
    571571
     
    755755
    756756//The total number of operations is 51.75
    757 /*template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::gt(bitblock128_t arg1, bitblock128_t arg2)
     757template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::gt(bitblock128_t arg1, bitblock128_t arg2)
    758758{
    759759        bitblock128_t hiAns = simd128<(64)>::gt(arg1, arg2);
     
    762762        mask = simd_or(mask, simd128<128>::slli<(64)>(mask));
    763763        return simd_or(simd128<128>::srai<(64)>(hiAns), mask);
    764 }*/
     764}
    765765
    766766//The total number of operations is 289.0
     
    14081408}
    14091409
    1410 //The total number of operations is 4.0
    1411 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1412 {
    1413         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1414 }
    1415 
    1416 //The total number of operations is 10.0
    1417 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1418 {
    1419         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1420         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1421 }
    1422 
    1423 //The total number of operations is 5.0
    1424 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1425 {
    1426         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1427         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1428 }
    1429 
    1430 //The total number of operations is 1.0
    1431 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1432 {
    1433         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1434 }
    1435 
    1436 //The total number of operations is 1.0
    1437 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1438 {
    1439         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1440 }
    1441 
    1442 //The total number of operations is 4.5
    1443 /*template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1444 {
    1445         return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
    1446 }*/
    1447 
    1448 //The total number of operations is 11.0833333333
    1449 /*template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1450 {
    1451         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1452 }*/
    1453 
    14541410//The total number of operations is 10.0
    14551411template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15141470}
    15151471
    1516 //The total number of operations is 0
    1517 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1518 {
    1519         return simd128<2>::constant<(1)>();
    1520 }
    1521 
    1522 //The total number of operations is 0
    1523 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1524 {
    1525         return simd128<4>::constant<(3)>();
    1526 }
    1527 
    1528 //The total number of operations is 0
    1529 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1530 {
    1531         return simd128<8>::constant<(15)>();
    1532 }
    1533 
    1534 //The total number of operations is 0
    1535 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1536 {
    1537         return simd128<16>::constant<(255)>();
    1538 }
    1539 
    1540 //The total number of operations is 0
    1541 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1542 {
    1543         return simd128<32>::constant<(65535)>();
    1544 }
    1545 
    1546 //The total number of operations is 0
    1547 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1548 {
    1549         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1550 }
    1551 
    1552 //The total number of operations is 0
    1553 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1554 {
    1555         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1556 }
    1557 
    15581472//The total number of operations is 10.0
    15591473template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15721486template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    15731487{
    1574         return simd128<2>::constant<(val+val+val)>();
     1488        return simd128<2>::constant<((val+val)+val)>();
    15751489}
    15761490
     
    16671581{
    16681582        return simd128<1>::ifh(simd128<128>::gt(arg1, arg2), arg2, arg1);
     1583}
     1584
     1585//The total number of operations is 0
     1586template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1587{
     1588        return simd128<2>::constant<(1)>();
     1589}
     1590
     1591//The total number of operations is 0
     1592template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1593{
     1594        return simd128<4>::constant<(3)>();
     1595}
     1596
     1597//The total number of operations is 0
     1598template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1599{
     1600        return simd128<8>::constant<(15)>();
     1601}
     1602
     1603//The total number of operations is 0
     1604template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1605{
     1606        return simd128<16>::constant<(255)>();
     1607}
     1608
     1609//The total number of operations is 0
     1610template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1611{
     1612        return simd128<32>::constant<(65535)>();
     1613}
     1614
     1615//The total number of operations is 0
     1616template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1617{
     1618        return _mm_set_epi32((int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)));
     1619}
     1620
     1621//The total number of operations is 0
     1622template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1623{
     1624        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)));
    16691625}
    16701626
     
    17251681}
    17261682
    1727 //The total number of operations is 1.0
    1728 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1729 {
    1730         return simd_or(arg1, arg2);
    1731 }
    1732 
    1733 //The total number of operations is 15.6666666667
    1734 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1735 {
    1736         return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
    1737 }
    1738 
    1739 //The total number of operations is 6.0
    1740 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1741 {
    1742         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
    1743 }
    1744 
    1745 //The total number of operations is 1.0
    1746 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1747 {
    1748         return _mm_max_epu8(arg1, arg2);
     1683//The total number of operations is 7.33333333333
     1684template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1685{
     1686        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1687}
     1688
     1689//The total number of operations is 19.0
     1690template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1691{
     1692        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1693        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1694}
     1695
     1696//The total number of operations is 5.0
     1697template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1698{
     1699        bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
     1700        return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
     1701}
     1702
     1703//The total number of operations is 5.0
     1704template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1705{
     1706        bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
     1707        return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
     1708}
     1709
     1710//The total number of operations is 5.0
     1711template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1712{
     1713        bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
     1714        return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
     1715}
     1716
     1717//The total number of operations is 17.0
     1718template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1719{
     1720        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1721        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1722}
     1723
     1724//The total number of operations is 44.0
     1725template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1726{
     1727        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1728        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1729}
     1730
     1731//The total number of operations is 2.0
     1732template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1733{
     1734        return simd_not(simd_xor(arg1, arg2));
     1735}
     1736
     1737//The total number of operations is 8.0
     1738template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1739{
     1740        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1741        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1742        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1743        return simd_or(loMask, hiMask);
     1744}
     1745
     1746//The total number of operations is 9.0
     1747template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1748{
     1749        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1750}
     1751
     1752//The total number of operations is 1.0
     1753template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1754{
     1755        return _mm_cmpeq_epi8(arg1, arg2);
     1756}
     1757
     1758//The total number of operations is 1.0
     1759template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1760{
     1761        return _mm_cmpeq_epi16(arg1, arg2);
     1762}
     1763
     1764//The total number of operations is 1.0
     1765template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1766{
     1767        return _mm_cmpeq_epi32(arg1, arg2);
     1768}
     1769
     1770//The total number of operations is 5.0
     1771template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1772{
     1773        bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
     1774        bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
     1775        bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
     1776        return simd_or(loMask, hiMask);
     1777}
     1778
     1779//The total number of operations is 11.6666666667
     1780template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1781{
     1782        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1783        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1784        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1785        return simd_or(loMask, hiMask);
    17491786}
    17501787
    17511788//The total number of operations is 4.0
    1752 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1753 {
    1754         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1755         return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1756 }
    1757 
    1758 //The total number of operations is 7.0
    1759 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1760 {
    1761         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1762         return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1763 }
    1764 
    1765 //The total number of operations is 20.0
    1766 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1767 {
    1768         bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
    1769         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1770         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1771         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1772 }
    1773 
    1774 //The total number of operations is 43.6666666667
    1775 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1776 {
    1777         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1778         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1779         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1780         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1789template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1790{
     1791        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1792}
     1793
     1794//The total number of operations is 10.0
     1795template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1796{
     1797        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1798        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1799}
     1800
     1801//The total number of operations is 5.0
     1802template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1803{
     1804        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1805        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1806}
     1807
     1808//The total number of operations is 1.0
     1809template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1810{
     1811        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1812}
     1813
     1814//The total number of operations is 1.0
     1815template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1816{
     1817        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1818}
     1819
     1820//The total number of operations is 4.5
     1821template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1822{
     1823        return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
     1824}
     1825
     1826//The total number of operations is 11.0833333333
     1827template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1828{
     1829        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    17811830}
    17821831
     
    18361885}
    18371886
    1838 //The total number of operations is 2.0
    1839 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1840 {
    1841         return simd_not(simd_xor(arg1, arg2));
    1842 }
    1843 
    1844 //The total number of operations is 8.0
    1845 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1846 {
    1847         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1848         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1849         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1850         return simd_or(loMask, hiMask);
    1851 }
    1852 
    1853 //The total number of operations is 9.0
    1854 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1855 {
    1856         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1857 }
    1858 
    1859 //The total number of operations is 1.0
    1860 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1861 {
    1862         return _mm_cmpeq_epi8(arg1, arg2);
    1863 }
    1864 
    1865 //The total number of operations is 1.0
    1866 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1867 {
    1868         return _mm_cmpeq_epi16(arg1, arg2);
    1869 }
    1870 
    1871 //The total number of operations is 1.0
    1872 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1873 {
    1874         return _mm_cmpeq_epi32(arg1, arg2);
    1875 }
    1876 
    1877 //The total number of operations is 5.0
    1878 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1879 {
    1880         bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
    1881         bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
    1882         bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
    1883         return simd_or(loMask, hiMask);
    1884 }
    1885 
    1886 //The total number of operations is 11.6666666667
    1887 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1888 {
    1889         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1890         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1891         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1892         return simd_or(loMask, hiMask);
    1893 }
    1894 
    18951887//The total number of operations is 0
    18961888template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    19201912template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    19211913{
    1922         return simd128<32>::constant<4294901760>();
     1914        return simd128<32>::constant<4294901760ULL>();
    19231915}
    19241916
     
    19261918template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    19271919{
    1928         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1920        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0));
    19291921}
    19301922
     
    19321924template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    19331925{
    1934         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1926        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0));
    19351927}
    19361928
     
    19871979}
    19881980
    1989 //The total number of operations is 7.33333333333
    1990 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1991 {
    1992         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1993 }
    1994 
    1995 //The total number of operations is 19.0
    1996 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1997 {
    1998         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1999         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    2000 }
    2001 
    2002 //The total number of operations is 5.0
    2003 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    2004 {
    2005         bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
    2006         return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
    2007 }
    2008 
    2009 //The total number of operations is 5.0
    2010 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    2011 {
    2012         bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
    2013         return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
    2014 }
    2015 
    2016 //The total number of operations is 5.0
    2017 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    2018 {
    2019         bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
    2020         return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
    2021 }
    2022 
    2023 //The total number of operations is 17.0
    2024 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    2025 {
    2026         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    2027         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    2028 }
    2029 
    2030 //The total number of operations is 44.0
    2031 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    2032 {
    2033         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    2034         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1981//The total number of operations is 1.0
     1982template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1983{
     1984        return simd_or(arg1, arg2);
     1985}
     1986
     1987//The total number of operations is 15.6666666667
     1988template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1989{
     1990        return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
     1991}
     1992
     1993//The total number of operations is 6.0
     1994template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1995{
     1996        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
     1997}
     1998
     1999//The total number of operations is 1.0
     2000template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2001{
     2002        return _mm_max_epu8(arg1, arg2);
     2003}
     2004
     2005//The total number of operations is 4.0
     2006template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2007{
     2008        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     2009        return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2010}
     2011
     2012//The total number of operations is 7.0
     2013template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2014{
     2015        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     2016        return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2017}
     2018
     2019//The total number of operations is 20.0
     2020template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2021{
     2022        bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
     2023        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     2024        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     2025        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2026}
     2027
     2028//The total number of operations is 43.6666666667
     2029template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2030{
     2031        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     2032        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     2033        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     2034        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    20352035}
    20362036
     
    31783178}
    31793179
     3180//The total number of operations is 2.33333333333
     3181template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3182{
     3183        return simd128<128>::slli<sh>(arg1);
     3184}
     3185
    31803186//The total number of operations is 2.0
    31813187IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
     
    31913197
    31923198//The total number of operations is 1.0
     3199IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
     3200{
     3201        _mm_store_si128((bitblock128_t*)(arg2), arg1);
     3202}
     3203
     3204//The total number of operations is 1.0
    31933205IDISA_ALWAYS_INLINE void bitblock128::store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
    31943206{
     
    31963208}
    31973209
    3198 //The total number of operations is 2.33333333333
    3199 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    3200 {
    3201         return simd128<128>::slli<sh>(arg1);
    3202 }
    3203 
    3204 //The total number of operations is 1.0
    3205 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
    3206 {
    3207         _mm_store_si128((bitblock128_t*)(arg2), arg1);
    3208 }
    3209 
    32103210#endif
  • trunk/lib/idisa_cpp/idisa_sse3.cpp

    r3526 r3576  
    5656        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    5757        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    5859        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    5960        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    6061        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    6162        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    62         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6363        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    6666        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
    6767        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    68         static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
     68        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6969        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    7070};
     
    134134IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    135135IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     136IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    136137IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    137 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    138138IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    139139IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
     
    260260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    261261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    262 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    263 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    264 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    265 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    266 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    267 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    268 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    269262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    270263template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     
    277270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    278271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    279 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    280 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    286272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    287273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     
    302288template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    303289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    304297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    305298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    310303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    311304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    312 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    313 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    314 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
     313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
     320template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     321template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     322template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     323template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     324template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     325template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    320327template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    321328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     
    326333template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    327334template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    328 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    329 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
    330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
    331 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
    332 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
    333 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
    334 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    335 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    336335template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    337336template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    349348template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    350349template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    351 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    352 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    353 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    354 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    355 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    356 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    357 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     350template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     351template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     353template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     354template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     355template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     356template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     357template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    358358template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    359359template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    555555IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1)
    556556{
    557         return simd_xor(arg1, simd128<32>::constant<-1>());
     557        return simd_xor(arg1, simd128<32>::constant<4294967295ULL>());
     558}
     559
     560//The total number of operations is 1.0
     561IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
     562{
     563        return _mm_andnot_si128(arg2, arg1);
    558564}
    559565
     
    562568{
    563569        return _mm_or_si128(arg1, arg2);
    564 }
    565 
    566 //The total number of operations is 1.0
    567 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    568 {
    569         return _mm_andnot_si128(arg2, arg1);
    570570}
    571571
     
    14081408}
    14091409
    1410 //The total number of operations is 4.0
    1411 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1412 {
    1413         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1414 }
    1415 
    1416 //The total number of operations is 10.0
    1417 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1418 {
    1419         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1420         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1421 }
    1422 
    1423 //The total number of operations is 5.0
    1424 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1425 {
    1426         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1427         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1428 }
    1429 
    1430 //The total number of operations is 1.0
    1431 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1432 {
    1433         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1434 }
    1435 
    1436 //The total number of operations is 1.0
    1437 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1438 {
    1439         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1440 }
    1441 
    1442 //The total number of operations is 4.5
    1443 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1444 {
    1445         return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
    1446 }
    1447 
    1448 //The total number of operations is 11.0833333333
    1449 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1450 {
    1451         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1452 }
    1453 
    14541410//The total number of operations is 10.0
    14551411template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15141470}
    15151471
    1516 //The total number of operations is 0
    1517 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1518 {
    1519         return simd128<2>::constant<(1)>();
    1520 }
    1521 
    1522 //The total number of operations is 0
    1523 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1524 {
    1525         return simd128<4>::constant<(3)>();
    1526 }
    1527 
    1528 //The total number of operations is 0
    1529 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1530 {
    1531         return simd128<8>::constant<(15)>();
    1532 }
    1533 
    1534 //The total number of operations is 0
    1535 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1536 {
    1537         return simd128<16>::constant<(255)>();
    1538 }
    1539 
    1540 //The total number of operations is 0
    1541 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1542 {
    1543         return simd128<32>::constant<(65535)>();
    1544 }
    1545 
    1546 //The total number of operations is 0
    1547 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1548 {
    1549         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1550 }
    1551 
    1552 //The total number of operations is 0
    1553 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1554 {
    1555         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1556 }
    1557 
    15581472//The total number of operations is 10.0
    15591473template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15721486template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    15731487{
    1574         return simd128<32>::constant<(-1*val)>();
     1488        return simd128<2>::constant<((val+val)+val)>();
    15751489}
    15761490
     
    16671581{
    16681582        return simd128<1>::ifh(simd128<128>::gt(arg1, arg2), arg2, arg1);
     1583}
     1584
     1585//The total number of operations is 0
     1586template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1587{
     1588        return simd128<2>::constant<(1)>();
     1589}
     1590
     1591//The total number of operations is 0
     1592template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1593{
     1594        return simd128<4>::constant<(3)>();
     1595}
     1596
     1597//The total number of operations is 0
     1598template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1599{
     1600        return simd128<8>::constant<(15)>();
     1601}
     1602
     1603//The total number of operations is 0
     1604template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1605{
     1606        return simd128<16>::constant<(255)>();
     1607}
     1608
     1609//The total number of operations is 0
     1610template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1611{
     1612        return simd128<32>::constant<(65535)>();
     1613}
     1614
     1615//The total number of operations is 0
     1616template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1617{
     1618        return _mm_set_epi32((int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)));
     1619}
     1620
     1621//The total number of operations is 0
     1622template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1623{
     1624        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)));
    16691625}
    16701626
     
    17251681}
    17261682
    1727 //The total number of operations is 1.0
    1728 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1729 {
    1730         return simd_or(arg1, arg2);
    1731 }
    1732 
    1733 //The total number of operations is 15.6666666667
    1734 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1735 {
    1736         return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
    1737 }
    1738 
    1739 //The total number of operations is 6.0
    1740 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1741 {
    1742         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
    1743 }
    1744 
    1745 //The total number of operations is 1.0
    1746 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1747 {
    1748         return _mm_max_epu8(arg1, arg2);
     1683//The total number of operations is 7.33333333333
     1684template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1685{
     1686        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1687}
     1688
     1689//The total number of operations is 19.0
     1690template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1691{
     1692        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1693        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1694}
     1695
     1696//The total number of operations is 5.0
     1697template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1698{
     1699        bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
     1700        return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
     1701}
     1702
     1703//The total number of operations is 5.0
     1704template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1705{
     1706        bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
     1707        return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
     1708}
     1709
     1710//The total number of operations is 5.0
     1711template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1712{
     1713        bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
     1714        return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
     1715}
     1716
     1717//The total number of operations is 17.0
     1718template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1719{
     1720        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1721        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1722}
     1723
     1724//The total number of operations is 44.0
     1725template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1726{
     1727        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1728        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1729}
     1730
     1731//The total number of operations is 2.0
     1732template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1733{
     1734        return simd_not(simd_xor(arg1, arg2));
     1735}
     1736
     1737//The total number of operations is 8.0
     1738template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1739{
     1740        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1741        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1742        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1743        return simd_or(loMask, hiMask);
     1744}
     1745
     1746//The total number of operations is 9.0
     1747template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1748{
     1749        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1750}
     1751
     1752//The total number of operations is 1.0
     1753template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1754{
     1755        return _mm_cmpeq_epi8(arg1, arg2);
     1756}
     1757
     1758//The total number of operations is 1.0
     1759template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1760{
     1761        return _mm_cmpeq_epi16(arg1, arg2);
     1762}
     1763
     1764//The total number of operations is 1.0
     1765template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1766{
     1767        return _mm_cmpeq_epi32(arg1, arg2);
     1768}
     1769
     1770//The total number of operations is 5.0
     1771template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1772{
     1773        bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
     1774        bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
     1775        bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
     1776        return simd_or(loMask, hiMask);
     1777}
     1778
     1779//The total number of operations is 11.6666666667
     1780template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1781{
     1782        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1783        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1784        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1785        return simd_or(loMask, hiMask);
    17491786}
    17501787
    17511788//The total number of operations is 4.0
    1752 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1753 {
    1754         bitblock128_t high_bit = simd128<16>::constant<(32768)>();
    1755         return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1756 }
    1757 
    1758 //The total number of operations is 7.0
    1759 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1760 {
    1761         bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
    1762         return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1763 }
    1764 
    1765 //The total number of operations is 20.0
    1766 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1767 {
    1768         bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
    1769         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1770         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1771         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1772 }
    1773 
    1774 //The total number of operations is 43.6666666667
    1775 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1776 {
    1777         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1778         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1779         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1780         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1789template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1790{
     1791        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1792}
     1793
     1794//The total number of operations is 10.0
     1795template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1796{
     1797        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1798        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1799}
     1800
     1801//The total number of operations is 5.0
     1802template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1803{
     1804        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1805        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1806}
     1807
     1808//The total number of operations is 1.0
     1809template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1810{
     1811        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1812}
     1813
     1814//The total number of operations is 1.0
     1815template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1816{
     1817        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1818}
     1819
     1820//The total number of operations is 4.5
     1821template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1822{
     1823        return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
     1824}
     1825
     1826//The total number of operations is 11.0833333333
     1827template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1828{
     1829        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    17811830}
    17821831
     
    18361885}
    18371886
    1838 //The total number of operations is 2.0
    1839 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1840 {
    1841         return simd_not(simd_xor(arg1, arg2));
    1842 }
    1843 
    1844 //The total number of operations is 8.0
    1845 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1846 {
    1847         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1848         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1849         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1850         return simd_or(loMask, hiMask);
    1851 }
    1852 
    1853 //The total number of operations is 9.0
    1854 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1855 {
    1856         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1857 }
    1858 
    1859 //The total number of operations is 1.0
    1860 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1861 {
    1862         return _mm_cmpeq_epi8(arg1, arg2);
    1863 }
    1864 
    1865 //The total number of operations is 1.0
    1866 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1867 {
    1868         return _mm_cmpeq_epi16(arg1, arg2);
    1869 }
    1870 
    1871 //The total number of operations is 1.0
    1872 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1873 {
    1874         return _mm_cmpeq_epi32(arg1, arg2);
    1875 }
    1876 
    1877 //The total number of operations is 5.0
    1878 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1879 {
    1880         bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
    1881         bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
    1882         bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
    1883         return simd_or(loMask, hiMask);
    1884 }
    1885 
    1886 //The total number of operations is 11.6666666667
    1887 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1888 {
    1889         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1890         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1891         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1892         return simd_or(loMask, hiMask);
    1893 }
    1894 
    18951887//The total number of operations is 0
    18961888template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    19201912template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    19211913{
    1922         return simd128<32>::constant<-65536>();
     1914        return simd128<32>::constant<4294901760ULL>();
    19231915}
    19241916
     
    19261918template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    19271919{
    1928         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1920        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0));
    19291921}
    19301922
     
    19321924template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    19331925{
    1934         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1926        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0));
    19351927}
    19361928
     
    19871979}
    19881980
    1989 //The total number of operations is 7.33333333333
    1990 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1991 {
    1992         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1993 }
    1994 
    1995 //The total number of operations is 19.0
    1996 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1997 {
    1998         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1999         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    2000 }
    2001 
    2002 //The total number of operations is 5.0
    2003 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    2004 {
    2005         bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
    2006         return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
    2007 }
    2008 
    2009 //The total number of operations is 5.0
    2010 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    2011 {
    2012         bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
    2013         return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
    2014 }
    2015 
    2016 //The total number of operations is 5.0
    2017 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    2018 {
    2019         bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
    2020         return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
    2021 }
    2022 
    2023 //The total number of operations is 17.0
    2024 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    2025 {
    2026         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    2027         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    2028 }
    2029 
    2030 //The total number of operations is 44.0
    2031 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    2032 {
    2033         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    2034         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1981//The total number of operations is 1.0
     1982template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1983{
     1984        return simd_or(arg1, arg2);
     1985}
     1986
     1987//The total number of operations is 15.6666666667
     1988template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1989{
     1990        return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
     1991}
     1992
     1993//The total number of operations is 6.0
     1994template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1995{
     1996        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
     1997}
     1998
     1999//The total number of operations is 1.0
     2000template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2001{
     2002        return _mm_max_epu8(arg1, arg2);
     2003}
     2004
     2005//The total number of operations is 4.0
     2006template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2007{
     2008        bitblock128_t high_bit = simd128<16>::constant<(32768)>();
     2009        return simd_xor(simd128<16>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2010}
     2011
     2012//The total number of operations is 7.0
     2013template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2014{
     2015        bitblock128_t high_bit = simd128<32>::constant<(2147483648ULL)>();
     2016        return simd_xor(simd128<32>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2017}
     2018
     2019//The total number of operations is 20.0
     2020template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2021{
     2022        bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
     2023        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     2024        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     2025        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2026}
     2027
     2028//The total number of operations is 43.6666666667
     2029template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2030{
     2031        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     2032        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     2033        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     2034        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    20352035}
    20362036
     
    31693169IDISA_ALWAYS_INLINE bool bitblock128::all(bitblock128_t arg1)
    31703170{
    3171         return hsimd128<8>::signmask(simd128<8>::eq(arg1, simd128<8>::constant<-1>())) == 65535;
     3171        return hsimd128<8>::signmask(simd128<8>::eq(arg1, simd128<8>::constant<255>())) == 65535;
    31723172}
    31733173
     
    31783178}
    31793179
     3180//The total number of operations is 2.33333333333
     3181template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3182{
     3183        return simd128<128>::slli<sh>(arg1);
     3184}
     3185
    31803186//The total number of operations is 2.0
    31813187IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
     
    31913197
    31923198//The total number of operations is 1.0
     3199IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
     3200{
     3201        _mm_store_si128((bitblock128_t*)(arg2), arg1);
     3202}
     3203
     3204//The total number of operations is 1.0
    31933205IDISA_ALWAYS_INLINE void bitblock128::store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
    31943206{
     
    31963208}
    31973209
    3198 //The total number of operations is 2.33333333333
    3199 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    3200 {
    3201         return simd128<128>::slli<sh>(arg1);
    3202 }
    3203 
    3204 //The total number of operations is 1.0
    3205 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
    3206 {
    3207         _mm_store_si128((bitblock128_t*)(arg2), arg1);
    3208 }
    3209 
    32103210#endif
  • trunk/lib/idisa_cpp/idisa_sse4_1.cpp

    r3526 r3576  
    5656        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    5757        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    5859        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    5960        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    6061        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    6162        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    62         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6363        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    6666        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
    6767        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    68         static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
     68        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6969        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    7070};
     
    135135IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    136136IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     137IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    137138IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    138 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    139139IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    140140IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
     
    261261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    262262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    263 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    264 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    265 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    266 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    267 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    268 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    269 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    270263template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    271264template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     
    278271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    279272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    280 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    286 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    287273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    288274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     
    303289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    304290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    305298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    306299template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    311304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    312305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    313 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    314 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    320 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
     320template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
     321template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     322template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     323template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     324template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     325template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     327template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    321328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    322329template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     
    327334template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    328335template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    329 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
    331 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
    332 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
    333 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
    334 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
    335 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    336 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    337336template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    338337template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    350349template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    351350template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    352 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    353 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    354 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    355 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    356 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    357 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    358 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     351template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     353template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     354template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     355template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     356template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     357template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     358template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    359359template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    360360template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    560560IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1)
    561561{
    562         return simd_xor(arg1, simd128<32>::constant<-1>());
     562        return simd_xor(arg1, simd128<32>::constant<4294967295ULL>());
     563}
     564
     565//The total number of operations is 1.0
     566IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
     567{
     568        return _mm_andnot_si128(arg2, arg1);
    563569}
    564570
     
    567573{
    568574        return _mm_or_si128(arg1, arg2);
    569 }
    570 
    571 //The total number of operations is 1.0
    572 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    573 {
    574         return _mm_andnot_si128(arg2, arg1);
    575575}
    576576
     
    13131313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
    13141314{
    1315         return _mm_sign_epi32(arg1, simd128<32>::constant<-1>());
     1315        return _mm_sign_epi32(arg1, simd128<32>::constant<((4294967296ULL)-1)>());
    13161316}
    13171317
     
    14181418}
    14191419
    1420 //The total number of operations is 4.0
    1421 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1422 {
    1423         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1424 }
    1425 
    1426 //The total number of operations is 10.0
    1427 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1428 {
    1429         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1430         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1431 }
    1432 
    1433 //The total number of operations is 5.0
    1434 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1435 {
    1436         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1437         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1438 }
    1439 
    1440 //The total number of operations is 1.0
    1441 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1442 {
    1443         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1444 }
    1445 
    1446 //The total number of operations is 1.0
    1447 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1448 {
    1449         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1450 }
    1451 
    1452 //The total number of operations is 4.5
    1453 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1454 {
    1455         return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
    1456 }
    1457 
    1458 //The total number of operations is 11.0833333333
    1459 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1460 {
    1461         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1462 }
    1463 
    14641420//The total number of operations is 10.0
    14651421template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15241480}
    15251481
    1526 //The total number of operations is 0
    1527 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1528 {
    1529         return simd128<2>::constant<(1)>();
    1530 }
    1531 
    1532 //The total number of operations is 0
    1533 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1534 {
    1535         return simd128<4>::constant<(3)>();
    1536 }
    1537 
    1538 //The total number of operations is 0
    1539 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1540 {
    1541         return simd128<8>::constant<(15)>();
    1542 }
    1543 
    1544 //The total number of operations is 0
    1545 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1546 {
    1547         return simd128<16>::constant<(255)>();
    1548 }
    1549 
    1550 //The total number of operations is 0
    1551 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1552 {
    1553         return simd128<32>::constant<(65535)>();
    1554 }
    1555 
    1556 //The total number of operations is 0
    1557 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1558 {
    1559         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1560 }
    1561 
    1562 //The total number of operations is 0
    1563 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1564 {
    1565         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1566 }
    1567 
    15681482//The total number of operations is 10.0
    15691483template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15821496template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    15831497{
    1584         return simd128<32>::constant<(-1*val)>();
     1498        return simd128<2>::constant<((val+val)+val)>();
    15851499}
    15861500
     
    16861600}
    16871601
     1602//The total number of operations is 0
     1603template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1604{
     1605        return simd128<2>::constant<(1)>();
     1606}
     1607
     1608//The total number of operations is 0
     1609template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1610{
     1611        return simd128<4>::constant<(3)>();
     1612}
     1613
     1614//The total number of operations is 0
     1615template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1616{
     1617        return simd128<8>::constant<(15)>();
     1618}
     1619
     1620//The total number of operations is 0
     1621template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1622{
     1623        return simd128<16>::constant<(255)>();
     1624}
     1625
     1626//The total number of operations is 0
     1627template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1628{
     1629        return simd128<32>::constant<(65535)>();
     1630}
     1631
     1632//The total number of operations is 0
     1633template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1634{
     1635        return _mm_set_epi32((int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)));
     1636}
     1637
     1638//The total number of operations is 0
     1639template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1640{
     1641        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)));
     1642}
     1643
    16881644//The total number of operations is 1.0
    16891645template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    17401696}
    17411697
    1742 //The total number of operations is 1.0
    1743 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1744 {
    1745         return simd_or(arg1, arg2);
    1746 }
    1747 
    1748 //The total number of operations is 15.6666666667
    1749 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1750 {
    1751         return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
    1752 }
    1753 
    1754 //The total number of operations is 6.0
    1755 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1756 {
    1757         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
    1758 }
    1759 
    1760 //The total number of operations is 1.0
    1761 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1762 {
    1763         return _mm_max_epu8(arg1, arg2);
    1764 }
    1765 
    1766 //The total number of operations is 1.0
    1767 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1768 {
    1769         return _mm_max_epu16(arg1, arg2);
    1770 }
    1771 
    1772 //The total number of operations is 1.0
    1773 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1774 {
    1775         return _mm_max_epu32(arg1, arg2);
    1776 }
    1777 
    1778 //The total number of operations is 14.0
    1779 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1780 {
    1781         bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
    1782         bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
    1783         bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
    1784         return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1785 }
    1786 
    1787 //The total number of operations is 29.6666666667
    1788 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1789 {
    1790         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1791         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1792         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1793         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1698//The total number of operations is 7.33333333333
     1699template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1700{
     1701        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1702}
     1703
     1704//The total number of operations is 19.0
     1705template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1706{
     1707        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1708        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1709}
     1710
     1711//The total number of operations is 1.0
     1712template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1713{
     1714        return _mm_abs_epi8(arg1);
     1715}
     1716
     1717//The total number of operations is 1.0
     1718template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1719{
     1720        return _mm_abs_epi16(arg1);
     1721}
     1722
     1723//The total number of operations is 1.0
     1724template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1725{
     1726        return _mm_abs_epi32(arg1);
     1727}
     1728
     1729//The total number of operations is 9.0
     1730template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1731{
     1732        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1733        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1734}
     1735
     1736//The total number of operations is 32.0
     1737template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1738{
     1739        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1740        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1741}
     1742
     1743//The total number of operations is 2.0
     1744template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1745{
     1746        return simd_not(simd_xor(arg1, arg2));
     1747}
     1748
     1749//The total number of operations is 8.0
     1750template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1751{
     1752        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1753        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1754        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1755        return simd_or(loMask, hiMask);
     1756}
     1757
     1758//The total number of operations is 9.0
     1759template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1760{
     1761        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1762}
     1763
     1764//The total number of operations is 1.0
     1765template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1766{
     1767        return _mm_cmpeq_epi8(arg1, arg2);
     1768}
     1769
     1770//The total number of operations is 1.0
     1771template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1772{
     1773        return _mm_cmpeq_epi16(arg1, arg2);
     1774}
     1775
     1776//The total number of operations is 1.0
     1777template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1778{
     1779        return _mm_cmpeq_epi32(arg1, arg2);
     1780}
     1781
     1782//The total number of operations is 1.0
     1783template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1784{
     1785        return _mm_cmpeq_epi64(arg1, arg2);
     1786}
     1787
     1788//The total number of operations is 7.66666666667
     1789template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1790{
     1791        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1792        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1793        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1794        return simd_or(loMask, hiMask);
     1795}
     1796
     1797//The total number of operations is 4.0
     1798template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1799{
     1800        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1801}
     1802
     1803//The total number of operations is 10.0
     1804template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1805{
     1806        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1807        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1808}
     1809
     1810//The total number of operations is 5.0
     1811template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1812{
     1813        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1814        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1815}
     1816
     1817//The total number of operations is 1.0
     1818template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1819{
     1820        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1821}
     1822
     1823//The total number of operations is 1.0
     1824template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1825{
     1826        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1827}
     1828
     1829//The total number of operations is 4.5
     1830template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1831{
     1832        return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
     1833}
     1834
     1835//The total number of operations is 11.0833333333
     1836template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1837{
     1838        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    17941839}
    17951840
     
    18491894}
    18501895
    1851 //The total number of operations is 2.0
    1852 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1853 {
    1854         return simd_not(simd_xor(arg1, arg2));
    1855 }
    1856 
    1857 //The total number of operations is 8.0
    1858 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1859 {
    1860         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1861         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1862         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1863         return simd_or(loMask, hiMask);
    1864 }
    1865 
    1866 //The total number of operations is 9.0
    1867 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1868 {
    1869         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1870 }
    1871 
    1872 //The total number of operations is 1.0
    1873 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1874 {
    1875         return _mm_cmpeq_epi8(arg1, arg2);
    1876 }
    1877 
    1878 //The total number of operations is 1.0
    1879 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1880 {
    1881         return _mm_cmpeq_epi16(arg1, arg2);
    1882 }
    1883 
    1884 //The total number of operations is 1.0
    1885 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1886 {
    1887         return _mm_cmpeq_epi32(arg1, arg2);
    1888 }
    1889 
    1890 //The total number of operations is 1.0
    1891 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1892 {
    1893         return _mm_cmpeq_epi64(arg1, arg2);
    1894 }
    1895 
    1896 //The total number of operations is 7.66666666667
    1897 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1898 {
    1899         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1900         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1901         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1902         return simd_or(loMask, hiMask);
    1903 }
    1904 
    19051896//The total number of operations is 0
    19061897template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    19301921template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    19311922{
    1932         return simd128<32>::constant<-65536>();
     1923        return simd128<32>::constant<4294901760ULL>();
    19331924}
    19341925
     
    19361927template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    19371928{
    1938         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1929        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0));
    19391930}
    19401931
     
    19421933template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    19431934{
    1944         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1935        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0));
    19451936}
    19461937
     
    19971988}
    19981989
    1999 //The total number of operations is 7.33333333333
    2000 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    2001 {
    2002         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    2003 }
    2004 
    2005 //The total number of operations is 19.0
    2006 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    2007 {
    2008         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    2009         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    2010 }
    2011 
    2012 //The total number of operations is 1.0
    2013 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    2014 {
    2015         return _mm_abs_epi8(arg1);
    2016 }
    2017 
    2018 //The total number of operations is 1.0
    2019 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    2020 {
    2021         return _mm_abs_epi16(arg1);
    2022 }
    2023 
    2024 //The total number of operations is 1.0
    2025 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    2026 {
    2027         return _mm_abs_epi32(arg1);
    2028 }
    2029 
    2030 //The total number of operations is 9.0
    2031 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    2032 {
    2033         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    2034         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    2035 }
    2036 
    2037 //The total number of operations is 32.0
    2038 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    2039 {
    2040         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    2041         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1990//The total number of operations is 1.0
     1991template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1992{
     1993        return simd_or(arg1, arg2);
     1994}
     1995
     1996//The total number of operations is 15.6666666667
     1997template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1998{
     1999        return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
     2000}
     2001
     2002//The total number of operations is 6.0
     2003template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2004{
     2005        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
     2006}
     2007
     2008//The total number of operations is 1.0
     2009template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2010{
     2011        return _mm_max_epu8(arg1, arg2);
     2012}
     2013
     2014//The total number of operations is 1.0
     2015template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2016{
     2017        return _mm_max_epu16(arg1, arg2);
     2018}
     2019
     2020//The total number of operations is 1.0
     2021template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2022{
     2023        return _mm_max_epu32(arg1, arg2);
     2024}
     2025
     2026//The total number of operations is 14.0
     2027template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2028{
     2029        bitblock128_t tmpAns = simd128<(32)>::umax(arg1, arg2);
     2030        bitblock128_t eqMask1 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg1));
     2031        bitblock128_t eqMask2 = simd128<64>::srli<(32)>(simd128<(32)>::eq(tmpAns, arg2));
     2032        return simd128<1>::ifh(simd128<64>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2033}
     2034
     2035//The total number of operations is 29.6666666667
     2036template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2037{
     2038        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     2039        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     2040        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     2041        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    20422042}
    20432043
     
    32083208IDISA_ALWAYS_INLINE bool bitblock128::all(bitblock128_t arg1)
    32093209{
    3210         return hsimd128<8>::signmask(simd128<8>::eq(arg1, simd128<8>::constant<-1>())) == 65535;
     3210        return hsimd128<8>::signmask(simd128<8>::eq(arg1, simd128<8>::constant<255>())) == 65535;
    32113211}
    32123212
     
    32173217}
    32183218
     3219//The total number of operations is 2.33333333333
     3220template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3221{
     3222        return simd128<128>::slli<sh>(arg1);
     3223}
     3224
    32193225//The total number of operations is 2.0
    32203226IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
     
    32303236
    32313237//The total number of operations is 1.0
     3238IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
     3239{
     3240        _mm_store_si128((bitblock128_t*)(arg2), arg1);
     3241}
     3242
     3243//The total number of operations is 1.0
    32323244IDISA_ALWAYS_INLINE void bitblock128::store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
    32333245{
     
    32353247}
    32363248
    3237 //The total number of operations is 2.33333333333
    3238 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    3239 {
    3240         return simd128<128>::slli<sh>(arg1);
    3241 }
    3242 
    3243 //The total number of operations is 1.0
    3244 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
    3245 {
    3246         _mm_store_si128((bitblock128_t*)(arg2), arg1);
    3247 }
    3248 
    32493249#endif
  • trunk/lib/idisa_cpp/idisa_sse4_2.cpp

    r3526 r3576  
    5656        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    5757        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    5859        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    5960        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    6061        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    6162        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    62         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6363        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    6666        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
    6767        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    68         static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
     68        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6969        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    7070};
     
    135135IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    136136IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     137IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    137138IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    138 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    139139IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    140140IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
     
    261261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    262262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    263 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    264 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    265 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    266 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    267 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    268 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    269 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    270263template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    271264template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     
    278271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    279272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    280 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    286 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    287273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    288274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     
    303289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    304290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    305298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    306299template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    311304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    312305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    313 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    314 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    320 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
     320template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
     321template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     322template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     323template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     324template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     325template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     327template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    321328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    322329template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     
    327334template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    328335template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    329 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
    331 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
    332 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
    333 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
    334 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
    335 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    336 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    337336template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    338337template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    350349template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    351350template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    352 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    353 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    354 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    355 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    356 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    357 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    358 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     351template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     353template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     354template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     355template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     356template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     357template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     358template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    359359template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    360360template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    560560IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1)
    561561{
    562         return simd_xor(arg1, simd128<32>::constant<-1>());
     562        return simd_xor(arg1, simd128<32>::constant<4294967295ULL>());
     563}
     564
     565//The total number of operations is 1.0
     566IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
     567{
     568        return _mm_andnot_si128(arg2, arg1);
    563569}
    564570
     
    567573{
    568574        return _mm_or_si128(arg1, arg2);
    569 }
    570 
    571 //The total number of operations is 1.0
    572 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    573 {
    574         return _mm_andnot_si128(arg2, arg1);
    575575}
    576576
     
    13041304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
    13051305{
    1306         return _mm_sign_epi32(arg1, simd128<32>::constant<-1>());
     1306        return _mm_sign_epi32(arg1, simd128<32>::constant<((4294967296ULL)-1)>());
    13071307}
    13081308
     
    14091409}
    14101410
    1411 //The total number of operations is 4.0
    1412 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1413 {
    1414         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1415 }
    1416 
    1417 //The total number of operations is 10.0
    1418 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1419 {
    1420         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1421         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1422 }
    1423 
    1424 //The total number of operations is 5.0
    1425 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1426 {
    1427         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1428         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1429 }
    1430 
    1431 //The total number of operations is 1.0
    1432 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1433 {
    1434         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1435 }
    1436 
    1437 //The total number of operations is 1.0
    1438 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1439 {
    1440         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1441 }
    1442 
    1443 //The total number of operations is 4.5
    1444 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1445 {
    1446         return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
    1447 }
    1448 
    1449 //The total number of operations is 11.0833333333
    1450 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1451 {
    1452         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1453 }
    1454 
    14551411//The total number of operations is 10.0
    14561412template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15151471}
    15161472
    1517 //The total number of operations is 0
    1518 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1519 {
    1520         return simd128<2>::constant<(1)>();
    1521 }
    1522 
    1523 //The total number of operations is 0
    1524 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1525 {
    1526         return simd128<4>::constant<(3)>();
    1527 }
    1528 
    1529 //The total number of operations is 0
    1530 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1531 {
    1532         return simd128<8>::constant<(15)>();
    1533 }
    1534 
    1535 //The total number of operations is 0
    1536 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1537 {
    1538         return simd128<16>::constant<(255)>();
    1539 }
    1540 
    1541 //The total number of operations is 0
    1542 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1543 {
    1544         return simd128<32>::constant<(65535)>();
    1545 }
    1546 
    1547 //The total number of operations is 0
    1548 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1549 {
    1550         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1551 }
    1552 
    1553 //The total number of operations is 0
    1554 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1555 {
    1556         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1557 }
    1558 
    15591473//The total number of operations is 10.0
    15601474template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15731487template <> template <FieldType<1>::T val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    15741488{
    1575         return simd128<32>::constant<(-1*val)>();
     1489        return simd128<2>::constant<((val+val)+val)>();
    15761490}
    15771491
     
    16731587}
    16741588
     1589//The total number of operations is 0
     1590template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1591{
     1592        return simd128<2>::constant<(1)>();
     1593}
     1594
     1595//The total number of operations is 0
     1596template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1597{
     1598        return simd128<4>::constant<(3)>();
     1599}
     1600
     1601//The total number of operations is 0
     1602template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1603{
     1604        return simd128<8>::constant<(15)>();
     1605}
     1606
     1607//The total number of operations is 0
     1608template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1609{
     1610        return simd128<16>::constant<(255)>();
     1611}
     1612
     1613//The total number of operations is 0
     1614template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1615{
     1616        return simd128<32>::constant<(65535)>();
     1617}
     1618
     1619//The total number of operations is 0
     1620template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1621{
     1622        return _mm_set_epi32((int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)));
     1623}
     1624
     1625//The total number of operations is 0
     1626template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1627{
     1628        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)));
     1629}
     1630
    16751631//The total number of operations is 1.0
    16761632template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    17251681}
    17261682
    1727 //The total number of operations is 1.0
    1728 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1729 {
    1730         return simd_or(arg1, arg2);
    1731 }
    1732 
    1733 //The total number of operations is 15.6666666667
    1734 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1735 {
    1736         return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
    1737 }
    1738 
    1739 //The total number of operations is 6.0
    1740 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1741 {
    1742         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
    1743 }
    1744 
    1745 //The total number of operations is 1.0
    1746 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1747 {
    1748         return _mm_max_epu8(arg1, arg2);
    1749 }
    1750 
    1751 //The total number of operations is 1.0
    1752 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1753 {
    1754         return _mm_max_epu16(arg1, arg2);
    1755 }
    1756 
    1757 //The total number of operations is 1.0
    1758 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1759 {
    1760         return _mm_max_epu32(arg1, arg2);
    1761 }
    1762 
    1763 //The total number of operations is 7.0
    1764 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1765 {
    1766         bitblock128_t high_bit = simd128<64>::constant<(9223372036854775808ULL)>();
    1767         return simd_xor(simd128<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1768 }
    1769 
    1770 //The total number of operations is 22.6666666667
    1771 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
    1772 {
    1773         bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
    1774         bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
    1775         bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    1776         return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1683//The total number of operations is 7.33333333333
     1684template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1685{
     1686        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1687}
     1688
     1689//The total number of operations is 19.0
     1690template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1691{
     1692        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1693        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1694}
     1695
     1696//The total number of operations is 1.0
     1697template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1698{
     1699        return _mm_abs_epi8(arg1);
     1700}
     1701
     1702//The total number of operations is 1.0
     1703template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1704{
     1705        return _mm_abs_epi16(arg1);
     1706}
     1707
     1708//The total number of operations is 1.0
     1709template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1710{
     1711        return _mm_abs_epi32(arg1);
     1712}
     1713
     1714//The total number of operations is 5.0
     1715template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1716{
     1717        bitblock128_t gtMask = simd128<64>::gt(arg1, simd128<64>::constant<0>());
     1718        return simd128<1>::ifh(gtMask, arg1, simd128<64>::sub(gtMask, arg1));
     1719}
     1720
     1721//The total number of operations is 28.0
     1722template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1723{
     1724        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1725        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1726}
     1727
     1728//The total number of operations is 2.0
     1729template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1730{
     1731        return simd_not(simd_xor(arg1, arg2));
     1732}
     1733
     1734//The total number of operations is 8.0
     1735template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1736{
     1737        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1738        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1739        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1740        return simd_or(loMask, hiMask);
     1741}
     1742
     1743//The total number of operations is 9.0
     1744template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1745{
     1746        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1747}
     1748
     1749//The total number of operations is 1.0
     1750template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1751{
     1752        return _mm_cmpeq_epi8(arg1, arg2);
     1753}
     1754
     1755//The total number of operations is 1.0
     1756template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1757{
     1758        return _mm_cmpeq_epi16(arg1, arg2);
     1759}
     1760
     1761//The total number of operations is 1.0
     1762template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1763{
     1764        return _mm_cmpeq_epi32(arg1, arg2);
     1765}
     1766
     1767//The total number of operations is 1.0
     1768template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1769{
     1770        return _mm_cmpeq_epi64(arg1, arg2);
     1771}
     1772
     1773//The total number of operations is 7.66666666667
     1774template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1775{
     1776        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1777        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1778        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1779        return simd_or(loMask, hiMask);
     1780}
     1781
     1782//The total number of operations is 4.0
     1783template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1784{
     1785        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1786}
     1787
     1788//The total number of operations is 10.0
     1789template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1790{
     1791        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1792        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1793}
     1794
     1795//The total number of operations is 5.0
     1796template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1797{
     1798        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1799        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1800}
     1801
     1802//The total number of operations is 1.0
     1803template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1804{
     1805        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1806}
     1807
     1808//The total number of operations is 1.0
     1809template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1810{
     1811        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1812}
     1813
     1814//The total number of operations is 4.5
     1815template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1816{
     1817        return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
     1818}
     1819
     1820//The total number of operations is 11.0833333333
     1821template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1822{
     1823        return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    17771824}
    17781825
     
    18311878}
    18321879
    1833 //The total number of operations is 2.0
    1834 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1835 {
    1836         return simd_not(simd_xor(arg1, arg2));
    1837 }
    1838 
    1839 //The total number of operations is 8.0
    1840 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1841 {
    1842         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1843         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1844         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1845         return simd_or(loMask, hiMask);
    1846 }
    1847 
    1848 //The total number of operations is 9.0
    1849 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1850 {
    1851         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1852 }
    1853 
    1854 //The total number of operations is 1.0
    1855 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1856 {
    1857         return _mm_cmpeq_epi8(arg1, arg2);
    1858 }
    1859 
    1860 //The total number of operations is 1.0
    1861 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1862 {
    1863         return _mm_cmpeq_epi16(arg1, arg2);
    1864 }
    1865 
    1866 //The total number of operations is 1.0
    1867 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1868 {
    1869         return _mm_cmpeq_epi32(arg1, arg2);
    1870 }
    1871 
    1872 //The total number of operations is 1.0
    1873 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1874 {
    1875         return _mm_cmpeq_epi64(arg1, arg2);
    1876 }
    1877 
    1878 //The total number of operations is 7.66666666667
    1879 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1880 {
    1881         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1882         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1883         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1884         return simd_or(loMask, hiMask);
    1885 }
    1886 
    18871880//The total number of operations is 0
    18881881template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     
    19121905template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    19131906{
    1914         return simd128<32>::constant<-65536>();
     1907        return simd128<32>::constant<4294901760ULL>();
    19151908}
    19161909
     
    19181911template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    19191912{
    1920         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1913        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(((4294967296ULL)-1)), (int32_t)(0));
    19211914}
    19221915
     
    19241917template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    19251918{
    1926         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1919        return _mm_set_epi32((int32_t)(((4294967296ULL)-1)), (int32_t)(((4294967296ULL)-1)), (int32_t)(0), (int32_t)(0));
    19271920}
    19281921
     
    19791972}
    19801973
    1981 //The total number of operations is 7.33333333333
    1982 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1983 {
    1984         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1985 }
    1986 
    1987 //The total number of operations is 19.0
    1988 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1989 {
    1990         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1991         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1992 }
    1993 
    1994 //The total number of operations is 1.0
    1995 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1996 {
    1997         return _mm_abs_epi8(arg1);
    1998 }
    1999 
    2000 //The total number of operations is 1.0
    2001 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    2002 {
    2003         return _mm_abs_epi16(arg1);
    2004 }
    2005 
    2006 //The total number of operations is 1.0
    2007 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    2008 {
    2009         return _mm_abs_epi32(arg1);
    2010 }
    2011 
    2012 //The total number of operations is 5.0
    2013 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    2014 {
    2015         bitblock128_t gtMask = simd128<64>::gt(arg1, simd128<64>::constant<0>());
    2016         return simd128<1>::ifh(gtMask, arg1, simd128<64>::sub(gtMask, arg1));
    2017 }
    2018 
    2019 //The total number of operations is 28.0
    2020 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    2021 {
    2022         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    2023         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1974//The total number of operations is 1.0
     1975template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1976{
     1977        return simd_or(arg1, arg2);
     1978}
     1979
     1980//The total number of operations is 15.6666666667
     1981template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1982{
     1983        return simd128<1>::ifh(simd128<2>::himask(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd128<128>::srli<1>(simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd128<128>::srli<1>(simd_or(arg1, simd_not(arg2))))));
     1984}
     1985
     1986//The total number of operations is 6.0
     1987template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1988{
     1989        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::umax(arg1, arg2)), simd128<(8)>::umax(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2)));
     1990}
     1991
     1992//The total number of operations is 1.0
     1993template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2)
     1994{
     1995        return _mm_max_epu8(arg1, arg2);
     1996}
     1997
     1998//The total number of operations is 1.0
     1999template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2000{
     2001        return _mm_max_epu16(arg1, arg2);
     2002}
     2003
     2004//The total number of operations is 1.0
     2005template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2006{
     2007        return _mm_max_epu32(arg1, arg2);
     2008}
     2009
     2010//The total number of operations is 7.0
     2011template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2012{
     2013        bitblock128_t high_bit = simd128<64>::constant<(9223372036854775808ULL)>();
     2014        return simd_xor(simd128<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2015}
     2016
     2017//The total number of operations is 22.6666666667
     2018template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2)
     2019{
     2020        bitblock128_t tmpAns = simd128<(64)>::umax(arg1, arg2);
     2021        bitblock128_t eqMask1 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg1));
     2022        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
     2023        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    20242024}
    20252025
     
    31903190IDISA_ALWAYS_INLINE bool bitblock128::all(bitblock128_t arg1)
    31913191{
    3192         return hsimd128<8>::signmask(simd128<8>::eq(arg1, simd128<8>::constant<-1>())) == 65535;
     3192        return hsimd128<8>::signmask(simd128<8>::eq(arg1, simd128<8>::constant<255>())) == 65535;
    31933193}
    31943194
     
    31993199}
    32003200
     3201//The total number of operations is 2.33333333333
     3202template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3203{
     3204        return simd128<128>::slli<sh>(arg1);
     3205}
     3206
    32013207//The total number of operations is 2.0
    32023208IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
     
    32123218
    32133219//The total number of operations is 1.0
     3220IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
     3221{
     3222        _mm_store_si128((bitblock128_t*)(arg2), arg1);
     3223}
     3224
     3225//The total number of operations is 1.0
    32143226IDISA_ALWAYS_INLINE void bitblock128::store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
    32153227{
     
    32173229}
    32183230
    3219 //The total number of operations is 2.33333333333
    3220 template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
    3221 {
    3222         return simd128<128>::slli<sh>(arg1);
    3223 }
    3224 
    3225 //The total number of operations is 1.0
    3226 IDISA_ALWAYS_INLINE void bitblock128::store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
    3227 {
    3228         _mm_store_si128((bitblock128_t*)(arg2), arg1);
    3229 }
    3230 
    32313231#endif
  • trunk/lib/idisa_cpp/idisa_ssse3.cpp

    r3526 r3576  
    5656        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
    5757        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
     58        static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
    5859        static IDISA_ALWAYS_INLINE bitblock128_t vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    5960        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    6061        template <typename FieldType<fw>::T val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    6162        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    62         static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6363        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    6464        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    6666        static IDISA_ALWAYS_INLINE bitblock128_t any(bitblock128_t arg1);
    6767        template <uint16_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srai(bitblock128_t arg1);
    68         static IDISA_ALWAYS_INLINE bitblock128_t lt(bitblock128_t arg1, bitblock128_t arg2);
     68        static IDISA_ALWAYS_INLINE bitblock128_t add(bitblock128_t arg1, bitblock128_t arg2);
    6969        static IDISA_ALWAYS_INLINE bitblock128_t ugt(bitblock128_t arg1, bitblock128_t arg2);
    7070};
     
    135135IDISA_ALWAYS_INLINE bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
    136136IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1);
     137IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    137138IDISA_ALWAYS_INLINE bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
    138 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
    139139IDISA_ALWAYS_INLINE bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
    140140IDISA_ALWAYS_INLINE bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
     
    261261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    262262template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::ifh(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
    263 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
    264 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
    265 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
    266 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
    267 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
    268 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
    269 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    270263template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
    271264template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask);
     
    278271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add_hl(bitblock128_t arg1);
    279272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    280 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    281 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    282 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    283 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    284 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    285 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    286 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    287273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
    288274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::vsll(bitblock128_t arg1, bitblock128_t shift_mask);
     
    303289template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    304290template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     291template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     292template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     293template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     294template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     295template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     296template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     297template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    305298template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    306299template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    311304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    312305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    313 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    314 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    315 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    316 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    317 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    318 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    320 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     313template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
     314template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     315template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
     316template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
     317template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
     318template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
     319template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
     320template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
     321template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1);
     322template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1);
     323template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1);
     324template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1);
     325template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1);
     326template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1);
     327template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1);
    321328template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::lt(bitblock128_t arg1, bitblock128_t arg2);
    322329template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lt(bitblock128_t arg1, bitblock128_t arg2);
     
    327334template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lt(bitblock128_t arg1, bitblock128_t arg2);
    328335template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lt(bitblock128_t arg1, bitblock128_t arg2);
    329 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    330 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
    331 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2);
    332 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2);
    333 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2);
    334 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2);
    335 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2);
    336 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2);
    337336template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask();
    338337template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask();
     
    350349template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    351350template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    352 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    353 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    354 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    355 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    356 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    357 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    358 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     351template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     352template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     353template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     354template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     355template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     356template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     357template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     358template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    359359template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    360360template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    560560IDISA_ALWAYS_INLINE bitblock128_t simd_not(bitblock128_t arg1)
    561561{
    562         return simd_xor(arg1, simd128<32>::constant<-1>());
     562        return simd_xor(arg1, simd128<32>::constant<4294967295ULL>());
     563}
     564
     565//The total number of operations is 1.0
     566IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
     567{
     568        return _mm_andnot_si128(arg2, arg1);
    563569}
    564570
     
    567573{
    568574        return _mm_or_si128(arg1, arg2);
    569 }
    570 
    571 //The total number of operations is 1.0
    572 IDISA_ALWAYS_INLINE bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
    573 {
    574         return _mm_andnot_si128(arg2, arg1);
    575575}
    576576
     
    13081308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::neg(bitblock128_t arg1)
    13091309{
    1310         return _mm_sign_epi32(arg1, simd128<32>::constant<-1>());
     1310        return _mm_sign_epi32(arg1, simd128<32>::constant<((4294967296ULL)-1)>());
    13111311}
    13121312
     
    14131413}
    14141414
    1415 //The total number of operations is 4.0
    1416 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1417 {
    1418         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1419 }
    1420 
    1421 //The total number of operations is 10.0
    1422 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1423 {
    1424         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1425         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1426 }
    1427 
    1428 //The total number of operations is 5.0
    1429 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1430 {
    1431         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1432         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1433 }
    1434 
    1435 //The total number of operations is 1.0
    1436 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1437 {
    1438         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1439 }
    1440 
    1441 //The total number of operations is 1.0
    1442 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1443 {
    1444         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1445 }
    1446 
    1447 //The total number of operations is 4.5
    1448 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1449 {
    1450         return simd_or(simd_and(simd128<64>::himask(), simd128<(32)>::srai<((sh < (32)) ? sh : (32))>(arg1)), ((sh <= (32)) ? simd128<64>::srli<sh>(arg1) : simd128<(32)>::srai<(sh-(32))>(simd128<64>::srli<(32)>(arg1))));
    1451 }
    1452 
    1453 //The total number of operations is 11.0833333333
    1454 template <> template <uint16_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1455 {
    1456         return simd_or(simd_and(simd128<128>::himask(), simd128<(64)>::srai<((sh < (64)) ? sh : (64))>(arg1)), ((sh <= (64)) ? simd128<128>::srli<sh>(arg1) : simd128<(64)>::srai<(sh-(64))>(simd128<128>::srli<(64)>(arg1))));
    1457 }
    1458 
    14591415//The total number of operations is 10.0
    14601416template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::vsrl(bitblock128_t arg1, bitblock128_t shift_mask)
     
    15191475}
    15201476