Changeset 1580 for trunk/lib/idisa_cpp


Ignore:
Timestamp:
Oct 23, 2011, 9:43:33 AM (8 years ago)
Author:
cameron
Message:

bitblock::srl, sll, srli, slli implementations

Location:
trunk/lib/idisa_cpp
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/idisa_cpp/idisa_avx.cpp

    r1573 r1580  
    2626        static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2);
    2727        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1);
     28        static IDISA_ALWAYS_INLINE bitblock256_t lomask();
    2829        static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2);
    2930        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
    3031        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
    31         static IDISA_ALWAYS_INLINE bitblock256_t lomask();
    3232        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
    3333        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
     
    8686public:
    8787        static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(bitblock256_t* arg1);
     88        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
    8889        static IDISA_ALWAYS_INLINE void store_aligned(bitblock256_t* arg1, bitblock256_t arg2);
    8990        static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1);
    9091        static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1);
    9192        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock256_t arg1);
     93        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
    9294        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(bitblock256_t* arg1);
    9395        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock256_t* arg1, bitblock256_t arg2);
     
    239241template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
    240242template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
    241 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
    242 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
    243 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
    244 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
    245 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
    246 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
    247 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
    248 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    249243template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
    250244template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
     
    265259template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
    266260template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
     261template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
     262template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
     263template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
     264template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
     265template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
     266template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
     267template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
     268template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
    267269template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
    268270template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
     
    274276template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
    275277template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
    276 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
    277 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
    278 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
    279 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
    280 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
    281 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
    282 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
    283 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
    284 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
     278template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
     279template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
     280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
     281template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
     282template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
     283template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
     284template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
     285template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
    285286template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
    286287template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
     
    317318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
    318319template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
    319 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
    320 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
    321 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
    322 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
    323 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
    324 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
    325 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
    326 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
     320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
     321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
     322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
     323template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
     324template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
     325template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
     326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
     327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
     328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
    327329template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
    328330template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
     
    470472template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    471473template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    472 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1);
    473 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1);
    474 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1);
    475 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1);
    476 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1);
    477 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1);
    478 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1);
    479 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1);
    480474template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    481475template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
     
    506500template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2);
    507501template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2);
     502template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1);
     503template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1);
     504template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1);
     505template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1);
     506template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1);
     507template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1);
     508template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1);
     509template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1);
    508510template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    509511template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     
    15471549
    15481550//The total number of operations is 0
    1549 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
    1550 {
    1551         return simd256<2>::constant<(1)>();
    1552 }
    1553 
    1554 //The total number of operations is 0
    1555 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
    1556 {
    1557         return simd256<4>::constant<(3)>();
    1558 }
    1559 
    1560 //The total number of operations is 0
    1561 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
    1562 {
    1563         return simd256<8>::constant<(15)>();
    1564 }
    1565 
    1566 //The total number of operations is 0
    1567 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
    1568 {
    1569         return simd256<16>::constant<(255)>();
    1570 }
    1571 
    1572 //The total number of operations is 0
    1573 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
    1574 {
    1575         return simd256<32>::constant<(65535)>();
    1576 }
    1577 
    1578 //The total number of operations is 0
    1579 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
    1580 {
    1581         return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1)));
    1582 }
    1583 
    1584 //The total number of operations is 0
    1585 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
    1586 {
    1587         return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1)));
    1588 }
    1589 
    1590 //The total number of operations is 0
    1591 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
    1592 {
    1593         return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1)));
    1594 }
    1595 
    1596 //The total number of operations is 0
    15971551template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()
    15981552{
     
    17151669}
    17161670
     1671//The total number of operations is 0
     1672template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
     1673{
     1674        return simd256<2>::constant<(1)>();
     1675}
     1676
     1677//The total number of operations is 0
     1678template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
     1679{
     1680        return simd256<4>::constant<(3)>();
     1681}
     1682
     1683//The total number of operations is 0
     1684template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
     1685{
     1686        return simd256<8>::constant<(15)>();
     1687}
     1688
     1689//The total number of operations is 0
     1690template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
     1691{
     1692        return simd256<16>::constant<(255)>();
     1693}
     1694
     1695//The total number of operations is 0
     1696template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
     1697{
     1698        return simd256<32>::constant<(65535)>();
     1699}
     1700
     1701//The total number of operations is 0
     1702template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
     1703{
     1704        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1)));
     1705}
     1706
     1707//The total number of operations is 0
     1708template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
     1709{
     1710        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1)));
     1711}
     1712
     1713//The total number of operations is 0
     1714template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
     1715{
     1716        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1)));
     1717}
     1718
    17171719//The total number of operations is 1
    17181720template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
     
    17791781}
    17801782
    1781 //The total number of operations is 1
    1782 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1783 {
    1784         return simd_or(arg1, arg2);
    1785 }
    1786 
    1787 //The total number of operations is 28
    1788 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1789 {
    1790         bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
    1791         bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
    1792         bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
    1793         return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1794 }
    1795 
    1796 //The total number of operations is 20
    1797 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1798 {
    1799         return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
    1800 }
    1801 
    1802 //The total number of operations is 8
    1803 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1804 {
    1805         return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1806 }
    1807 
    1808 //The total number of operations is 8
    1809 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1810 {
    1811         return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1812 }
    1813 
    1814 //The total number of operations is 8
    1815 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1816 {
    1817         return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
    1818 }
    1819 
    1820 //The total number of operations is 14
    1821 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1822 {
    1823         bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808UL)>();
    1824         return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
    1825 }
    1826 
    1827 //The total number of operations is 77
    1828 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1829 {
    1830         bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
    1831         bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
    1832         bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
    1833         return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1834 }
    1835 
    1836 //The total number of operations is 264
    1837 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
    1838 {
    1839         bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
    1840         bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
    1841         bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
    1842         return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     1783//The total number of operations is 45
     1784template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
     1785{
     1786        return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
     1787}
     1788
     1789//The total number of operations is 51
     1790template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
     1791{
     1792        bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
     1793        return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
     1794}
     1795
     1796//The total number of operations is 6
     1797template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
     1798{
     1799        return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
     1800}
     1801
     1802//The total number of operations is 6
     1803template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
     1804{
     1805        return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
     1806}
     1807
     1808//The total number of operations is 6
     1809template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
     1810{
     1811        return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
     1812}
     1813
     1814//The total number of operations is 19
     1815template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
     1816{
     1817        bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
     1818        return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
     1819}
     1820
     1821//The total number of operations is 117
     1822template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
     1823{
     1824        bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
     1825        return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
     1826}
     1827
     1828//The total number of operations is 391
     1829template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
     1830{
     1831        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
     1832        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
    18431833}
    18441834
     
    20732063}
    20742064
    2075 //The total number of operations is 45
    2076 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
    2077 {
    2078         return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
    2079 }
    2080 
    2081 //The total number of operations is 51
    2082 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
    2083 {
    2084         bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
    2085         return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
    2086 }
    2087 
    2088 //The total number of operations is 6
    2089 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
    2090 {
    2091         return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
    2092 }
    2093 
    2094 //The total number of operations is 6
    2095 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
    2096 {
    2097         return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
    2098 }
    2099 
    2100 //The total number of operations is 6
    2101 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
    2102 {
    2103         return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
    2104 }
    2105 
    2106 //The total number of operations is 19
    2107 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
    2108 {
    2109         bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
    2110         return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
    2111 }
    2112 
    2113 //The total number of operations is 117
    2114 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
    2115 {
    2116         bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
    2117         return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
    2118 }
    2119 
    2120 //The total number of operations is 391
    2121 template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
    2122 {
    2123         bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
    2124         return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
     2065//The total number of operations is 1
     2066template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2067{
     2068        return simd_or(arg1, arg2);
     2069}
     2070
     2071//The total number of operations is 28
     2072template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2073{
     2074        bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
     2075        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
     2076        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
     2077        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2078}
     2079
     2080//The total number of operations is 20
     2081template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2082{
     2083        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
     2084}
     2085
     2086//The total number of operations is 8
     2087template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2088{
     2089        return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2090}
     2091
     2092//The total number of operations is 8
     2093template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2094{
     2095        return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2096}
     2097
     2098//The total number of operations is 8
     2099template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2100{
     2101        return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
     2102}
     2103
     2104//The total number of operations is 14
     2105template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2106{
     2107        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808UL)>();
     2108        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
     2109}
     2110
     2111//The total number of operations is 77
     2112template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2113{
     2114        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
     2115        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
     2116        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
     2117        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
     2118}
     2119
     2120//The total number of operations is 264
     2121template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
     2122{
     2123        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
     2124        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
     2125        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
     2126        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    21252127}
    21262128
     
    30503052}
    30513053
     3054//The total number of operations is 5
     3055template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3056{
     3057        return simd256<1>::ifh(simd256<(4)>::himask(), mvmd256<1>::fill2(val1, val2), mvmd256<1>::fill2(val3, val4));
     3058}
     3059
     3060//The total number of operations is 5
     3061template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3062{
     3063        return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<2>::fill2(val1, val2), mvmd256<2>::fill2(val3, val4));
     3064}
     3065
     3066//The total number of operations is 5
     3067template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3068{
     3069        return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<4>::fill2(val1, val2), mvmd256<4>::fill2(val3, val4));
     3070}
     3071
     3072//The total number of operations is 5
     3073template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3074{
     3075        return simd256<1>::ifh(simd256<(32)>::himask(), mvmd256<8>::fill2(val1, val2), mvmd256<8>::fill2(val3, val4));
     3076}
     3077
     3078//The total number of operations is 3
     3079template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3080{
     3081        return simd_or(mvmd256<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd256<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
     3082}
     3083
     3084//The total number of operations is 1
     3085template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     3086{
     3087        return (bitblock256_t)_mm256_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
     3088}
     3089
     3090//The total number of operations is 41
     3091template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1)
     3092{
     3093        return simd256<256>::srli<(sh*2)>(arg1);
     3094}
     3095
     3096//The total number of operations is 41
     3097template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1)
     3098{
     3099        return simd256<256>::srli<(sh*4)>(arg1);
     3100}
     3101
     3102//The total number of operations is 41
     3103template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1)
     3104{
     3105        return simd256<256>::srli<(sh*8)>(arg1);
     3106}
     3107
     3108//The total number of operations is 41
     3109template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1)
     3110{
     3111        return simd256<256>::srli<(sh*16)>(arg1);
     3112}
     3113
     3114//The total number of operations is 41
     3115template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1)
     3116{
     3117        return simd256<256>::srli<(sh*32)>(arg1);
     3118}
     3119
     3120//The total number of operations is 41
     3121template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
     3122{
     3123        return simd256<256>::srli<(sh*64)>(arg1);
     3124}
     3125
     3126//The total number of operations is 41
     3127template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)
     3128{
     3129        return simd256<256>::srli<(sh*128)>(arg1);
     3130}
     3131
     3132//The total number of operations is 41
     3133template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)
     3134{
     3135        return simd256<256>::srli<(sh*256)>(arg1);
     3136}
     3137
     3138//The total number of operations is 1
     3139template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(uint64_t val1, uint64_t val2)
     3140{
     3141        return mvmd256<(2)>::fill(((val1<<1)|(val2&(1))));
     3142}
     3143
     3144//The total number of operations is 1
     3145template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(uint64_t val1, uint64_t val2)
     3146{
     3147        return mvmd256<(4)>::fill(((val1<<2)|(val2&(3))));
     3148}
     3149
     3150//The total number of operations is 1
     3151template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(uint64_t val1, uint64_t val2)
     3152{
     3153        return mvmd256<(8)>::fill(((val1<<4)|(val2&(15))));
     3154}
     3155
     3156//The total number of operations is 1
     3157template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(uint64_t val1, uint64_t val2)
     3158{
     3159        return mvmd256<(16)>::fill(((val1<<8)|(val2&(255))));
     3160}
     3161
     3162//The total number of operations is 1
     3163template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(uint64_t val1, uint64_t val2)
     3164{
     3165        return mvmd256<(32)>::fill(((val1<<16)|(val2&(65535))));
     3166}
     3167
     3168//The total number of operations is 5
     3169template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(uint64_t val1, uint64_t val2)
     3170{
     3171        return simd256<1>::ifh(simd256<(64)>::himask(), mvmd256<32>::fill(val1), mvmd256<32>::fill(val2));
     3172}
     3173
     3174//The total number of operations is 82
     3175template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3176{
     3177        return simd_or(mvmd256<2>::slli<sh>(arg1), mvmd256<2>::srli<((128)-sh)>(arg2));
     3178}
     3179
     3180//The total number of operations is 82
     3181template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3182{
     3183        return simd_or(mvmd256<4>::slli<sh>(arg1), mvmd256<4>::srli<((64)-sh)>(arg2));
     3184}
     3185
     3186//The total number of operations is 82
     3187template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3188{
     3189        return simd_or(mvmd256<8>::slli<sh>(arg1), mvmd256<8>::srli<((32)-sh)>(arg2));
     3190}
     3191
     3192//The total number of operations is 82
     3193template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3194{
     3195        return simd_or(mvmd256<16>::slli<sh>(arg1), mvmd256<16>::srli<((16)-sh)>(arg2));
     3196}
     3197
     3198//The total number of operations is 82
     3199template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3200{
     3201        return simd_or(mvmd256<32>::slli<sh>(arg1), mvmd256<32>::srli<((8)-sh)>(arg2));
     3202}
     3203
     3204//The total number of operations is 82
     3205template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3206{
     3207        return simd_or(mvmd256<64>::slli<sh>(arg1), mvmd256<64>::srli<((4)-sh)>(arg2));
     3208}
     3209
     3210//The total number of operations is 82
     3211template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3212{
     3213        return simd_or(mvmd256<128>::slli<sh>(arg1), mvmd256<128>::srli<((2)-sh)>(arg2));
     3214}
     3215
     3216//The total number of operations is 82
     3217template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)
     3218{
     3219        return simd_or(mvmd256<256>::slli<sh>(arg1), mvmd256<256>::srli<((1)-sh)>(arg2));
     3220}
     3221
    30523222//The total number of operations is 40
    30533223template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1)
     
    30983268}
    30993269
    3100 //The total number of operations is 5
    3101 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    3102 {
    3103         return simd256<1>::ifh(simd256<(4)>::himask(), mvmd256<1>::fill2(val1, val2), mvmd256<1>::fill2(val3, val4));
    3104 }
    3105 
    3106 //The total number of operations is 5
    3107 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    3108 {
    3109         return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<2>::fill2(val1, val2), mvmd256<2>::fill2(val3, val4));
    3110 }
    3111 
    3112 //The total number of operations is 5
    3113 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    3114 {
    3115         return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<4>::fill2(val1, val2), mvmd256<4>::fill2(val3, val4));
    3116 }
    3117 
    3118 //The total number of operations is 5
    3119 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    3120 {
    3121         return simd256<1>::ifh(simd256<(32)>::himask(), mvmd256<8>::fill2(val1, val2), mvmd256<8>::fill2(val3, val4));
    3122 }
    3123 
    3124 //The total number of operations is 3
    3125 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    3126 {
    3127         return simd_or(mvmd256<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd256<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
    3128 }
    3129 
    3130 //The total number of operations is 1
    3131 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
    3132 {
    3133         return (bitblock256_t)_mm256_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
    3134 }
    3135 
    3136 //The total number of operations is 41
    3137 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1)
    3138 {
    3139         return simd256<256>::srli<(sh*2)>(arg1);
    3140 }
    3141 
    3142 //The total number of operations is 41
    3143 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1)
    3144 {
    3145         return simd256<256>::srli<(sh*4)>(arg1);
    3146 }
    3147 
    3148 //The total number of operations is 41
    3149 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1)
    3150 {
    3151         return simd256<256>::srli<(sh*8)>(arg1);
    3152 }
    3153 
    3154 //The total number of operations is 41
    3155 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1)
    3156 {
    3157         return simd256<256>::srli<(sh*16)>(arg1);
    3158 }
    3159 
    3160 //The total number of operations is 41
    3161 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1)
    3162 {
    3163         return simd256<256>::srli<(sh*32)>(arg1);
    3164 }
    3165 
    3166 //The total number of operations is 41
    3167 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
    3168 {
    3169         return simd256<256>::srli<(sh*64)>(arg1);
    3170 }
    3171 
    3172 //The total number of operations is 41
    3173 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)
    3174 {
    3175         return simd256<256>::srli<(sh*128)>(arg1);
    3176 }
    3177 
    3178 //The total number of operations is 41
    3179 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)
    3180 {
    3181         return simd256<256>::srli<(sh*256)>(arg1);
    3182 }
    3183 
    3184 //The total number of operations is 1
    3185 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(uint64_t val1, uint64_t val2)
    3186 {
    3187         return mvmd256<(2)>::fill(((val1<<1)|(val2&(1))));
    3188 }
    3189 
    3190 //The total number of operations is 1
    3191 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(uint64_t val1, uint64_t val2)
    3192 {
    3193         return mvmd256<(4)>::fill(((val1<<2)|(val2&(3))));
    3194 }
    3195 
    3196 //The total number of operations is 1
    3197 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(uint64_t val1, uint64_t val2)
    3198 {
    3199         return mvmd256<(8)>::fill(((val1<<4)|(val2&(15))));
    3200 }
    3201 
    3202 //The total number of operations is 1
    3203 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(uint64_t val1, uint64_t val2)
    3204 {
    3205         return mvmd256<(16)>::fill(((val1<<8)|(val2&(255))));
    3206 }
    3207 
    3208 //The total number of operations is 1
    3209 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(uint64_t val1, uint64_t val2)
    3210 {
    3211         return mvmd256<(32)>::fill(((val1<<16)|(val2&(65535))));
    3212 }
    3213 
    3214 //The total number of operations is 5
    3215 template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(uint64_t val1, uint64_t val2)
    3216 {
    3217         return simd256<1>::ifh(simd256<(64)>::himask(), mvmd256<32>::fill(val1), mvmd256<32>::fill(val2));
    3218 }
    3219 
    3220 //The total number of operations is 82
    3221 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3222 {
    3223         return simd_or(mvmd256<2>::slli<sh>(arg1), mvmd256<2>::srli<((128)-sh)>(arg2));
    3224 }
    3225 
    3226 //The total number of operations is 82
    3227 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3228 {
    3229         return simd_or(mvmd256<4>::slli<sh>(arg1), mvmd256<4>::srli<((64)-sh)>(arg2));
    3230 }
    3231 
    3232 //The total number of operations is 82
    3233 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3234 {
    3235         return simd_or(mvmd256<8>::slli<sh>(arg1), mvmd256<8>::srli<((32)-sh)>(arg2));
    3236 }
    3237 
    3238 //The total number of operations is 82
    3239 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3240 {
    3241         return simd_or(mvmd256<16>::slli<sh>(arg1), mvmd256<16>::srli<((16)-sh)>(arg2));
    3242 }
    3243 
    3244 //The total number of operations is 82
    3245 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3246 {
    3247         return simd_or(mvmd256<32>::slli<sh>(arg1), mvmd256<32>::srli<((8)-sh)>(arg2));
    3248 }
    3249 
    3250 //The total number of operations is 82
    3251 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3252 {
    3253         return simd_or(mvmd256<64>::slli<sh>(arg1), mvmd256<64>::srli<((4)-sh)>(arg2));
    3254 }
    3255 
    3256 //The total number of operations is 82
    3257 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3258 {
    3259         return simd_or(mvmd256<128>::slli<sh>(arg1), mvmd256<128>::srli<((2)-sh)>(arg2));
    3260 }
    3261 
    3262 //The total number of operations is 82
    3263 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)
    3264 {
    3265         return simd_or(mvmd256<256>::slli<sh>(arg1), mvmd256<256>::srli<((1)-sh)>(arg2));
    3266 }
    3267 
    32683270//The total number of operations is 13
    32693271template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     
    33083310}
    33093311
     3312//The total number of operations is 41
     3313template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srli(bitblock256_t arg1)
     3314{
     3315        return simd256<256>::srli<sh>(arg1);
     3316}
     3317
    33103318//The total number of operations is 1
    33113319IDISA_ALWAYS_INLINE void bitblock256::store_aligned(bitblock256_t* arg1, bitblock256_t arg2)
     
    33263334}
    33273335
     3336//The total number of operations is 40
     3337template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
     3338{
     3339        return simd256<256>::slli<sh>(arg1);
     3340}
     3341
    33283342//The total number of operations is 1
    33293343IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
  • trunk/lib/idisa_cpp/idisa_sse2.cpp

    r1573 r1580  
    2828        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    2929        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
     30        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3031        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    3132        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    3233        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    33         static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3434        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    3535        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    8888{
    8989public:
     90        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    9091        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(bitblock128_t* arg1);
     92        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     93        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    9194        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t* arg1, bitblock128_t arg2);
    9295        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    9396        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    9497        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
     98        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    9599        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(bitblock128_t* arg1);
    96100        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t* arg1, bitblock128_t arg2);
     
    230234template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    231235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    232 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    233 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    235 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    236 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    237 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    238 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    239236template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    240237template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     
    253250template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    254251template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     252template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     253template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     257template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     258template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    255259template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    256260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    261265template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    262266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    263 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    264 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    265 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    267 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     268template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    271274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    272275template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    299302template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    300303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    301 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    302 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    303 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    304 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    305 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    307 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    308312template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    309313template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    440444template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    441445template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    442 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    443 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    444 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    445 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    446 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    447 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    448 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    449446template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    450447template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
     
    474471template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    475472template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     473template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     474template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     475template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     476template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     477template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     478template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     479template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    476480template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    477481template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     
    13901394
    13911395//The total number of operations is 0
    1392 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1393 {
    1394         return simd128<2>::constant<(1)>();
    1395 }
    1396 
    1397 //The total number of operations is 0
    1398 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1399 {
    1400         return simd128<4>::constant<(3)>();
    1401 }
    1402 
    1403 //The total number of operations is 0
    1404 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1405 {
    1406         return simd128<8>::constant<(15)>();
    1407 }
    1408 
    1409 //The total number of operations is 0
    1410 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1411 {
    1412         return simd128<16>::constant<(255)>();
    1413 }
    1414 
    1415 //The total number of operations is 0
    1416 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1417 {
    1418         return simd128<32>::constant<(65535)>();
    1419 }
    1420 
    1421 //The total number of operations is 0
    1422 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1423 {
    1424         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1425 }
    1426 
    1427 //The total number of operations is 0
    1428 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1429 {
    1430         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1431 }
    1432 
    1433 //The total number of operations is 0
    14341396template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    14351397{
     
    15321494}
    15331495
     1496//The total number of operations is 0
     1497template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1498{
     1499        return simd128<2>::constant<(1)>();
     1500}
     1501
     1502//The total number of operations is 0
     1503template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1504{
     1505        return simd128<4>::constant<(3)>();
     1506}
     1507
     1508//The total number of operations is 0
     1509template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1510{
     1511        return simd128<8>::constant<(15)>();
     1512}
     1513
     1514//The total number of operations is 0
     1515template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1516{
     1517        return simd128<16>::constant<(255)>();
     1518}
     1519
     1520//The total number of operations is 0
     1521template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1522{
     1523        return simd128<32>::constant<(65535)>();
     1524}
     1525
     1526//The total number of operations is 0
     1527template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1528{
     1529        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
     1530}
     1531
     1532//The total number of operations is 0
     1533template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1534{
     1535        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
     1536}
     1537
    15341538//The total number of operations is 1
    15351539template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    15881592}
    15891593
     1594//The total number of operations is 9
     1595template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1596{
     1597        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1598}
     1599
     1600//The total number of operations is 19
     1601template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1602{
     1603        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1604        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1605}
     1606
     1607//The total number of operations is 5
     1608template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1609{
     1610        bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
     1611        return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
     1612}
     1613
     1614//The total number of operations is 5
     1615template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1616{
     1617        bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
     1618        return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
     1619}
     1620
     1621//The total number of operations is 5
     1622template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1623{
     1624        bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
     1625        return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
     1626}
     1627
     1628//The total number of operations is 17
     1629template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1630{
     1631        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1632        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1633}
     1634
     1635//The total number of operations is 49
     1636template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1637{
     1638        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1639        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1640}
     1641
     1642//The total number of operations is 2
     1643template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1644{
     1645        return simd_not(simd_xor(arg1, arg2));
     1646}
     1647
     1648//The total number of operations is 8
     1649template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1650{
     1651        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1652        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1653        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1654        return simd_or(loMask, hiMask);
     1655}
     1656
     1657//The total number of operations is 9
     1658template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1659{
     1660        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1661}
     1662
     1663//The total number of operations is 1
     1664template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1665{
     1666        return _mm_cmpeq_epi8(arg1, arg2);
     1667}
     1668
     1669//The total number of operations is 1
     1670template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1671{
     1672        return _mm_cmpeq_epi16(arg1, arg2);
     1673}
     1674
     1675//The total number of operations is 1
     1676template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1677{
     1678        return _mm_cmpeq_epi32(arg1, arg2);
     1679}
     1680
     1681//The total number of operations is 5
     1682template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1683{
     1684        bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
     1685        bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
     1686        bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
     1687        return simd_or(loMask, hiMask);
     1688}
     1689
     1690//The total number of operations is 15
     1691template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1692{
     1693        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1694        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1695        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1696        return simd_or(loMask, hiMask);
     1697}
     1698
     1699//The total number of operations is 4
     1700template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1701{
     1702        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1703}
     1704
     1705//The total number of operations is 10
     1706template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1707{
     1708        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1709        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1710}
     1711
     1712//The total number of operations is 5
     1713template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1714{
     1715        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1716        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1717}
     1718
     1719//The total number of operations is 1
     1720template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1721{
     1722        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1723}
     1724
     1725//The total number of operations is 1
     1726template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1727{
     1728        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1729}
     1730
     1731//The total number of operations is 5
     1732template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1733{
     1734        bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
     1735        return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
     1736}
     1737
     1738//The total number of operations is 21
     1739template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1740{
     1741        bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
     1742        return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
     1743}
     1744
     1745//The total number of operations is 0
     1746template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     1747{
     1748        return simd128<2>::constant<(2)>();
     1749}
     1750
     1751//The total number of operations is 0
     1752template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
     1753{
     1754        return simd128<4>::constant<(12)>();
     1755}
     1756
     1757//The total number of operations is 0
     1758template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
     1759{
     1760        return simd128<8>::constant<(240)>();
     1761}
     1762
     1763//The total number of operations is 0
     1764template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
     1765{
     1766        return simd128<16>::constant<(65280)>();
     1767}
     1768
     1769//The total number of operations is 0
     1770template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
     1771{
     1772        return simd128<32>::constant<-65536>();
     1773}
     1774
     1775//The total number of operations is 0
     1776template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
     1777{
     1778        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1779}
     1780
     1781//The total number of operations is 0
     1782template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
     1783{
     1784        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1785}
     1786
     1787//The total number of operations is 1
     1788template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
     1789{
     1790        return simd_xor(arg1, arg2);
     1791}
     1792
     1793//The total number of operations is 10
     1794template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
     1795{
     1796        bitblock128_t tmp = simd_xor(arg1, arg2);
     1797        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
     1798}
     1799
     1800//The total number of operations is 6
     1801template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
     1802{
     1803        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
     1804}
     1805
     1806//The total number of operations is 1
     1807template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
     1808{
     1809        return _mm_add_epi8(arg1, arg2);
     1810}
     1811
     1812//The total number of operations is 1
     1813template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
     1814{
     1815        return _mm_add_epi16(arg1, arg2);
     1816}
     1817
     1818//The total number of operations is 1
     1819template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
     1820{
     1821        return _mm_add_epi32(arg1, arg2);
     1822}
     1823
     1824//The total number of operations is 1
     1825template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
     1826{
     1827        return _mm_add_epi64(arg1, arg2);
     1828}
     1829
     1830//The total number of operations is 11
     1831template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
     1832{
     1833        bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
     1834        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
     1835        bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
     1836        return simd128<(64)>::add(partial, carry);
     1837}
     1838
    15901839//The total number of operations is 1
    15911840template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     
    16421891        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    16431892        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1644 }
    1645 
    1646 //The total number of operations is 2
    1647 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1648 {
    1649         return simd_not(simd_xor(arg1, arg2));
    1650 }
    1651 
    1652 //The total number of operations is 8
    1653 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1654 {
    1655         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1656         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1657         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1658         return simd_or(loMask, hiMask);
    1659 }
    1660 
    1661 //The total number of operations is 9
    1662 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1663 {
    1664         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1665 }
    1666 
    1667 //The total number of operations is 1
    1668 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1669 {
    1670         return _mm_cmpeq_epi8(arg1, arg2);
    1671 }
    1672 
    1673 //The total number of operations is 1
    1674 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1675 {
    1676         return _mm_cmpeq_epi16(arg1, arg2);
    1677 }
    1678 
    1679 //The total number of operations is 1
    1680 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1681 {
    1682         return _mm_cmpeq_epi32(arg1, arg2);
    1683 }
    1684 
    1685 //The total number of operations is 5
    1686 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1687 {
    1688         bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
    1689         bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
    1690         bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
    1691         return simd_or(loMask, hiMask);
    1692 }
    1693 
    1694 //The total number of operations is 15
    1695 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1696 {
    1697         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1698         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1699         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1700         return simd_or(loMask, hiMask);
    1701 }
    1702 
    1703 //The total number of operations is 4
    1704 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1705 {
    1706         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1707 }
    1708 
    1709 //The total number of operations is 10
    1710 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1711 {
    1712         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1713         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1714 }
    1715 
    1716 //The total number of operations is 5
    1717 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1718 {
    1719         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1720         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1721 }
    1722 
    1723 //The total number of operations is 1
    1724 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1725 {
    1726         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1727 }
    1728 
    1729 //The total number of operations is 1
    1730 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1731 {
    1732         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1733 }
    1734 
    1735 //The total number of operations is 5
    1736 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1737 {
    1738         bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
    1739         return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
    1740 }
    1741 
    1742 //The total number of operations is 21
    1743 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1744 {
    1745         bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
    1746         return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
    1747 }
    1748 
    1749 //The total number of operations is 0
    1750 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
    1751 {
    1752         return simd128<2>::constant<(2)>();
    1753 }
    1754 
    1755 //The total number of operations is 0
    1756 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
    1757 {
    1758         return simd128<4>::constant<(12)>();
    1759 }
    1760 
    1761 //The total number of operations is 0
    1762 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
    1763 {
    1764         return simd128<8>::constant<(240)>();
    1765 }
    1766 
    1767 //The total number of operations is 0
    1768 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
    1769 {
    1770         return simd128<16>::constant<(65280)>();
    1771 }
    1772 
    1773 //The total number of operations is 0
    1774 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    1775 {
    1776         return simd128<32>::constant<-65536>();
    1777 }
    1778 
    1779 //The total number of operations is 0
    1780 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    1781 {
    1782         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
    1783 }
    1784 
    1785 //The total number of operations is 0
    1786 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    1787 {
    1788         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
    1789 }
    1790 
    1791 //The total number of operations is 1
    1792 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
    1793 {
    1794         return simd_xor(arg1, arg2);
    1795 }
    1796 
    1797 //The total number of operations is 10
    1798 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
    1799 {
    1800         bitblock128_t tmp = simd_xor(arg1, arg2);
    1801         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
    1802 }
    1803 
    1804 //The total number of operations is 6
    1805 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
    1806 {
    1807         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
    1808 }
    1809 
    1810 //The total number of operations is 1
    1811 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
    1812 {
    1813         return _mm_add_epi8(arg1, arg2);
    1814 }
    1815 
    1816 //The total number of operations is 1
    1817 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
    1818 {
    1819         return _mm_add_epi16(arg1, arg2);
    1820 }
    1821 
    1822 //The total number of operations is 1
    1823 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
    1824 {
    1825         return _mm_add_epi32(arg1, arg2);
    1826 }
    1827 
    1828 //The total number of operations is 1
    1829 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
    1830 {
    1831         return _mm_add_epi64(arg1, arg2);
    1832 }
    1833 
    1834 //The total number of operations is 11
    1835 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
    1836 {
    1837         bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
    1838         bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
    1839         bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
    1840         return simd128<(64)>::add(partial, carry);
    1841 }
    1842 
    1843 //The total number of operations is 9
    1844 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1845 {
    1846         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1847 }
    1848 
    1849 //The total number of operations is 19
    1850 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1851 {
    1852         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1853         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1854 }
    1855 
    1856 //The total number of operations is 5
    1857 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1858 {
    1859         bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
    1860         return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
    1861 }
    1862 
    1863 //The total number of operations is 5
    1864 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1865 {
    1866         bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
    1867         return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
    1868 }
    1869 
    1870 //The total number of operations is 5
    1871 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1872 {
    1873         bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
    1874         return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
    1875 }
    1876 
    1877 //The total number of operations is 17
    1878 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1879 {
    1880         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1881         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1882 }
    1883 
    1884 //The total number of operations is 49
    1885 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1886 {
    1887         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1888         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    18891893}
    18901894
     
    27412745}
    27422746
    2743 //The total number of operations is 4
    2744 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    2745 {
    2746         return simd128<128>::slli<(sh*2)>(arg1);
    2747 }
    2748 
    2749 //The total number of operations is 4
    2750 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2751 {
    2752         return simd128<128>::slli<(sh*4)>(arg1);
    2753 }
    2754 
    2755 //The total number of operations is 4
    2756 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    2757 {
    2758         return simd128<128>::slli<(sh*8)>(arg1);
    2759 }
    2760 
    2761 //The total number of operations is 4
    2762 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2763 {
    2764         return simd128<128>::slli<(sh*16)>(arg1);
    2765 }
    2766 
    2767 //The total number of operations is 4
    2768 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    2769 {
    2770         return simd128<128>::slli<(sh*32)>(arg1);
    2771 }
    2772 
    2773 //The total number of operations is 4
    2774 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2775 {
    2776         return simd128<128>::slli<(sh*64)>(arg1);
    2777 }
    2778 
    2779 //The total number of operations is 4
    2780 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    2781 {
    2782         return simd128<128>::slli<(sh*128)>(arg1);
    2783 }
    2784 
    27852747//The total number of operations is 5
    27862748template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     
    29452907}
    29462908
     2909//The total number of operations is 4
     2910template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     2911{
     2912        return simd128<128>::slli<(sh*2)>(arg1);
     2913}
     2914
     2915//The total number of operations is 4
     2916template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     2917{
     2918        return simd128<128>::slli<(sh*4)>(arg1);
     2919}
     2920
     2921//The total number of operations is 4
     2922template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     2923{
     2924        return simd128<128>::slli<(sh*8)>(arg1);
     2925}
     2926
     2927//The total number of operations is 4
     2928template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     2929{
     2930        return simd128<128>::slli<(sh*16)>(arg1);
     2931}
     2932
     2933//The total number of operations is 4
     2934template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     2935{
     2936        return simd128<128>::slli<(sh*32)>(arg1);
     2937}
     2938
     2939//The total number of operations is 4
     2940template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     2941{
     2942        return simd128<128>::slli<(sh*64)>(arg1);
     2943}
     2944
     2945//The total number of operations is 4
     2946template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     2947{
     2948        return simd128<128>::slli<(sh*128)>(arg1);
     2949}
     2950
    29472951//The total number of operations is 13
    29482952template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     
    29752979}
    29762980
     2981//The total number of operations is 11
     2982IDISA_ALWAYS_INLINE bitblock128_t bitblock128::sll(bitblock128_t arg1, bitblock128_t arg2)
     2983{
     2984        return simd128<128>::sll(arg1, arg2);
     2985}
     2986
    29772987//The total number of operations is 1
    29782988IDISA_ALWAYS_INLINE bitblock128_t bitblock128::load_unaligned(bitblock128_t* arg1)
    29792989{
    29802990        return _mm_loadu_si128((bitblock128_t*)(arg1));
     2991}
     2992
     2993//The total number of operations is 4
     2994template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     2995{
     2996        return simd128<128>::srli<sh>(arg1);
    29812997}
    29822998
     
    29993015}
    30003016
     3017//The total number of operations is 11
     3018IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srl(bitblock128_t arg1, bitblock128_t arg2)
     3019{
     3020        return simd128<128>::srl(arg1, arg2);
     3021}
     3022
     3023//The total number of operations is 4
     3024template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3025{
     3026        return simd128<128>::slli<sh>(arg1);
     3027}
     3028
    30013029//The total number of operations is 2
    30023030IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
  • trunk/lib/idisa_cpp/idisa_sse3.cpp

    r1573 r1580  
    2828        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    2929        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
     30        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3031        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    3132        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    3233        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    33         static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3434        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    3535        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    8888{
    8989public:
     90        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    9091        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(bitblock128_t* arg1);
     92        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     93        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    9194        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t* arg1, bitblock128_t arg2);
    9295        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    9396        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    9497        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
     98        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    9599        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(bitblock128_t* arg1);
    96100        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t* arg1, bitblock128_t arg2);
     
    230234template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    231235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    232 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    233 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    235 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    236 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    237 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    238 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    239236template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    240237template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     
    253250template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    254251template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     252template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     253template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     257template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     258template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    255259template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    256260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    261265template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    262266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    263 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    264 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    265 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    267 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     268template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    271274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    272275template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    299302template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    300303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    301 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    302 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    303 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    304 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    305 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    307 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    308312template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    309313template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    440444template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    441445template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    442 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    443 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    444 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    445 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    446 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    447 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    448 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    449446template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    450447template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
     
    474471template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    475472template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     473template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     474template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     475template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     476template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     477template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     478template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     479template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    476480template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    477481template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     
    13901394
    13911395//The total number of operations is 0
    1392 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1393 {
    1394         return simd128<2>::constant<(1)>();
    1395 }
    1396 
    1397 //The total number of operations is 0
    1398 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1399 {
    1400         return simd128<4>::constant<(3)>();
    1401 }
    1402 
    1403 //The total number of operations is 0
    1404 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1405 {
    1406         return simd128<8>::constant<(15)>();
    1407 }
    1408 
    1409 //The total number of operations is 0
    1410 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1411 {
    1412         return simd128<16>::constant<(255)>();
    1413 }
    1414 
    1415 //The total number of operations is 0
    1416 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1417 {
    1418         return simd128<32>::constant<(65535)>();
    1419 }
    1420 
    1421 //The total number of operations is 0
    1422 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1423 {
    1424         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1425 }
    1426 
    1427 //The total number of operations is 0
    1428 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1429 {
    1430         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1431 }
    1432 
    1433 //The total number of operations is 0
    14341396template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    14351397{
     
    15321494}
    15331495
     1496//The total number of operations is 0
     1497template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1498{
     1499        return simd128<2>::constant<(1)>();
     1500}
     1501
     1502//The total number of operations is 0
     1503template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1504{
     1505        return simd128<4>::constant<(3)>();
     1506}
     1507
     1508//The total number of operations is 0
     1509template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1510{
     1511        return simd128<8>::constant<(15)>();
     1512}
     1513
     1514//The total number of operations is 0
     1515template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1516{
     1517        return simd128<16>::constant<(255)>();
     1518}
     1519
     1520//The total number of operations is 0
     1521template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1522{
     1523        return simd128<32>::constant<(65535)>();
     1524}
     1525
     1526//The total number of operations is 0
     1527template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1528{
     1529        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
     1530}
     1531
     1532//The total number of operations is 0
     1533template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1534{
     1535        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
     1536}
     1537
    15341538//The total number of operations is 1
    15351539template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    15881592}
    15891593
     1594//The total number of operations is 9
     1595template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1596{
     1597        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1598}
     1599
     1600//The total number of operations is 19
     1601template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1602{
     1603        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1604        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1605}
     1606
     1607//The total number of operations is 5
     1608template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1609{
     1610        bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
     1611        return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
     1612}
     1613
     1614//The total number of operations is 5
     1615template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1616{
     1617        bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
     1618        return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
     1619}
     1620
     1621//The total number of operations is 5
     1622template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1623{
     1624        bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
     1625        return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
     1626}
     1627
     1628//The total number of operations is 17
     1629template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1630{
     1631        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1632        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1633}
     1634
     1635//The total number of operations is 49
     1636template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1637{
     1638        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1639        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1640}
     1641
     1642//The total number of operations is 2
     1643template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1644{
     1645        return simd_not(simd_xor(arg1, arg2));
     1646}
     1647
     1648//The total number of operations is 8
     1649template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1650{
     1651        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1652        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1653        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1654        return simd_or(loMask, hiMask);
     1655}
     1656
     1657//The total number of operations is 9
     1658template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1659{
     1660        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1661}
     1662
     1663//The total number of operations is 1
     1664template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1665{
     1666        return _mm_cmpeq_epi8(arg1, arg2);
     1667}
     1668
     1669//The total number of operations is 1
     1670template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1671{
     1672        return _mm_cmpeq_epi16(arg1, arg2);
     1673}
     1674
     1675//The total number of operations is 1
     1676template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1677{
     1678        return _mm_cmpeq_epi32(arg1, arg2);
     1679}
     1680
     1681//The total number of operations is 5
     1682template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1683{
     1684        bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
     1685        bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
     1686        bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
     1687        return simd_or(loMask, hiMask);
     1688}
     1689
     1690//The total number of operations is 15
     1691template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1692{
     1693        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1694        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1695        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1696        return simd_or(loMask, hiMask);
     1697}
     1698
     1699//The total number of operations is 4
     1700template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1701{
     1702        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1703}
     1704
     1705//The total number of operations is 10
     1706template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1707{
     1708        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1709        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1710}
     1711
     1712//The total number of operations is 5
     1713template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1714{
     1715        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1716        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1717}
     1718
     1719//The total number of operations is 1
     1720template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1721{
     1722        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1723}
     1724
     1725//The total number of operations is 1
     1726template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1727{
     1728        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1729}
     1730
     1731//The total number of operations is 5
     1732template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1733{
     1734        bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
     1735        return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
     1736}
     1737
     1738//The total number of operations is 21
     1739template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1740{
     1741        bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
     1742        return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
     1743}
     1744
     1745//The total number of operations is 0
     1746template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     1747{
     1748        return simd128<2>::constant<(2)>();
     1749}
     1750
     1751//The total number of operations is 0
     1752template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
     1753{
     1754        return simd128<4>::constant<(12)>();
     1755}
     1756
     1757//The total number of operations is 0
     1758template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
     1759{
     1760        return simd128<8>::constant<(240)>();
     1761}
     1762
     1763//The total number of operations is 0
     1764template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
     1765{
     1766        return simd128<16>::constant<(65280)>();
     1767}
     1768
     1769//The total number of operations is 0
     1770template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
     1771{
     1772        return simd128<32>::constant<-65536>();
     1773}
     1774
     1775//The total number of operations is 0
     1776template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
     1777{
     1778        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1779}
     1780
     1781//The total number of operations is 0
     1782template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
     1783{
     1784        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1785}
     1786
     1787//The total number of operations is 1
     1788template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
     1789{
     1790        return simd_xor(arg1, arg2);
     1791}
     1792
     1793//The total number of operations is 10
     1794template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
     1795{
     1796        bitblock128_t tmp = simd_xor(arg1, arg2);
     1797        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
     1798}
     1799
     1800//The total number of operations is 6
     1801template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
     1802{
     1803        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
     1804}
     1805
     1806//The total number of operations is 1
     1807template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
     1808{
     1809        return _mm_add_epi8(arg1, arg2);
     1810}
     1811
     1812//The total number of operations is 1
     1813template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
     1814{
     1815        return _mm_add_epi16(arg1, arg2);
     1816}
     1817
     1818//The total number of operations is 1
     1819template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
     1820{
     1821        return _mm_add_epi32(arg1, arg2);
     1822}
     1823
     1824//The total number of operations is 1
     1825template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
     1826{
     1827        return _mm_add_epi64(arg1, arg2);
     1828}
     1829
     1830//The total number of operations is 11
     1831template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
     1832{
     1833        bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
     1834        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
     1835        bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
     1836        return simd128<(64)>::add(partial, carry);
     1837}
     1838
    15901839//The total number of operations is 1
    15911840template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     
    16421891        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    16431892        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1644 }
    1645 
    1646 //The total number of operations is 2
    1647 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1648 {
    1649         return simd_not(simd_xor(arg1, arg2));
    1650 }
    1651 
    1652 //The total number of operations is 8
    1653 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1654 {
    1655         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1656         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1657         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1658         return simd_or(loMask, hiMask);
    1659 }
    1660 
    1661 //The total number of operations is 9
    1662 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1663 {
    1664         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1665 }
    1666 
    1667 //The total number of operations is 1
    1668 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1669 {
    1670         return _mm_cmpeq_epi8(arg1, arg2);
    1671 }
    1672 
    1673 //The total number of operations is 1
    1674 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1675 {
    1676         return _mm_cmpeq_epi16(arg1, arg2);
    1677 }
    1678 
    1679 //The total number of operations is 1
    1680 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1681 {
    1682         return _mm_cmpeq_epi32(arg1, arg2);
    1683 }
    1684 
    1685 //The total number of operations is 5
    1686 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1687 {
    1688         bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
    1689         bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
    1690         bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
    1691         return simd_or(loMask, hiMask);
    1692 }
    1693 
    1694 //The total number of operations is 15
    1695 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1696 {
    1697         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1698         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1699         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1700         return simd_or(loMask, hiMask);
    1701 }
    1702 
    1703 //The total number of operations is 4
    1704 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1705 {
    1706         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1707 }
    1708 
    1709 //The total number of operations is 10
    1710 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1711 {
    1712         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1713         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1714 }
    1715 
    1716 //The total number of operations is 5
    1717 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1718 {
    1719         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1720         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1721 }
    1722 
    1723 //The total number of operations is 1
    1724 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1725 {
    1726         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1727 }
    1728 
    1729 //The total number of operations is 1
    1730 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1731 {
    1732         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1733 }
    1734 
    1735 //The total number of operations is 5
    1736 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1737 {
    1738         bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
    1739         return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
    1740 }
    1741 
    1742 //The total number of operations is 21
    1743 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1744 {
    1745         bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
    1746         return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
    1747 }
    1748 
    1749 //The total number of operations is 0
    1750 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
    1751 {
    1752         return simd128<2>::constant<(2)>();
    1753 }
    1754 
    1755 //The total number of operations is 0
    1756 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
    1757 {
    1758         return simd128<4>::constant<(12)>();
    1759 }
    1760 
    1761 //The total number of operations is 0
    1762 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
    1763 {
    1764         return simd128<8>::constant<(240)>();
    1765 }
    1766 
    1767 //The total number of operations is 0
    1768 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
    1769 {
    1770         return simd128<16>::constant<(65280)>();
    1771 }
    1772 
    1773 //The total number of operations is 0
    1774 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    1775 {
    1776         return simd128<32>::constant<-65536>();
    1777 }
    1778 
    1779 //The total number of operations is 0
    1780 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    1781 {
    1782         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
    1783 }
    1784 
    1785 //The total number of operations is 0
    1786 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    1787 {
    1788         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
    1789 }
    1790 
    1791 //The total number of operations is 1
    1792 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
    1793 {
    1794         return simd_xor(arg1, arg2);
    1795 }
    1796 
    1797 //The total number of operations is 10
    1798 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
    1799 {
    1800         bitblock128_t tmp = simd_xor(arg1, arg2);
    1801         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
    1802 }
    1803 
    1804 //The total number of operations is 6
    1805 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
    1806 {
    1807         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
    1808 }
    1809 
    1810 //The total number of operations is 1
    1811 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
    1812 {
    1813         return _mm_add_epi8(arg1, arg2);
    1814 }
    1815 
    1816 //The total number of operations is 1
    1817 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
    1818 {
    1819         return _mm_add_epi16(arg1, arg2);
    1820 }
    1821 
    1822 //The total number of operations is 1
    1823 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
    1824 {
    1825         return _mm_add_epi32(arg1, arg2);
    1826 }
    1827 
    1828 //The total number of operations is 1
    1829 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
    1830 {
    1831         return _mm_add_epi64(arg1, arg2);
    1832 }
    1833 
    1834 //The total number of operations is 11
    1835 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
    1836 {
    1837         bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
    1838         bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
    1839         bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
    1840         return simd128<(64)>::add(partial, carry);
    1841 }
    1842 
    1843 //The total number of operations is 9
    1844 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1845 {
    1846         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1847 }
    1848 
    1849 //The total number of operations is 19
    1850 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1851 {
    1852         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1853         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1854 }
    1855 
    1856 //The total number of operations is 5
    1857 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1858 {
    1859         bitblock128_t gtMask = simd128<8>::gt(arg1, simd128<8>::constant<0>());
    1860         return simd128<1>::ifh(gtMask, arg1, simd128<8>::sub(gtMask, arg1));
    1861 }
    1862 
    1863 //The total number of operations is 5
    1864 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1865 {
    1866         bitblock128_t gtMask = simd128<16>::gt(arg1, simd128<16>::constant<0>());
    1867         return simd128<1>::ifh(gtMask, arg1, simd128<16>::sub(gtMask, arg1));
    1868 }
    1869 
    1870 //The total number of operations is 5
    1871 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1872 {
    1873         bitblock128_t gtMask = simd128<32>::gt(arg1, simd128<32>::constant<0>());
    1874         return simd128<1>::ifh(gtMask, arg1, simd128<32>::sub(gtMask, arg1));
    1875 }
    1876 
    1877 //The total number of operations is 17
    1878 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1879 {
    1880         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1881         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1882 }
    1883 
    1884 //The total number of operations is 49
    1885 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1886 {
    1887         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1888         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    18891893}
    18901894
     
    27412745}
    27422746
    2743 //The total number of operations is 4
    2744 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    2745 {
    2746         return simd128<128>::slli<(sh*2)>(arg1);
    2747 }
    2748 
    2749 //The total number of operations is 4
    2750 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2751 {
    2752         return simd128<128>::slli<(sh*4)>(arg1);
    2753 }
    2754 
    2755 //The total number of operations is 4
    2756 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    2757 {
    2758         return simd128<128>::slli<(sh*8)>(arg1);
    2759 }
    2760 
    2761 //The total number of operations is 4
    2762 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2763 {
    2764         return simd128<128>::slli<(sh*16)>(arg1);
    2765 }
    2766 
    2767 //The total number of operations is 4
    2768 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    2769 {
    2770         return simd128<128>::slli<(sh*32)>(arg1);
    2771 }
    2772 
    2773 //The total number of operations is 4
    2774 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2775 {
    2776         return simd128<128>::slli<(sh*64)>(arg1);
    2777 }
    2778 
    2779 //The total number of operations is 4
    2780 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    2781 {
    2782         return simd128<128>::slli<(sh*128)>(arg1);
    2783 }
    2784 
    27852747//The total number of operations is 5
    27862748template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     
    29452907}
    29462908
     2909//The total number of operations is 4
     2910template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     2911{
     2912        return simd128<128>::slli<(sh*2)>(arg1);
     2913}
     2914
     2915//The total number of operations is 4
     2916template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     2917{
     2918        return simd128<128>::slli<(sh*4)>(arg1);
     2919}
     2920
     2921//The total number of operations is 4
     2922template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     2923{
     2924        return simd128<128>::slli<(sh*8)>(arg1);
     2925}
     2926
     2927//The total number of operations is 4
     2928template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     2929{
     2930        return simd128<128>::slli<(sh*16)>(arg1);
     2931}
     2932
     2933//The total number of operations is 4
     2934template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     2935{
     2936        return simd128<128>::slli<(sh*32)>(arg1);
     2937}
     2938
     2939//The total number of operations is 4
     2940template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     2941{
     2942        return simd128<128>::slli<(sh*64)>(arg1);
     2943}
     2944
     2945//The total number of operations is 4
     2946template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     2947{
     2948        return simd128<128>::slli<(sh*128)>(arg1);
     2949}
     2950
    29472951//The total number of operations is 13
    29482952template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     
    29752979}
    29762980
     2981//The total number of operations is 11
     2982IDISA_ALWAYS_INLINE bitblock128_t bitblock128::sll(bitblock128_t arg1, bitblock128_t arg2)
     2983{
     2984        return simd128<128>::sll(arg1, arg2);
     2985}
     2986
    29772987//The total number of operations is 1
    29782988IDISA_ALWAYS_INLINE bitblock128_t bitblock128::load_unaligned(bitblock128_t* arg1)
    29792989{
    29802990        return _mm_loadu_si128((bitblock128_t*)(arg1));
     2991}
     2992
     2993//The total number of operations is 4
     2994template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     2995{
     2996        return simd128<128>::srli<sh>(arg1);
    29812997}
    29822998
     
    29993015}
    30003016
     3017//The total number of operations is 11
     3018IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srl(bitblock128_t arg1, bitblock128_t arg2)
     3019{
     3020        return simd128<128>::srl(arg1, arg2);
     3021}
     3022
     3023//The total number of operations is 4
     3024template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3025{
     3026        return simd128<128>::slli<sh>(arg1);
     3027}
     3028
    30013029//The total number of operations is 2
    30023030IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
  • trunk/lib/idisa_cpp/idisa_sse4_1.cpp

    r1573 r1580  
    2828        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    2929        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
     30        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3031        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    3132        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    3233        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    33         static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3434        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    3535        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    8989{
    9090public:
     91        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    9192        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(bitblock128_t* arg1);
     93        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     94        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    9295        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t* arg1, bitblock128_t arg2);
    9396        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    9497        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    9598        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
     99        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    96100        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(bitblock128_t* arg1);
    97101        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t* arg1, bitblock128_t arg2);
     
    231235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    232236template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    233 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    235 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    236 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    237 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    238 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    239 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    240237template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    241238template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     
    254251template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    255252template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     253template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     257template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     258template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     259template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    256260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    257261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    262266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    263267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    264 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    265 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    267 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    271 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     268template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    272275template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    273276template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    300303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    301304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    302 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    303 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    304 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    305 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    307 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    308 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    309313template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    310314template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    445449template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    446450template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    447 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    448 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    449 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    450 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    451 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    452 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    453 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    454451template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    455452template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
     
    479476template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    480477template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     478template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     479template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     480template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     481template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     482template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     483template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     484template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    481485template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    482486template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     
    13961400
    13971401//The total number of operations is 0
    1398 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1399 {
    1400         return simd128<2>::constant<(1)>();
    1401 }
    1402 
    1403 //The total number of operations is 0
    1404 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1405 {
    1406         return simd128<4>::constant<(3)>();
    1407 }
    1408 
    1409 //The total number of operations is 0
    1410 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1411 {
    1412         return simd128<8>::constant<(15)>();
    1413 }
    1414 
    1415 //The total number of operations is 0
    1416 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1417 {
    1418         return simd128<16>::constant<(255)>();
    1419 }
    1420 
    1421 //The total number of operations is 0
    1422 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1423 {
    1424         return simd128<32>::constant<(65535)>();
    1425 }
    1426 
    1427 //The total number of operations is 0
    1428 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1429 {
    1430         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1431 }
    1432 
    1433 //The total number of operations is 0
    1434 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1435 {
    1436         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1437 }
    1438 
    1439 //The total number of operations is 0
    14401402template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    14411403{
     
    15421504}
    15431505
     1506//The total number of operations is 0
     1507template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1508{
     1509        return simd128<2>::constant<(1)>();
     1510}
     1511
     1512//The total number of operations is 0
     1513template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1514{
     1515        return simd128<4>::constant<(3)>();
     1516}
     1517
     1518//The total number of operations is 0
     1519template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1520{
     1521        return simd128<8>::constant<(15)>();
     1522}
     1523
     1524//The total number of operations is 0
     1525template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1526{
     1527        return simd128<16>::constant<(255)>();
     1528}
     1529
     1530//The total number of operations is 0
     1531template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1532{
     1533        return simd128<32>::constant<(65535)>();
     1534}
     1535
     1536//The total number of operations is 0
     1537template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1538{
     1539        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
     1540}
     1541
     1542//The total number of operations is 0
     1543template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1544{
     1545        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
     1546}
     1547
    15441548//The total number of operations is 1
    15451549template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    15961600}
    15971601
     1602//The total number of operations is 9
     1603template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1604{
     1605        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1606}
     1607
     1608//The total number of operations is 19
     1609template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1610{
     1611        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1612        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1613}
     1614
     1615//The total number of operations is 1
     1616template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1617{
     1618        return _mm_abs_epi8(arg1);
     1619}
     1620
     1621//The total number of operations is 1
     1622template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1623{
     1624        return _mm_abs_epi16(arg1);
     1625}
     1626
     1627//The total number of operations is 1
     1628template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1629{
     1630        return _mm_abs_epi32(arg1);
     1631}
     1632
     1633//The total number of operations is 9
     1634template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1635{
     1636        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1637        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1638}
     1639
     1640//The total number of operations is 37
     1641template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1642{
     1643        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1644        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1645}
     1646
     1647//The total number of operations is 2
     1648template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1649{
     1650        return simd_not(simd_xor(arg1, arg2));
     1651}
     1652
     1653//The total number of operations is 8
     1654template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1655{
     1656        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1657        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1658        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1659        return simd_or(loMask, hiMask);
     1660}
     1661
     1662//The total number of operations is 9
     1663template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1664{
     1665        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1666}
     1667
     1668//The total number of operations is 1
     1669template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1670{
     1671        return _mm_cmpeq_epi8(arg1, arg2);
     1672}
     1673
     1674//The total number of operations is 1
     1675template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1676{
     1677        return _mm_cmpeq_epi16(arg1, arg2);
     1678}
     1679
     1680//The total number of operations is 1
     1681template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1682{
     1683        return _mm_cmpeq_epi32(arg1, arg2);
     1684}
     1685
     1686//The total number of operations is 1
     1687template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1688{
     1689        return _mm_cmpeq_epi64(arg1, arg2);
     1690}
     1691
     1692//The total number of operations is 11
     1693template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1694{
     1695        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1696        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1697        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1698        return simd_or(loMask, hiMask);
     1699}
     1700
     1701//The total number of operations is 4
     1702template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1703{
     1704        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1705}
     1706
     1707//The total number of operations is 10
     1708template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1709{
     1710        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1711        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1712}
     1713
     1714//The total number of operations is 5
     1715template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1716{
     1717        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1718        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1719}
     1720
     1721//The total number of operations is 1
     1722template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1723{
     1724        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1725}
     1726
     1727//The total number of operations is 1
     1728template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1729{
     1730        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1731}
     1732
     1733//The total number of operations is 5
     1734template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1735{
     1736        bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
     1737        return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
     1738}
     1739
     1740//The total number of operations is 21
     1741template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1742{
     1743        bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
     1744        return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
     1745}
     1746
     1747//The total number of operations is 0
     1748template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     1749{
     1750        return simd128<2>::constant<(2)>();
     1751}
     1752
     1753//The total number of operations is 0
     1754template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
     1755{
     1756        return simd128<4>::constant<(12)>();
     1757}
     1758
     1759//The total number of operations is 0
     1760template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
     1761{
     1762        return simd128<8>::constant<(240)>();
     1763}
     1764
     1765//The total number of operations is 0
     1766template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
     1767{
     1768        return simd128<16>::constant<(65280)>();
     1769}
     1770
     1771//The total number of operations is 0
     1772template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
     1773{
     1774        return simd128<32>::constant<-65536>();
     1775}
     1776
     1777//The total number of operations is 0
     1778template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
     1779{
     1780        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1781}
     1782
     1783//The total number of operations is 0
     1784template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
     1785{
     1786        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1787}
     1788
     1789//The total number of operations is 1
     1790template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
     1791{
     1792        return simd_xor(arg1, arg2);
     1793}
     1794
     1795//The total number of operations is 10
     1796template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
     1797{
     1798        bitblock128_t tmp = simd_xor(arg1, arg2);
     1799        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
     1800}
     1801
     1802//The total number of operations is 6
     1803template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
     1804{
     1805        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
     1806}
     1807
     1808//The total number of operations is 1
     1809template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
     1810{
     1811        return _mm_add_epi8(arg1, arg2);
     1812}
     1813
     1814//The total number of operations is 1
     1815template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
     1816{
     1817        return _mm_add_epi16(arg1, arg2);
     1818}
     1819
     1820//The total number of operations is 1
     1821template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
     1822{
     1823        return _mm_add_epi32(arg1, arg2);
     1824}
     1825
     1826//The total number of operations is 1
     1827template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
     1828{
     1829        return _mm_add_epi64(arg1, arg2);
     1830}
     1831
     1832//The total number of operations is 11
     1833template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
     1834{
     1835        bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
     1836        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
     1837        bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
     1838        return simd128<(64)>::add(partial, carry);
     1839}
     1840
    15981841//The total number of operations is 1
    15991842template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     
    16481891        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    16491892        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1650 }
    1651 
    1652 //The total number of operations is 2
    1653 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1654 {
    1655         return simd_not(simd_xor(arg1, arg2));
    1656 }
    1657 
    1658 //The total number of operations is 8
    1659 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1660 {
    1661         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1662         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1663         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1664         return simd_or(loMask, hiMask);
    1665 }
    1666 
    1667 //The total number of operations is 9
    1668 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1669 {
    1670         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1671 }
    1672 
    1673 //The total number of operations is 1
    1674 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1675 {
    1676         return _mm_cmpeq_epi8(arg1, arg2);
    1677 }
    1678 
    1679 //The total number of operations is 1
    1680 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1681 {
    1682         return _mm_cmpeq_epi16(arg1, arg2);
    1683 }
    1684 
    1685 //The total number of operations is 1
    1686 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1687 {
    1688         return _mm_cmpeq_epi32(arg1, arg2);
    1689 }
    1690 
    1691 //The total number of operations is 1
    1692 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1693 {
    1694         return _mm_cmpeq_epi64(arg1, arg2);
    1695 }
    1696 
    1697 //The total number of operations is 11
    1698 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1699 {
    1700         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1701         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1702         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1703         return simd_or(loMask, hiMask);
    1704 }
    1705 
    1706 //The total number of operations is 4
    1707 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1708 {
    1709         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1710 }
    1711 
    1712 //The total number of operations is 10
    1713 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1714 {
    1715         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1716         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1717 }
    1718 
    1719 //The total number of operations is 5
    1720 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1721 {
    1722         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1723         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1724 }
    1725 
    1726 //The total number of operations is 1
    1727 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1728 {
    1729         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1730 }
    1731 
    1732 //The total number of operations is 1
    1733 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1734 {
    1735         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1736 }
    1737 
    1738 //The total number of operations is 5
    1739 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1740 {
    1741         bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
    1742         return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
    1743 }
    1744 
    1745 //The total number of operations is 21
    1746 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1747 {
    1748         bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
    1749         return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
    1750 }
    1751 
    1752 //The total number of operations is 0
    1753 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
    1754 {
    1755         return simd128<2>::constant<(2)>();
    1756 }
    1757 
    1758 //The total number of operations is 0
    1759 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
    1760 {
    1761         return simd128<4>::constant<(12)>();
    1762 }
    1763 
    1764 //The total number of operations is 0
    1765 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
    1766 {
    1767         return simd128<8>::constant<(240)>();
    1768 }
    1769 
    1770 //The total number of operations is 0
    1771 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
    1772 {
    1773         return simd128<16>::constant<(65280)>();
    1774 }
    1775 
    1776 //The total number of operations is 0
    1777 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    1778 {
    1779         return simd128<32>::constant<-65536>();
    1780 }
    1781 
    1782 //The total number of operations is 0
    1783 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    1784 {
    1785         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
    1786 }
    1787 
    1788 //The total number of operations is 0
    1789 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    1790 {
    1791         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
    1792 }
    1793 
    1794 //The total number of operations is 1
    1795 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
    1796 {
    1797         return simd_xor(arg1, arg2);
    1798 }
    1799 
    1800 //The total number of operations is 10
    1801 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
    1802 {
    1803         bitblock128_t tmp = simd_xor(arg1, arg2);
    1804         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
    1805 }
    1806 
    1807 //The total number of operations is 6
    1808 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
    1809 {
    1810         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
    1811 }
    1812 
    1813 //The total number of operations is 1
    1814 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
    1815 {
    1816         return _mm_add_epi8(arg1, arg2);
    1817 }
    1818 
    1819 //The total number of operations is 1
    1820 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
    1821 {
    1822         return _mm_add_epi16(arg1, arg2);
    1823 }
    1824 
    1825 //The total number of operations is 1
    1826 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
    1827 {
    1828         return _mm_add_epi32(arg1, arg2);
    1829 }
    1830 
    1831 //The total number of operations is 1
    1832 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
    1833 {
    1834         return _mm_add_epi64(arg1, arg2);
    1835 }
    1836 
    1837 //The total number of operations is 11
    1838 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
    1839 {
    1840         bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
    1841         bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
    1842         bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
    1843         return simd128<(64)>::add(partial, carry);
    1844 }
    1845 
    1846 //The total number of operations is 9
    1847 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1848 {
    1849         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1850 }
    1851 
    1852 //The total number of operations is 19
    1853 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1854 {
    1855         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1856         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1857 }
    1858 
    1859 //The total number of operations is 1
    1860 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1861 {
    1862         return _mm_abs_epi8(arg1);
    1863 }
    1864 
    1865 //The total number of operations is 1
    1866 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1867 {
    1868         return _mm_abs_epi16(arg1);
    1869 }
    1870 
    1871 //The total number of operations is 1
    1872 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1873 {
    1874         return _mm_abs_epi32(arg1);
    1875 }
    1876 
    1877 //The total number of operations is 9
    1878 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1879 {
    1880         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1881         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1882 }
    1883 
    1884 //The total number of operations is 37
    1885 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1886 {
    1887         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1888         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    18891893}
    18901894
     
    27732777}
    27742778
    2775 //The total number of operations is 4
    2776 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    2777 {
    2778         return simd128<128>::slli<(sh*2)>(arg1);
    2779 }
    2780 
    2781 //The total number of operations is 4
    2782 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2783 {
    2784         return simd128<128>::slli<(sh*4)>(arg1);
    2785 }
    2786 
    2787 //The total number of operations is 4
    2788 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    2789 {
    2790         return simd128<128>::slli<(sh*8)>(arg1);
    2791 }
    2792 
    2793 //The total number of operations is 4
    2794 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2795 {
    2796         return simd128<128>::slli<(sh*16)>(arg1);
    2797 }
    2798 
    2799 //The total number of operations is 4
    2800 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    2801 {
    2802         return simd128<128>::slli<(sh*32)>(arg1);
    2803 }
    2804 
    2805 //The total number of operations is 4
    2806 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2807 {
    2808         return simd128<128>::slli<(sh*64)>(arg1);
    2809 }
    2810 
    2811 //The total number of operations is 4
    2812 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    2813 {
    2814         return simd128<128>::slli<(sh*128)>(arg1);
    2815 }
    2816 
    28172779//The total number of operations is 5
    28182780template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     
    29772939}
    29782940
     2941//The total number of operations is 4
     2942template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     2943{
     2944        return simd128<128>::slli<(sh*2)>(arg1);
     2945}
     2946
     2947//The total number of operations is 4
     2948template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     2949{
     2950        return simd128<128>::slli<(sh*4)>(arg1);
     2951}
     2952
     2953//The total number of operations is 4
     2954template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     2955{
     2956        return simd128<128>::slli<(sh*8)>(arg1);
     2957}
     2958
     2959//The total number of operations is 4
     2960template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     2961{
     2962        return simd128<128>::slli<(sh*16)>(arg1);
     2963}
     2964
     2965//The total number of operations is 4
     2966template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     2967{
     2968        return simd128<128>::slli<(sh*32)>(arg1);
     2969}
     2970
     2971//The total number of operations is 4
     2972template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     2973{
     2974        return simd128<128>::slli<(sh*64)>(arg1);
     2975}
     2976
     2977//The total number of operations is 4
     2978template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     2979{
     2980        return simd128<128>::slli<(sh*128)>(arg1);
     2981}
     2982
    29792983//The total number of operations is 13
    29802984template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     
    30073011}
    30083012
     3013//The total number of operations is 11
     3014IDISA_ALWAYS_INLINE bitblock128_t bitblock128::sll(bitblock128_t arg1, bitblock128_t arg2)
     3015{
     3016        return simd128<128>::sll(arg1, arg2);
     3017}
     3018
    30093019//The total number of operations is 1
    30103020IDISA_ALWAYS_INLINE bitblock128_t bitblock128::load_unaligned(bitblock128_t* arg1)
    30113021{
    30123022        return _mm_loadu_si128((bitblock128_t*)(arg1));
     3023}
     3024
     3025//The total number of operations is 4
     3026template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     3027{
     3028        return simd128<128>::srli<sh>(arg1);
    30133029}
    30143030
     
    30313047}
    30323048
     3049//The total number of operations is 11
     3050IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srl(bitblock128_t arg1, bitblock128_t arg2)
     3051{
     3052        return simd128<128>::srl(arg1, arg2);
     3053}
     3054
     3055//The total number of operations is 4
     3056template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3057{
     3058        return simd128<128>::slli<sh>(arg1);
     3059}
     3060
    30333061//The total number of operations is 2
    30343062IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
  • trunk/lib/idisa_cpp/idisa_sse4_2.cpp

    r1573 r1580  
    2828        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    2929        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
     30        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3031        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    3132        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    3233        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    33         static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3434        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    3535        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    8989{
    9090public:
     91        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    9192        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(bitblock128_t* arg1);
     93        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     94        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    9295        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t* arg1, bitblock128_t arg2);
    9396        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    9497        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    9598        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
     99        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    96100        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(bitblock128_t* arg1);
    97101        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t* arg1, bitblock128_t arg2);
     
    231235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    232236template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    233 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    235 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    236 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    237 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    238 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    239 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    240237template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    241238template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     
    254251template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    255252template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     253template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     257template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     258template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     259template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    256260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    257261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    262266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    263267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    264 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    265 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    267 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    271 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     268template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    272275template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    273276template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    300303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    301304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    302 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    303 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    304 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    305 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    307 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    308 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    309313template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    310314template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    445449template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    446450template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    447 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    448 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    449 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    450 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    451 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    452 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    453 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    454451template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    455452template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
     
    479476template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    480477template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     478template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     479template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     480template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     481template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     482template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     483template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     484template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    481485template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    482486template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     
    13861390
    13871391//The total number of operations is 0
    1388 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1389 {
    1390         return simd128<2>::constant<(1)>();
    1391 }
    1392 
    1393 //The total number of operations is 0
    1394 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1395 {
    1396         return simd128<4>::constant<(3)>();
    1397 }
    1398 
    1399 //The total number of operations is 0
    1400 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1401 {
    1402         return simd128<8>::constant<(15)>();
    1403 }
    1404 
    1405 //The total number of operations is 0
    1406 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1407 {
    1408         return simd128<16>::constant<(255)>();
    1409 }
    1410 
    1411 //The total number of operations is 0
    1412 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1413 {
    1414         return simd128<32>::constant<(65535)>();
    1415 }
    1416 
    1417 //The total number of operations is 0
    1418 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1419 {
    1420         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1421 }
    1422 
    1423 //The total number of operations is 0
    1424 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1425 {
    1426         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1427 }
    1428 
    1429 //The total number of operations is 0
    14301392template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    14311393{
     
    15281490}
    15291491
     1492//The total number of operations is 0
     1493template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1494{
     1495        return simd128<2>::constant<(1)>();
     1496}
     1497
     1498//The total number of operations is 0
     1499template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1500{
     1501        return simd128<4>::constant<(3)>();
     1502}
     1503
     1504//The total number of operations is 0
     1505template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1506{
     1507        return simd128<8>::constant<(15)>();
     1508}
     1509
     1510//The total number of operations is 0
     1511template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1512{
     1513        return simd128<16>::constant<(255)>();
     1514}
     1515
     1516//The total number of operations is 0
     1517template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1518{
     1519        return simd128<32>::constant<(65535)>();
     1520}
     1521
     1522//The total number of operations is 0
     1523template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1524{
     1525        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
     1526}
     1527
     1528//The total number of operations is 0
     1529template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1530{
     1531        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
     1532}
     1533
    15301534//The total number of operations is 1
    15311535template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    15801584}
    15811585
     1586//The total number of operations is 9
     1587template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1588{
     1589        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1590}
     1591
     1592//The total number of operations is 19
     1593template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1594{
     1595        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1596        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1597}
     1598
     1599//The total number of operations is 1
     1600template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1601{
     1602        return _mm_abs_epi8(arg1);
     1603}
     1604
     1605//The total number of operations is 1
     1606template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1607{
     1608        return _mm_abs_epi16(arg1);
     1609}
     1610
     1611//The total number of operations is 1
     1612template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1613{
     1614        return _mm_abs_epi32(arg1);
     1615}
     1616
     1617//The total number of operations is 5
     1618template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1619{
     1620        bitblock128_t gtMask = simd128<64>::gt(arg1, simd128<64>::constant<0>());
     1621        return simd128<1>::ifh(gtMask, arg1, simd128<64>::sub(gtMask, arg1));
     1622}
     1623
     1624//The total number of operations is 33
     1625template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1626{
     1627        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1628        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1629}
     1630
     1631//The total number of operations is 2
     1632template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1633{
     1634        return simd_not(simd_xor(arg1, arg2));
     1635}
     1636
     1637//The total number of operations is 8
     1638template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1639{
     1640        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1641        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1642        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1643        return simd_or(loMask, hiMask);
     1644}
     1645
     1646//The total number of operations is 9
     1647template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1648{
     1649        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1650}
     1651
     1652//The total number of operations is 1
     1653template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1654{
     1655        return _mm_cmpeq_epi8(arg1, arg2);
     1656}
     1657
     1658//The total number of operations is 1
     1659template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1660{
     1661        return _mm_cmpeq_epi16(arg1, arg2);
     1662}
     1663
     1664//The total number of operations is 1
     1665template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1666{
     1667        return _mm_cmpeq_epi32(arg1, arg2);
     1668}
     1669
     1670//The total number of operations is 1
     1671template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1672{
     1673        return _mm_cmpeq_epi64(arg1, arg2);
     1674}
     1675
     1676//The total number of operations is 11
     1677template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1678{
     1679        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1680        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1681        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1682        return simd_or(loMask, hiMask);
     1683}
     1684
     1685//The total number of operations is 4
     1686template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1687{
     1688        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1689}
     1690
     1691//The total number of operations is 10
     1692template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1693{
     1694        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1695        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1696}
     1697
     1698//The total number of operations is 5
     1699template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1700{
     1701        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1702        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1703}
     1704
     1705//The total number of operations is 1
     1706template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1707{
     1708        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1709}
     1710
     1711//The total number of operations is 1
     1712template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1713{
     1714        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1715}
     1716
     1717//The total number of operations is 5
     1718template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1719{
     1720        bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
     1721        return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
     1722}
     1723
     1724//The total number of operations is 21
     1725template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1726{
     1727        bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
     1728        return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
     1729}
     1730
     1731//The total number of operations is 0
     1732template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     1733{
     1734        return simd128<2>::constant<(2)>();
     1735}
     1736
     1737//The total number of operations is 0
     1738template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
     1739{
     1740        return simd128<4>::constant<(12)>();
     1741}
     1742
     1743//The total number of operations is 0
     1744template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
     1745{
     1746        return simd128<8>::constant<(240)>();
     1747}
     1748
     1749//The total number of operations is 0
     1750template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
     1751{
     1752        return simd128<16>::constant<(65280)>();
     1753}
     1754
     1755//The total number of operations is 0
     1756template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
     1757{
     1758        return simd128<32>::constant<-65536>();
     1759}
     1760
     1761//The total number of operations is 0
     1762template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
     1763{
     1764        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1765}
     1766
     1767//The total number of operations is 0
     1768template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
     1769{
     1770        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1771}
     1772
     1773//The total number of operations is 1
     1774template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
     1775{
     1776        return simd_xor(arg1, arg2);
     1777}
     1778
     1779//The total number of operations is 10
     1780template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
     1781{
     1782        bitblock128_t tmp = simd_xor(arg1, arg2);
     1783        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
     1784}
     1785
     1786//The total number of operations is 6
     1787template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
     1788{
     1789        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
     1790}
     1791
     1792//The total number of operations is 1
     1793template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
     1794{
     1795        return _mm_add_epi8(arg1, arg2);
     1796}
     1797
     1798//The total number of operations is 1
     1799template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
     1800{
     1801        return _mm_add_epi16(arg1, arg2);
     1802}
     1803
     1804//The total number of operations is 1
     1805template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
     1806{
     1807        return _mm_add_epi32(arg1, arg2);
     1808}
     1809
     1810//The total number of operations is 1
     1811template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
     1812{
     1813        return _mm_add_epi64(arg1, arg2);
     1814}
     1815
     1816//The total number of operations is 11
     1817template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
     1818{
     1819        bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
     1820        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
     1821        bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
     1822        return simd128<(64)>::add(partial, carry);
     1823}
     1824
    15821825//The total number of operations is 1
    15831826template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     
    16301873        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    16311874        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1632 }
    1633 
    1634 //The total number of operations is 2
    1635 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1636 {
    1637         return simd_not(simd_xor(arg1, arg2));
    1638 }
    1639 
    1640 //The total number of operations is 8
    1641 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1642 {
    1643         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1644         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1645         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1646         return simd_or(loMask, hiMask);
    1647 }
    1648 
    1649 //The total number of operations is 9
    1650 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1651 {
    1652         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1653 }
    1654 
    1655 //The total number of operations is 1
    1656 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1657 {
    1658         return _mm_cmpeq_epi8(arg1, arg2);
    1659 }
    1660 
    1661 //The total number of operations is 1
    1662 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1663 {
    1664         return _mm_cmpeq_epi16(arg1, arg2);
    1665 }
    1666 
    1667 //The total number of operations is 1
    1668 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1669 {
    1670         return _mm_cmpeq_epi32(arg1, arg2);
    1671 }
    1672 
    1673 //The total number of operations is 1
    1674 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1675 {
    1676         return _mm_cmpeq_epi64(arg1, arg2);
    1677 }
    1678 
    1679 //The total number of operations is 11
    1680 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1681 {
    1682         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1683         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1684         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1685         return simd_or(loMask, hiMask);
    1686 }
    1687 
    1688 //The total number of operations is 4
    1689 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1690 {
    1691         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1692 }
    1693 
    1694 //The total number of operations is 10
    1695 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1696 {
    1697         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1698         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1699 }
    1700 
    1701 //The total number of operations is 5
    1702 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1703 {
    1704         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1705         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1706 }
    1707 
    1708 //The total number of operations is 1
    1709 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1710 {
    1711         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1712 }
    1713 
    1714 //The total number of operations is 1
    1715 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1716 {
    1717         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1718 }
    1719 
    1720 //The total number of operations is 5
    1721 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1722 {
    1723         bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
    1724         return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
    1725 }
    1726 
    1727 //The total number of operations is 21
    1728 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1729 {
    1730         bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
    1731         return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
    1732 }
    1733 
    1734 //The total number of operations is 0
    1735 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
    1736 {
    1737         return simd128<2>::constant<(2)>();
    1738 }
    1739 
    1740 //The total number of operations is 0
    1741 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
    1742 {
    1743         return simd128<4>::constant<(12)>();
    1744 }
    1745 
    1746 //The total number of operations is 0
    1747 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
    1748 {
    1749         return simd128<8>::constant<(240)>();
    1750 }
    1751 
    1752 //The total number of operations is 0
    1753 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
    1754 {
    1755         return simd128<16>::constant<(65280)>();
    1756 }
    1757 
    1758 //The total number of operations is 0
    1759 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    1760 {
    1761         return simd128<32>::constant<-65536>();
    1762 }
    1763 
    1764 //The total number of operations is 0
    1765 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    1766 {
    1767         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
    1768 }
    1769 
    1770 //The total number of operations is 0
    1771 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    1772 {
    1773         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
    1774 }
    1775 
    1776 //The total number of operations is 1
    1777 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
    1778 {
    1779         return simd_xor(arg1, arg2);
    1780 }
    1781 
    1782 //The total number of operations is 10
    1783 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
    1784 {
    1785         bitblock128_t tmp = simd_xor(arg1, arg2);
    1786         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
    1787 }
    1788 
    1789 //The total number of operations is 6
    1790 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
    1791 {
    1792         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
    1793 }
    1794 
    1795 //The total number of operations is 1
    1796 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
    1797 {
    1798         return _mm_add_epi8(arg1, arg2);
    1799 }
    1800 
    1801 //The total number of operations is 1
    1802 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
    1803 {
    1804         return _mm_add_epi16(arg1, arg2);
    1805 }
    1806 
    1807 //The total number of operations is 1
    1808 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
    1809 {
    1810         return _mm_add_epi32(arg1, arg2);
    1811 }
    1812 
    1813 //The total number of operations is 1
    1814 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
    1815 {
    1816         return _mm_add_epi64(arg1, arg2);
    1817 }
    1818 
    1819 //The total number of operations is 11
    1820 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
    1821 {
    1822         bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
    1823         bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
    1824         bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
    1825         return simd128<(64)>::add(partial, carry);
    1826 }
    1827 
    1828 //The total number of operations is 9
    1829 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1830 {
    1831         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1832 }
    1833 
    1834 //The total number of operations is 19
    1835 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1836 {
    1837         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1838         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1839 }
    1840 
    1841 //The total number of operations is 1
    1842 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1843 {
    1844         return _mm_abs_epi8(arg1);
    1845 }
    1846 
    1847 //The total number of operations is 1
    1848 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1849 {
    1850         return _mm_abs_epi16(arg1);
    1851 }
    1852 
    1853 //The total number of operations is 1
    1854 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1855 {
    1856         return _mm_abs_epi32(arg1);
    1857 }
    1858 
    1859 //The total number of operations is 5
    1860 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1861 {
    1862         bitblock128_t gtMask = simd128<64>::gt(arg1, simd128<64>::constant<0>());
    1863         return simd128<1>::ifh(gtMask, arg1, simd128<64>::sub(gtMask, arg1));
    1864 }
    1865 
    1866 //The total number of operations is 33
    1867 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1868 {
    1869         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1870         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    18711875}
    18721876
     
    27552759}
    27562760
    2757 //The total number of operations is 4
    2758 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    2759 {
    2760         return simd128<128>::slli<(sh*2)>(arg1);
    2761 }
    2762 
    2763 //The total number of operations is 4
    2764 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2765 {
    2766         return simd128<128>::slli<(sh*4)>(arg1);
    2767 }
    2768 
    2769 //The total number of operations is 4
    2770 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    2771 {
    2772         return simd128<128>::slli<(sh*8)>(arg1);
    2773 }
    2774 
    2775 //The total number of operations is 4
    2776 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2777 {
    2778         return simd128<128>::slli<(sh*16)>(arg1);
    2779 }
    2780 
    2781 //The total number of operations is 4
    2782 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    2783 {
    2784         return simd128<128>::slli<(sh*32)>(arg1);
    2785 }
    2786 
    2787 //The total number of operations is 4
    2788 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2789 {
    2790         return simd128<128>::slli<(sh*64)>(arg1);
    2791 }
    2792 
    2793 //The total number of operations is 4
    2794 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    2795 {
    2796         return simd128<128>::slli<(sh*128)>(arg1);
    2797 }
    2798 
    27992761//The total number of operations is 5
    28002762template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     
    29592921}
    29602922
     2923//The total number of operations is 4
     2924template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     2925{
     2926        return simd128<128>::slli<(sh*2)>(arg1);
     2927}
     2928
     2929//The total number of operations is 4
     2930template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     2931{
     2932        return simd128<128>::slli<(sh*4)>(arg1);
     2933}
     2934
     2935//The total number of operations is 4
     2936template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     2937{
     2938        return simd128<128>::slli<(sh*8)>(arg1);
     2939}
     2940
     2941//The total number of operations is 4
     2942template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     2943{
     2944        return simd128<128>::slli<(sh*16)>(arg1);
     2945}
     2946
     2947//The total number of operations is 4
     2948template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     2949{
     2950        return simd128<128>::slli<(sh*32)>(arg1);
     2951}
     2952
     2953//The total number of operations is 4
     2954template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     2955{
     2956        return simd128<128>::slli<(sh*64)>(arg1);
     2957}
     2958
     2959//The total number of operations is 4
     2960template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     2961{
     2962        return simd128<128>::slli<(sh*128)>(arg1);
     2963}
     2964
    29612965//The total number of operations is 13
    29622966template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     
    29892993}
    29902994
     2995//The total number of operations is 11
     2996IDISA_ALWAYS_INLINE bitblock128_t bitblock128::sll(bitblock128_t arg1, bitblock128_t arg2)
     2997{
     2998        return simd128<128>::sll(arg1, arg2);
     2999}
     3000
    29913001//The total number of operations is 1
    29923002IDISA_ALWAYS_INLINE bitblock128_t bitblock128::load_unaligned(bitblock128_t* arg1)
    29933003{
    29943004        return _mm_loadu_si128((bitblock128_t*)(arg1));
     3005}
     3006
     3007//The total number of operations is 4
     3008template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     3009{
     3010        return simd128<128>::srli<sh>(arg1);
    29953011}
    29963012
     
    30133029}
    30143030
     3031//The total number of operations is 11
     3032IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srl(bitblock128_t arg1, bitblock128_t arg2)
     3033{
     3034        return simd128<128>::srl(arg1, arg2);
     3035}
     3036
     3037//The total number of operations is 4
     3038template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3039{
     3040        return simd128<128>::slli<sh>(arg1);
     3041}
     3042
    30153043//The total number of operations is 2
    30163044IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
  • trunk/lib/idisa_cpp/idisa_ssse3.cpp

    r1573 r1580  
    2828        static IDISA_ALWAYS_INLINE bitblock128_t add_hl(bitblock128_t arg1);
    2929        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t shift_mask);
     30        static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3031        static IDISA_ALWAYS_INLINE bitblock128_t umin(bitblock128_t arg1, bitblock128_t arg2);
    3132        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock128_t constant();
    3233        static IDISA_ALWAYS_INLINE bitblock128_t min(bitblock128_t arg1, bitblock128_t arg2);
    33         static IDISA_ALWAYS_INLINE bitblock128_t lomask();
    3434        static IDISA_ALWAYS_INLINE bitblock128_t umax(bitblock128_t arg1, bitblock128_t arg2);
    3535        static IDISA_ALWAYS_INLINE bitblock128_t abs(bitblock128_t arg1);
     
    8989{
    9090public:
     91        static IDISA_ALWAYS_INLINE bitblock128_t sll(bitblock128_t arg1, bitblock128_t arg2);
    9192        static IDISA_ALWAYS_INLINE bitblock128_t load_unaligned(bitblock128_t* arg1);
     93        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t srli(bitblock128_t arg1);
     94        static IDISA_ALWAYS_INLINE bitblock128_t srl(bitblock128_t arg1, bitblock128_t arg2);
    9295        static IDISA_ALWAYS_INLINE void store_aligned(bitblock128_t* arg1, bitblock128_t arg2);
    9396        static IDISA_ALWAYS_INLINE bool all(bitblock128_t arg1);
    9497        static IDISA_ALWAYS_INLINE bool any(bitblock128_t arg1);
    9598        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock128_t arg1);
     99        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock128_t slli(bitblock128_t arg1);
    96100        static IDISA_ALWAYS_INLINE bitblock128_t load_aligned(bitblock128_t* arg1);
    97101        static IDISA_ALWAYS_INLINE void store_unaligned(bitblock128_t* arg1, bitblock128_t arg2);
     
    231235template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    232236template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srl(bitblock128_t arg1, bitblock128_t shift_mask);
    233 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
    234 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
    235 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
    236 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
    237 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
    238 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
    239 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    240237template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant();
    241238template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::constant();
     
    254251template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::min(bitblock128_t arg1, bitblock128_t arg2);
    255252template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::min(bitblock128_t arg1, bitblock128_t arg2);
     253template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask();
     254template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask();
     255template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask();
     256template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask();
     257template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask();
     258template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask();
     259template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask();
    256260template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2);
    257261template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umin(bitblock128_t arg1, bitblock128_t arg2);
     
    262266template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umin(bitblock128_t arg1, bitblock128_t arg2);
    263267template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umin(bitblock128_t arg1, bitblock128_t arg2);
    264 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
    265 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
    266 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
    267 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
    268 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
    269 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
    270 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
    271 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
     268template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
     269template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
     270template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
     271template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
     272template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
     273template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
     274template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
    272275template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2);
    273276template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2);
     
    300303template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2);
    301304template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2);
    302 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1);
    303 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1);
    304 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1);
    305 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1);
    306 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1);
    307 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1);
    308 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1);
     305template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2);
     306template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::umax(bitblock128_t arg1, bitblock128_t arg2);
     307template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::umax(bitblock128_t arg1, bitblock128_t arg2);
     308template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::umax(bitblock128_t arg1, bitblock128_t arg2);
     309template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::umax(bitblock128_t arg1, bitblock128_t arg2);
     310template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::umax(bitblock128_t arg1, bitblock128_t arg2);
     311template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::umax(bitblock128_t arg1, bitblock128_t arg2);
     312template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::umax(bitblock128_t arg1, bitblock128_t arg2);
    309313template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<2>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
    310314template <> IDISA_ALWAYS_INLINE bitblock128_t hsimd128<4>::umin_hl(bitblock128_t arg1, bitblock128_t arg2);
     
    445449template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    446450template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
    447 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
    448 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
    449 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
    450 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
    451 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
    452 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
    453 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    454451template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
    455452template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
     
    479476template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::dslli(bitblock128_t arg1, bitblock128_t arg2);
    480477template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::dslli(bitblock128_t arg1, bitblock128_t arg2);
     478template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1);
     479template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1);
     480template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1);
     481template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1);
     482template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1);
     483template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1);
     484template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1);
    481485template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
    482486template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
     
    13951399
    13961400//The total number of operations is 0
    1397 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
    1398 {
    1399         return simd128<2>::constant<(1)>();
    1400 }
    1401 
    1402 //The total number of operations is 0
    1403 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
    1404 {
    1405         return simd128<4>::constant<(3)>();
    1406 }
    1407 
    1408 //The total number of operations is 0
    1409 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
    1410 {
    1411         return simd128<8>::constant<(15)>();
    1412 }
    1413 
    1414 //The total number of operations is 0
    1415 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
    1416 {
    1417         return simd128<16>::constant<(255)>();
    1418 }
    1419 
    1420 //The total number of operations is 0
    1421 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
    1422 {
    1423         return simd128<32>::constant<(65535)>();
    1424 }
    1425 
    1426 //The total number of operations is 0
    1427 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
    1428 {
    1429         return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
    1430 }
    1431 
    1432 //The total number of operations is 0
    1433 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
    1434 {
    1435         return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
    1436 }
    1437 
    1438 //The total number of operations is 0
    14391401template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::constant()
    14401402{
     
    15371499}
    15381500
     1501//The total number of operations is 0
     1502template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::lomask()
     1503{
     1504        return simd128<2>::constant<(1)>();
     1505}
     1506
     1507//The total number of operations is 0
     1508template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::lomask()
     1509{
     1510        return simd128<4>::constant<(3)>();
     1511}
     1512
     1513//The total number of operations is 0
     1514template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::lomask()
     1515{
     1516        return simd128<8>::constant<(15)>();
     1517}
     1518
     1519//The total number of operations is 0
     1520template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::lomask()
     1521{
     1522        return simd128<16>::constant<(255)>();
     1523}
     1524
     1525//The total number of operations is 0
     1526template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::lomask()
     1527{
     1528        return simd128<32>::constant<(65535)>();
     1529}
     1530
     1531//The total number of operations is 0
     1532template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::lomask()
     1533{
     1534        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
     1535}
     1536
     1537//The total number of operations is 0
     1538template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::lomask()
     1539{
     1540        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
     1541}
     1542
    15391543//The total number of operations is 1
    15401544template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umin(bitblock128_t arg1, bitblock128_t arg2)
     
    15931597}
    15941598
     1599//The total number of operations is 9
     1600template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
     1601{
     1602        return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
     1603}
     1604
     1605//The total number of operations is 19
     1606template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
     1607{
     1608        bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
     1609        return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
     1610}
     1611
     1612//The total number of operations is 1
     1613template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
     1614{
     1615        return _mm_abs_epi8(arg1);
     1616}
     1617
     1618//The total number of operations is 1
     1619template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
     1620{
     1621        return _mm_abs_epi16(arg1);
     1622}
     1623
     1624//The total number of operations is 1
     1625template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
     1626{
     1627        return _mm_abs_epi32(arg1);
     1628}
     1629
     1630//The total number of operations is 13
     1631template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
     1632{
     1633        bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
     1634        return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
     1635}
     1636
     1637//The total number of operations is 45
     1638template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
     1639{
     1640        bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
     1641        return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
     1642}
     1643
     1644//The total number of operations is 2
     1645template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1646{
     1647        return simd_not(simd_xor(arg1, arg2));
     1648}
     1649
     1650//The total number of operations is 8
     1651template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1652{
     1653        bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
     1654        bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
     1655        bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
     1656        return simd_or(loMask, hiMask);
     1657}
     1658
     1659//The total number of operations is 9
     1660template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1661{
     1662        return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
     1663}
     1664
     1665//The total number of operations is 1
     1666template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1667{
     1668        return _mm_cmpeq_epi8(arg1, arg2);
     1669}
     1670
     1671//The total number of operations is 1
     1672template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1673{
     1674        return _mm_cmpeq_epi16(arg1, arg2);
     1675}
     1676
     1677//The total number of operations is 1
     1678template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1679{
     1680        return _mm_cmpeq_epi32(arg1, arg2);
     1681}
     1682
     1683//The total number of operations is 5
     1684template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1685{
     1686        bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
     1687        bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
     1688        bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
     1689        return simd_or(loMask, hiMask);
     1690}
     1691
     1692//The total number of operations is 15
     1693template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
     1694{
     1695        bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
     1696        bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
     1697        bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
     1698        return simd_or(loMask, hiMask);
     1699}
     1700
     1701//The total number of operations is 4
     1702template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
     1703{
     1704        return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
     1705}
     1706
     1707//The total number of operations is 10
     1708template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
     1709{
     1710        bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
     1711        return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1712}
     1713
     1714//The total number of operations is 5
     1715template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
     1716{
     1717        bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
     1718        return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
     1719}
     1720
     1721//The total number of operations is 1
     1722template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
     1723{
     1724        return _mm_srai_epi16(arg1, (int32_t)(sh));
     1725}
     1726
     1727//The total number of operations is 1
     1728template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
     1729{
     1730        return _mm_srai_epi32(arg1, (int32_t)(sh));
     1731}
     1732
     1733//The total number of operations is 5
     1734template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
     1735{
     1736        bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
     1737        return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
     1738}
     1739
     1740//The total number of operations is 21
     1741template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
     1742{
     1743        bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
     1744        return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
     1745}
     1746
     1747//The total number of operations is 0
     1748template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
     1749{
     1750        return simd128<2>::constant<(2)>();
     1751}
     1752
     1753//The total number of operations is 0
     1754template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
     1755{
     1756        return simd128<4>::constant<(12)>();
     1757}
     1758
     1759//The total number of operations is 0
     1760template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
     1761{
     1762        return simd128<8>::constant<(240)>();
     1763}
     1764
     1765//The total number of operations is 0
     1766template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
     1767{
     1768        return simd128<16>::constant<(65280)>();
     1769}
     1770
     1771//The total number of operations is 0
     1772template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
     1773{
     1774        return simd128<32>::constant<-65536>();
     1775}
     1776
     1777//The total number of operations is 0
     1778template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
     1779{
     1780        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
     1781}
     1782
     1783//The total number of operations is 0
     1784template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
     1785{
     1786        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
     1787}
     1788
     1789//The total number of operations is 1
     1790template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
     1791{
     1792        return simd_xor(arg1, arg2);
     1793}
     1794
     1795//The total number of operations is 10
     1796template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
     1797{
     1798        bitblock128_t tmp = simd_xor(arg1, arg2);
     1799        return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
     1800}
     1801
     1802//The total number of operations is 6
     1803template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
     1804{
     1805        return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
     1806}
     1807
     1808//The total number of operations is 1
     1809template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
     1810{
     1811        return _mm_add_epi8(arg1, arg2);
     1812}
     1813
     1814//The total number of operations is 1
     1815template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
     1816{
     1817        return _mm_add_epi16(arg1, arg2);
     1818}
     1819
     1820//The total number of operations is 1
     1821template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
     1822{
     1823        return _mm_add_epi32(arg1, arg2);
     1824}
     1825
     1826//The total number of operations is 1
     1827template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
     1828{
     1829        return _mm_add_epi64(arg1, arg2);
     1830}
     1831
     1832//The total number of operations is 11
     1833template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
     1834{
     1835        bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
     1836        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
     1837        bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
     1838        return simd128<(64)>::add(partial, carry);
     1839}
     1840
    15951841//The total number of operations is 1
    15961842template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::umax(bitblock128_t arg1, bitblock128_t arg2)
     
    16471893        bitblock128_t eqMask2 = simd128<128>::srli<(64)>(simd128<(64)>::eq(tmpAns, arg2));
    16481894        return simd128<1>::ifh(simd128<128>::himask(), tmpAns, simd128<1>::ifh(eqMask1, simd128<1>::ifh(eqMask2, tmpAns, arg1), arg2));
    1649 }
    1650 
    1651 //The total number of operations is 2
    1652 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1653 {
    1654         return simd_not(simd_xor(arg1, arg2));
    1655 }
    1656 
    1657 //The total number of operations is 8
    1658 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1659 {
    1660         bitblock128_t tmpAns = simd128<(1)>::eq(arg1, arg2);
    1661         bitblock128_t loMask = simd_and(tmpAns, simd128<2>::srli<(1)>(tmpAns));
    1662         bitblock128_t hiMask = simd128<2>::slli<(1)>(loMask);
    1663         return simd_or(loMask, hiMask);
    1664 }
    1665 
    1666 //The total number of operations is 9
    1667 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1668 {
    1669         return simd_or(simd_and(simd128<(8)>::himask(), simd128<(8)>::eq(simd_and(simd128<(8)>::himask(), arg1), simd_and(simd128<(8)>::himask(), arg2))), simd_and(simd128<(8)>::lomask(), simd128<(8)>::eq(simd_and(simd128<(8)>::lomask(), arg1), simd_and(simd128<(8)>::lomask(), arg2))));
    1670 }
    1671 
    1672 //The total number of operations is 1
    1673 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1674 {
    1675         return _mm_cmpeq_epi8(arg1, arg2);
    1676 }
    1677 
    1678 //The total number of operations is 1
    1679 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1680 {
    1681         return _mm_cmpeq_epi16(arg1, arg2);
    1682 }
    1683 
    1684 //The total number of operations is 1
    1685 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1686 {
    1687         return _mm_cmpeq_epi32(arg1, arg2);
    1688 }
    1689 
    1690 //The total number of operations is 5
    1691 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1692 {
    1693         bitblock128_t tmpAns = simd128<(32)>::eq(arg1, arg2);
    1694         bitblock128_t loMask = simd_and(tmpAns, simd128<64>::srli<(32)>(tmpAns));
    1695         bitblock128_t hiMask = simd128<64>::slli<(32)>(loMask);
    1696         return simd_or(loMask, hiMask);
    1697 }
    1698 
    1699 //The total number of operations is 15
    1700 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::eq(bitblock128_t arg1, bitblock128_t arg2)
    1701 {
    1702         bitblock128_t tmpAns = simd128<(64)>::eq(arg1, arg2);
    1703         bitblock128_t loMask = simd_and(tmpAns, simd128<128>::srli<(64)>(tmpAns));
    1704         bitblock128_t hiMask = simd128<128>::slli<(64)>(loMask);
    1705         return simd_or(loMask, hiMask);
    1706 }
    1707 
    1708 //The total number of operations is 4
    1709 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::srai(bitblock128_t arg1)
    1710 {
    1711         return ((sh == 0) ? arg1 : simd_or(simd_and(simd128<2>::himask(), arg1), simd128<2>::srli<1>(arg1)));
    1712 }
    1713 
    1714 //The total number of operations is 10
    1715 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::srai(bitblock128_t arg1)
    1716 {
    1717         bitblock128_t tmp = simd128<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
    1718         return simd_or(tmp, simd128<4>::sub(simd128<4>::constant<0>(), simd_and(simd128<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1719 }
    1720 
    1721 //The total number of operations is 5
    1722 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::srai(bitblock128_t arg1)
    1723 {
    1724         bitblock128_t tmp = simd128<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
    1725         return simd_or(tmp, simd128<8>::sub(simd128<8>::constant<0>(), simd_and(simd128<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
    1726 }
    1727 
    1728 //The total number of operations is 1
    1729 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::srai(bitblock128_t arg1)
    1730 {
    1731         return _mm_srai_epi16(arg1, (int32_t)(sh));
    1732 }
    1733 
    1734 //The total number of operations is 1
    1735 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::srai(bitblock128_t arg1)
    1736 {
    1737         return _mm_srai_epi32(arg1, (int32_t)(sh));
    1738 }
    1739 
    1740 //The total number of operations is 5
    1741 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::srai(bitblock128_t arg1)
    1742 {
    1743         bitblock128_t tmp = simd128<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
    1744         return simd_or(tmp, simd128<64>::sub(simd128<64>::constant<0>(), simd_and(simd128<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd128<64>::constant<1>()), tmp)));
    1745 }
    1746 
    1747 //The total number of operations is 21
    1748 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::srai(bitblock128_t arg1)
    1749 {
    1750         bitblock128_t tmp = simd128<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
    1751         return simd_or(tmp, simd128<128>::sub(simd128<128>::constant<0>(), simd_and(simd128<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd128<128>::constant<1>()), tmp)));
    1752 }
    1753 
    1754 //The total number of operations is 0
    1755 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::himask()
    1756 {
    1757         return simd128<2>::constant<(2)>();
    1758 }
    1759 
    1760 //The total number of operations is 0
    1761 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::himask()
    1762 {
    1763         return simd128<4>::constant<(12)>();
    1764 }
    1765 
    1766 //The total number of operations is 0
    1767 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::himask()
    1768 {
    1769         return simd128<8>::constant<(240)>();
    1770 }
    1771 
    1772 //The total number of operations is 0
    1773 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::himask()
    1774 {
    1775         return simd128<16>::constant<(65280)>();
    1776 }
    1777 
    1778 //The total number of operations is 0
    1779 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::himask()
    1780 {
    1781         return simd128<32>::constant<-65536>();
    1782 }
    1783 
    1784 //The total number of operations is 0
    1785 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::himask()
    1786 {
    1787         return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
    1788 }
    1789 
    1790 //The total number of operations is 0
    1791 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::himask()
    1792 {
    1793         return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
    1794 }
    1795 
    1796 //The total number of operations is 1
    1797 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<1>::add(bitblock128_t arg1, bitblock128_t arg2)
    1798 {
    1799         return simd_xor(arg1, arg2);
    1800 }
    1801 
    1802 //The total number of operations is 10
    1803 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::add(bitblock128_t arg1, bitblock128_t arg2)
    1804 {
    1805         bitblock128_t tmp = simd_xor(arg1, arg2);
    1806         return simd128<1>::ifh(simd128<2>::himask(), simd_xor(tmp, simd128<128>::slli<1>(simd_and(arg1, arg2))), tmp);
    1807 }
    1808 
    1809 //The total number of operations is 6
    1810 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::add(bitblock128_t arg1, bitblock128_t arg2)
    1811 {
    1812         return simd128<1>::ifh(simd128<(8)>::himask(), simd128<(8)>::add(arg1, simd_and(simd128<(8)>::himask(), arg2)), simd128<(8)>::add(arg1, arg2));
    1813 }
    1814 
    1815 //The total number of operations is 1
    1816 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::add(bitblock128_t arg1, bitblock128_t arg2)
    1817 {
    1818         return _mm_add_epi8(arg1, arg2);
    1819 }
    1820 
    1821 //The total number of operations is 1
    1822 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::add(bitblock128_t arg1, bitblock128_t arg2)
    1823 {
    1824         return _mm_add_epi16(arg1, arg2);
    1825 }
    1826 
    1827 //The total number of operations is 1
    1828 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::add(bitblock128_t arg1, bitblock128_t arg2)
    1829 {
    1830         return _mm_add_epi32(arg1, arg2);
    1831 }
    1832 
    1833 //The total number of operations is 1
    1834 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::add(bitblock128_t arg1, bitblock128_t arg2)
    1835 {
    1836         return _mm_add_epi64(arg1, arg2);
    1837 }
    1838 
    1839 //The total number of operations is 11
    1840 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::add(bitblock128_t arg1, bitblock128_t arg2)
    1841 {
    1842         bitblock128_t partial = simd128<(64)>::add(arg1, arg2);
    1843         bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
    1844         bitblock128_t carry = simd128<128>::slli<(64)>(simd128<(64)>::srli<(63)>(carryMask));
    1845         return simd128<(64)>::add(partial, carry);
    1846 }
    1847 
    1848 //The total number of operations is 9
    1849 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<2>::abs(bitblock128_t arg1)
    1850 {
    1851         return simd128<1>::ifh(simd128<2>::himask(), simd_and(arg1, simd128<128>::slli<1>(simd_not(arg1))), arg1);
    1852 }
    1853 
    1854 //The total number of operations is 19
    1855 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<4>::abs(bitblock128_t arg1)
    1856 {
    1857         bitblock128_t gtMask = simd128<4>::gt(arg1, simd128<4>::constant<0>());
    1858         return simd128<1>::ifh(gtMask, arg1, simd128<4>::sub(gtMask, arg1));
    1859 }
    1860 
    1861 //The total number of operations is 1
    1862 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<8>::abs(bitblock128_t arg1)
    1863 {
    1864         return _mm_abs_epi8(arg1);
    1865 }
    1866 
    1867 //The total number of operations is 1
    1868 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<16>::abs(bitblock128_t arg1)
    1869 {
    1870         return _mm_abs_epi16(arg1);
    1871 }
    1872 
    1873 //The total number of operations is 1
    1874 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<32>::abs(bitblock128_t arg1)
    1875 {
    1876         return _mm_abs_epi32(arg1);
    1877 }
    1878 
    1879 //The total number of operations is 13
    1880 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<64>::abs(bitblock128_t arg1)
    1881 {
    1882         bitblock128_t eqMask = simd128<64>::eq(simd128<1>::ifh(simd128<64>::himask(), simd128<(32)>::abs(arg1), arg1), arg1);
    1883         return simd128<1>::ifh(eqMask, arg1, simd128<64>::sub(eqMask, arg1));
    1884 }
    1885 
    1886 //The total number of operations is 45
    1887 template <> IDISA_ALWAYS_INLINE bitblock128_t simd128<128>::abs(bitblock128_t arg1)
    1888 {
    1889         bitblock128_t eqMask = simd128<128>::eq(simd128<1>::ifh(simd128<128>::himask(), simd128<(64)>::abs(arg1), arg1), arg1);
    1890         return simd128<1>::ifh(eqMask, arg1, simd128<128>::sub(eqMask, arg1));
    18911895}
    18921896
     
    27762780}
    27772781
    2778 //The total number of operations is 4
    2779 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
    2780 {
    2781         return simd128<128>::slli<(sh*2)>(arg1);
    2782 }
    2783 
    2784 //The total number of operations is 4
    2785 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
    2786 {
    2787         return simd128<128>::slli<(sh*4)>(arg1);
    2788 }
    2789 
    2790 //The total number of operations is 4
    2791 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
    2792 {
    2793         return simd128<128>::slli<(sh*8)>(arg1);
    2794 }
    2795 
    2796 //The total number of operations is 4
    2797 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
    2798 {
    2799         return simd128<128>::slli<(sh*16)>(arg1);
    2800 }
    2801 
    2802 //The total number of operations is 4
    2803 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
    2804 {
    2805         return simd128<128>::slli<(sh*32)>(arg1);
    2806 }
    2807 
    2808 //The total number of operations is 4
    2809 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
    2810 {
    2811         return simd128<128>::slli<(sh*64)>(arg1);
    2812 }
    2813 
    2814 //The total number of operations is 4
    2815 template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
    2816 {
    2817         return simd128<128>::slli<(sh*128)>(arg1);
    2818 }
    2819 
    28202782//The total number of operations is 5
    28212783template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
     
    29802942}
    29812943
     2944//The total number of operations is 4
     2945template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<2>::slli(bitblock128_t arg1)
     2946{
     2947        return simd128<128>::slli<(sh*2)>(arg1);
     2948}
     2949
     2950//The total number of operations is 4
     2951template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<4>::slli(bitblock128_t arg1)
     2952{
     2953        return simd128<128>::slli<(sh*4)>(arg1);
     2954}
     2955
     2956//The total number of operations is 4
     2957template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<8>::slli(bitblock128_t arg1)
     2958{
     2959        return simd128<128>::slli<(sh*8)>(arg1);
     2960}
     2961
     2962//The total number of operations is 4
     2963template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<16>::slli(bitblock128_t arg1)
     2964{
     2965        return simd128<128>::slli<(sh*16)>(arg1);
     2966}
     2967
     2968//The total number of operations is 4
     2969template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<32>::slli(bitblock128_t arg1)
     2970{
     2971        return simd128<128>::slli<(sh*32)>(arg1);
     2972}
     2973
     2974//The total number of operations is 4
     2975template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<64>::slli(bitblock128_t arg1)
     2976{
     2977        return simd128<128>::slli<(sh*64)>(arg1);
     2978}
     2979
     2980//The total number of operations is 4
     2981template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<128>::slli(bitblock128_t arg1)
     2982{
     2983        return simd128<128>::slli<(sh*128)>(arg1);
     2984}
     2985
    29822986//The total number of operations is 13
    29832987template <> IDISA_ALWAYS_INLINE bitblock128_t mvmd128<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
     
    30103014}
    30113015
     3016//The total number of operations is 11
     3017IDISA_ALWAYS_INLINE bitblock128_t bitblock128::sll(bitblock128_t arg1, bitblock128_t arg2)
     3018{
     3019        return simd128<128>::sll(arg1, arg2);
     3020}
     3021
    30123022//The total number of operations is 1
    30133023IDISA_ALWAYS_INLINE bitblock128_t bitblock128::load_unaligned(bitblock128_t* arg1)
    30143024{
    30153025        return _mm_loadu_si128((bitblock128_t*)(arg1));
     3026}
     3027
     3028//The total number of operations is 4
     3029template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srli(bitblock128_t arg1)
     3030{
     3031        return simd128<128>::srli<sh>(arg1);
    30163032}
    30173033
     
    30343050}
    30353051
     3052//The total number of operations is 11
     3053IDISA_ALWAYS_INLINE bitblock128_t bitblock128::srl(bitblock128_t arg1, bitblock128_t arg2)
     3054{
     3055        return simd128<128>::srl(arg1, arg2);
     3056}
     3057
     3058//The total number of operations is 4
     3059template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock128_t bitblock128::slli(bitblock128_t arg1)
     3060{
     3061        return simd128<128>::slli<sh>(arg1);
     3062}
     3063
    30363064//The total number of operations is 2
    30373065IDISA_ALWAYS_INLINE bool bitblock128::any(bitblock128_t arg1)
Note: See TracChangeset for help on using the changeset viewer.