source: trunk/lib/idisa_cpp/idisa_avx.cpp @ 1584

Last change on this file since 1584 was 1584, checked in by huah, 8 years ago

fixed the bitblock::load/store for AVX; added testing modules for bitblock::srl/sll/srli/slli

File size: 173.6 KB
Line 
1#ifndef IDISA_AVX_CPP
2#define IDISA_AVX_CPP
3#include <stdint.h>
4#include "../config.hpp"
5
6#include "immintrin.h"
7
8typedef __m256 bitblock256_t;
9template <uint32_t fw>
10class simd256
11{
12public:
13        static IDISA_ALWAYS_INLINE bitblock256_t max(bitblock256_t arg1, bitblock256_t arg2);
14        static IDISA_ALWAYS_INLINE bitblock256_t mult(bitblock256_t arg1, bitblock256_t arg2);
15        static IDISA_ALWAYS_INLINE bitblock256_t gt(bitblock256_t arg1, bitblock256_t arg2);
16        static IDISA_ALWAYS_INLINE bitblock256_t umult(bitblock256_t arg1, bitblock256_t arg2);
17        static IDISA_ALWAYS_INLINE bitblock256_t ult(bitblock256_t arg1, bitblock256_t arg2);
18        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
19        static IDISA_ALWAYS_INLINE bitblock256_t ctz(bitblock256_t arg1);
20        static IDISA_ALWAYS_INLINE bitblock256_t eq(bitblock256_t arg1, bitblock256_t arg2);
21        static IDISA_ALWAYS_INLINE bitblock256_t popcount(bitblock256_t arg1);
22        static IDISA_ALWAYS_INLINE bitblock256_t neg(bitblock256_t arg1);
23        static IDISA_ALWAYS_INLINE bitblock256_t himask();
24        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
25        static IDISA_ALWAYS_INLINE bitblock256_t ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
26        static IDISA_ALWAYS_INLINE bitblock256_t sub(bitblock256_t arg1, bitblock256_t arg2);
27        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1);
28        static IDISA_ALWAYS_INLINE bitblock256_t lomask();
29        static IDISA_ALWAYS_INLINE bitblock256_t umin(bitblock256_t arg1, bitblock256_t arg2);
30        template <uint64_t val> static IDISA_ALWAYS_INLINE bitblock256_t constant();
31        static IDISA_ALWAYS_INLINE bitblock256_t min(bitblock256_t arg1, bitblock256_t arg2);
32        static IDISA_ALWAYS_INLINE bitblock256_t umax(bitblock256_t arg1, bitblock256_t arg2);
33        static IDISA_ALWAYS_INLINE bitblock256_t abs(bitblock256_t arg1);
34        static IDISA_ALWAYS_INLINE bitblock256_t xor_hl(bitblock256_t arg1);
35        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srai(bitblock256_t arg1);
36        static IDISA_ALWAYS_INLINE bitblock256_t lt(bitblock256_t arg1, bitblock256_t arg2);
37        static IDISA_ALWAYS_INLINE bitblock256_t add(bitblock256_t arg1, bitblock256_t arg2);
38        static IDISA_ALWAYS_INLINE bitblock256_t ugt(bitblock256_t arg1, bitblock256_t arg2);
39};
40
41template <uint32_t fw>
42class hsimd256
43{
44public:
45        static IDISA_ALWAYS_INLINE bitblock256_t umin_hl(bitblock256_t arg1, bitblock256_t arg2);
46        static IDISA_ALWAYS_INLINE bitblock256_t add_hl(bitblock256_t arg1, bitblock256_t arg2);
47        static IDISA_ALWAYS_INLINE bitblock256_t packss(bitblock256_t arg1, bitblock256_t arg2);
48        static IDISA_ALWAYS_INLINE bitblock256_t packh(bitblock256_t arg1, bitblock256_t arg2);
49        static IDISA_ALWAYS_INLINE uint64_t signmask(bitblock256_t arg1);
50        static IDISA_ALWAYS_INLINE bitblock256_t packl(bitblock256_t arg1, bitblock256_t arg2);
51        static IDISA_ALWAYS_INLINE bitblock256_t min_hl(bitblock256_t arg1, bitblock256_t arg2);
52        static IDISA_ALWAYS_INLINE bitblock256_t packus(bitblock256_t arg1, bitblock256_t arg2);
53};
54
55template <uint32_t fw>
56class esimd256
57{
58public:
59        static IDISA_ALWAYS_INLINE bitblock256_t mergel(bitblock256_t arg1, bitblock256_t arg2);
60        static IDISA_ALWAYS_INLINE bitblock256_t signextendh(bitblock256_t arg1);
61        static IDISA_ALWAYS_INLINE bitblock256_t mergeh(bitblock256_t arg1, bitblock256_t arg2);
62        static IDISA_ALWAYS_INLINE bitblock256_t zeroextendh(bitblock256_t arg1);
63        static IDISA_ALWAYS_INLINE bitblock256_t zeroextendl(bitblock256_t arg1);
64        static IDISA_ALWAYS_INLINE bitblock256_t signextendl(bitblock256_t arg1);
65};
66
67template <uint32_t fw>
68class mvmd256
69{
70public:
71        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dsrli(bitblock256_t arg1, bitblock256_t arg2);
72        static IDISA_ALWAYS_INLINE bitblock256_t fill(uint64_t val1);
73        template <uint64_t pos> static IDISA_ALWAYS_INLINE uint64_t extract(bitblock256_t arg1);
74        template <uint64_t pos> static IDISA_ALWAYS_INLINE bitblock256_t splat(bitblock256_t arg1);
75        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
76        static IDISA_ALWAYS_INLINE bitblock256_t fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
77        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
78        static IDISA_ALWAYS_INLINE bitblock256_t fill2(uint64_t val1, uint64_t val2);
79        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t dslli(bitblock256_t arg1, bitblock256_t arg2);
80        static IDISA_ALWAYS_INLINE bitblock256_t fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
81        static IDISA_ALWAYS_INLINE bitblock256_t fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
82};
83
84class bitblock256
85{
86public:
87        static IDISA_ALWAYS_INLINE bitblock256_t load_unaligned(float const* arg1);
88        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t srli(bitblock256_t arg1);
89        static IDISA_ALWAYS_INLINE void store_aligned(float* arg1, bitblock256_t arg2);
90        static IDISA_ALWAYS_INLINE bool all(bitblock256_t arg1);
91        static IDISA_ALWAYS_INLINE bool any(bitblock256_t arg1);
92        static IDISA_ALWAYS_INLINE uint64_t popcount(bitblock256_t arg1);
93        template <uint64_t sh> static IDISA_ALWAYS_INLINE bitblock256_t slli(bitblock256_t arg1);
94        static IDISA_ALWAYS_INLINE bitblock256_t load_aligned(float const* arg1);
95        static IDISA_ALWAYS_INLINE void store_unaligned(float* arg1, bitblock256_t arg2);
96};
97
98//Declaration Part
99IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2);
100IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1);
101IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2);
102IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2);
103IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2);
104IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2);
105template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::max(bitblock256_t arg1, bitblock256_t arg2);
106template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::max(bitblock256_t arg1, bitblock256_t arg2);
107template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::max(bitblock256_t arg1, bitblock256_t arg2);
108template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::max(bitblock256_t arg1, bitblock256_t arg2);
109template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::max(bitblock256_t arg1, bitblock256_t arg2);
110template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::max(bitblock256_t arg1, bitblock256_t arg2);
111template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::max(bitblock256_t arg1, bitblock256_t arg2);
112template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::max(bitblock256_t arg1, bitblock256_t arg2);
113template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::max(bitblock256_t arg1, bitblock256_t arg2);
114template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::mult(bitblock256_t arg1, bitblock256_t arg2);
115template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::mult(bitblock256_t arg1, bitblock256_t arg2);
116template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::mult(bitblock256_t arg1, bitblock256_t arg2);
117template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::mult(bitblock256_t arg1, bitblock256_t arg2);
118template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::mult(bitblock256_t arg1, bitblock256_t arg2);
119template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::mult(bitblock256_t arg1, bitblock256_t arg2);
120template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::mult(bitblock256_t arg1, bitblock256_t arg2);
121template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::mult(bitblock256_t arg1, bitblock256_t arg2);
122template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::mult(bitblock256_t arg1, bitblock256_t arg2);
123template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::gt(bitblock256_t arg1, bitblock256_t arg2);
124template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::gt(bitblock256_t arg1, bitblock256_t arg2);
125template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::gt(bitblock256_t arg1, bitblock256_t arg2);
126template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::gt(bitblock256_t arg1, bitblock256_t arg2);
127template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::gt(bitblock256_t arg1, bitblock256_t arg2);
128template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::gt(bitblock256_t arg1, bitblock256_t arg2);
129template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::gt(bitblock256_t arg1, bitblock256_t arg2);
130template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::gt(bitblock256_t arg1, bitblock256_t arg2);
131template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::gt(bitblock256_t arg1, bitblock256_t arg2);
132template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umult(bitblock256_t arg1, bitblock256_t arg2);
133template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umult(bitblock256_t arg1, bitblock256_t arg2);
134template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umult(bitblock256_t arg1, bitblock256_t arg2);
135template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umult(bitblock256_t arg1, bitblock256_t arg2);
136template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umult(bitblock256_t arg1, bitblock256_t arg2);
137template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umult(bitblock256_t arg1, bitblock256_t arg2);
138template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umult(bitblock256_t arg1, bitblock256_t arg2);
139template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umult(bitblock256_t arg1, bitblock256_t arg2);
140template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ult(bitblock256_t arg1, bitblock256_t arg2);
141template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ult(bitblock256_t arg1, bitblock256_t arg2);
142template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ult(bitblock256_t arg1, bitblock256_t arg2);
143template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ult(bitblock256_t arg1, bitblock256_t arg2);
144template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ult(bitblock256_t arg1, bitblock256_t arg2);
145template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ult(bitblock256_t arg1, bitblock256_t arg2);
146template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ult(bitblock256_t arg1, bitblock256_t arg2);
147template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ult(bitblock256_t arg1, bitblock256_t arg2);
148template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ult(bitblock256_t arg1, bitblock256_t arg2);
149template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2);
150template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2);
151template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2);
152template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2);
153template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2);
154template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2);
155template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2);
156template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2);
157template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2);
158template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1);
159template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1);
160template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1);
161template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1);
162template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1);
163template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1);
164template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1);
165template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1);
166template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1);
167template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1);
168template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ctz(bitblock256_t arg1);
169template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ctz(bitblock256_t arg1);
170template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ctz(bitblock256_t arg1);
171template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ctz(bitblock256_t arg1);
172template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ctz(bitblock256_t arg1);
173template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1);
174template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1);
175template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2);
176template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2);
177template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2);
178template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2);
179template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2);
180template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2);
181template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2);
182template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2);
183template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2);
184template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1);
185template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1);
186template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1);
187template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1);
188template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1);
189template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1);
190template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1);
191template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1);
192template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1);
193template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1);
194template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1);
195template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1);
196template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1);
197template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1);
198template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1);
199template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1);
200template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1);
201template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1);
202template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1);
203template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1);
204template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1);
205template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1);
206template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1);
207template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1);
208template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1);
209template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1);
210template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1);
211template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1);
212template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1);
213template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1);
214template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1);
215template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1);
216template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1);
217template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
218template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
219template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
220template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
221template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
222template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
223template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
224template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
225template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3);
226template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2);
227template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2);
228template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2);
229template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2);
230template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2);
231template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2);
232template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2);
233template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2);
234template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2);
235template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1);
236template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1);
237template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add_hl(bitblock256_t arg1);
238template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add_hl(bitblock256_t arg1);
239template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add_hl(bitblock256_t arg1);
240template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add_hl(bitblock256_t arg1);
241template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1);
242template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1);
243template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant();
244template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant();
245template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant();
246template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant();
247template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant();
248template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant();
249template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant();
250template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant();
251template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant();
252template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2);
253template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2);
254template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::min(bitblock256_t arg1, bitblock256_t arg2);
255template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::min(bitblock256_t arg1, bitblock256_t arg2);
256template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::min(bitblock256_t arg1, bitblock256_t arg2);
257template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::min(bitblock256_t arg1, bitblock256_t arg2);
258template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::min(bitblock256_t arg1, bitblock256_t arg2);
259template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2);
260template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2);
261template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask();
262template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask();
263template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask();
264template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask();
265template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask();
266template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask();
267template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask();
268template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask();
269template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2);
270template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2);
271template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2);
272template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2);
273template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2);
274template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2);
275template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2);
276template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2);
277template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2);
278template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1);
279template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1);
280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1);
281template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1);
282template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1);
283template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1);
284template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1);
285template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1);
286template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2);
287template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2);
288template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2);
289template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2);
290template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2);
291template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2);
292template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2);
293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2);
294template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2);
295template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1);
296template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1);
297template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1);
298template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1);
299template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1);
300template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1);
301template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1);
302template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1);
303template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask();
304template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask();
305template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask();
306template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask();
307template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask();
308template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask();
309template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask();
310template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask();
311template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2);
312template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2);
313template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2);
314template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2);
315template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2);
316template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2);
317template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2);
318template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2);
319template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2);
320template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2);
321template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2);
322template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2);
323template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2);
324template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2);
325template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2);
326template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2);
327template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2);
328template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2);
329template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
330template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
331template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
332template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
333template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
334template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
335template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
336template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::umin_hl(bitblock256_t arg1, bitblock256_t arg2);
337template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
338template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
339template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
340template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
341template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
342template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
343template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
344template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::add_hl(bitblock256_t arg1, bitblock256_t arg2);
345template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packss(bitblock256_t arg1, bitblock256_t arg2);
346template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packss(bitblock256_t arg1, bitblock256_t arg2);
347template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packss(bitblock256_t arg1, bitblock256_t arg2);
348template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packss(bitblock256_t arg1, bitblock256_t arg2);
349template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packss(bitblock256_t arg1, bitblock256_t arg2);
350template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packss(bitblock256_t arg1, bitblock256_t arg2);
351template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packss(bitblock256_t arg1, bitblock256_t arg2);
352template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packss(bitblock256_t arg1, bitblock256_t arg2);
353template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<8>::signmask(bitblock256_t arg1);
354template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<16>::signmask(bitblock256_t arg1);
355template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<32>::signmask(bitblock256_t arg1);
356template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<64>::signmask(bitblock256_t arg1);
357template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<128>::signmask(bitblock256_t arg1);
358template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<256>::signmask(bitblock256_t arg1);
359template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packl(bitblock256_t arg1, bitblock256_t arg2);
360template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packl(bitblock256_t arg1, bitblock256_t arg2);
361template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packl(bitblock256_t arg1, bitblock256_t arg2);
362template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packl(bitblock256_t arg1, bitblock256_t arg2);
363template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packl(bitblock256_t arg1, bitblock256_t arg2);
364template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packl(bitblock256_t arg1, bitblock256_t arg2);
365template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packl(bitblock256_t arg1, bitblock256_t arg2);
366template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packl(bitblock256_t arg1, bitblock256_t arg2);
367template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packh(bitblock256_t arg1, bitblock256_t arg2);
368template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packh(bitblock256_t arg1, bitblock256_t arg2);
369template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packh(bitblock256_t arg1, bitblock256_t arg2);
370template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packh(bitblock256_t arg1, bitblock256_t arg2);
371template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packh(bitblock256_t arg1, bitblock256_t arg2);
372template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packh(bitblock256_t arg1, bitblock256_t arg2);
373template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packh(bitblock256_t arg1, bitblock256_t arg2);
374template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packh(bitblock256_t arg1, bitblock256_t arg2);
375template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
376template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
377template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
378template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
379template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
380template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
381template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
382template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::min_hl(bitblock256_t arg1, bitblock256_t arg2);
383template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packus(bitblock256_t arg1, bitblock256_t arg2);
384template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packus(bitblock256_t arg1, bitblock256_t arg2);
385template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packus(bitblock256_t arg1, bitblock256_t arg2);
386template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packus(bitblock256_t arg1, bitblock256_t arg2);
387template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packus(bitblock256_t arg1, bitblock256_t arg2);
388template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packus(bitblock256_t arg1, bitblock256_t arg2);
389template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packus(bitblock256_t arg1, bitblock256_t arg2);
390template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packus(bitblock256_t arg1, bitblock256_t arg2);
391template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::mergel(bitblock256_t arg1, bitblock256_t arg2);
392template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::mergel(bitblock256_t arg1, bitblock256_t arg2);
393template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::mergel(bitblock256_t arg1, bitblock256_t arg2);
394template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::mergel(bitblock256_t arg1, bitblock256_t arg2);
395template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::mergel(bitblock256_t arg1, bitblock256_t arg2);
396template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::mergel(bitblock256_t arg1, bitblock256_t arg2);
397template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::mergel(bitblock256_t arg1, bitblock256_t arg2);
398template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::mergel(bitblock256_t arg1, bitblock256_t arg2);
399template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
400template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
401template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
402template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
403template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
404template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
405template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
406template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::mergeh(bitblock256_t arg1, bitblock256_t arg2);
407template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::zeroextendh(bitblock256_t arg1);
408template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::zeroextendh(bitblock256_t arg1);
409template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::zeroextendh(bitblock256_t arg1);
410template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::zeroextendh(bitblock256_t arg1);
411template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::zeroextendh(bitblock256_t arg1);
412template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::zeroextendh(bitblock256_t arg1);
413template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::zeroextendh(bitblock256_t arg1);
414template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::zeroextendh(bitblock256_t arg1);
415template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::zeroextendl(bitblock256_t arg1);
416template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::zeroextendl(bitblock256_t arg1);
417template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::zeroextendl(bitblock256_t arg1);
418template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::zeroextendl(bitblock256_t arg1);
419template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::zeroextendl(bitblock256_t arg1);
420template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::zeroextendl(bitblock256_t arg1);
421template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::zeroextendl(bitblock256_t arg1);
422template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::zeroextendl(bitblock256_t arg1);
423template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::signextendh(bitblock256_t arg1);
424template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::signextendh(bitblock256_t arg1);
425template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::signextendh(bitblock256_t arg1);
426template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::signextendh(bitblock256_t arg1);
427template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::signextendh(bitblock256_t arg1);
428template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::signextendh(bitblock256_t arg1);
429template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::signextendh(bitblock256_t arg1);
430template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::signextendh(bitblock256_t arg1);
431template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::signextendl(bitblock256_t arg1);
432template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::signextendl(bitblock256_t arg1);
433template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::signextendl(bitblock256_t arg1);
434template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::signextendl(bitblock256_t arg1);
435template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::signextendl(bitblock256_t arg1);
436template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::signextendl(bitblock256_t arg1);
437template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::signextendl(bitblock256_t arg1);
438template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::signextendl(bitblock256_t arg1);
439template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
440template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
441template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
442template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
443template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
444template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
445template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
446template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2);
447template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(uint64_t val1);
448template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(uint64_t val1);
449template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(uint64_t val1);
450template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(uint64_t val1);
451template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(uint64_t val1);
452template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(uint64_t val1);
453template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<1>::extract(bitblock256_t arg1);
454template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<2>::extract(bitblock256_t arg1);
455template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<4>::extract(bitblock256_t arg1);
456template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<8>::extract(bitblock256_t arg1);
457template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<16>::extract(bitblock256_t arg1);
458template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<32>::extract(bitblock256_t arg1);
459template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<64>::extract(bitblock256_t arg1);
460template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1);
461template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1);
462template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1);
463template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1);
464template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1);
465template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1);
466template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1);
467template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1);
468template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1);
469template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
470template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
471template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
472template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
473template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
474template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
475template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
476template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
477template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
478template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
479template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
480template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1);
481template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1);
482template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1);
483template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1);
484template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1);
485template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1);
486template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1);
487template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1);
488template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(uint64_t val1, uint64_t val2);
489template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(uint64_t val1, uint64_t val2);
490template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(uint64_t val1, uint64_t val2);
491template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(uint64_t val1, uint64_t val2);
492template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(uint64_t val1, uint64_t val2);
493template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(uint64_t val1, uint64_t val2);
494template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2);
495template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2);
496template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2);
497template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2);
498template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2);
499template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2);
500template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2);
501template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2);
502template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1);
503template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1);
504template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1);
505template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1);
506template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1);
507template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1);
508template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1);
509template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1);
510template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
511template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
512template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
513template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
514template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
515template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
516
517//Implementation Part
518
519#define avx_move_lo128_to_hi128(x) \
520        _mm256_permute2f128_ps(x, x, 0 + 8)
521
522#define avx_select_lo128(x) \
523        ((__m128i) _mm256_castps256_ps128(x))
524
525#define avx_move_hi128_to_lo128(x) \
526        _mm256_permute2f128_ps(x, x, 1 + 128)
527
528#define avx_select_hi128(x) \
529        ((__m128i)(_mm256_extractf128_ps(x, 1)))
530
531#define avx_byte_shift_right(x, y) \
532        ((bitblock256_t)avx_general_combine256(_mm_srli_si128(avx_select_hi128(x), y), _mm_srli_si128(avx_select_lo128(x), y)))
533
534#define avx_byte_shift_left(x, y) \
535        ((bitblock256_t)avx_general_combine256(_mm_slli_si128(avx_select_hi128(x), y), _mm_slli_si128(avx_select_lo128(x), y)))
536
537#define avx_general_combine256(x, y) \
538   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
539//The total number of operations is 2
540IDISA_ALWAYS_INLINE bitblock256_t simd_nor(bitblock256_t arg1, bitblock256_t arg2)
541{
542        return simd_not(simd_or(arg1, arg2));
543}
544
545//The total number of operations is 1
546IDISA_ALWAYS_INLINE bitblock256_t simd_not(bitblock256_t arg1)
547{
548        return simd_xor(arg1, simd256<32>::constant<-1>());
549}
550
551//The total number of operations is 1
552IDISA_ALWAYS_INLINE bitblock256_t simd_andc(bitblock256_t arg1, bitblock256_t arg2)
553{
554        return _mm256_andnot_ps(arg2, arg1);
555}
556
557//The total number of operations is 1
558IDISA_ALWAYS_INLINE bitblock256_t simd_or(bitblock256_t arg1, bitblock256_t arg2)
559{
560        return _mm256_or_ps(arg1, arg2);
561}
562
563//The total number of operations is 1
564IDISA_ALWAYS_INLINE bitblock256_t simd_and(bitblock256_t arg1, bitblock256_t arg2)
565{
566        return _mm256_and_ps(arg1, arg2);
567}
568
569//The total number of operations is 1
570IDISA_ALWAYS_INLINE bitblock256_t simd_xor(bitblock256_t arg1, bitblock256_t arg2)
571{
572        return _mm256_xor_ps(arg1, arg2);
573}
574
575//The total number of operations is 1
576template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::max(bitblock256_t arg1, bitblock256_t arg2)
577{
578        return simd_and(arg1, arg2);
579}
580
581//The total number of operations is 29
582template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::max(bitblock256_t arg1, bitblock256_t arg2)
583{
584        bitblock256_t hiAns = simd256<(1)>::max(arg1, arg2);
585        bitblock256_t loAns = simd256<(1)>::umax(arg1, arg2);
586        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg1));
587        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg2));
588        return simd256<1>::ifh(simd256<2>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
589}
590
591//The total number of operations is 23
592template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::max(bitblock256_t arg1, bitblock256_t arg2)
593{
594        bitblock256_t high_bit = simd256<4>::constant<(8)>();
595        return simd_xor(simd256<4>::umax(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
596}
597
598//The total number of operations is 8
599template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::max(bitblock256_t arg1, bitblock256_t arg2)
600{
601        return avx_general_combine256(_mm_max_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
602}
603
604//The total number of operations is 8
605template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::max(bitblock256_t arg1, bitblock256_t arg2)
606{
607        return avx_general_combine256(_mm_max_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
608}
609
610//The total number of operations is 8
611template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::max(bitblock256_t arg1, bitblock256_t arg2)
612{
613        return avx_general_combine256(_mm_max_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
614}
615
616//The total number of operations is 11
617template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::max(bitblock256_t arg1, bitblock256_t arg2)
618{
619        return simd256<1>::ifh(simd256<64>::gt(arg1, arg2), arg1, arg2);
620}
621
622//The total number of operations is 88
623template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::max(bitblock256_t arg1, bitblock256_t arg2)
624{
625        bitblock256_t hiAns = simd256<(64)>::max(arg1, arg2);
626        bitblock256_t loAns = simd256<(64)>::umax(arg1, arg2);
627        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg1));
628        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg2));
629        return simd256<1>::ifh(simd256<128>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
630}
631
632//The total number of operations is 352
633template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::max(bitblock256_t arg1, bitblock256_t arg2)
634{
635        bitblock256_t hiAns = simd256<(128)>::max(arg1, arg2);
636        bitblock256_t loAns = simd256<(128)>::umax(arg1, arg2);
637        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg1));
638        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg2));
639        return simd256<1>::ifh(simd256<256>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
640}
641
642//The total number of operations is 1
643template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::mult(bitblock256_t arg1, bitblock256_t arg2)
644{
645        return simd_and(arg1, arg2);
646}
647
648//The total number of operations is 95
649template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::mult(bitblock256_t arg1, bitblock256_t arg2)
650{
651        bitblock256_t tmp1 = simd256<256>::slli<1>(arg1);
652        bitblock256_t tmp2 = simd256<256>::slli<1>(arg2);
653        return simd256<1>::ifh(simd256<2>::himask(), simd_or(simd_and(tmp1, simd_and(arg2, simd_or(simd_not(arg1), simd_not(tmp2)))), simd_and(arg1, simd_and(tmp2, simd_or(simd_not(tmp1), simd_not(arg2))))), simd_and(arg1, arg2));
654}
655
656//The total number of operations is 104
657template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::mult(bitblock256_t arg1, bitblock256_t arg2)
658{
659        bitblock256_t loMask = simd256<(8)>::lomask();
660        bitblock256_t tmpAns1 = simd256<(8)>::mult(simd_and(loMask, arg1), simd_and(loMask, arg2));
661        bitblock256_t tmpAns2 = simd256<(8)>::mult(simd256<(8)>::srli<4>(arg1), simd256<(8)>::srli<4>(arg2));
662        return simd256<1>::ifh(loMask, tmpAns1, simd256<(8)>::slli<4>(tmpAns2));
663}
664
665//The total number of operations is 39
666template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::mult(bitblock256_t arg1, bitblock256_t arg2)
667{
668        bitblock256_t loMask = simd256<(16)>::lomask();
669        bitblock256_t tmpAns1 = simd256<(16)>::mult(simd_and(loMask, arg1), simd_and(loMask, arg2));
670        bitblock256_t tmpAns2 = simd256<(16)>::mult(simd256<(16)>::srli<8>(arg1), simd256<(16)>::srli<8>(arg2));
671        return simd256<1>::ifh(loMask, tmpAns1, simd256<(16)>::slli<8>(tmpAns2));
672}
673
674//The total number of operations is 8
675template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::mult(bitblock256_t arg1, bitblock256_t arg2)
676{
677        return avx_general_combine256(_mm_mullo_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_mullo_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
678}
679
680//The total number of operations is 8
681template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::mult(bitblock256_t arg1, bitblock256_t arg2)
682{
683        return avx_general_combine256(_mm_mullo_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_mullo_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
684}
685
686//The total number of operations is 66
687template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::mult(bitblock256_t arg1, bitblock256_t arg2)
688{
689        bitblock256_t loMask = simd256<64>::lomask();
690        bitblock256_t arg1_low = simd_and(arg1, loMask);
691        bitblock256_t arg1_high = simd256<64>::srli<(32)>(arg1);
692        bitblock256_t arg2_low = simd_and(arg2, loMask);
693        bitblock256_t arg2_high = simd256<64>::srli<(32)>(arg2);
694        bitblock256_t tmpAns1 = simd256<(32)>::umult(arg1_low, arg2_low);
695        bitblock256_t tmpAns2 = simd256<64>::slli<(32)>(simd256<(32)>::umult(arg1_low, arg2_high));
696        bitblock256_t tmpAns3 = simd256<64>::slli<(32)>(simd256<(32)>::umult(arg1_high, arg2_low));
697        return simd256<64>::add(tmpAns1, simd256<64>::add(tmpAns2, tmpAns3));
698}
699
700//The total number of operations is 877
701template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::mult(bitblock256_t arg1, bitblock256_t arg2)
702{
703        bitblock256_t loMask = simd256<128>::lomask();
704        bitblock256_t arg1_low = simd_and(arg1, loMask);
705        bitblock256_t arg1_high = simd256<128>::srli<(64)>(arg1);
706        bitblock256_t arg2_low = simd_and(arg2, loMask);
707        bitblock256_t arg2_high = simd256<128>::srli<(64)>(arg2);
708        bitblock256_t tmpAns1 = simd256<(64)>::umult(arg1_low, arg2_low);
709        bitblock256_t tmpAns2 = simd256<128>::slli<(64)>(simd256<(64)>::umult(arg1_low, arg2_high));
710        bitblock256_t tmpAns3 = simd256<128>::slli<(64)>(simd256<(64)>::umult(arg1_high, arg2_low));
711        return simd256<128>::add(tmpAns1, simd256<128>::add(tmpAns2, tmpAns3));
712}
713
714//The total number of operations is 5001
715template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::mult(bitblock256_t arg1, bitblock256_t arg2)
716{
717        bitblock256_t loMask = simd256<256>::lomask();
718        bitblock256_t arg1_low = simd_and(arg1, loMask);
719        bitblock256_t arg1_high = simd256<256>::srli<(128)>(arg1);
720        bitblock256_t arg2_low = simd_and(arg2, loMask);
721        bitblock256_t arg2_high = simd256<256>::srli<(128)>(arg2);
722        bitblock256_t tmpAns1 = simd256<(128)>::umult(arg1_low, arg2_low);
723        bitblock256_t tmpAns2 = simd256<256>::slli<(128)>(simd256<(128)>::umult(arg1_low, arg2_high));
724        bitblock256_t tmpAns3 = simd256<256>::slli<(128)>(simd256<(128)>::umult(arg1_high, arg2_low));
725        return simd256<256>::add(tmpAns1, simd256<256>::add(tmpAns2, tmpAns3));
726}
727
728//The total number of operations is 1
729template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::gt(bitblock256_t arg1, bitblock256_t arg2)
730{
731        return simd_andc(arg2, arg1);
732}
733
734//The total number of operations is 30
735template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::gt(bitblock256_t arg1, bitblock256_t arg2)
736{
737        bitblock256_t hiAns = simd256<(1)>::gt(arg1, arg2);
738        bitblock256_t loAns = simd256<(1)>::ugt(arg1, arg2);
739        bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
740        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
741        return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
742}
743
744//The total number of operations is 28
745template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::gt(bitblock256_t arg1, bitblock256_t arg2)
746{
747        bitblock256_t high_bit = simd256<4>::constant<(8)>();
748        return simd256<4>::ugt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
749}
750
751//The total number of operations is 8
752template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::gt(bitblock256_t arg1, bitblock256_t arg2)
753{
754        return avx_general_combine256(_mm_cmpgt_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpgt_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
755}
756
757//The total number of operations is 8
758template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::gt(bitblock256_t arg1, bitblock256_t arg2)
759{
760        return avx_general_combine256(_mm_cmpgt_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpgt_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
761}
762
763//The total number of operations is 8
764template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::gt(bitblock256_t arg1, bitblock256_t arg2)
765{
766        return avx_general_combine256(_mm_cmpgt_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpgt_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
767}
768
769//The total number of operations is 8
770template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::gt(bitblock256_t arg1, bitblock256_t arg2)
771{
772        return avx_general_combine256(_mm_cmpgt_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpgt_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
773}
774
775//The total number of operations is 151
776template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::gt(bitblock256_t arg1, bitblock256_t arg2)
777{
778        bitblock256_t hiAns = simd256<(64)>::gt(arg1, arg2);
779        bitblock256_t loAns = simd256<(64)>::ugt(arg1, arg2);
780        bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
781        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
782        return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
783}
784
785//The total number of operations is 646
786template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::gt(bitblock256_t arg1, bitblock256_t arg2)
787{
788        bitblock256_t hiAns = simd256<(128)>::gt(arg1, arg2);
789        bitblock256_t loAns = simd256<(128)>::ugt(arg1, arg2);
790        bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
791        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
792        return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
793}
794
795//The total number of operations is 978
796template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umult(bitblock256_t arg1, bitblock256_t arg2)
797{
798        bitblock256_t loMask = simd256<(2)>::lomask();
799        bitblock256_t tmpAns1 = simd256<(2)>::umult(simd_and(loMask, arg1), simd_and(loMask, arg2));
800        bitblock256_t tmpAns2 = simd256<(2)>::umult(simd_and(loMask, simd256<(4)>::srli<(2)>(arg1)), simd_and(loMask, simd256<(4)>::srli<(2)>(arg2)));
801        return simd_or(tmpAns1, simd256<(4)>::slli<(2)>(tmpAns2));
802}
803
804//The total number of operations is 476
805template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umult(bitblock256_t arg1, bitblock256_t arg2)
806{
807        bitblock256_t loMask = simd256<(4)>::lomask();
808        bitblock256_t tmpAns1 = simd256<(4)>::umult(simd_and(loMask, arg1), simd_and(loMask, arg2));
809        bitblock256_t tmpAns2 = simd256<(4)>::umult(simd_and(loMask, simd256<(8)>::srli<(4)>(arg1)), simd_and(loMask, simd256<(8)>::srli<(4)>(arg2)));
810        return simd_or(tmpAns1, simd256<(8)>::slli<(4)>(tmpAns2));
811}
812
813//The total number of operations is 225
814template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umult(bitblock256_t arg1, bitblock256_t arg2)
815{
816        bitblock256_t loMask = simd256<(8)>::lomask();
817        bitblock256_t tmpAns1 = simd256<(8)>::umult(simd_and(loMask, arg1), simd_and(loMask, arg2));
818        bitblock256_t tmpAns2 = simd256<(8)>::umult(simd_and(loMask, simd256<(16)>::srli<(8)>(arg1)), simd_and(loMask, simd256<(16)>::srli<(8)>(arg2)));
819        return simd_or(tmpAns1, simd256<(16)>::slli<(8)>(tmpAns2));
820}
821
822//The total number of operations is 101
823template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umult(bitblock256_t arg1, bitblock256_t arg2)
824{
825        bitblock256_t loMask = simd256<(16)>::lomask();
826        bitblock256_t tmpAns1 = simd256<(16)>::umult(simd_and(loMask, arg1), simd_and(loMask, arg2));
827        bitblock256_t tmpAns2 = simd256<(16)>::umult(simd_and(loMask, simd256<(32)>::srli<(16)>(arg1)), simd_and(loMask, simd256<(32)>::srli<(16)>(arg2)));
828        return simd_or(tmpAns1, simd256<(32)>::slli<(16)>(tmpAns2));
829}
830
831//The total number of operations is 39
832template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umult(bitblock256_t arg1, bitblock256_t arg2)
833{
834        bitblock256_t loMask = simd256<(32)>::lomask();
835        bitblock256_t tmpAns1 = simd256<(32)>::umult(simd_and(loMask, arg1), simd_and(loMask, arg2));
836        bitblock256_t tmpAns2 = simd256<(32)>::umult(simd_and(loMask, simd256<(64)>::srli<(32)>(arg1)), simd_and(loMask, simd256<(64)>::srli<(32)>(arg2)));
837        return simd_or(tmpAns1, simd256<(64)>::slli<(32)>(tmpAns2));
838}
839
840//The total number of operations is 8
841template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umult(bitblock256_t arg1, bitblock256_t arg2)
842{
843        return avx_general_combine256(_mm_mul_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_mul_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
844}
845
846//The total number of operations is 237
847template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umult(bitblock256_t arg1, bitblock256_t arg2)
848{
849        bitblock256_t loMask1 = simd256<(128)>::lomask();
850        bitblock256_t arg11 = simd_and(arg1, loMask1);
851        bitblock256_t arg22 = simd_and(arg2, loMask1);
852        bitblock256_t loMask2 = simd256<64>::lomask();
853        bitblock256_t arg1_low = simd_and(arg11, loMask2);
854        bitblock256_t arg1_high = simd256<64>::srli<(32)>(arg11);
855        bitblock256_t arg2_low = simd_and(arg22, loMask2);
856        bitblock256_t arg2_high = simd256<64>::srli<(32)>(arg22);
857        bitblock256_t tmpAns1 = simd256<(32)>::umult(arg1_low, arg2_low);
858        bitblock256_t tmpAns2 = simd256<(128)>::slli<(32)>(simd256<(32)>::umult(arg1_low, arg2_high));
859        bitblock256_t tmpAns3 = simd256<(128)>::slli<(32)>(simd256<(32)>::umult(arg1_high, arg2_low));
860        bitblock256_t tmpAns4 = simd256<(128)>::slli<64>(simd256<(32)>::umult(arg1_high, arg2_high));
861        return simd256<(128)>::add(tmpAns1, simd256<(128)>::add(tmpAns2, simd256<(128)>::add(tmpAns3, tmpAns4)));
862}
863
864//The total number of operations is 1521
865template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umult(bitblock256_t arg1, bitblock256_t arg2)
866{
867        bitblock256_t loMask1 = simd256<(256)>::lomask();
868        bitblock256_t arg11 = simd_and(arg1, loMask1);
869        bitblock256_t arg22 = simd_and(arg2, loMask1);
870        bitblock256_t loMask2 = simd256<128>::lomask();
871        bitblock256_t arg1_low = simd_and(arg11, loMask2);
872        bitblock256_t arg1_high = simd256<128>::srli<(64)>(arg11);
873        bitblock256_t arg2_low = simd_and(arg22, loMask2);
874        bitblock256_t arg2_high = simd256<128>::srli<(64)>(arg22);
875        bitblock256_t tmpAns1 = simd256<(64)>::umult(arg1_low, arg2_low);
876        bitblock256_t tmpAns2 = simd256<(256)>::slli<(64)>(simd256<(64)>::umult(arg1_low, arg2_high));
877        bitblock256_t tmpAns3 = simd256<(256)>::slli<(64)>(simd256<(64)>::umult(arg1_high, arg2_low));
878        bitblock256_t tmpAns4 = simd256<(256)>::slli<128>(simd256<(64)>::umult(arg1_high, arg2_high));
879        return simd256<(256)>::add(tmpAns1, simd256<(256)>::add(tmpAns2, simd256<(256)>::add(tmpAns3, tmpAns4)));
880}
881
882//The total number of operations is 1
883template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ult(bitblock256_t arg1, bitblock256_t arg2)
884{
885        return simd_andc(arg2, arg1);
886}
887
888//The total number of operations is 29
889template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ult(bitblock256_t arg1, bitblock256_t arg2)
890{
891        bitblock256_t tmpAns = simd256<(1)>::ult(arg1, arg2);
892        bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
893        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
894        return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
895}
896
897//The total number of operations is 48
898template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ult(bitblock256_t arg1, bitblock256_t arg2)
899{
900        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ult(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::ult(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
901}
902
903//The total number of operations is 21
904template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ult(bitblock256_t arg1, bitblock256_t arg2)
905{
906        bitblock256_t high_bit = simd256<8>::constant<(128)>();
907        return simd256<8>::lt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
908}
909
910//The total number of operations is 21
911template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ult(bitblock256_t arg1, bitblock256_t arg2)
912{
913        bitblock256_t high_bit = simd256<16>::constant<(32768)>();
914        return simd256<16>::lt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
915}
916
917//The total number of operations is 21
918template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ult(bitblock256_t arg1, bitblock256_t arg2)
919{
920        bitblock256_t high_bit = simd256<32>::constant<(2147483648UL)>();
921        return simd256<32>::lt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
922}
923
924//The total number of operations is 21
925template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ult(bitblock256_t arg1, bitblock256_t arg2)
926{
927        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808UL)>();
928        return simd256<64>::lt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
929}
930
931//The total number of operations is 154
932template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ult(bitblock256_t arg1, bitblock256_t arg2)
933{
934        bitblock256_t tmpAns = simd256<(64)>::ult(arg1, arg2);
935        bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
936        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
937        return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
938}
939
940//The total number of operations is 496
941template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ult(bitblock256_t arg1, bitblock256_t arg2)
942{
943        return simd_and(simd256<256>::srai<(255)>(simd_or(simd_and(simd_not(arg1), arg2), simd_and(simd_not(simd_xor(arg1, arg2)), simd256<256>::sub(arg1, arg2)))), simd_not(simd256<256>::eq(arg1, arg2)));
944}
945
946//The total number of operations is 1
947template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::lt(bitblock256_t arg1, bitblock256_t arg2)
948{
949        return simd_andc(arg1, arg2);
950}
951
952//The total number of operations is 30
953template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lt(bitblock256_t arg1, bitblock256_t arg2)
954{
955        bitblock256_t hiAns = simd256<(1)>::lt(arg1, arg2);
956        bitblock256_t loAns = simd256<(1)>::ult(arg1, arg2);
957        bitblock256_t mask = simd_and(loAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
958        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
959        return simd_or(simd256<2>::srai<(1)>(hiAns), mask);
960}
961
962//The total number of operations is 50
963template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lt(bitblock256_t arg1, bitblock256_t arg2)
964{
965        bitblock256_t high_bit = simd256<4>::constant<(8)>();
966        return simd256<4>::ult(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
967}
968
969//The total number of operations is 19
970template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lt(bitblock256_t arg1, bitblock256_t arg2)
971{
972        return simd_and(simd_not(simd256<8>::gt(arg1, arg2)), simd_not(simd256<8>::eq(arg1, arg2)));
973}
974
975//The total number of operations is 19
976template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lt(bitblock256_t arg1, bitblock256_t arg2)
977{
978        return simd_and(simd_not(simd256<16>::gt(arg1, arg2)), simd_not(simd256<16>::eq(arg1, arg2)));
979}
980
981//The total number of operations is 19
982template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lt(bitblock256_t arg1, bitblock256_t arg2)
983{
984        return simd_and(simd_not(simd256<32>::gt(arg1, arg2)), simd_not(simd256<32>::eq(arg1, arg2)));
985}
986
987//The total number of operations is 19
988template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lt(bitblock256_t arg1, bitblock256_t arg2)
989{
990        return simd_and(simd_not(simd256<64>::gt(arg1, arg2)), simd_not(simd256<64>::eq(arg1, arg2)));
991}
992
993//The total number of operations is 173
994template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lt(bitblock256_t arg1, bitblock256_t arg2)
995{
996        bitblock256_t hiAns = simd256<(64)>::lt(arg1, arg2);
997        bitblock256_t loAns = simd256<(64)>::ult(arg1, arg2);
998        bitblock256_t mask = simd_and(loAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
999        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
1000        return simd_or(simd256<128>::srai<(64)>(hiAns), mask);
1001}
1002
1003//The total number of operations is 679
1004template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lt(bitblock256_t arg1, bitblock256_t arg2)
1005{
1006        bitblock256_t hiAns = simd256<(128)>::lt(arg1, arg2);
1007        bitblock256_t loAns = simd256<(128)>::ult(arg1, arg2);
1008        bitblock256_t mask = simd_and(loAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
1009        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
1010        return simd_or(simd256<256>::srai<(128)>(hiAns), mask);
1011}
1012
1013//The total number of operations is 7
1014template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srli(bitblock256_t arg1)
1015{
1016        return simd_and(simd256<32>::srli<sh>(arg1), simd256<2>::constant<((3)>>sh)>());
1017}
1018
1019//The total number of operations is 7
1020template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srli(bitblock256_t arg1)
1021{
1022        return simd_and(simd256<32>::srli<sh>(arg1), simd256<4>::constant<((15)>>sh)>());
1023}
1024
1025//The total number of operations is 7
1026template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srli(bitblock256_t arg1)
1027{
1028        return simd_and(simd256<32>::srli<sh>(arg1), simd256<8>::constant<((255)>>sh)>());
1029}
1030
1031//The total number of operations is 6
1032template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srli(bitblock256_t arg1)
1033{
1034        return avx_general_combine256(_mm_srli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
1035}
1036
1037//The total number of operations is 6
1038template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srli(bitblock256_t arg1)
1039{
1040        return avx_general_combine256(_mm_srli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
1041}
1042
1043//The total number of operations is 6
1044template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srli(bitblock256_t arg1)
1045{
1046        return avx_general_combine256(_mm_srli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
1047}
1048
1049//The total number of operations is 19
1050template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srli(bitblock256_t arg1)
1051{
1052        return (((sh%8) == 0) ? avx_byte_shift_right(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::srli<(sh-64)>(avx_byte_shift_right(arg1, 8)) : simd_or(simd256<64>::srli<sh>(arg1), avx_byte_shift_right(simd256<64>::slli<(64-sh)>(arg1), 8))));
1053}
1054
1055//The total number of operations is 41
1056template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srli(bitblock256_t arg1)
1057{
1058        return ((sh < 128) ? simd_or(simd256<128>::srli<sh>(arg1), simd256<128>::slli<(128-sh)>(((bitblock256_t)_mm256_castsi128_si256(avx_select_hi128(arg1))))) : simd256<128>::srli<(sh-128)>(avx_move_hi128_to_lo128(arg1)));
1059}
1060
1061//The total number of operations is 1
1062template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ctz(bitblock256_t arg1)
1063{
1064        return simd_not(arg1);
1065}
1066
1067//The total number of operations is 34
1068template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ctz(bitblock256_t arg1)
1069{
1070        return simd256<2>::popcount(simd_andc(simd256<2>::sub(arg1, simd256<2>::constant<1>()), arg1));
1071}
1072
1073//The total number of operations is 52
1074template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ctz(bitblock256_t arg1)
1075{
1076        return simd256<4>::popcount(simd_andc(simd256<4>::sub(arg1, simd256<4>::constant<1>()), arg1));
1077}
1078
1079//The total number of operations is 56
1080template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ctz(bitblock256_t arg1)
1081{
1082        return simd256<8>::popcount(simd_andc(simd256<8>::sub(arg1, simd256<8>::constant<1>()), arg1));
1083}
1084
1085//The total number of operations is 71
1086template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ctz(bitblock256_t arg1)
1087{
1088        return simd256<16>::popcount(simd_andc(simd256<16>::sub(arg1, simd256<16>::constant<1>()), arg1));
1089}
1090
1091//The total number of operations is 86
1092template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ctz(bitblock256_t arg1)
1093{
1094        return simd256<32>::popcount(simd_andc(simd256<32>::sub(arg1, simd256<32>::constant<1>()), arg1));
1095}
1096
1097//The total number of operations is 64
1098template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ctz(bitblock256_t arg1)
1099{
1100        return simd256<64>::popcount(simd_andc(simd256<64>::sub(arg1, simd256<64>::constant<1>()), arg1));
1101}
1102
1103//The total number of operations is 164
1104template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ctz(bitblock256_t arg1)
1105{
1106        return simd256<128>::popcount(simd_andc(simd256<128>::sub(arg1, simd256<128>::constant<1>()), arg1));
1107}
1108
1109//The total number of operations is 343
1110template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ctz(bitblock256_t arg1)
1111{
1112        return simd256<256>::popcount(simd_andc(simd256<256>::sub(arg1, simd256<256>::constant<1>()), arg1));
1113}
1114
1115//The total number of operations is 1
1116template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1117{
1118        return simd_andc(arg1, arg2);
1119}
1120
1121//The total number of operations is 29
1122template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1123{
1124        bitblock256_t tmpAns = simd256<(1)>::ugt(arg1, arg2);
1125        bitblock256_t mask = simd_and(tmpAns, simd256<2>::srli<(1)>(simd256<(1)>::eq(arg1, arg2)));
1126        mask = simd_or(mask, simd256<2>::slli<(1)>(mask));
1127        return simd_or(simd256<2>::srai<(1)>(tmpAns), mask);
1128}
1129
1130//The total number of operations is 26
1131template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1132{
1133        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::ugt(simd_and(simd256<(8)>::himask(), arg1), arg2), simd256<(8)>::ugt(simd_andc(arg1, simd256<(8)>::himask()), simd_andc(arg2, simd256<(8)>::himask())));
1134}
1135
1136//The total number of operations is 10
1137template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1138{
1139        bitblock256_t high_bit = simd256<8>::constant<(128)>();
1140        return simd256<8>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1141}
1142
1143//The total number of operations is 10
1144template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1145{
1146        bitblock256_t high_bit = simd256<16>::constant<(32768)>();
1147        return simd256<16>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1148}
1149
1150//The total number of operations is 10
1151template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1152{
1153        bitblock256_t high_bit = simd256<32>::constant<(2147483648UL)>();
1154        return simd256<32>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1155}
1156
1157//The total number of operations is 10
1158template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1159{
1160        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808UL)>();
1161        return simd256<64>::gt(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1162}
1163
1164//The total number of operations is 143
1165template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1166{
1167        bitblock256_t tmpAns = simd256<(64)>::ugt(arg1, arg2);
1168        bitblock256_t mask = simd_and(tmpAns, simd256<128>::srli<(64)>(simd256<(64)>::eq(arg1, arg2)));
1169        mask = simd_or(mask, simd256<128>::slli<(64)>(mask));
1170        return simd_or(simd256<128>::srai<(64)>(tmpAns), mask);
1171}
1172
1173//The total number of operations is 495
1174template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ugt(bitblock256_t arg1, bitblock256_t arg2)
1175{
1176        bitblock256_t tmpAns = simd256<(128)>::ugt(arg1, arg2);
1177        bitblock256_t mask = simd_and(tmpAns, simd256<256>::srli<(128)>(simd256<(128)>::eq(arg1, arg2)));
1178        mask = simd_or(mask, simd256<256>::slli<(128)>(mask));
1179        return simd_or(simd256<256>::srai<(128)>(tmpAns), mask);
1180}
1181
1182//The total number of operations is 9
1183template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::xor_hl(bitblock256_t arg1)
1184{
1185        return simd_xor(simd256<2>::srli<(1)>(arg1), simd_and(arg1, simd256<2>::lomask()));
1186}
1187
1188//The total number of operations is 9
1189template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::xor_hl(bitblock256_t arg1)
1190{
1191        return simd_xor(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
1192}
1193
1194//The total number of operations is 9
1195template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::xor_hl(bitblock256_t arg1)
1196{
1197        return simd_xor(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
1198}
1199
1200//The total number of operations is 8
1201template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::xor_hl(bitblock256_t arg1)
1202{
1203        return simd_xor(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
1204}
1205
1206//The total number of operations is 8
1207template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::xor_hl(bitblock256_t arg1)
1208{
1209        return simd_xor(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
1210}
1211
1212//The total number of operations is 8
1213template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::xor_hl(bitblock256_t arg1)
1214{
1215        return simd_xor(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
1216}
1217
1218//The total number of operations is 21
1219template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::xor_hl(bitblock256_t arg1)
1220{
1221        return simd_xor(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
1222}
1223
1224//The total number of operations is 43
1225template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::xor_hl(bitblock256_t arg1)
1226{
1227        return simd_xor(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
1228}
1229
1230//The total number of operations is 0
1231template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::popcount(bitblock256_t arg1)
1232{
1233        return arg1;
1234}
1235
1236//The total number of operations is 15
1237template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::popcount(bitblock256_t arg1)
1238{
1239        return simd256<2>::add_hl(simd256<(1)>::popcount(arg1));
1240}
1241
1242//The total number of operations is 31
1243template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::popcount(bitblock256_t arg1)
1244{
1245        return simd256<4>::add_hl(simd256<(2)>::popcount(arg1));
1246}
1247
1248//The total number of operations is 47
1249template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::popcount(bitblock256_t arg1)
1250{
1251        return simd256<8>::add_hl(simd256<(4)>::popcount(arg1));
1252}
1253
1254//The total number of operations is 62
1255template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::popcount(bitblock256_t arg1)
1256{
1257        return simd256<16>::add_hl(simd256<(8)>::popcount(arg1));
1258}
1259
1260//The total number of operations is 77
1261template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::popcount(bitblock256_t arg1)
1262{
1263        return simd256<32>::add_hl(simd256<(16)>::popcount(arg1));
1264}
1265
1266//The total number of operations is 55
1267template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::popcount(bitblock256_t arg1)
1268{
1269        bitblock256_t tmpAns = simd256<8>::popcount(arg1);
1270        return avx_general_combine256(_mm_sad_epu8(avx_select_hi128(tmpAns), _mm_set1_epi32((int32_t)(0))), _mm_sad_epu8(avx_select_lo128(tmpAns), _mm_set1_epi32((int32_t)(0))));
1271}
1272
1273//The total number of operations is 119
1274template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::popcount(bitblock256_t arg1)
1275{
1276        return simd256<128>::add_hl(simd256<(64)>::popcount(arg1));
1277}
1278
1279//The total number of operations is 205
1280template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::popcount(bitblock256_t arg1)
1281{
1282        bitblock256_t tmpAns = simd256<(128)>::popcount(arg1);
1283        return simd256<(128)>::add(simd_and(tmpAns, simd256<256>::lomask()), simd256<256>::srli<(128)>(tmpAns));
1284}
1285
1286//The total number of operations is 18
1287template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::neg(bitblock256_t arg1)
1288{
1289        return simd256<2>::sub(simd256<2>::constant<0>(), arg1);
1290}
1291
1292//The total number of operations is 20
1293template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::neg(bitblock256_t arg1)
1294{
1295        return simd256<4>::sub(simd256<4>::constant<0>(), arg1);
1296}
1297
1298//The total number of operations is 8
1299template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::neg(bitblock256_t arg1)
1300{
1301        return simd256<8>::sub(simd256<8>::constant<0>(), arg1);
1302}
1303
1304//The total number of operations is 8
1305template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::neg(bitblock256_t arg1)
1306{
1307        return simd256<16>::sub(simd256<16>::constant<0>(), arg1);
1308}
1309
1310//The total number of operations is 8
1311template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::neg(bitblock256_t arg1)
1312{
1313        return simd256<32>::sub(simd256<32>::constant<0>(), arg1);
1314}
1315
1316//The total number of operations is 8
1317template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::neg(bitblock256_t arg1)
1318{
1319        return simd256<64>::sub(simd256<64>::constant<0>(), arg1);
1320}
1321
1322//The total number of operations is 44
1323template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::neg(bitblock256_t arg1)
1324{
1325        return simd256<128>::sub(simd256<128>::constant<0>(), arg1);
1326}
1327
1328//The total number of operations is 137
1329template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::neg(bitblock256_t arg1)
1330{
1331        return simd256<256>::sub(simd256<256>::constant<0>(), arg1);
1332}
1333
1334//The total number of operations is 7
1335template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::slli(bitblock256_t arg1)
1336{
1337        return simd_and(simd256<32>::slli<sh>(arg1), simd256<2>::constant<(((3)<<sh)&(3))>());
1338}
1339
1340//The total number of operations is 7
1341template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::slli(bitblock256_t arg1)
1342{
1343        return simd_and(simd256<32>::slli<sh>(arg1), simd256<4>::constant<(((15)<<sh)&(15))>());
1344}
1345
1346//The total number of operations is 7
1347template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::slli(bitblock256_t arg1)
1348{
1349        return simd_and(simd256<32>::slli<sh>(arg1), simd256<8>::constant<(((255)<<sh)&(255))>());
1350}
1351
1352//The total number of operations is 6
1353template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::slli(bitblock256_t arg1)
1354{
1355        return avx_general_combine256(_mm_slli_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
1356}
1357
1358//The total number of operations is 6
1359template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::slli(bitblock256_t arg1)
1360{
1361        return avx_general_combine256(_mm_slli_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
1362}
1363
1364//The total number of operations is 6
1365template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::slli(bitblock256_t arg1)
1366{
1367        return avx_general_combine256(_mm_slli_epi64(avx_select_hi128(arg1), (int32_t)(sh)), _mm_slli_epi64(avx_select_lo128(arg1), (int32_t)(sh)));
1368}
1369
1370//The total number of operations is 19
1371template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::slli(bitblock256_t arg1)
1372{
1373        return (((sh%8) == 0) ? avx_byte_shift_left(arg1, (sh/8)) : ((sh >= 64) ? simd256<64>::slli<(sh-64)>(avx_byte_shift_left(arg1, 8)) : simd_or(simd256<64>::slli<sh>(arg1), avx_byte_shift_left(simd256<64>::srli<(64-sh)>(arg1), 8))));
1374}
1375
1376//The total number of operations is 40
1377template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::slli(bitblock256_t arg1)
1378{
1379        return ((sh < 128) ? simd_or(simd256<128>::slli<sh>(arg1), avx_move_lo128_to_hi128(simd256<128>::srli<(128-sh)>(arg1))) : simd256<128>::slli<(sh-128)>(avx_move_lo128_to_hi128(arg1)));
1380}
1381
1382//The total number of operations is 3
1383template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1384{
1385        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
1386}
1387
1388//The total number of operations is 13
1389template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1390{
1391        return simd256<(1)>::ifh(simd256<1>::ifh(simd256<2>::himask(), arg1, simd256<2>::srli<(1)>(arg1)), arg2, arg3);
1392}
1393
1394//The total number of operations is 23
1395template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1396{
1397        return simd256<(2)>::ifh(simd256<1>::ifh(simd256<4>::himask(), arg1, simd256<4>::srli<(2)>(arg1)), arg2, arg3);
1398}
1399
1400//The total number of operations is 11
1401template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1402{
1403        return simd256<1>::ifh(simd256<8>::gt(simd256<8>::constant<0>(), arg1), arg2, arg3);
1404}
1405
1406//The total number of operations is 11
1407template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1408{
1409        return simd256<1>::ifh(simd256<16>::gt(simd256<16>::constant<0>(), arg1), arg2, arg3);
1410}
1411
1412//The total number of operations is 11
1413template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1414{
1415        return simd256<1>::ifh(simd256<32>::gt(simd256<32>::constant<0>(), arg1), arg2, arg3);
1416}
1417
1418//The total number of operations is 1
1419template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1420{
1421        return (bitblock256_t)_mm256_blendv_pd((__m256d)(arg3), (__m256d)(arg2), (__m256d)(arg1));
1422}
1423
1424//The total number of operations is 23
1425template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1426{
1427        return simd256<(64)>::ifh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg1)), arg2, arg3);
1428}
1429
1430//The total number of operations is 67
1431template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::ifh(bitblock256_t arg1, bitblock256_t arg2, bitblock256_t arg3)
1432{
1433        return simd256<(128)>::ifh(simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg1)), arg2, arg3);
1434}
1435
1436//The total number of operations is 1
1437template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::sub(bitblock256_t arg1, bitblock256_t arg2)
1438{
1439        return simd_xor(arg1, arg2);
1440}
1441
1442//The total number of operations is 18
1443template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::sub(bitblock256_t arg1, bitblock256_t arg2)
1444{
1445        bitblock256_t ans = simd256<(1)>::sub(arg1, arg2);
1446        bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
1447        bitblock256_t loMask = simd256<2>::lomask();
1448        bitblock256_t borrow = simd256<2>::slli<1>(simd_and(borrowMask, loMask));
1449        return simd256<1>::ifh(loMask, ans, simd256<(1)>::sub(ans, borrow));
1450}
1451
1452//The total number of operations is 20
1453template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::sub(bitblock256_t arg1, bitblock256_t arg2)
1454{
1455        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::sub(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::sub(arg1, arg2));
1456}
1457
1458//The total number of operations is 8
1459template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::sub(bitblock256_t arg1, bitblock256_t arg2)
1460{
1461        return avx_general_combine256(_mm_sub_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1462}
1463
1464//The total number of operations is 8
1465template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::sub(bitblock256_t arg1, bitblock256_t arg2)
1466{
1467        return avx_general_combine256(_mm_sub_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1468}
1469
1470//The total number of operations is 8
1471template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::sub(bitblock256_t arg1, bitblock256_t arg2)
1472{
1473        return avx_general_combine256(_mm_sub_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1474}
1475
1476//The total number of operations is 8
1477template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::sub(bitblock256_t arg1, bitblock256_t arg2)
1478{
1479        return avx_general_combine256(_mm_sub_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_sub_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1480}
1481
1482//The total number of operations is 44
1483template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::sub(bitblock256_t arg1, bitblock256_t arg2)
1484{
1485        bitblock256_t ans = simd256<(64)>::sub(arg1, arg2);
1486        bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
1487        bitblock256_t loMask = simd256<128>::lomask();
1488        bitblock256_t borrow = simd256<128>::slli<1>(simd_and(borrowMask, loMask));
1489        return simd256<1>::ifh(loMask, ans, simd256<(64)>::sub(ans, borrow));
1490}
1491
1492//The total number of operations is 137
1493template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::sub(bitblock256_t arg1, bitblock256_t arg2)
1494{
1495        bitblock256_t ans = simd256<(128)>::sub(arg1, arg2);
1496        bitblock256_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_and(simd_not(simd_xor(arg1, arg2)), ans));
1497        bitblock256_t loMask = simd256<256>::lomask();
1498        bitblock256_t borrow = simd256<256>::slli<1>(simd_and(borrowMask, loMask));
1499        return simd256<1>::ifh(loMask, ans, simd256<(128)>::sub(ans, borrow));
1500}
1501
1502//The total number of operations is 15
1503template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add_hl(bitblock256_t arg1)
1504{
1505        return simd256<16>::sub(arg1, simd_and(simd256<2>::lomask(), simd256<16>::srli<1>(arg1)));
1506}
1507
1508//The total number of operations is 16
1509template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add_hl(bitblock256_t arg1)
1510{
1511        return simd256<(8)>::add(simd256<4>::srli<(2)>(arg1), simd_and(arg1, simd256<4>::lomask()));
1512}
1513
1514//The total number of operations is 16
1515template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add_hl(bitblock256_t arg1)
1516{
1517        return simd256<(16)>::add(simd256<8>::srli<(4)>(arg1), simd_and(arg1, simd256<8>::lomask()));
1518}
1519
1520//The total number of operations is 15
1521template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add_hl(bitblock256_t arg1)
1522{
1523        return simd256<(32)>::add(simd256<16>::srli<(8)>(arg1), simd_and(arg1, simd256<16>::lomask()));
1524}
1525
1526//The total number of operations is 15
1527template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add_hl(bitblock256_t arg1)
1528{
1529        return simd256<(64)>::add(simd256<32>::srli<(16)>(arg1), simd_and(arg1, simd256<32>::lomask()));
1530}
1531
1532//The total number of operations is 15
1533template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add_hl(bitblock256_t arg1)
1534{
1535        return simd256<64>::add(simd256<64>::srli<(32)>(arg1), simd_and(arg1, simd256<64>::lomask()));
1536}
1537
1538//The total number of operations is 64
1539template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add_hl(bitblock256_t arg1)
1540{
1541        return simd256<128>::add(simd256<128>::srli<(64)>(arg1), simd_and(arg1, simd256<128>::lomask()));
1542}
1543
1544//The total number of operations is 179
1545template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add_hl(bitblock256_t arg1)
1546{
1547        return simd256<256>::add(simd256<256>::srli<(128)>(arg1), simd_and(arg1, simd256<256>::lomask()));
1548}
1549
1550//The total number of operations is 0
1551template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::constant()
1552{
1553        return simd256<32>::constant<(-1*val)>();
1554}
1555
1556//The total number of operations is 0
1557template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::constant()
1558{
1559        return ((val < 0) ? simd256<(4)>::constant<((val<<2)|(val^(-4)))>() : simd256<(4)>::constant<((val<<2)|val)>());
1560}
1561
1562//The total number of operations is 0
1563template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::constant()
1564{
1565        return ((val < 0) ? simd256<(8)>::constant<((val<<4)|(val^(-16)))>() : simd256<(8)>::constant<((val<<4)|val)>());
1566}
1567
1568//The total number of operations is 0
1569template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::constant()
1570{
1571        return (bitblock256_t)_mm256_set1_epi8((int32_t)(val));
1572}
1573
1574//The total number of operations is 0
1575template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::constant()
1576{
1577        return (bitblock256_t)_mm256_set1_epi16((int32_t)(val));
1578}
1579
1580//The total number of operations is 0
1581template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::constant()
1582{
1583        return (bitblock256_t)_mm256_set1_epi32((int32_t)(val));
1584}
1585
1586//The total number of operations is 0
1587template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::constant()
1588{
1589        return ((bitblock256_t)_mm256_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val)));
1590}
1591
1592//The total number of operations is 0
1593template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::constant()
1594{
1595        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val)));
1596}
1597
1598//The total number of operations is 0
1599template <> template <uint64_t val> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::constant()
1600{
1601        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val)));
1602}
1603
1604//The total number of operations is 1
1605template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::min(bitblock256_t arg1, bitblock256_t arg2)
1606{
1607        return simd_or(arg1, arg2);
1608}
1609
1610//The total number of operations is 29
1611template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::min(bitblock256_t arg1, bitblock256_t arg2)
1612{
1613        bitblock256_t hiAns = simd256<(1)>::min(arg1, arg2);
1614        bitblock256_t loAns = simd256<(1)>::umin(arg1, arg2);
1615        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg1));
1616        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(hiAns, arg2));
1617        return simd256<1>::ifh(simd256<2>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
1618}
1619
1620//The total number of operations is 23
1621template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::min(bitblock256_t arg1, bitblock256_t arg2)
1622{
1623        bitblock256_t high_bit = simd256<4>::constant<(8)>();
1624        return simd_xor(simd256<4>::umin(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1625}
1626
1627//The total number of operations is 8
1628template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::min(bitblock256_t arg1, bitblock256_t arg2)
1629{
1630        return avx_general_combine256(_mm_min_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1631}
1632
1633//The total number of operations is 8
1634template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::min(bitblock256_t arg1, bitblock256_t arg2)
1635{
1636        return avx_general_combine256(_mm_min_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1637}
1638
1639//The total number of operations is 8
1640template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::min(bitblock256_t arg1, bitblock256_t arg2)
1641{
1642        return avx_general_combine256(_mm_min_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1643}
1644
1645//The total number of operations is 11
1646template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::min(bitblock256_t arg1, bitblock256_t arg2)
1647{
1648        return simd256<1>::ifh(simd256<64>::gt(arg1, arg2), arg2, arg1);
1649}
1650
1651//The total number of operations is 88
1652template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::min(bitblock256_t arg1, bitblock256_t arg2)
1653{
1654        bitblock256_t hiAns = simd256<(64)>::min(arg1, arg2);
1655        bitblock256_t loAns = simd256<(64)>::umin(arg1, arg2);
1656        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg1));
1657        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(hiAns, arg2));
1658        return simd256<1>::ifh(simd256<128>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
1659}
1660
1661//The total number of operations is 352
1662template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::min(bitblock256_t arg1, bitblock256_t arg2)
1663{
1664        bitblock256_t hiAns = simd256<(128)>::min(arg1, arg2);
1665        bitblock256_t loAns = simd256<(128)>::umin(arg1, arg2);
1666        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg1));
1667        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(hiAns, arg2));
1668        return simd256<1>::ifh(simd256<256>::himask(), hiAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, loAns, arg1), arg2));
1669}
1670
1671//The total number of operations is 0
1672template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::lomask()
1673{
1674        return simd256<2>::constant<(1)>();
1675}
1676
1677//The total number of operations is 0
1678template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::lomask()
1679{
1680        return simd256<4>::constant<(3)>();
1681}
1682
1683//The total number of operations is 0
1684template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::lomask()
1685{
1686        return simd256<8>::constant<(15)>();
1687}
1688
1689//The total number of operations is 0
1690template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::lomask()
1691{
1692        return simd256<16>::constant<(255)>();
1693}
1694
1695//The total number of operations is 0
1696template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::lomask()
1697{
1698        return simd256<32>::constant<(65535)>();
1699}
1700
1701//The total number of operations is 0
1702template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::lomask()
1703{
1704        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1)));
1705}
1706
1707//The total number of operations is 0
1708template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::lomask()
1709{
1710        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1)));
1711}
1712
1713//The total number of operations is 0
1714template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::lomask()
1715{
1716        return ((bitblock256_t)_mm256_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1)));
1717}
1718
1719//The total number of operations is 1
1720template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umin(bitblock256_t arg1, bitblock256_t arg2)
1721{
1722        return simd_and(arg1, arg2);
1723}
1724
1725//The total number of operations is 28
1726template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umin(bitblock256_t arg1, bitblock256_t arg2)
1727{
1728        bitblock256_t tmpAns = simd256<(1)>::umin(arg1, arg2);
1729        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
1730        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
1731        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
1732}
1733
1734//The total number of operations is 20
1735template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umin(bitblock256_t arg1, bitblock256_t arg2)
1736{
1737        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umin(arg1, arg2)), simd256<(8)>::umin(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
1738}
1739
1740//The total number of operations is 8
1741template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umin(bitblock256_t arg1, bitblock256_t arg2)
1742{
1743        return avx_general_combine256(_mm_min_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1744}
1745
1746//The total number of operations is 8
1747template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umin(bitblock256_t arg1, bitblock256_t arg2)
1748{
1749        return avx_general_combine256(_mm_min_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1750}
1751
1752//The total number of operations is 8
1753template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umin(bitblock256_t arg1, bitblock256_t arg2)
1754{
1755        return avx_general_combine256(_mm_min_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_min_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1756}
1757
1758//The total number of operations is 14
1759template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umin(bitblock256_t arg1, bitblock256_t arg2)
1760{
1761        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808UL)>();
1762        return simd_xor(simd256<64>::min(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1763}
1764
1765//The total number of operations is 77
1766template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umin(bitblock256_t arg1, bitblock256_t arg2)
1767{
1768        bitblock256_t tmpAns = simd256<(64)>::umin(arg1, arg2);
1769        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
1770        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
1771        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
1772}
1773
1774//The total number of operations is 264
1775template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umin(bitblock256_t arg1, bitblock256_t arg2)
1776{
1777        bitblock256_t tmpAns = simd256<(128)>::umin(arg1, arg2);
1778        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
1779        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
1780        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
1781}
1782
1783//The total number of operations is 45
1784template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::abs(bitblock256_t arg1)
1785{
1786        return simd256<1>::ifh(simd256<2>::himask(), simd_and(arg1, simd256<256>::slli<1>(simd_not(arg1))), arg1);
1787}
1788
1789//The total number of operations is 51
1790template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::abs(bitblock256_t arg1)
1791{
1792        bitblock256_t gtMask = simd256<4>::gt(arg1, simd256<4>::constant<0>());
1793        return simd256<1>::ifh(gtMask, arg1, simd256<4>::sub(gtMask, arg1));
1794}
1795
1796//The total number of operations is 6
1797template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::abs(bitblock256_t arg1)
1798{
1799        return avx_general_combine256(_mm_abs_epi8(avx_select_hi128(arg1)), _mm_abs_epi8(avx_select_lo128(arg1)));
1800}
1801
1802//The total number of operations is 6
1803template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::abs(bitblock256_t arg1)
1804{
1805        return avx_general_combine256(_mm_abs_epi16(avx_select_hi128(arg1)), _mm_abs_epi16(avx_select_lo128(arg1)));
1806}
1807
1808//The total number of operations is 6
1809template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::abs(bitblock256_t arg1)
1810{
1811        return avx_general_combine256(_mm_abs_epi32(avx_select_hi128(arg1)), _mm_abs_epi32(avx_select_lo128(arg1)));
1812}
1813
1814//The total number of operations is 19
1815template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::abs(bitblock256_t arg1)
1816{
1817        bitblock256_t gtMask = simd256<64>::gt(arg1, simd256<64>::constant<0>());
1818        return simd256<1>::ifh(gtMask, arg1, simd256<64>::sub(gtMask, arg1));
1819}
1820
1821//The total number of operations is 117
1822template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::abs(bitblock256_t arg1)
1823{
1824        bitblock256_t eqMask = simd256<128>::eq(simd256<1>::ifh(simd256<128>::himask(), simd256<(64)>::abs(arg1), arg1), arg1);
1825        return simd256<1>::ifh(eqMask, arg1, simd256<128>::sub(eqMask, arg1));
1826}
1827
1828//The total number of operations is 391
1829template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::abs(bitblock256_t arg1)
1830{
1831        bitblock256_t eqMask = simd256<256>::eq(simd256<1>::ifh(simd256<256>::himask(), simd256<(128)>::abs(arg1), arg1), arg1);
1832        return simd256<1>::ifh(eqMask, arg1, simd256<256>::sub(eqMask, arg1));
1833}
1834
1835//The total number of operations is 2
1836template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::eq(bitblock256_t arg1, bitblock256_t arg2)
1837{
1838        return simd_not(simd_xor(arg1, arg2));
1839}
1840
1841//The total number of operations is 18
1842template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::eq(bitblock256_t arg1, bitblock256_t arg2)
1843{
1844        bitblock256_t tmpAns = simd256<(1)>::eq(arg1, arg2);
1845        bitblock256_t loMask = simd_and(tmpAns, simd256<2>::srli<(1)>(tmpAns));
1846        bitblock256_t hiMask = simd256<2>::slli<(1)>(loMask);
1847        return simd_or(loMask, hiMask);
1848}
1849
1850//The total number of operations is 23
1851template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::eq(bitblock256_t arg1, bitblock256_t arg2)
1852{
1853        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::eq(simd_and(simd256<(8)>::himask(), arg1), simd_and(simd256<(8)>::himask(), arg2))), simd_and(simd256<(8)>::lomask(), simd256<(8)>::eq(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2))));
1854}
1855
1856//The total number of operations is 8
1857template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::eq(bitblock256_t arg1, bitblock256_t arg2)
1858{
1859        return avx_general_combine256(_mm_cmpeq_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1860}
1861
1862//The total number of operations is 8
1863template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::eq(bitblock256_t arg1, bitblock256_t arg2)
1864{
1865        return avx_general_combine256(_mm_cmpeq_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1866}
1867
1868//The total number of operations is 8
1869template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::eq(bitblock256_t arg1, bitblock256_t arg2)
1870{
1871        return avx_general_combine256(_mm_cmpeq_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1872}
1873
1874//The total number of operations is 8
1875template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::eq(bitblock256_t arg1, bitblock256_t arg2)
1876{
1877        return avx_general_combine256(_mm_cmpeq_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_cmpeq_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
1878}
1879
1880//The total number of operations is 48
1881template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::eq(bitblock256_t arg1, bitblock256_t arg2)
1882{
1883        bitblock256_t tmpAns = simd256<(64)>::eq(arg1, arg2);
1884        bitblock256_t loMask = simd_and(tmpAns, simd256<128>::srli<(64)>(tmpAns));
1885        bitblock256_t hiMask = simd256<128>::slli<(64)>(loMask);
1886        return simd_or(loMask, hiMask);
1887}
1888
1889//The total number of operations is 131
1890template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::eq(bitblock256_t arg1, bitblock256_t arg2)
1891{
1892        bitblock256_t tmpAns = simd256<(128)>::eq(arg1, arg2);
1893        bitblock256_t loMask = simd_and(tmpAns, simd256<256>::srli<(128)>(tmpAns));
1894        bitblock256_t hiMask = simd256<256>::slli<(128)>(loMask);
1895        return simd_or(loMask, hiMask);
1896}
1897
1898//The total number of operations is 9
1899template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::srai(bitblock256_t arg1)
1900{
1901        return ((sh == 0) ? arg1 : simd_or(simd_and(simd256<2>::himask(), arg1), simd256<2>::srli<1>(arg1)));
1902}
1903
1904//The total number of operations is 29
1905template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::srai(bitblock256_t arg1)
1906{
1907        bitblock256_t tmp = simd256<4>::srli<((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh))>(arg1);
1908        return simd_or(tmp, simd256<4>::sub(simd256<4>::constant<0>(), simd_and(simd256<4>::constant<(1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
1909}
1910
1911//The total number of operations is 17
1912template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::srai(bitblock256_t arg1)
1913{
1914        bitblock256_t tmp = simd256<8>::srli<((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh))>(arg1);
1915        return simd_or(tmp, simd256<8>::sub(simd256<8>::constant<0>(), simd_and(simd256<8>::constant<(1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))>(), tmp)));
1916}
1917
1918//The total number of operations is 6
1919template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::srai(bitblock256_t arg1)
1920{
1921        return avx_general_combine256(_mm_srai_epi16(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi16(avx_select_lo128(arg1), (int32_t)(sh)));
1922}
1923
1924//The total number of operations is 6
1925template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::srai(bitblock256_t arg1)
1926{
1927        return avx_general_combine256(_mm_srai_epi32(avx_select_hi128(arg1), (int32_t)(sh)), _mm_srai_epi32(avx_select_lo128(arg1), (int32_t)(sh)));
1928}
1929
1930//The total number of operations is 22
1931template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::srai(bitblock256_t arg1)
1932{
1933        bitblock256_t tmp = simd256<64>::srli<((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh))>(arg1);
1934        return simd_or(tmp, simd256<64>::sub(simd256<64>::constant<0>(), simd_and(simd256<64>::slli<((64-((sh >= 64) ? (63) : ((sh < 0) ? 0 : sh)))-1)>(simd256<64>::constant<1>()), tmp)));
1935}
1936
1937//The total number of operations is 84
1938template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::srai(bitblock256_t arg1)
1939{
1940        bitblock256_t tmp = simd256<128>::srli<((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh))>(arg1);
1941        return simd_or(tmp, simd256<128>::sub(simd256<128>::constant<0>(), simd_and(simd256<128>::slli<((128-((sh >= 128) ? (127) : ((sh < 0) ? 0 : sh)))-1)>(simd256<128>::constant<1>()), tmp)));
1942}
1943
1944//The total number of operations is 220
1945template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::srai(bitblock256_t arg1)
1946{
1947        bitblock256_t tmp = simd256<256>::srli<((sh >= 256) ? (255) : ((sh < 0) ? 0 : sh))>(arg1);
1948        return simd_or(tmp, simd256<256>::sub(simd256<256>::constant<0>(), simd_and(simd256<256>::slli<((256-((sh >= 256) ? (255) : ((sh < 0) ? 0 : sh)))-1)>(simd256<256>::constant<1>()), tmp)));
1949}
1950
1951//The total number of operations is 0
1952template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::himask()
1953{
1954        return simd256<2>::constant<(2)>();
1955}
1956
1957//The total number of operations is 0
1958template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::himask()
1959{
1960        return simd256<4>::constant<(12)>();
1961}
1962
1963//The total number of operations is 0
1964template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::himask()
1965{
1966        return simd256<8>::constant<(240)>();
1967}
1968
1969//The total number of operations is 0
1970template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::himask()
1971{
1972        return simd256<16>::constant<(65280)>();
1973}
1974
1975//The total number of operations is 0
1976template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::himask()
1977{
1978        return simd256<32>::constant<-65536>();
1979}
1980
1981//The total number of operations is 0
1982template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::himask()
1983{
1984        return ((bitblock256_t)_mm256_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0)));
1985}
1986
1987//The total number of operations is 0
1988template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::himask()
1989{
1990        return ((bitblock256_t)_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0)));
1991}
1992
1993//The total number of operations is 0
1994template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::himask()
1995{
1996        return ((bitblock256_t)_mm256_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0), (int32_t)(0), (int32_t)(0)));
1997}
1998
1999//The total number of operations is 1
2000template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::add(bitblock256_t arg1, bitblock256_t arg2)
2001{
2002        return simd_xor(arg1, arg2);
2003}
2004
2005//The total number of operations is 18
2006template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::add(bitblock256_t arg1, bitblock256_t arg2)
2007{
2008        bitblock256_t ans = simd256<(1)>::add(arg1, arg2);
2009        bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
2010        bitblock256_t loMask = simd256<2>::lomask();
2011        bitblock256_t carry = simd256<2>::slli<1>(simd_and(carryMask, loMask));
2012        return simd256<1>::ifh(loMask, ans, simd256<(1)>::add(ans, carry));
2013}
2014
2015//The total number of operations is 20
2016template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::add(bitblock256_t arg1, bitblock256_t arg2)
2017{
2018        return simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::add(arg1, simd_and(simd256<(8)>::himask(), arg2)), simd256<(8)>::add(arg1, arg2));
2019}
2020
2021//The total number of operations is 8
2022template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::add(bitblock256_t arg1, bitblock256_t arg2)
2023{
2024        return avx_general_combine256(_mm_add_epi8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2025}
2026
2027//The total number of operations is 8
2028template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::add(bitblock256_t arg1, bitblock256_t arg2)
2029{
2030        return avx_general_combine256(_mm_add_epi16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2031}
2032
2033//The total number of operations is 8
2034template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::add(bitblock256_t arg1, bitblock256_t arg2)
2035{
2036        return avx_general_combine256(_mm_add_epi32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2037}
2038
2039//The total number of operations is 8
2040template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::add(bitblock256_t arg1, bitblock256_t arg2)
2041{
2042        return avx_general_combine256(_mm_add_epi64(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_add_epi64(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2043}
2044
2045//The total number of operations is 44
2046template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::add(bitblock256_t arg1, bitblock256_t arg2)
2047{
2048        bitblock256_t ans = simd256<(64)>::add(arg1, arg2);
2049        bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
2050        bitblock256_t loMask = simd256<128>::lomask();
2051        bitblock256_t carry = simd256<128>::slli<1>(simd_and(carryMask, loMask));
2052        return simd256<1>::ifh(loMask, ans, simd256<(64)>::add(ans, carry));
2053}
2054
2055//The total number of operations is 137
2056template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::add(bitblock256_t arg1, bitblock256_t arg2)
2057{
2058        bitblock256_t ans = simd256<(128)>::add(arg1, arg2);
2059        bitblock256_t carryMask = simd_or(simd_and(arg1, arg2), simd_and(simd_xor(arg1, arg2), simd_not(ans)));
2060        bitblock256_t loMask = simd256<256>::lomask();
2061        bitblock256_t carry = simd256<256>::slli<1>(simd_and(carryMask, loMask));
2062        return simd256<1>::ifh(loMask, ans, simd256<(128)>::add(ans, carry));
2063}
2064
2065//The total number of operations is 1
2066template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<1>::umax(bitblock256_t arg1, bitblock256_t arg2)
2067{
2068        return simd_or(arg1, arg2);
2069}
2070
2071//The total number of operations is 28
2072template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<2>::umax(bitblock256_t arg1, bitblock256_t arg2)
2073{
2074        bitblock256_t tmpAns = simd256<(1)>::umax(arg1, arg2);
2075        bitblock256_t eqMask1 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg1));
2076        bitblock256_t eqMask2 = simd256<2>::srli<(1)>(simd256<(1)>::eq(tmpAns, arg2));
2077        return simd256<1>::ifh(simd256<2>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
2078}
2079
2080//The total number of operations is 20
2081template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<4>::umax(bitblock256_t arg1, bitblock256_t arg2)
2082{
2083        return simd_or(simd_and(simd256<(8)>::himask(), simd256<(8)>::umax(arg1, arg2)), simd256<(8)>::umax(simd_and(simd256<(8)>::lomask(), arg1), simd_and(simd256<(8)>::lomask(), arg2)));
2084}
2085
2086//The total number of operations is 8
2087template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<8>::umax(bitblock256_t arg1, bitblock256_t arg2)
2088{
2089        return avx_general_combine256(_mm_max_epu8(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu8(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2090}
2091
2092//The total number of operations is 8
2093template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<16>::umax(bitblock256_t arg1, bitblock256_t arg2)
2094{
2095        return avx_general_combine256(_mm_max_epu16(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu16(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2096}
2097
2098//The total number of operations is 8
2099template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<32>::umax(bitblock256_t arg1, bitblock256_t arg2)
2100{
2101        return avx_general_combine256(_mm_max_epu32(avx_select_hi128(arg1), avx_select_hi128(arg2)), _mm_max_epu32(avx_select_lo128(arg1), avx_select_lo128(arg2)));
2102}
2103
2104//The total number of operations is 14
2105template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<64>::umax(bitblock256_t arg1, bitblock256_t arg2)
2106{
2107        bitblock256_t high_bit = simd256<64>::constant<(9223372036854775808UL)>();
2108        return simd_xor(simd256<64>::max(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
2109}
2110
2111//The total number of operations is 77
2112template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<128>::umax(bitblock256_t arg1, bitblock256_t arg2)
2113{
2114        bitblock256_t tmpAns = simd256<(64)>::umax(arg1, arg2);
2115        bitblock256_t eqMask1 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg1));
2116        bitblock256_t eqMask2 = simd256<128>::srli<(64)>(simd256<(64)>::eq(tmpAns, arg2));
2117        return simd256<1>::ifh(simd256<128>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
2118}
2119
2120//The total number of operations is 264
2121template <> IDISA_ALWAYS_INLINE bitblock256_t simd256<256>::umax(bitblock256_t arg1, bitblock256_t arg2)
2122{
2123        bitblock256_t tmpAns = simd256<(128)>::umax(arg1, arg2);
2124        bitblock256_t eqMask1 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg1));
2125        bitblock256_t eqMask2 = simd256<256>::srli<(128)>(simd256<(128)>::eq(tmpAns, arg2));
2126        return simd256<1>::ifh(simd256<256>::himask(), tmpAns, simd256<1>::ifh(eqMask1, simd256<1>::ifh(eqMask2, tmpAns, arg1), arg2));
2127}
2128
2129//The total number of operations is 561
2130template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2131{
2132        return simd256<(1)>::umin(hsimd256<2>::packh(arg1, arg2), hsimd256<2>::packl(arg1, arg2));
2133}
2134
2135//The total number of operations is 412
2136template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2137{
2138        return simd256<(2)>::umin(hsimd256<4>::packh(arg1, arg2), hsimd256<4>::packl(arg1, arg2));
2139}
2140
2141//The total number of operations is 228
2142template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2143{
2144        return simd256<(4)>::umin(hsimd256<8>::packh(arg1, arg2), hsimd256<8>::packl(arg1, arg2));
2145}
2146
2147//The total number of operations is 38
2148template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2149{
2150        return simd256<(8)>::umin(hsimd256<16>::packh(arg1, arg2), hsimd256<16>::packl(arg1, arg2));
2151}
2152
2153//The total number of operations is 38
2154template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2155{
2156        return simd256<(16)>::umin(hsimd256<32>::packh(arg1, arg2), hsimd256<32>::packl(arg1, arg2));
2157}
2158
2159//The total number of operations is 450
2160template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2161{
2162        return simd256<(32)>::umin(hsimd256<64>::packh(arg1, arg2), hsimd256<64>::packl(arg1, arg2));
2163}
2164
2165//The total number of operations is 456
2166template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2167{
2168        return simd256<(64)>::umin(hsimd256<128>::packh(arg1, arg2), hsimd256<128>::packl(arg1, arg2));
2169}
2170
2171//The total number of operations is 164
2172template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::umin_hl(bitblock256_t arg1, bitblock256_t arg2)
2173{
2174        return simd256<(128)>::umin(hsimd256<256>::packh(arg1, arg2), hsimd256<256>::packl(arg1, arg2));
2175}
2176
2177//The total number of operations is 561
2178template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2179{
2180        return simd256<(1)>::add(hsimd256<2>::packh(arg1, arg2), hsimd256<2>::packl(arg1, arg2));
2181}
2182
2183//The total number of operations is 402
2184template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2185{
2186        return simd256<(2)>::add(hsimd256<4>::packh(arg1, arg2), hsimd256<4>::packl(arg1, arg2));
2187}
2188
2189//The total number of operations is 228
2190template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2191{
2192        return simd256<(4)>::add(hsimd256<8>::packh(arg1, arg2), hsimd256<8>::packl(arg1, arg2));
2193}
2194
2195//The total number of operations is 38
2196template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2197{
2198        return simd256<(8)>::add(hsimd256<16>::packh(arg1, arg2), hsimd256<16>::packl(arg1, arg2));
2199}
2200
2201//The total number of operations is 8
2202template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2203{
2204        return avx_general_combine256(_mm_hadd_epi16(avx_select_lo128(arg1), avx_select_hi128(arg1)),_mm_hadd_epi16(avx_select_lo128(arg2), avx_select_hi128(arg2)));
2205}
2206
2207//The total number of operations is 8
2208template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2209{
2210        return avx_general_combine256(_mm_hadd_epi32(avx_select_lo128(arg1), avx_select_hi128(arg1)),_mm_hadd_epi32(avx_select_lo128(arg2), avx_select_hi128(arg2)));
2211}
2212
2213//The total number of operations is 450
2214template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2215{
2216        return simd256<(64)>::add(hsimd256<128>::packh(arg1, arg2), hsimd256<128>::packl(arg1, arg2));
2217}
2218
2219//The total number of operations is 131
2220template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::add_hl(bitblock256_t arg1, bitblock256_t arg2)
2221{
2222        return simd256<(128)>::add(hsimd256<256>::packh(arg1, arg2), hsimd256<256>::packl(arg1, arg2));
2223}
2224
2225//The total number of operations is 414
2226template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packss(bitblock256_t arg1, bitblock256_t arg2)
2227{
2228        bitblock256_t hiBound = simd256<2>::srli<1>(simd256<2>::lomask());
2229        bitblock256_t loBound = simd_not(hiBound);
2230        return hsimd256<2>::packl(simd256<1>::ifh(simd256<2>::gt(arg1, hiBound), hiBound, simd256<1>::ifh(simd256<2>::gt(arg1, loBound), arg1, loBound)), simd256<1>::ifh(simd256<2>::gt(arg2, hiBound), hiBound, simd256<1>::ifh(simd256<2>::gt(arg2, loBound), arg2, loBound)));
2231}
2232
2233//The total number of operations is 318
2234template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packss(bitblock256_t arg1, bitblock256_t arg2)
2235{
2236        bitblock256_t hiBound = simd256<4>::srli<1>(simd256<4>::lomask());
2237        bitblock256_t loBound = simd_not(hiBound);
2238        return hsimd256<4>::packl(simd256<1>::ifh(simd256<4>::gt(arg1, hiBound), hiBound, simd256<1>::ifh(simd256<4>::gt(arg1, loBound), arg1, loBound)), simd256<1>::ifh(simd256<4>::gt(arg2, hiBound), hiBound, simd256<1>::ifh(simd256<4>::gt(arg2, loBound), arg2, loBound)));
2239}
2240
2241//The total number of operations is 150
2242template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packss(bitblock256_t arg1, bitblock256_t arg2)
2243{
2244        bitblock256_t hiBound = simd256<8>::srli<1>(simd256<8>::lomask());
2245        bitblock256_t loBound = simd_not(hiBound);
2246        return hsimd256<8>::packl(simd256<1>::ifh(simd256<8>::gt(arg1, hiBound), hiBound, simd256<1>::ifh(simd256<8>::gt(arg1, loBound), arg1, loBound)), simd256<1>::ifh(simd256<8>::gt(arg2, hiBound), hiBound, simd256<1>::ifh(simd256<8>::gt(arg2, loBound), arg2, loBound)));
2247}
2248
2249//The total number of operations is 8
2250template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packss(bitblock256_t arg1, bitblock256_t arg2)
2251{
2252        return avx_general_combine256(_mm_packs_epi16(avx_select_lo128(arg1), avx_select_hi128(arg1)),_mm_packs_epi16(avx_select_lo128(arg2), avx_select_hi128(arg2)));
2253}
2254
2255//The total number of operations is 8
2256template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packss(bitblock256_t arg1, bitblock256_t arg2)
2257{
2258        return avx_general_combine256(_mm_packs_epi32(avx_select_lo128(arg1), avx_select_hi128(arg1)),_mm_packs_epi32(avx_select_lo128(arg2), avx_select_hi128(arg2)));
2259}
2260
2261//The total number of operations is 266
2262template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packss(bitblock256_t arg1, bitblock256_t arg2)
2263{
2264        bitblock256_t hiBound = simd256<64>::srli<1>(simd256<64>::lomask());
2265        bitblock256_t loBound = simd_not(hiBound);
2266        return hsimd256<64>::packl(simd256<1>::ifh(simd256<64>::gt(arg1, hiBound), hiBound, simd256<1>::ifh(simd256<64>::gt(arg1, loBound), arg1, loBound)), simd256<1>::ifh(simd256<64>::gt(arg2, hiBound), hiBound, simd256<1>::ifh(simd256<64>::gt(arg2, loBound), arg2, loBound)));
2267}
2268
2269//The total number of operations is 763
2270template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packss(bitblock256_t arg1, bitblock256_t arg2)
2271{
2272        bitblock256_t hiBound = simd256<128>::srli<1>(simd256<128>::lomask());
2273        bitblock256_t loBound = simd_not(hiBound);
2274        return hsimd256<128>::packl(simd256<1>::ifh(simd256<128>::gt(arg1, hiBound), hiBound, simd256<1>::ifh(simd256<128>::gt(arg1, loBound), arg1, loBound)), simd256<1>::ifh(simd256<128>::gt(arg2, hiBound), hiBound, simd256<1>::ifh(simd256<128>::gt(arg2, loBound), arg2, loBound)));
2275}
2276
2277//The total number of operations is 2681
2278template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packss(bitblock256_t arg1, bitblock256_t arg2)
2279{
2280        bitblock256_t hiBound = simd256<256>::srli<1>(simd256<256>::lomask());
2281        bitblock256_t loBound = simd_not(hiBound);
2282        return hsimd256<256>::packl(simd256<1>::ifh(simd256<256>::gt(arg1, hiBound), hiBound, simd256<1>::ifh(simd256<256>::gt(arg1, loBound), arg1, loBound)), simd256<1>::ifh(simd256<256>::gt(arg2, hiBound), hiBound, simd256<1>::ifh(simd256<256>::gt(arg2, loBound), arg2, loBound)));
2283}
2284
2285//The total number of operations is 4
2286template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<8>::signmask(bitblock256_t arg1)
2287{
2288        return ((((uint64_t)_mm_movemask_epi8(((__m128i)avx_select_hi128(arg1))))<<16)|((uint64_t)_mm_movemask_epi8(((__m128i)avx_select_lo128(arg1)))));
2289}
2290
2291//The total number of operations is 24
2292template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<16>::signmask(bitblock256_t arg1)
2293{
2294        return hsimd256<(8)>::signmask(hsimd256<16>::packh(simd256<16>::constant<0>(), arg1));
2295}
2296
2297//The total number of operations is 44
2298template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<32>::signmask(bitblock256_t arg1)
2299{
2300        return hsimd256<(16)>::signmask(hsimd256<32>::packh(simd256<32>::constant<0>(), arg1));
2301}
2302
2303//The total number of operations is 271
2304template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<64>::signmask(bitblock256_t arg1)
2305{
2306        return hsimd256<(32)>::signmask(hsimd256<64>::packh(simd256<64>::constant<0>(), arg1));
2307}
2308
2309//The total number of operations is 586
2310template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<128>::signmask(bitblock256_t arg1)
2311{
2312        return hsimd256<(64)>::signmask(hsimd256<128>::packh(simd256<128>::constant<0>(), arg1));
2313}
2314
2315//The total number of operations is 630
2316template <> IDISA_ALWAYS_INLINE uint64_t hsimd256<256>::signmask(bitblock256_t arg1)
2317{
2318        return hsimd256<(128)>::signmask(hsimd256<256>::packh(simd256<256>::constant<0>(), arg1));
2319}
2320
2321//The total number of operations is 274
2322template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packl(bitblock256_t arg1, bitblock256_t arg2)
2323{
2324        return hsimd256<(4)>::packl(simd256<1>::ifh(simd256<2>::himask(), simd256<256>::srli<(1)>(arg1), arg1), simd256<1>::ifh(simd256<2>::himask(), simd256<256>::srli<(1)>(arg2), arg2));
2325}
2326
2327//The total number of operations is 186
2328template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packl(bitblock256_t arg1, bitblock256_t arg2)
2329{
2330        return hsimd256<(8)>::packl(simd256<1>::ifh(simd256<4>::himask(), simd256<256>::srli<(2)>(arg1), arg1), simd256<1>::ifh(simd256<4>::himask(), simd256<256>::srli<(2)>(arg2), arg2));
2331}
2332
2333//The total number of operations is 98
2334template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packl(bitblock256_t arg1, bitblock256_t arg2)
2335{
2336        return hsimd256<(16)>::packl(simd256<1>::ifh(simd256<8>::himask(), simd256<256>::srli<(4)>(arg1), arg1), simd256<1>::ifh(simd256<8>::himask(), simd256<256>::srli<(4)>(arg2), arg2));
2337}
2338
2339//The total number of operations is 10
2340template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packl(bitblock256_t arg1, bitblock256_t arg2)
2341{
2342        return hsimd256<16>::packus(simd_and(arg1, simd256<16>::lomask()), simd_and(arg2, simd256<16>::lomask()));
2343}
2344
2345//The total number of operations is 10
2346template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packl(bitblock256_t arg1, bitblock256_t arg2)
2347{
2348        return hsimd256<32>::packus(simd_and(arg1, simd256<32>::lomask()), simd_and(arg2, simd256<32>::lomask()));
2349}
2350
2351//The total number of operations is 215
2352template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packl(bitblock256_t arg1, bitblock256_t arg2)
2353{
2354        return hsimd256<(128)>::packl(simd256<1>::ifh(simd256<64>::himask(), simd256<256>::srli<(32)>(arg1), arg1), simd256<1>::ifh(simd256<64>::himask(), simd256<256>::srli<(32)>(arg2), arg2));
2355}
2356
2357//The total number of operations is 127
2358template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packl(bitblock256_t arg1, bitblock256_t arg2)
2359{
2360        return hsimd256<(256)>::packl(simd256<64>::ifh(simd256<128>::himask(), simd256<256>::srli<(64)>(arg1), arg1), simd256<64>::ifh(simd256<128>::himask(), simd256<256>::srli<(64)>(arg2), arg2));
2361}
2362
2363//The total number of operations is 43
2364template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packl(bitblock256_t arg1, bitblock256_t arg2)
2365{
2366        return simd256<1>::ifh(simd256<256>::himask(), simd256<256>::slli<(128)>(arg1), arg2);
2367}
2368
2369//The total number of operations is 286
2370template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packh(bitblock256_t arg1, bitblock256_t arg2)
2371{
2372        return hsimd256<2>::packl(simd256<64>::srli<(1)>(arg1), simd256<64>::srli<(1)>(arg2));
2373}
2374
2375//The total number of operations is 198
2376template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packh(bitblock256_t arg1, bitblock256_t arg2)
2377{
2378        return hsimd256<4>::packl(simd256<64>::srli<(2)>(arg1), simd256<64>::srli<(2)>(arg2));
2379}
2380
2381//The total number of operations is 110
2382template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packh(bitblock256_t arg1, bitblock256_t arg2)
2383{
2384        return hsimd256<8>::packl(simd256<64>::srli<(4)>(arg1), simd256<64>::srli<(4)>(arg2));
2385}
2386
2387//The total number of operations is 20
2388template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packh(bitblock256_t arg1, bitblock256_t arg2)
2389{
2390        return hsimd256<16>::packus(simd256<16>::srli<(8)>(arg1), simd256<16>::srli<(8)>(arg2));
2391}
2392
2393//The total number of operations is 20
2394template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packh(bitblock256_t arg1, bitblock256_t arg2)
2395{
2396        return hsimd256<32>::packus(simd256<32>::srli<(16)>(arg1), simd256<32>::srli<(16)>(arg2));
2397}
2398
2399//The total number of operations is 227
2400template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packh(bitblock256_t arg1, bitblock256_t arg2)
2401{
2402        return hsimd256<64>::packl(simd256<64>::srli<(32)>(arg1), simd256<64>::srli<(32)>(arg2));
2403}
2404
2405//The total number of operations is 315
2406template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packh(bitblock256_t arg1, bitblock256_t arg2)
2407{
2408        return hsimd256<128>::packus(simd256<128>::srli<(64)>(arg1), simd256<128>::srli<(64)>(arg2));
2409}
2410
2411//The total number of operations is 44
2412template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packh(bitblock256_t arg1, bitblock256_t arg2)
2413{
2414        return simd256<1>::ifh(simd256<256>::himask(), arg1, simd256<256>::srli<(128)>(arg2));
2415}
2416
2417//The total number of operations is 561
2418template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2419{
2420        return simd256<(1)>::min(hsimd256<2>::packh(arg1, arg2), hsimd256<2>::packl(arg1, arg2));
2421}
2422
2423//The total number of operations is 413
2424template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2425{
2426        return simd256<(2)>::min(hsimd256<4>::packh(arg1, arg2), hsimd256<4>::packl(arg1, arg2));
2427}
2428
2429//The total number of operations is 231
2430template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2431{
2432        return simd256<(4)>::min(hsimd256<8>::packh(arg1, arg2), hsimd256<8>::packl(arg1, arg2));
2433}
2434
2435//The total number of operations is 38
2436template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2437{
2438        return simd256<(8)>::min(hsimd256<16>::packh(arg1, arg2), hsimd256<16>::packl(arg1, arg2));
2439}
2440
2441//The total number of operations is 38
2442template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2443{
2444        return simd256<(16)>::min(hsimd256<32>::packh(arg1, arg2), hsimd256<32>::packl(arg1, arg2));
2445}
2446
2447//The total number of operations is 450
2448template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2449{
2450        return simd256<(32)>::min(hsimd256<64>::packh(arg1, arg2), hsimd256<64>::packl(arg1, arg2));
2451}
2452
2453//The total number of operations is 453
2454template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2455{
2456        return simd256<(64)>::min(hsimd256<128>::packh(arg1, arg2), hsimd256<128>::packl(arg1, arg2));
2457}
2458
2459//The total number of operations is 175
2460template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::min_hl(bitblock256_t arg1, bitblock256_t arg2)
2461{
2462        return simd256<(128)>::min(hsimd256<256>::packh(arg1, arg2), hsimd256<256>::packl(arg1, arg2));
2463}
2464
2465//The total number of operations is 344
2466template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<2>::packus(bitblock256_t arg1, bitblock256_t arg2)
2467{
2468        bitblock256_t arg11 = simd256<2>::ifh(arg1, simd256<2>::constant<0>(), arg1);
2469        bitblock256_t arg12 = simd_and(simd256<2>::lomask(), arg11);
2470        bitblock256_t arg21 = simd256<2>::ifh(arg2, simd256<2>::constant<0>(), arg2);
2471        bitblock256_t arg22 = simd_and(simd256<2>::lomask(), arg21);
2472        return hsimd256<2>::packl(simd256<1>::ifh(simd256<2>::eq(arg12, arg11), arg12, simd256<2>::lomask()), simd256<1>::ifh(simd256<2>::eq(arg22, arg21), arg22, simd256<2>::lomask()));
2473}
2474
2475//The total number of operations is 286
2476template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<4>::packus(bitblock256_t arg1, bitblock256_t arg2)
2477{
2478        bitblock256_t arg11 = simd256<4>::ifh(arg1, simd256<4>::constant<0>(), arg1);
2479        bitblock256_t arg12 = simd_and(simd256<4>::lomask(), arg11);
2480        bitblock256_t arg21 = simd256<4>::ifh(arg2, simd256<4>::constant<0>(), arg2);
2481        bitblock256_t arg22 = simd_and(simd256<4>::lomask(), arg21);
2482        return hsimd256<4>::packl(simd256<1>::ifh(simd256<4>::eq(arg12, arg11), arg12, simd256<4>::lomask()), simd256<1>::ifh(simd256<4>::eq(arg22, arg21), arg22, simd256<4>::lomask()));
2483}
2484
2485//The total number of operations is 144
2486template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<8>::packus(bitblock256_t arg1, bitblock256_t arg2)
2487{
2488        bitblock256_t arg11 = simd256<8>::ifh(arg1, simd256<8>::constant<0>(), arg1);
2489        bitblock256_t arg12 = simd_and(simd256<8>::lomask(), arg11);
2490        bitblock256_t arg21 = simd256<8>::ifh(arg2, simd256<8>::constant<0>(), arg2);
2491        bitblock256_t arg22 = simd_and(simd256<8>::lomask(), arg21);
2492        return hsimd256<8>::packl(simd256<1>::ifh(simd256<8>::eq(arg12, arg11), arg12, simd256<8>::lomask()), simd256<1>::ifh(simd256<8>::eq(arg22, arg21), arg22, simd256<8>::lomask()));
2493}
2494
2495//The total number of operations is 8
2496template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<16>::packus(bitblock256_t arg1, bitblock256_t arg2)
2497{
2498        return avx_general_combine256(_mm_packus_epi16(avx_select_lo128(arg1), avx_select_hi128(arg1)),_mm_packus_epi16(avx_select_lo128(arg2), avx_select_hi128(arg2)));
2499}
2500
2501//The total number of operations is 8
2502template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<32>::packus(bitblock256_t arg1, bitblock256_t arg2)
2503{
2504        return avx_general_combine256(_mm_packus_epi32(avx_select_lo128(arg1), avx_select_hi128(arg1)),_mm_packus_epi32(avx_select_lo128(arg2), avx_select_hi128(arg2)));
2505}
2506
2507//The total number of operations is 241
2508template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<64>::packus(bitblock256_t arg1, bitblock256_t arg2)
2509{
2510        bitblock256_t arg11 = simd256<64>::ifh(arg1, simd256<64>::constant<0>(), arg1);
2511        bitblock256_t arg12 = simd_and(simd256<64>::lomask(), arg11);
2512        bitblock256_t arg21 = simd256<64>::ifh(arg2, simd256<64>::constant<0>(), arg2);
2513        bitblock256_t arg22 = simd_and(simd256<64>::lomask(), arg21);
2514        return hsimd256<64>::packl(simd256<1>::ifh(simd256<64>::eq(arg12, arg11), arg12, simd256<64>::lomask()), simd256<1>::ifh(simd256<64>::eq(arg22, arg21), arg22, simd256<64>::lomask()));
2515}
2516
2517//The total number of operations is 277
2518template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<128>::packus(bitblock256_t arg1, bitblock256_t arg2)
2519{
2520        bitblock256_t arg11 = simd256<128>::ifh(arg1, simd256<128>::constant<0>(), arg1);
2521        bitblock256_t arg12 = simd_and(simd256<128>::lomask(), arg11);
2522        bitblock256_t arg21 = simd256<128>::ifh(arg2, simd256<128>::constant<0>(), arg2);
2523        bitblock256_t arg22 = simd_and(simd256<128>::lomask(), arg21);
2524        return hsimd256<128>::packl(simd256<1>::ifh(simd256<128>::eq(arg12, arg11), arg12, simd256<128>::lomask()), simd256<1>::ifh(simd256<128>::eq(arg22, arg21), arg22, simd256<128>::lomask()));
2525}
2526
2527//The total number of operations is 262
2528template <> IDISA_ALWAYS_INLINE bitblock256_t hsimd256<256>::packus(bitblock256_t arg1, bitblock256_t arg2)
2529{
2530        bitblock256_t hiPart = hsimd256<256>::packh(arg1, arg2);
2531        return simd256<(128)>::ifh(hiPart, simd256<(128)>::constant<0>(), simd_or(simd256<(128)>::gt(hiPart, simd256<(128)>::constant<0>()), hsimd256<256>::packl(arg1, arg2)));
2532}
2533
2534//The total number of operations is 64
2535template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2536{
2537        return esimd256<(2)>::mergel(simd256<1>::ifh(simd256<(2)>::himask(), arg1, simd256<(2)>::srli<1>(arg2)), simd256<1>::ifh(simd256<(2)>::himask(), simd256<(2)>::slli<1>(arg1), arg2));
2538}
2539
2540//The total number of operations is 44
2541template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2542{
2543        return esimd256<(4)>::mergel(simd256<1>::ifh(simd256<(4)>::himask(), arg1, simd256<(4)>::srli<2>(arg2)), simd256<1>::ifh(simd256<(4)>::himask(), simd256<(4)>::slli<2>(arg1), arg2));
2544}
2545
2546//The total number of operations is 24
2547template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2548{
2549        return esimd256<(8)>::mergel(simd256<1>::ifh(simd256<(8)>::himask(), arg1, simd256<(8)>::srli<4>(arg2)), simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::slli<4>(arg1), arg2));
2550}
2551
2552//The total number of operations is 4
2553template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2554{
2555        __m128i loPart2 = avx_select_lo128(arg2);
2556        __m128i loPart1 = avx_select_lo128(arg1);
2557        return avx_general_combine256(_mm_unpackhi_epi8(loPart2, loPart1), _mm_unpacklo_epi8(loPart2, loPart1));
2558}
2559
2560//The total number of operations is 4
2561template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2562{
2563        __m128i loPart2 = avx_select_lo128(arg2);
2564        __m128i loPart1 = avx_select_lo128(arg1);
2565        return avx_general_combine256(_mm_unpackhi_epi16(loPart2, loPart1), _mm_unpacklo_epi16(loPart2, loPart1));
2566}
2567
2568//The total number of operations is 4
2569template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2570{
2571        __m128i loPart2 = avx_select_lo128(arg2);
2572        __m128i loPart1 = avx_select_lo128(arg1);
2573        return avx_general_combine256(_mm_unpackhi_epi32(loPart2, loPart1), _mm_unpacklo_epi32(loPart2, loPart1));
2574}
2575
2576//The total number of operations is 4
2577template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2578{
2579        __m128i loPart2 = avx_select_lo128(arg2);
2580        __m128i loPart1 = avx_select_lo128(arg1);
2581        return avx_general_combine256(_mm_unpackhi_epi64(loPart2, loPart1), _mm_unpacklo_epi64(loPart2, loPart1));
2582}
2583
2584//The total number of operations is 48
2585template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::mergel(bitblock256_t arg1, bitblock256_t arg2)
2586{
2587        return esimd256<(64)>::mergel(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg2)), simd256<1>::ifh(simd256<128>::himask(), simd256<128>::slli<(64)>(arg1), arg2));
2588}
2589
2590//The total number of operations is 64
2591template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2592{
2593        return esimd256<(2)>::mergeh(simd256<1>::ifh(simd256<(2)>::himask(), arg1, simd256<(2)>::srli<1>(arg2)), simd256<1>::ifh(simd256<(2)>::himask(), simd256<(2)>::slli<1>(arg1), arg2));
2594}
2595
2596//The total number of operations is 44
2597template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2598{
2599        return esimd256<(4)>::mergeh(simd256<1>::ifh(simd256<(4)>::himask(), arg1, simd256<(4)>::srli<2>(arg2)), simd256<1>::ifh(simd256<(4)>::himask(), simd256<(4)>::slli<2>(arg1), arg2));
2600}
2601
2602//The total number of operations is 24
2603template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2604{
2605        return esimd256<(8)>::mergeh(simd256<1>::ifh(simd256<(8)>::himask(), arg1, simd256<(8)>::srli<4>(arg2)), simd256<1>::ifh(simd256<(8)>::himask(), simd256<(8)>::slli<4>(arg1), arg2));
2606}
2607
2608//The total number of operations is 4
2609template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2610{
2611        __m128i hiPart2 = avx_select_hi128(arg2);
2612        __m128i hiPart1 = avx_select_hi128(arg1);
2613        return avx_general_combine256(_mm_unpackhi_epi8(hiPart2, hiPart1), _mm_unpacklo_epi8(hiPart2, hiPart1));
2614}
2615
2616//The total number of operations is 4
2617template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2618{
2619        __m128i hiPart2 = avx_select_hi128(arg2);
2620        __m128i hiPart1 = avx_select_hi128(arg1);
2621        return avx_general_combine256(_mm_unpackhi_epi16(hiPart2, hiPart1), _mm_unpacklo_epi16(hiPart2, hiPart1));
2622}
2623
2624//The total number of operations is 4
2625template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2626{
2627        __m128i hiPart2 = avx_select_hi128(arg2);
2628        __m128i hiPart1 = avx_select_hi128(arg1);
2629        return avx_general_combine256(_mm_unpackhi_epi32(hiPart2, hiPart1), _mm_unpacklo_epi32(hiPart2, hiPart1));
2630}
2631
2632//The total number of operations is 4
2633template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2634{
2635        __m128i hiPart2 = avx_select_hi128(arg2);
2636        __m128i hiPart1 = avx_select_hi128(arg1);
2637        return avx_general_combine256(_mm_unpackhi_epi64(hiPart2, hiPart1), _mm_unpacklo_epi64(hiPart2, hiPart1));
2638}
2639
2640//The total number of operations is 48
2641template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::mergeh(bitblock256_t arg1, bitblock256_t arg2)
2642{
2643        return esimd256<(64)>::mergeh(simd256<1>::ifh(simd256<128>::himask(), arg1, simd256<128>::srli<(64)>(arg2)), simd256<1>::ifh(simd256<128>::himask(), simd256<128>::slli<(64)>(arg1), arg2));
2644}
2645
2646//The total number of operations is 52
2647template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::zeroextendh(bitblock256_t arg1)
2648{
2649        return esimd256<(2)>::mergeh(simd256<(2)>::srli<1>(arg1), simd_and(simd256<(2)>::lomask(), arg1));
2650}
2651
2652//The total number of operations is 32
2653template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::zeroextendh(bitblock256_t arg1)
2654{
2655        return esimd256<(4)>::mergeh(simd256<(4)>::srli<2>(arg1), simd_and(simd256<(4)>::lomask(), arg1));
2656}
2657
2658//The total number of operations is 12
2659template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::zeroextendh(bitblock256_t arg1)
2660{
2661        return esimd256<(8)>::mergeh(simd256<(8)>::srli<4>(arg1), simd_and(simd256<(8)>::lomask(), arg1));
2662}
2663
2664//The total number of operations is 11
2665template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::zeroextendh(bitblock256_t arg1)
2666{
2667        return esimd256<(16)>::mergeh(simd256<(16)>::srli<8>(arg1), simd_and(simd256<(16)>::lomask(), arg1));
2668}
2669
2670//The total number of operations is 11
2671template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::zeroextendh(bitblock256_t arg1)
2672{
2673        return esimd256<(32)>::mergeh(simd256<(32)>::srli<16>(arg1), simd_and(simd256<(32)>::lomask(), arg1));
2674}
2675
2676//The total number of operations is 11
2677template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::zeroextendh(bitblock256_t arg1)
2678{
2679        return esimd256<(64)>::mergeh(simd256<(64)>::srli<32>(arg1), simd_and(simd256<(64)>::lomask(), arg1));
2680}
2681
2682//The total number of operations is 68
2683template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::zeroextendh(bitblock256_t arg1)
2684{
2685        return esimd256<(128)>::mergeh(simd256<(128)>::srli<64>(arg1), simd_and(simd256<(128)>::lomask(), arg1));
2686}
2687
2688//The total number of operations is 41
2689template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::zeroextendh(bitblock256_t arg1)
2690{
2691        return simd256<(256)>::srli<128>(arg1);
2692}
2693
2694//The total number of operations is 52
2695template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::zeroextendl(bitblock256_t arg1)
2696{
2697        return esimd256<(2)>::mergel(simd256<(2)>::srli<1>(arg1), simd_and(simd256<(2)>::lomask(), arg1));
2698}
2699
2700//The total number of operations is 32
2701template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::zeroextendl(bitblock256_t arg1)
2702{
2703        return esimd256<(4)>::mergel(simd256<(4)>::srli<2>(arg1), simd_and(simd256<(4)>::lomask(), arg1));
2704}
2705
2706//The total number of operations is 12
2707template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::zeroextendl(bitblock256_t arg1)
2708{
2709        return esimd256<(8)>::mergel(simd256<(8)>::srli<4>(arg1), simd_and(simd256<(8)>::lomask(), arg1));
2710}
2711
2712//The total number of operations is 11
2713template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::zeroextendl(bitblock256_t arg1)
2714{
2715        return esimd256<(16)>::mergel(simd256<(16)>::srli<8>(arg1), simd_and(simd256<(16)>::lomask(), arg1));
2716}
2717
2718//The total number of operations is 11
2719template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::zeroextendl(bitblock256_t arg1)
2720{
2721        return esimd256<(32)>::mergel(simd256<(32)>::srli<16>(arg1), simd_and(simd256<(32)>::lomask(), arg1));
2722}
2723
2724//The total number of operations is 11
2725template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::zeroextendl(bitblock256_t arg1)
2726{
2727        return esimd256<(64)>::mergel(simd256<(64)>::srli<32>(arg1), simd_and(simd256<(64)>::lomask(), arg1));
2728}
2729
2730//The total number of operations is 68
2731template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::zeroextendl(bitblock256_t arg1)
2732{
2733        return esimd256<(128)>::mergel(simd256<(128)>::srli<64>(arg1), simd_and(simd256<(128)>::lomask(), arg1));
2734}
2735
2736//The total number of operations is 1
2737template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::zeroextendl(bitblock256_t arg1)
2738{
2739        return simd_and(simd256<(256)>::lomask(), arg1);
2740}
2741
2742//The total number of operations is 69
2743template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::signextendh(bitblock256_t arg1)
2744{
2745        return esimd256<(2)>::mergeh(simd256<(2)>::srai<1>(arg1), simd256<(2)>::srai<1>(simd256<(2)>::slli<1>(arg1)));
2746}
2747
2748//The total number of operations is 89
2749template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::signextendh(bitblock256_t arg1)
2750{
2751        return esimd256<(4)>::mergeh(simd256<(4)>::srai<2>(arg1), simd256<(4)>::srai<2>(simd256<(4)>::slli<2>(arg1)));
2752}
2753
2754//The total number of operations is 45
2755template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::signextendh(bitblock256_t arg1)
2756{
2757        return esimd256<(8)>::mergeh(simd256<(8)>::srai<4>(arg1), simd256<(8)>::srai<4>(simd256<(8)>::slli<4>(arg1)));
2758}
2759
2760//The total number of operations is 22
2761template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::signextendh(bitblock256_t arg1)
2762{
2763        return esimd256<(16)>::mergeh(simd256<(16)>::srai<8>(arg1), simd256<(16)>::srai<8>(simd256<(16)>::slli<8>(arg1)));
2764}
2765
2766//The total number of operations is 22
2767template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::signextendh(bitblock256_t arg1)
2768{
2769        return esimd256<(32)>::mergeh(simd256<(32)>::srai<16>(arg1), simd256<(32)>::srai<16>(simd256<(32)>::slli<16>(arg1)));
2770}
2771
2772//The total number of operations is 54
2773template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::signextendh(bitblock256_t arg1)
2774{
2775        return esimd256<(64)>::mergeh(simd256<(64)>::srai<32>(arg1), simd256<(64)>::srai<32>(simd256<(64)>::slli<32>(arg1)));
2776}
2777
2778//The total number of operations is 235
2779template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::signextendh(bitblock256_t arg1)
2780{
2781        return esimd256<(128)>::mergeh(simd256<(128)>::srai<64>(arg1), simd256<(128)>::srai<64>(simd256<(128)>::slli<64>(arg1)));
2782}
2783
2784//The total number of operations is 220
2785template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::signextendh(bitblock256_t arg1)
2786{
2787        return simd256<(256)>::srai<128>(arg1);
2788}
2789
2790//The total number of operations is 69
2791template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<1>::signextendl(bitblock256_t arg1)
2792{
2793        return esimd256<(2)>::mergel(simd256<(2)>::srai<1>(arg1), simd256<(2)>::srai<1>(simd256<(2)>::slli<1>(arg1)));
2794}
2795
2796//The total number of operations is 89
2797template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<2>::signextendl(bitblock256_t arg1)
2798{
2799        return esimd256<(4)>::mergel(simd256<(4)>::srai<2>(arg1), simd256<(4)>::srai<2>(simd256<(4)>::slli<2>(arg1)));
2800}
2801
2802//The total number of operations is 45
2803template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<4>::signextendl(bitblock256_t arg1)
2804{
2805        return esimd256<(8)>::mergel(simd256<(8)>::srai<4>(arg1), simd256<(8)>::srai<4>(simd256<(8)>::slli<4>(arg1)));
2806}
2807
2808//The total number of operations is 22
2809template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<8>::signextendl(bitblock256_t arg1)
2810{
2811        return esimd256<(16)>::mergel(simd256<(16)>::srai<8>(arg1), simd256<(16)>::srai<8>(simd256<(16)>::slli<8>(arg1)));
2812}
2813
2814//The total number of operations is 22
2815template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<16>::signextendl(bitblock256_t arg1)
2816{
2817        return esimd256<(32)>::mergel(simd256<(32)>::srai<16>(arg1), simd256<(32)>::srai<16>(simd256<(32)>::slli<16>(arg1)));
2818}
2819
2820//The total number of operations is 54
2821template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<32>::signextendl(bitblock256_t arg1)
2822{
2823        return esimd256<(64)>::mergel(simd256<(64)>::srai<32>(arg1), simd256<(64)>::srai<32>(simd256<(64)>::slli<32>(arg1)));
2824}
2825
2826//The total number of operations is 235
2827template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<64>::signextendl(bitblock256_t arg1)
2828{
2829        return esimd256<(128)>::mergel(simd256<(128)>::srai<64>(arg1), simd256<(128)>::srai<64>(simd256<(128)>::slli<64>(arg1)));
2830}
2831
2832//The total number of operations is 260
2833template <> IDISA_ALWAYS_INLINE bitblock256_t esimd256<128>::signextendl(bitblock256_t arg1)
2834{
2835        return simd256<(256)>::srai<128>(simd256<(256)>::slli<128>(arg1));
2836}
2837
2838//The total number of operations is 82
2839template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2840{
2841        return simd_or(mvmd256<2>::srli<sh>(arg1), mvmd256<2>::slli<((128)-sh)>(arg2));
2842}
2843
2844//The total number of operations is 82
2845template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2846{
2847        return simd_or(mvmd256<4>::srli<sh>(arg1), mvmd256<4>::slli<((64)-sh)>(arg2));
2848}
2849
2850//The total number of operations is 82
2851template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2852{
2853        return simd_or(mvmd256<8>::srli<sh>(arg1), mvmd256<8>::slli<((32)-sh)>(arg2));
2854}
2855
2856//The total number of operations is 82
2857template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2858{
2859        return simd_or(mvmd256<16>::srli<sh>(arg1), mvmd256<16>::slli<((16)-sh)>(arg2));
2860}
2861
2862//The total number of operations is 82
2863template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2864{
2865        return simd_or(mvmd256<32>::srli<sh>(arg1), mvmd256<32>::slli<((8)-sh)>(arg2));
2866}
2867
2868//The total number of operations is 82
2869template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2870{
2871        return simd_or(mvmd256<64>::srli<sh>(arg1), mvmd256<64>::slli<((4)-sh)>(arg2));
2872}
2873
2874//The total number of operations is 82
2875template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2876{
2877        return simd_or(mvmd256<128>::srli<sh>(arg1), mvmd256<128>::slli<((2)-sh)>(arg2));
2878}
2879
2880//The total number of operations is 82
2881template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dsrli(bitblock256_t arg1, bitblock256_t arg2)
2882{
2883        return simd_or(mvmd256<256>::srli<sh>(arg1), mvmd256<256>::slli<((1)-sh)>(arg2));
2884}
2885
2886//The total number of operations is 1
2887template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill(uint64_t val1)
2888{
2889        return mvmd256<32>::fill((-1*val1));
2890}
2891
2892//The total number of operations is 1
2893template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill(uint64_t val1)
2894{
2895        return mvmd256<(4)>::fill(((val1<<2)|val1));
2896}
2897
2898//The total number of operations is 1
2899template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill(uint64_t val1)
2900{
2901        return mvmd256<(8)>::fill(((val1<<4)|val1));
2902}
2903
2904//The total number of operations is 1
2905template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill(uint64_t val1)
2906{
2907        return (bitblock256_t)_mm256_set1_epi8((int32_t)(val1));
2908}
2909
2910//The total number of operations is 1
2911template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill(uint64_t val1)
2912{
2913        return (bitblock256_t)_mm256_set1_epi16((int32_t)(val1));
2914}
2915
2916//The total number of operations is 1
2917template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill(uint64_t val1)
2918{
2919        return (bitblock256_t)_mm256_set1_epi32((int32_t)(val1));
2920}
2921
2922//The total number of operations is 2
2923template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<1>::extract(bitblock256_t arg1)
2924{
2925        return (((pos%2) == 0) ? (mvmd256<(2)>::extract<(pos/2)>(arg1)&(1)) : (mvmd256<(2)>::extract<(pos/2)>(arg1)>>1));
2926}
2927
2928//The total number of operations is 2
2929template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<2>::extract(bitblock256_t arg1)
2930{
2931        return (((pos%2) == 0) ? (mvmd256<(4)>::extract<(pos/2)>(arg1)&(3)) : (mvmd256<(4)>::extract<(pos/2)>(arg1)>>2));
2932}
2933
2934//The total number of operations is 2
2935template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<4>::extract(bitblock256_t arg1)
2936{
2937        return (((pos%2) == 0) ? (mvmd256<(8)>::extract<(pos/2)>(arg1)&(15)) : (mvmd256<(8)>::extract<(pos/2)>(arg1)>>4));
2938}
2939
2940//The total number of operations is 2
2941template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<8>::extract(bitblock256_t arg1)
2942{
2943        return (((pos%2) == 0) ? (mvmd256<(16)>::extract<(pos/2)>(arg1)&(255)) : (mvmd256<(16)>::extract<(pos/2)>(arg1)>>8));
2944}
2945
2946//The total number of operations is 2
2947template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<16>::extract(bitblock256_t arg1)
2948{
2949        return ((pos < 8) ? (65535&_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : (65535&_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
2950}
2951
2952//The total number of operations is 2
2953template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<32>::extract(bitblock256_t arg1)
2954{
2955        return ((pos < 4) ? (((uint64_t)((4294967296UL)-1))&_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : (((uint64_t)((4294967296UL)-1))&_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
2956}
2957
2958//The total number of operations is 4
2959template <> template <uint64_t pos> IDISA_ALWAYS_INLINE uint64_t mvmd256<64>::extract(bitblock256_t arg1)
2960{
2961        return ((((uint64_t)mvmd256<(32)>::extract<((2*pos)+1)>(arg1))<<(32))|mvmd256<(32)>::extract<(2*pos)>(arg1));
2962}
2963
2964//The total number of operations is 30
2965template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::splat(bitblock256_t arg1)
2966{
2967        bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(2)>::slli<1>(arg1) : simd256<(2)>::srli<1>(arg1));
2968        bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(2)>::lomask(), arg1) : simd_and(simd256<(2)>::himask(), arg1));
2969        return mvmd256<(2)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
2970}
2971
2972//The total number of operations is 21
2973template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::splat(bitblock256_t arg1)
2974{
2975        bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(4)>::slli<2>(arg1) : simd256<(4)>::srli<2>(arg1));
2976        bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(4)>::lomask(), arg1) : simd_and(simd256<(4)>::himask(), arg1));
2977        return mvmd256<(4)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
2978}
2979
2980//The total number of operations is 12
2981template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::splat(bitblock256_t arg1)
2982{
2983        bitblock256_t tmpArg = (((pos%2) == 0) ? simd256<(8)>::slli<4>(arg1) : simd256<(8)>::srli<4>(arg1));
2984        bitblock256_t arg11 = (((pos%2) == 0) ? simd_and(simd256<(8)>::lomask(), arg1) : simd_and(simd256<(8)>::himask(), arg1));
2985        return mvmd256<(8)>::splat<(pos/2)>(simd_or(tmpArg, arg11));
2986}
2987
2988//The total number of operations is 3
2989template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::splat(bitblock256_t arg1)
2990{
2991        return ((pos < 16) ? mvmd256<8>::fill(_mm_extract_epi8(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<8>::fill(_mm_extract_epi8(avx_select_hi128(arg1), (int32_t)((pos-16)))));
2992}
2993
2994//The total number of operations is 3
2995template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::splat(bitblock256_t arg1)
2996{
2997        return ((pos < 8) ? mvmd256<16>::fill(_mm_extract_epi16(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<16>::fill(_mm_extract_epi16(avx_select_hi128(arg1), (int32_t)((pos-8)))));
2998}
2999
3000//The total number of operations is 3
3001template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::splat(bitblock256_t arg1)
3002{
3003        return ((pos < 4) ? mvmd256<32>::fill(_mm_extract_epi32(avx_select_lo128(arg1), (int32_t)(pos))) : mvmd256<32>::fill(_mm_extract_epi32(avx_select_hi128(arg1), (int32_t)((pos-4)))));
3004}
3005
3006//The total number of operations is 9
3007template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::splat(bitblock256_t arg1)
3008{
3009        return simd256<1>::ifh(simd256<64>::himask(), mvmd256<(32)>::splat<((2*pos)+1)>(arg1), mvmd256<(32)>::splat<(2*pos)>(arg1));
3010}
3011
3012//The total number of operations is 21
3013template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::splat(bitblock256_t arg1)
3014{
3015        return simd256<1>::ifh(simd256<128>::himask(), mvmd256<(64)>::splat<((2*pos)+1)>(arg1), mvmd256<(64)>::splat<(2*pos)>(arg1));
3016}
3017
3018//The total number of operations is 45
3019template <> template <uint64_t pos> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::splat(bitblock256_t arg1)
3020{
3021        return simd256<1>::ifh(simd256<256>::himask(), mvmd256<(128)>::splat<((2*pos)+1)>(arg1), mvmd256<(128)>::splat<(2*pos)>(arg1));
3022}
3023
3024//The total number of operations is 15
3025template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
3026{
3027        return simd_or(mvmd256<(2)>::fill16((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd256<(2)>::fill16((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
3028}
3029
3030//The total number of operations is 7
3031template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
3032{
3033        return simd_or(mvmd256<(4)>::fill16((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd256<(4)>::fill16((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
3034}
3035
3036//The total number of operations is 3
3037template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
3038{
3039        return simd_or(mvmd256<(8)>::fill16((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd256<(8)>::fill16((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
3040}
3041
3042//The total number of operations is 1
3043template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
3044{
3045        return (bitblock256_t)_mm256_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
3046}
3047
3048//The total number of operations is 5
3049template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
3050{
3051        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<16>::fill8(val1, val2, val3, val4, val5, val6, val7, val8), mvmd256<16>::fill8(val9, val10, val11, val12, val13, val14, val15, val16));
3052}
3053
3054//The total number of operations is 5
3055template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
3056{
3057        return simd256<1>::ifh(simd256<(4)>::himask(), mvmd256<1>::fill2(val1, val2), mvmd256<1>::fill2(val3, val4));
3058}
3059
3060//The total number of operations is 5
3061template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
3062{
3063        return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<2>::fill2(val1, val2), mvmd256<2>::fill2(val3, val4));
3064}
3065
3066//The total number of operations is 5
3067template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
3068{
3069        return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<4>::fill2(val1, val2), mvmd256<4>::fill2(val3, val4));
3070}
3071
3072//The total number of operations is 5
3073template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
3074{
3075        return simd256<1>::ifh(simd256<(32)>::himask(), mvmd256<8>::fill2(val1, val2), mvmd256<8>::fill2(val3, val4));
3076}
3077
3078//The total number of operations is 3
3079template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
3080{
3081        return simd_or(mvmd256<(32)>::fill4((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd256<(32)>::fill4((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
3082}
3083
3084//The total number of operations is 1
3085template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
3086{
3087        return (bitblock256_t)_mm256_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
3088}
3089
3090//The total number of operations is 41
3091template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::srli(bitblock256_t arg1)
3092{
3093        return simd256<256>::srli<(sh*2)>(arg1);
3094}
3095
3096//The total number of operations is 41
3097template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::srli(bitblock256_t arg1)
3098{
3099        return simd256<256>::srli<(sh*4)>(arg1);
3100}
3101
3102//The total number of operations is 41
3103template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::srli(bitblock256_t arg1)
3104{
3105        return simd256<256>::srli<(sh*8)>(arg1);
3106}
3107
3108//The total number of operations is 41
3109template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::srli(bitblock256_t arg1)
3110{
3111        return simd256<256>::srli<(sh*16)>(arg1);
3112}
3113
3114//The total number of operations is 41
3115template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::srli(bitblock256_t arg1)
3116{
3117        return simd256<256>::srli<(sh*32)>(arg1);
3118}
3119
3120//The total number of operations is 41
3121template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::srli(bitblock256_t arg1)
3122{
3123        return simd256<256>::srli<(sh*64)>(arg1);
3124}
3125
3126//The total number of operations is 41
3127template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::srli(bitblock256_t arg1)
3128{
3129        return simd256<256>::srli<(sh*128)>(arg1);
3130}
3131
3132//The total number of operations is 41
3133template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::srli(bitblock256_t arg1)
3134{
3135        return simd256<256>::srli<(sh*256)>(arg1);
3136}
3137
3138//The total number of operations is 1
3139template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill2(uint64_t val1, uint64_t val2)
3140{
3141        return mvmd256<(2)>::fill(((val1<<1)|(val2&(1))));
3142}
3143
3144//The total number of operations is 1
3145template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill2(uint64_t val1, uint64_t val2)
3146{
3147        return mvmd256<(4)>::fill(((val1<<2)|(val2&(3))));
3148}
3149
3150//The total number of operations is 1
3151template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill2(uint64_t val1, uint64_t val2)
3152{
3153        return mvmd256<(8)>::fill(((val1<<4)|(val2&(15))));
3154}
3155
3156//The total number of operations is 1
3157template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill2(uint64_t val1, uint64_t val2)
3158{
3159        return mvmd256<(16)>::fill(((val1<<8)|(val2&(255))));
3160}
3161
3162//The total number of operations is 1
3163template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill2(uint64_t val1, uint64_t val2)
3164{
3165        return mvmd256<(32)>::fill(((val1<<16)|(val2&(65535))));
3166}
3167
3168//The total number of operations is 5
3169template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill2(uint64_t val1, uint64_t val2)
3170{
3171        return simd256<1>::ifh(simd256<(64)>::himask(), mvmd256<32>::fill(val1), mvmd256<32>::fill(val2));
3172}
3173
3174//The total number of operations is 82
3175template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3176{
3177        return simd_or(mvmd256<2>::slli<sh>(arg1), mvmd256<2>::srli<((128)-sh)>(arg2));
3178}
3179
3180//The total number of operations is 82
3181template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3182{
3183        return simd_or(mvmd256<4>::slli<sh>(arg1), mvmd256<4>::srli<((64)-sh)>(arg2));
3184}
3185
3186//The total number of operations is 82
3187template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3188{
3189        return simd_or(mvmd256<8>::slli<sh>(arg1), mvmd256<8>::srli<((32)-sh)>(arg2));
3190}
3191
3192//The total number of operations is 82
3193template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3194{
3195        return simd_or(mvmd256<16>::slli<sh>(arg1), mvmd256<16>::srli<((16)-sh)>(arg2));
3196}
3197
3198//The total number of operations is 82
3199template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3200{
3201        return simd_or(mvmd256<32>::slli<sh>(arg1), mvmd256<32>::srli<((8)-sh)>(arg2));
3202}
3203
3204//The total number of operations is 82
3205template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3206{
3207        return simd_or(mvmd256<64>::slli<sh>(arg1), mvmd256<64>::srli<((4)-sh)>(arg2));
3208}
3209
3210//The total number of operations is 82
3211template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3212{
3213        return simd_or(mvmd256<128>::slli<sh>(arg1), mvmd256<128>::srli<((2)-sh)>(arg2));
3214}
3215
3216//The total number of operations is 82
3217template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::dslli(bitblock256_t arg1, bitblock256_t arg2)
3218{
3219        return simd_or(mvmd256<256>::slli<sh>(arg1), mvmd256<256>::srli<((1)-sh)>(arg2));
3220}
3221
3222//The total number of operations is 40
3223template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::slli(bitblock256_t arg1)
3224{
3225        return simd256<256>::slli<(sh*2)>(arg1);
3226}
3227
3228//The total number of operations is 40
3229template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::slli(bitblock256_t arg1)
3230{
3231        return simd256<256>::slli<(sh*4)>(arg1);
3232}
3233
3234//The total number of operations is 40
3235template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::slli(bitblock256_t arg1)
3236{
3237        return simd256<256>::slli<(sh*8)>(arg1);
3238}
3239
3240//The total number of operations is 40
3241template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::slli(bitblock256_t arg1)
3242{
3243        return simd256<256>::slli<(sh*16)>(arg1);
3244}
3245
3246//The total number of operations is 40
3247template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::slli(bitblock256_t arg1)
3248{
3249        return simd256<256>::slli<(sh*32)>(arg1);
3250}
3251
3252//The total number of operations is 40
3253template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<64>::slli(bitblock256_t arg1)
3254{
3255        return simd256<256>::slli<(sh*64)>(arg1);
3256}
3257
3258//The total number of operations is 40
3259template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<128>::slli(bitblock256_t arg1)
3260{
3261        return simd256<256>::slli<(sh*128)>(arg1);
3262}
3263
3264//The total number of operations is 40
3265template <> template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<256>::slli(bitblock256_t arg1)
3266{
3267        return simd256<256>::slli<(sh*256)>(arg1);
3268}
3269
3270//The total number of operations is 13
3271template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<1>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
3272{
3273        return simd256<1>::ifh(simd256<(8)>::himask(), mvmd256<1>::fill4(val1, val2, val3, val4), mvmd256<1>::fill4(val5, val6, val7, val8));
3274}
3275
3276//The total number of operations is 13
3277template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<2>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
3278{
3279        return simd256<1>::ifh(simd256<(16)>::himask(), mvmd256<2>::fill4(val1, val2, val3, val4), mvmd256<2>::fill4(val5, val6, val7, val8));
3280}
3281
3282//The total number of operations is 7
3283template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<4>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
3284{
3285        return simd_or(mvmd256<(8)>::fill8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd256<(8)>::fill8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
3286}
3287
3288//The total number of operations is 3
3289template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<8>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
3290{
3291        return simd_or(mvmd256<(16)>::fill8((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd256<(16)>::fill8((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
3292}
3293
3294//The total number of operations is 1
3295template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<16>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
3296{
3297        return (bitblock256_t)_mm256_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
3298}
3299
3300//The total number of operations is 5
3301template <> IDISA_ALWAYS_INLINE bitblock256_t mvmd256<32>::fill8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
3302{
3303        return simd256<1>::ifh(simd256<(256)>::himask(), mvmd256<32>::fill4(val1, val2, val3, val4), mvmd256<32>::fill4(val5, val6, val7, val8));
3304}
3305
3306//The total number of operations is 1
3307IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_unaligned(float const* arg1)
3308{
3309        return _mm256_loadu_ps((float const*)(arg1));
3310}
3311
3312//The total number of operations is 41
3313template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::srli(bitblock256_t arg1)
3314{
3315        return simd256<256>::srli<sh>(arg1);
3316}
3317
3318//The total number of operations is 1
3319IDISA_ALWAYS_INLINE void bitblock256::store_aligned(float* arg1, bitblock256_t arg2)
3320{
3321        _mm256_store_ps((float*)(arg1), arg2);
3322}
3323
3324//The total number of operations is 209
3325IDISA_ALWAYS_INLINE uint64_t bitblock256::popcount(bitblock256_t arg1)
3326{
3327        return mvmd256<64>::extract<0>(simd256<256>::popcount(arg1));
3328}
3329
3330//The total number of operations is 2
3331IDISA_ALWAYS_INLINE bool bitblock256::all(bitblock256_t arg1)
3332{
3333        return _mm256_testz_si256(((__m256i)simd_not(arg1)), ((__m256i)simd256<8>::constant<-1>())) == 1;
3334}
3335
3336//The total number of operations is 40
3337template <uint64_t sh> IDISA_ALWAYS_INLINE bitblock256_t bitblock256::slli(bitblock256_t arg1)
3338{
3339        return simd256<256>::slli<sh>(arg1);
3340}
3341
3342//The total number of operations is 1
3343IDISA_ALWAYS_INLINE bool bitblock256::any(bitblock256_t arg1)
3344{
3345        return _mm256_testz_si256(((__m256i)arg1), ((__m256i)arg1)) == 0;
3346}
3347
3348//The total number of operations is 1
3349IDISA_ALWAYS_INLINE bitblock256_t bitblock256::load_aligned(float const* arg1)
3350{
3351        return _mm256_load_ps((float const*)(arg1));
3352}
3353
3354//The total number of operations is 1
3355IDISA_ALWAYS_INLINE void bitblock256::store_unaligned(float* arg1, bitblock256_t arg2)
3356{
3357        _mm256_storeu_ps((float*)(arg1), arg2);
3358}
3359
3360#endif
Note: See TracBrowser for help on using the repository browser.