source: trunk/lib_c/idisa_c/idisa_sse4_2_c.h @ 3391

Last change on this file since 3391 was 3391, checked in by linmengl, 6 years ago

check in IDISA C library and other support libraries. Some template features still remain.

File size: 110.6 KB
Line 
1
2/* Copyright (c) 2011, Hua Huang and Robert D. Cameron.
3   Licensed under the Academic Free License 3.0.
4   This file is generated by the IDISA+ generator;
5   modifications should be made only by changing the
6   generator configuration and data files. */
7
8#ifndef _IDISA_SSE4_2_C_H
9#define _IDISA_SSE4_2_C_H
10#include "smmintrin.h"
11
12#include <stdint.h>
13typedef __m128i bitblock128_t;
14
15#define shufflemask4(s1, s2, s3, s4) \
16        ((s1<<6) | (s2<<4) | (s3<<2) | s4)
17
18#define shufflemask4_from_shufflemask2(msk) \
19        (msk==3 ? 238 : (msk==2 ? 228 : (msk==1 ? 78 : 68)))
20
21#define shufflemask8_to_shufflemask4(msk) \
22        ((msk&3) | (((msk>>3)&3)<<2) | (((msk>>6)&3)<<4) | (((msk>>9)&3)<<6) | (((msk>>12)&3)<<8) | (((msk>>15)&3)<<10) | (((msk>>18)&3)<<12) | (((msk>>21)&3)<<14))
23
24//Declaration Starts here
25static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2);
26static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2);
27static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2);
28static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2);
29static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2);
30static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2);
31static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2);
32static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1);
33static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1);
34static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1);
35static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1);
36static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1);
37static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1);
38static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1);
39static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2);
40static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2);
41static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2);
42static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2);
43static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2);
44static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2);
45static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2);
46static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2);
47static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2);
48static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2);
49static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2);
50static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2);
51static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2);
52static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2);
53static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2);
54static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2);
55static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2);
56static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2);
57static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2);
58static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2);
59static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2);
60static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2);
61static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2);
62static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2);
63static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2);
64static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2);
65static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2);
66static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2);
67static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2);
68static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2);
69static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
70static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2);
71static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2);
72static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2);
73static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2);
74static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2);
75static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2);
76static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2);
77static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2);
78static inline bitblock128_t simd_not(bitblock128_t arg1);
79static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2);
80static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2);
81static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2);
82static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2);
83static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2);
84static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2);
85static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2);
86static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2);
87static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2);
88static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2);
89static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2);
90static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2);
91static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2);
92static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2);
93static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2);
94static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2);
95static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2);
96static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2);
97static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2);
98static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2);
99static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2);
100static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2);
101static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2);
102//The total number of operations is 1.0
103#define mvmd_shufflei_32(msk, arg1) \
104        _mm_shuffle_epi32(arg1, (int32_t)(msk))
105
106//The total number of operations is 1.0
107#define mvmd_shufflei_64(msk, arg1) \
108        mvmd_shufflei_32(shufflemask4_from_shufflemask2(msk), arg1)
109
110//The total number of operations is 13.6666666667
111#define mvmd_shufflei_16(msk, arg1) \
112        simd_ifh_1(mvmd_fill8_16(((((msk>>21)&4) == 0) ? 0 : (131071)), ((((msk>>18)&4) == 0) ? 0 : (131071)), ((((msk>>15)&4) == 0) ? 0 : (131071)), ((((msk>>12)&4) == 0) ? 0 : (131071)), ((((msk>>9)&4) == 0) ? (131071) : 0), ((((msk>>6)&4) == 0) ? (131071) : 0), ((((msk>>3)&4) == 0) ? (131071) : 0), (((msk&4) == 0) ? (131071) : 0)), _mm_shufflelo_epi16(_mm_shufflehi_epi16(arg1, (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), (int32_t)((shufflemask8_to_shufflemask4(msk)&255))), simd_or(_mm_shufflehi_epi16(simd_slli_128(64, arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), _mm_shufflelo_epi16(simd_srli_128(64, arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)&255)))))
113
114//The total number of operations is 1.0
115#define simd_srli_32(sh, arg1) \
116        _mm_srli_epi32(arg1, (int32_t)(sh))
117
118//The total number of operations is 2.0
119#define simd_srli_2(sh, arg1) \
120        simd_and(simd_srli_32(sh, arg1), simd_constant_2(((3)>>sh)))
121
122//The total number of operations is 2.0
123#define simd_srli_4(sh, arg1) \
124        simd_and(simd_srli_32(sh, arg1), simd_constant_4(((15)>>sh)))
125
126//The total number of operations is 2.0
127#define simd_srli_8(sh, arg1) \
128        simd_and(simd_srli_32(sh, arg1), simd_constant_8(((255)>>sh)))
129
130//The total number of operations is 1.0
131#define simd_srli_64(sh, arg1) \
132        _mm_srli_epi64(arg1, (int32_t)(sh))
133
134//The total number of operations is 2.33333333333
135#define simd_srli_128(sh, arg1) \
136        (((sh%8) == 0) ? _mm_srli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_srli_64((sh&63), _mm_srli_si128(arg1, (int32_t)(8))) : simd_or(simd_srli_64(sh, arg1), _mm_srli_si128(simd_slli_64(((128-sh)&63), arg1), (int32_t)(8)))))
137
138//The total number of operations is 1.0
139#define simd_srli_16(sh, arg1) \
140        _mm_srli_epi16(arg1, (int32_t)(sh))
141
142static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1);
143//The total number of operations is 3.0
144#define mvmd_dsrli_32(sh, arg1, arg2) \
145        simd_or(mvmd_srli_32(sh, arg1), mvmd_slli_32(((4)-sh), arg2))
146
147//The total number of operations is 5.66666666667
148#define mvmd_dsrli_2(sh, arg1, arg2) \
149        simd_or(mvmd_srli_2(sh, arg1), mvmd_slli_2(((64)-sh), arg2))
150
151//The total number of operations is 5.66666666667
152#define mvmd_dsrli_4(sh, arg1, arg2) \
153        simd_or(mvmd_srli_4(sh, arg1), mvmd_slli_4(((32)-sh), arg2))
154
155//The total number of operations is 3.0
156#define mvmd_dsrli_8(sh, arg1, arg2) \
157        simd_or(mvmd_srli_8(sh, arg1), mvmd_slli_8(((16)-sh), arg2))
158
159//The total number of operations is 3.0
160#define mvmd_dsrli_64(sh, arg1, arg2) \
161        simd_or(mvmd_srli_64(sh, arg1), mvmd_slli_64(((2)-sh), arg2))
162
163//The total number of operations is 3.0
164#define mvmd_dsrli_128(sh, arg1, arg2) \
165        simd_or(mvmd_srli_128(sh, arg1), mvmd_slli_128(((1)-sh), arg2))
166
167//The total number of operations is 3.0
168#define mvmd_dsrli_16(sh, arg1, arg2) \
169        simd_or(mvmd_srli_16(sh, arg1), mvmd_slli_16(((8)-sh), arg2))
170
171//The total number of operations is 2.33333333333
172#define bitblock_srli(sh, arg1) \
173        simd_srli_128(sh, arg1)
174
175static inline bitblock128_t simd_ctz_32(bitblock128_t arg1);
176static inline bitblock128_t simd_ctz_1(bitblock128_t arg1);
177static inline bitblock128_t simd_ctz_2(bitblock128_t arg1);
178static inline bitblock128_t simd_ctz_4(bitblock128_t arg1);
179static inline bitblock128_t simd_ctz_8(bitblock128_t arg1);
180static inline bitblock128_t simd_ctz_64(bitblock128_t arg1);
181static inline bitblock128_t simd_ctz_128(bitblock128_t arg1);
182static inline bitblock128_t simd_ctz_16(bitblock128_t arg1);
183static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask);
184static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask);
185static inline bitblock128_t mvmd_fill_32(uint64_t val1);
186static inline bitblock128_t mvmd_fill_1(uint64_t val1);
187static inline bitblock128_t mvmd_fill_2(uint64_t val1);
188static inline bitblock128_t mvmd_fill_4(uint64_t val1);
189static inline bitblock128_t mvmd_fill_8(uint64_t val1);
190static inline bitblock128_t mvmd_fill_64(uint64_t val1);
191static inline bitblock128_t mvmd_fill_128(uint64_t val1);
192static inline bitblock128_t mvmd_fill_16(uint64_t val1);
193static inline bitblock128_t mvmd_shuffle_32(bitblock128_t arg1, bitblock128_t arg2);
194static inline bitblock128_t mvmd_shuffle_8(bitblock128_t arg1, bitblock128_t arg2);
195static inline bitblock128_t mvmd_shuffle_64(bitblock128_t arg1, bitblock128_t arg2);
196static inline bitblock128_t mvmd_shuffle_16(bitblock128_t arg1, bitblock128_t arg2);
197static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2);
198static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2);
199static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2);
200static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2);
201static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2);
202static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2);
203static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2);
204static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2);
205static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
206static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2);
207static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2);
208static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2);
209static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2);
210static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2);
211static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2);
212static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2);
213static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2);
214static inline bitblock128_t simd_popcount_32(bitblock128_t arg1);
215static inline bitblock128_t simd_popcount_1(bitblock128_t arg1);
216static inline bitblock128_t simd_popcount_2(bitblock128_t arg1);
217static inline bitblock128_t simd_popcount_4(bitblock128_t arg1);
218static inline bitblock128_t simd_popcount_8(bitblock128_t arg1);
219static inline bitblock128_t simd_popcount_64(bitblock128_t arg1);
220static inline bitblock128_t simd_popcount_128(bitblock128_t arg1);
221static inline bitblock128_t simd_popcount_16(bitblock128_t arg1);
222static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
223//The total number of operations is 1.0
224#define mvmd_extract_32(pos, arg1) \
225        (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(arg1, (int32_t)(pos)))
226
227//The total number of operations is 1.0
228#define mvmd_extract_1(pos, arg1) \
229        (((pos%2) == 0) ? (mvmd_extract_2((pos/2), arg1)&(1)) : (mvmd_extract_2((pos/2), arg1)>>1))
230
231//The total number of operations is 1.0
232#define mvmd_extract_2(pos, arg1) \
233        (((pos%2) == 0) ? (mvmd_extract_4((pos/2), arg1)&(3)) : (mvmd_extract_4((pos/2), arg1)>>2))
234
235//The total number of operations is 1.0
236#define mvmd_extract_4(pos, arg1) \
237        (((pos%2) == 0) ? (mvmd_extract_8((pos/2), arg1)&(15)) : (mvmd_extract_8((pos/2), arg1)>>4))
238
239//The total number of operations is 1.0
240#define mvmd_extract_8(pos, arg1) \
241        (((pos%2) == 0) ? (mvmd_extract_16((pos/2), arg1)&(255)) : (mvmd_extract_16((pos/2), arg1)>>8))
242
243//The total number of operations is 2.0
244#define mvmd_extract_64(pos, arg1) \
245        ((((uint64_t)(mvmd_extract_32(((2*pos)+1), arg1)))<<(32))|mvmd_extract_32((2*pos), arg1))
246
247//The total number of operations is 1.0
248#define mvmd_extract_16(pos, arg1) \
249        (65535&_mm_extract_epi16(arg1, (int32_t)(pos)))
250
251static inline bitblock128_t simd_neg_32(bitblock128_t arg1);
252static inline bitblock128_t simd_neg_2(bitblock128_t arg1);
253static inline bitblock128_t simd_neg_4(bitblock128_t arg1);
254static inline bitblock128_t simd_neg_8(bitblock128_t arg1);
255static inline bitblock128_t simd_neg_64(bitblock128_t arg1);
256static inline bitblock128_t simd_neg_128(bitblock128_t arg1);
257static inline bitblock128_t simd_neg_16(bitblock128_t arg1);
258//The total number of operations is 1.0
259#define mvmd_splat_32(pos, arg1) \
260        mvmd_shufflei_32(shufflemask4(pos, pos, pos, pos), arg1)
261
262//The total number of operations is 12.6666666667
263#define mvmd_splat_1(pos, arg1) \
264        simd_sub_128(simd_constant_128(0), simd_and(simd_constant_128(1), simd_srli_128(pos, arg1)))
265
266//The total number of operations is 10.0
267#define mvmd_splat_2(pos, arg1) \
268        mvmd_splat_4((pos/2), simd_or((((pos%2) == 0) ? simd_slli_4(2, arg1) : simd_srli_4(2, arg1)), (((pos%2) == 0) ? simd_and(simd_lomask_4(), arg1) : simd_and(simd_himask_4(), arg1))))
269
270//The total number of operations is 6.0
271#define mvmd_splat_4(pos, arg1) \
272        mvmd_splat_8((pos/2), simd_or((((pos%2) == 0) ? simd_slli_8(4, arg1) : simd_srli_8(4, arg1)), (((pos%2) == 0) ? simd_and(simd_lomask_8(), arg1) : simd_and(simd_himask_8(), arg1))))
273
274//The total number of operations is 2.0
275#define mvmd_splat_8(pos, arg1) \
276        mvmd_fill_8(_mm_extract_epi8(arg1, (int32_t)(pos)))
277
278//The total number of operations is 5.0
279#define mvmd_splat_64(pos, arg1) \
280        simd_ifh_1(simd_himask_64(), mvmd_splat_32(((2*pos)+1), arg1), mvmd_splat_32((2*pos), arg1))
281
282//The total number of operations is 13.0
283#define mvmd_splat_128(pos, arg1) \
284        simd_ifh_1(simd_himask_128(), mvmd_splat_64(((2*pos)+1), arg1), mvmd_splat_64((2*pos), arg1))
285
286//The total number of operations is 2.0
287#define mvmd_splat_16(pos, arg1) \
288        mvmd_fill_16(_mm_extract_epi16(arg1, (int32_t)(pos)))
289
290static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2);
291static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2);
292static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2);
293static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2);
294static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2);
295static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2);
296static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2);
297static inline bitblock128_t simd_himask_32();
298static inline bitblock128_t simd_himask_2();
299static inline bitblock128_t simd_himask_4();
300static inline bitblock128_t simd_himask_8();
301static inline bitblock128_t simd_himask_64();
302static inline bitblock128_t simd_himask_128();
303static inline bitblock128_t simd_himask_16();
304//The total number of operations is 1.0
305#define simd_slli_32(sh, arg1) \
306        _mm_slli_epi32(arg1, (int32_t)(sh))
307
308//The total number of operations is 2.0
309#define simd_slli_2(sh, arg1) \
310        simd_and(simd_slli_32(sh, arg1), simd_constant_2((((3)<<sh)&(3))))
311
312//The total number of operations is 2.0
313#define simd_slli_4(sh, arg1) \
314        simd_and(simd_slli_32(sh, arg1), simd_constant_4((((15)<<sh)&(15))))
315
316//The total number of operations is 2.0
317#define simd_slli_8(sh, arg1) \
318        simd_and(simd_slli_32(sh, arg1), simd_constant_8((((255)<<sh)&(255))))
319
320//The total number of operations is 1.0
321#define simd_slli_64(sh, arg1) \
322        _mm_slli_epi64(arg1, (int32_t)(sh))
323
324//The total number of operations is 2.33333333333
325#define simd_slli_128(sh, arg1) \
326        (((sh%8) == 0) ? _mm_slli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_slli_64((sh&63), _mm_slli_si128(arg1, (int32_t)(8))) : simd_or(simd_slli_64(sh, arg1), _mm_slli_si128(simd_srli_64(((128-sh)&63), arg1), (int32_t)(8)))))
327
328//The total number of operations is 1.0
329#define simd_slli_16(sh, arg1) \
330        _mm_slli_epi16(arg1, (int32_t)(sh))
331
332static inline bool bitblock_all(bitblock128_t arg1);
333static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
334static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
335static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
336static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
337static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
338static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
339static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
340static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
341static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2);
342static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2);
343static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2);
344static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2);
345static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2);
346static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2);
347static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2);
348static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2);
349static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1);
350static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1);
351static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1);
352static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1);
353static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1);
354static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1);
355static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1);
356static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask);
357static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask);
358//The total number of operations is 1.0
359#define mvmd_slli_32(sh, arg1) \
360        mvmd_slli_16((sh*2), arg1)
361
362//The total number of operations is 2.33333333333
363#define mvmd_slli_2(sh, arg1) \
364        simd_slli_128((sh*2), arg1)
365
366//The total number of operations is 2.33333333333
367#define mvmd_slli_4(sh, arg1) \
368        mvmd_slli_2((sh*2), arg1)
369
370//The total number of operations is 1.0
371#define mvmd_slli_8(sh, arg1) \
372        _mm_slli_si128(arg1, (int32_t)(sh))
373
374//The total number of operations is 1.0
375#define mvmd_slli_64(sh, arg1) \
376        mvmd_slli_32((sh*2), arg1)
377
378//The total number of operations is 1.0
379#define mvmd_slli_128(sh, arg1) \
380        mvmd_slli_64((sh*2), arg1)
381
382//The total number of operations is 1.0
383#define mvmd_slli_16(sh, arg1) \
384        mvmd_slli_8((sh*2), arg1)
385
386static inline bitblock128_t simd_lomask_32();
387static inline bitblock128_t simd_lomask_2();
388static inline bitblock128_t simd_lomask_4();
389static inline bitblock128_t simd_lomask_8();
390static inline bitblock128_t simd_lomask_64();
391static inline bitblock128_t simd_lomask_128();
392static inline bitblock128_t simd_lomask_16();
393static inline uint64_t hsimd_signmask_32(bitblock128_t arg1);
394static inline uint64_t hsimd_signmask_4(bitblock128_t arg1);
395static inline uint64_t hsimd_signmask_8(bitblock128_t arg1);
396static inline uint64_t hsimd_signmask_64(bitblock128_t arg1);
397static inline uint64_t hsimd_signmask_128(bitblock128_t arg1);
398static inline uint64_t hsimd_signmask_16(bitblock128_t arg1);
399static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1);
400static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1);
401static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1);
402static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1);
403static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1);
404static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1);
405static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1);
406static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1);
407static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1);
408static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1);
409static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1);
410static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1);
411static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1);
412static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1);
413static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
414static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
415static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
416static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
417static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
418static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
419static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2);
420static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2);
421static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2);
422static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2);
423static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2);
424static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2);
425static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2);
426static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2);
427//The total number of operations is 1.0
428#define mvmd_srli_32(sh, arg1) \
429        mvmd_srli_16((sh*2), arg1)
430
431//The total number of operations is 2.33333333333
432#define mvmd_srli_2(sh, arg1) \
433        simd_srli_128((sh*2), arg1)
434
435//The total number of operations is 2.33333333333
436#define mvmd_srli_4(sh, arg1) \
437        simd_srli_128((sh*4), arg1)
438
439//The total number of operations is 1.0
440#define mvmd_srli_8(sh, arg1) \
441        _mm_srli_si128(arg1, (int32_t)(sh))
442
443//The total number of operations is 1.0
444#define mvmd_srli_64(sh, arg1) \
445        mvmd_srli_32((sh*2), arg1)
446
447//The total number of operations is 1.0
448#define mvmd_srli_128(sh, arg1) \
449        mvmd_srli_64((sh*2), arg1)
450
451//The total number of operations is 1.0
452#define mvmd_srli_16(sh, arg1) \
453        mvmd_srli_8((sh*2), arg1)
454
455//The total number of operations is 0
456#define simd_constant_32(val) \
457        _mm_set1_epi32((int32_t)(val))
458
459//The total number of operations is 0
460#define simd_constant_1(val) \
461        simd_constant_32((-1*val))
462
463//The total number of operations is 0
464#define simd_constant_2(val) \
465        ((val < 0) ? simd_constant_4(((val<<2)|(val^(-4)))) : simd_constant_4(((val<<2)|val)))
466
467//The total number of operations is 0
468#define simd_constant_4(val) \
469        ((val < 0) ? simd_constant_8(((val<<4)|(val^(-16)))) : simd_constant_8(((val<<4)|val)))
470
471//The total number of operations is 0
472#define simd_constant_8(val) \
473        _mm_set1_epi8((int32_t)(val))
474
475//The total number of operations is 0
476#define simd_constant_64(val) \
477        _mm_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))
478
479//The total number of operations is 0
480#define simd_constant_128(val) \
481        _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))
482
483//The total number of operations is 0
484#define simd_constant_16(val) \
485        _mm_set1_epi16((int32_t)(val))
486
487static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2);
488static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2);
489static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2);
490static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2);
491static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2);
492static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2);
493static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2);
494static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2);
495static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2);
496static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2);
497static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2);
498static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2);
499static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2);
500static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2);
501static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2);
502static inline bool bitblock_any(bitblock128_t arg1);
503static inline uint64_t bitblock_popcount(bitblock128_t arg1);
504//The total number of operations is 2.33333333333
505#define bitblock_slli(sh, arg1) \
506        simd_slli_128(sh, arg1)
507
508static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
509static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2);
510static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2);
511static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2);
512static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2);
513static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2);
514static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2);
515static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2);
516//The total number of operations is 3.0
517#define mvmd_dslli_32(sh, arg1, arg2) \
518        simd_or(mvmd_slli_32(sh, arg1), mvmd_srli_32(((4)-sh), arg2))
519
520//The total number of operations is 5.66666666667
521#define mvmd_dslli_2(sh, arg1, arg2) \
522        simd_or(mvmd_slli_2(sh, arg1), mvmd_srli_2(((64)-sh), arg2))
523
524//The total number of operations is 5.66666666667
525#define mvmd_dslli_4(sh, arg1, arg2) \
526        simd_or(mvmd_slli_4(sh, arg1), mvmd_srli_4(((32)-sh), arg2))
527
528//The total number of operations is 3.0
529#define mvmd_dslli_8(sh, arg1, arg2) \
530        simd_or(mvmd_slli_8(sh, arg1), mvmd_srli_8(((16)-sh), arg2))
531
532//The total number of operations is 3.0
533#define mvmd_dslli_64(sh, arg1, arg2) \
534        simd_or(mvmd_slli_64(sh, arg1), mvmd_srli_64(((2)-sh), arg2))
535
536//The total number of operations is 3.0
537#define mvmd_dslli_128(sh, arg1, arg2) \
538        simd_or(mvmd_slli_128(sh, arg1), mvmd_srli_128(((1)-sh), arg2))
539
540//The total number of operations is 3.0
541#define mvmd_dslli_16(sh, arg1, arg2) \
542        simd_or(mvmd_slli_16(sh, arg1), mvmd_srli_16(((8)-sh), arg2))
543
544static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
545static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
546static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
547static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
548static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
549static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2);
550static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2);
551static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2);
552static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2);
553static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2);
554static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2);
555static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2);
556static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
557static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2);
558static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2);
559static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2);
560static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2);
561static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2);
562static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2);
563static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2);
564static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2);
565static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1);
566static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2);
567static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1);
568static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1);
569static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1);
570static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1);
571static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1);
572static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1);
573static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1);
574static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2);
575static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2);
576static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2);
577static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2);
578static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2);
579static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2);
580static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2);
581static inline bitblock128_t simd_abs_32(bitblock128_t arg1);
582static inline bitblock128_t simd_abs_2(bitblock128_t arg1);
583static inline bitblock128_t simd_abs_4(bitblock128_t arg1);
584static inline bitblock128_t simd_abs_8(bitblock128_t arg1);
585static inline bitblock128_t simd_abs_64(bitblock128_t arg1);
586static inline bitblock128_t simd_abs_128(bitblock128_t arg1);
587static inline bitblock128_t simd_abs_16(bitblock128_t arg1);
588static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1);
589static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1);
590static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1);
591static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1);
592static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1);
593static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1);
594static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1);
595//The total number of operations is 1.0
596#define simd_srai_32(sh, arg1) \
597        _mm_srai_epi32(arg1, (int32_t)(sh))
598
599//The total number of operations is 4.0
600#define simd_srai_2(sh, arg1) \
601        ((sh == 0) ? arg1 : simd_or(simd_and(simd_himask_2(), arg1), simd_srli_2(1, arg1)))
602
603static inline bitblock128_t simd_srai_4(uint64_t sh, bitblock128_t arg1);
604static inline bitblock128_t simd_srai_8(uint64_t sh, bitblock128_t arg1);
605//The total number of operations is 4.5
606#define simd_srai_64(sh, arg1) \
607        simd_or(simd_and(simd_himask_64(), simd_srai_32(((sh < (32)) ? sh : (32)), arg1)), ((sh <= (32)) ? simd_srli_64(sh, arg1) : simd_srai_32((sh-(32)), simd_srli_64((32), arg1))))
608
609//The total number of operations is 11.0833333333
610#define simd_srai_128(sh, arg1) \
611        simd_or(simd_and(simd_himask_128(), simd_srai_64(((sh < (64)) ? sh : (64)), arg1)), ((sh <= (64)) ? simd_srli_128(sh, arg1) : simd_srai_64((sh-(64)), simd_srli_128((64), arg1))))
612
613//The total number of operations is 1.0
614#define simd_srai_16(sh, arg1) \
615        _mm_srai_epi16(arg1, (int32_t)(sh))
616
617static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
618static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
619static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
620static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
621static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
622static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2);
623static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2);
624static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2);
625static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2);
626static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2);
627static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2);
628static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2);
629static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2);
630static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2);
631static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2);
632static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2);
633static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2);
634static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2);
635static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2);
636static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2);
637static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2);
638static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2);
639static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2);
640static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2);
641static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2);
642static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2);
643static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2);
644static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2);
645static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2);
646
647//Implementation Starts here
648//The total number of operations is 1.0
649static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2)
650{
651        return _mm_unpacklo_epi32(arg2, arg1);
652}
653//The total number of operations is 31.0
654static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2)
655{
656        return esimd_mergel_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(1, arg2)), simd_ifh_1(simd_himask_2(), simd_slli_2(1, arg1), arg2));
657}
658//The total number of operations is 21.0
659static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2)
660{
661        return esimd_mergel_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(2, arg2)), simd_ifh_1(simd_himask_4(), simd_slli_4(2, arg1), arg2));
662}
663//The total number of operations is 11.0
664static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2)
665{
666        return esimd_mergel_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(4, arg2)), simd_ifh_1(simd_himask_8(), simd_slli_8(4, arg1), arg2));
667}
668//The total number of operations is 1.0
669static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2)
670{
671        return _mm_unpacklo_epi8(arg2, arg1);
672}
673//The total number of operations is 1.0
674static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2)
675{
676        return _mm_unpacklo_epi64(arg2, arg1);
677}
678//The total number of operations is 1.0
679static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2)
680{
681        return _mm_unpacklo_epi16(arg2, arg1);
682}
683//The total number of operations is 3.33333333333
684static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1)
685{
686        return esimd_signextendl_32(simd_srli_128((64), arg1));
687}
688//The total number of operations is 31.0
689static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1)
690{
691        return esimd_mergeh_2(simd_srai_2(1, arg1), simd_srai_2(1, simd_slli_2(1, arg1)));
692}
693//The total number of operations is 33.0
694static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1)
695{
696        return esimd_mergeh_4(simd_srai_4(2, arg1), simd_srai_4(2, simd_slli_4(2, arg1)));
697}
698//The total number of operations is 13.0
699static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1)
700{
701        return esimd_mergeh_8(simd_srai_8(4, arg1), simd_srai_8(4, simd_slli_8(4, arg1)));
702}
703//The total number of operations is 3.33333333333
704static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1)
705{
706        return esimd_signextendl_8(simd_srli_128((64), arg1));
707}
708//The total number of operations is 11.0833333333
709static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1)
710{
711        return simd_srai_128(64, arg1);
712}
713//The total number of operations is 3.33333333333
714static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1)
715{
716        return esimd_signextendl_16(simd_srli_128((64), arg1));
717}
718//The total number of operations is 1.0
719static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2)
720{
721        return _mm_max_epi32(arg1, arg2);
722}
723//The total number of operations is 1.0
724static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2)
725{
726        return simd_and(arg1, arg2);
727}
728//The total number of operations is 15.6666666667
729static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2)
730{
731        return simd_ifh_1(simd_himask_2(), simd_and(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(1, simd_or(arg1, simd_not(arg2)))), simd_and(arg1, simd_srli_128(1, simd_or(simd_not(arg1), arg2)))));
732}
733//The total number of operations is 9.0
734static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2)
735{
736        bitblock128_t high_bit = simd_constant_4((8));
737        return simd_xor(simd_umax_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
738}
739//The total number of operations is 1.0
740static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2)
741{
742        return _mm_max_epi8(arg1, arg2);
743}
744//The total number of operations is 4.0
745static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2)
746{
747        return simd_ifh_1(simd_gt_64(arg1, arg2), arg1, arg2);
748}
749//The total number of operations is 26.6666666667
750static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2)
751{
752        bitblock128_t hiAns = simd_max_64(arg1, arg2);
753        bitblock128_t loAns = simd_umax_64(arg1, arg2);
754        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(hiAns, arg1));
755        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(hiAns, arg2));
756        return simd_ifh_1(simd_himask_128(), hiAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, loAns, arg1), arg2));
757}
758//The total number of operations is 1.0
759static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2)
760{
761        return _mm_max_epi16(arg1, arg2);
762}
763//The total number of operations is 1.0
764static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2)
765{
766        return _mm_unpackhi_epi32(arg2, arg1);
767}
768//The total number of operations is 31.0
769static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2)
770{
771        return esimd_mergeh_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(1, arg2)), simd_ifh_1(simd_himask_2(), simd_slli_2(1, arg1), arg2));
772}
773//The total number of operations is 21.0
774static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2)
775{
776        return esimd_mergeh_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(2, arg2)), simd_ifh_1(simd_himask_4(), simd_slli_4(2, arg1), arg2));
777}
778//The total number of operations is 11.0
779static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2)
780{
781        return esimd_mergeh_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(4, arg2)), simd_ifh_1(simd_himask_8(), simd_slli_8(4, arg1), arg2));
782}
783//The total number of operations is 1.0
784static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2)
785{
786        return _mm_unpackhi_epi8(arg2, arg1);
787}
788//The total number of operations is 1.0
789static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2)
790{
791        return _mm_unpackhi_epi64(arg2, arg1);
792}
793//The total number of operations is 1.0
794static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2)
795{
796        return _mm_unpackhi_epi16(arg2, arg1);
797}
798//The total number of operations is 1.0
799static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2)
800{
801        return _mm_mullo_epi32(arg1, arg2);
802}
803//The total number of operations is 1.0
804static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2)
805{
806        return simd_and(arg1, arg2);
807}
808//The total number of operations is 19.6666666667
809static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2)
810{
811        bitblock128_t tmp1 = simd_slli_128(1, arg1);
812        bitblock128_t tmp2 = simd_slli_128(1, arg2);
813        return simd_ifh_1(simd_himask_2(), simd_or(simd_and(tmp1, simd_and(arg2, simd_or(simd_not(arg1), simd_not(tmp2)))), simd_and(arg1, simd_and(tmp2, simd_or(simd_not(tmp1), simd_not(arg2))))), simd_and(arg1, arg2));
814}
815//The total number of operations is 31.0
816static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2)
817{
818        bitblock128_t loMask = simd_lomask_8();
819        bitblock128_t tmpAns1 = simd_mult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
820        bitblock128_t tmpAns2 = simd_mult_8(simd_srli_8(4, arg1), simd_srli_8(4, arg2));
821        return simd_ifh_1(loMask, tmpAns1, simd_slli_8(4, tmpAns2));
822}
823//The total number of operations is 10.0
824static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2)
825{
826        bitblock128_t loMask = simd_lomask_16();
827        bitblock128_t tmpAns1 = simd_mult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
828        bitblock128_t tmpAns2 = simd_mult_16(simd_srli_16(8, arg1), simd_srli_16(8, arg2));
829        return simd_ifh_1(loMask, tmpAns1, simd_slli_16(8, tmpAns2));
830}
831//The total number of operations is 11.0
832static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2)
833{
834        bitblock128_t loMask = simd_lomask_64();
835        bitblock128_t arg1_low = simd_and(arg1, loMask);
836        bitblock128_t arg1_high = simd_srli_64((32), arg1);
837        bitblock128_t arg2_low = simd_and(arg2, loMask);
838        bitblock128_t arg2_high = simd_srli_64((32), arg2);
839        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
840        bitblock128_t tmpAns2 = simd_slli_64((32), simd_umult_32(arg1_low, arg2_high));
841        bitblock128_t tmpAns3 = simd_slli_64((32), simd_umult_32(arg1_high, arg2_low));
842        return simd_add_64(tmpAns1, simd_add_64(tmpAns2, tmpAns3));
843}
844//The total number of operations is 165.0
845static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2)
846{
847        bitblock128_t loMask = simd_lomask_128();
848        bitblock128_t arg1_low = simd_and(arg1, loMask);
849        bitblock128_t arg1_high = simd_srli_128((64), arg1);
850        bitblock128_t arg2_low = simd_and(arg2, loMask);
851        bitblock128_t arg2_high = simd_srli_128((64), arg2);
852        bitblock128_t tmpAns1 = simd_umult_64(arg1_low, arg2_low);
853        bitblock128_t tmpAns2 = simd_slli_128((64), simd_umult_64(arg1_low, arg2_high));
854        bitblock128_t tmpAns3 = simd_slli_128((64), simd_umult_64(arg1_high, arg2_low));
855        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, tmpAns3));
856}
857//The total number of operations is 1.0
858static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2)
859{
860        return _mm_mullo_epi16(arg1, arg2);
861}
862//The total number of operations is 7.0
863static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2)
864{
865        return simd_umin_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
866}
867//The total number of operations is 73.0
868static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2)
869{
870        return simd_umin_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
871}
872//The total number of operations is 66.6666666667
873static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2)
874{
875        return simd_umin_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
876}
877//The total number of operations is 35.3333333333
878static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2)
879{
880        return simd_umin_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
881}
882//The total number of operations is 7.0
883static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2)
884{
885        return simd_umin_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
886}
887//The total number of operations is 17.6666666667
888static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2)
889{
890        return simd_umin_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
891}
892//The total number of operations is 7.0
893static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2)
894{
895        return simd_umin_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
896}
897//The total number of operations is 2.0
898static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2)
899{
900        return simd_not(simd_or(arg1, arg2));
901}
902//The total number of operations is 1.0
903static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2)
904{
905        return _mm_cmpgt_epi32(arg1, arg2);
906}
907//The total number of operations is 1.0
908static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2)
909{
910        return simd_andc(arg2, arg1);
911}
912//The total number of operations is 14.6666666667
913static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2)
914{
915        bitblock128_t tmp = simd_not(arg1);
916        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(1, simd_and(arg1, simd_not(arg2))), simd_or(tmp, arg2)));
917        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
918}
919//The total number of operations is 10.0
920static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2)
921{
922        return simd_ifh_1(simd_himask_8(), simd_gt_8(simd_and(simd_himask_8(), arg1), arg2), simd_gt_8(simd_slli_8(4, arg1), simd_slli_8(4, arg2)));
923}
924//The total number of operations is 1.0
925static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2)
926{
927        return _mm_cmpgt_epi8(arg1, arg2);
928}
929//The total number of operations is 1.0
930static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2)
931{
932        return _mm_cmpgt_epi64(arg1, arg2);
933}
934//The total number of operations is 23.75
935static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2)
936{
937        bitblock128_t hiAns = simd_gt_64(arg1, arg2);
938        bitblock128_t loAns = simd_ugt_64(arg1, arg2);
939        bitblock128_t mask = simd_and(loAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
940        mask = simd_or(mask, simd_slli_128((64), mask));
941        return simd_or(simd_srai_128((64), hiAns), mask);
942}
943//The total number of operations is 1.0
944static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2)
945{
946        return _mm_cmpgt_epi16(arg1, arg2);
947}
948//The total number of operations is 1.0
949static inline bitblock128_t simd_not(bitblock128_t arg1)
950{
951        return simd_xor(arg1, simd_constant_32(-1));
952}
953//The total number of operations is 13.0
954static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2)
955{
956        return simd_sll_128(arg1, arg2);
957}
958//The total number of operations is 1.0
959static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2)
960{
961        return _mm_mul_epu32(arg1, arg2);
962}
963//The total number of operations is 289.0
964static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2)
965{
966        bitblock128_t loMask = simd_lomask_2();
967        bitblock128_t tmpAns1 = simd_umult_2(simd_and(loMask, arg1), simd_and(loMask, arg2));
968        bitblock128_t tmpAns2 = simd_umult_2(simd_and(loMask, simd_srli_4((2), arg1)), simd_and(loMask, simd_srli_4((2), arg2)));
969        return simd_or(tmpAns1, simd_slli_4((2), tmpAns2));
970}
971//The total number of operations is 139.0
972static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2)
973{
974        bitblock128_t loMask = simd_lomask_4();
975        bitblock128_t tmpAns1 = simd_umult_4(simd_and(loMask, arg1), simd_and(loMask, arg2));
976        bitblock128_t tmpAns2 = simd_umult_4(simd_and(loMask, simd_srli_8((4), arg1)), simd_and(loMask, simd_srli_8((4), arg2)));
977        return simd_or(tmpAns1, simd_slli_8((4), tmpAns2));
978}
979//The total number of operations is 64.0
980static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2)
981{
982        bitblock128_t loMask = simd_lomask_8();
983        bitblock128_t tmpAns1 = simd_umult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
984        bitblock128_t tmpAns2 = simd_umult_8(simd_and(loMask, simd_srli_16((8), arg1)), simd_and(loMask, simd_srli_16((8), arg2)));
985        return simd_or(tmpAns1, simd_slli_16((8), tmpAns2));
986}
987//The total number of operations is 28.0
988static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2)
989{
990        bitblock128_t loMask = simd_lomask_16();
991        bitblock128_t tmpAns1 = simd_umult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
992        bitblock128_t tmpAns2 = simd_umult_16(simd_and(loMask, simd_srli_32((16), arg1)), simd_and(loMask, simd_srli_32((16), arg2)));
993        return simd_or(tmpAns1, simd_slli_32((16), tmpAns2));
994}
995//The total number of operations is 45.0
996static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2)
997{
998        bitblock128_t loMask1 = simd_lomask_128();
999        bitblock128_t arg11 = simd_and(arg1, loMask1);
1000        bitblock128_t arg22 = simd_and(arg2, loMask1);
1001        bitblock128_t loMask2 = simd_lomask_64();
1002        bitblock128_t arg1_low = simd_and(arg11, loMask2);
1003        bitblock128_t arg1_high = simd_srli_64((32), arg11);
1004        bitblock128_t arg2_low = simd_and(arg22, loMask2);
1005        bitblock128_t arg2_high = simd_srli_64((32), arg22);
1006        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
1007        bitblock128_t tmpAns2 = simd_slli_128((32), simd_umult_32(arg1_low, arg2_high));
1008        bitblock128_t tmpAns3 = simd_slli_128((32), simd_umult_32(arg1_high, arg2_low));
1009        bitblock128_t tmpAns4 = simd_slli_128(64, simd_umult_32(arg1_high, arg2_high));
1010        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, simd_add_128(tmpAns3, tmpAns4)));
1011}
1012//The total number of operations is 10.0
1013static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2)
1014{
1015        bitblock128_t loMask = simd_lomask_32();
1016        bitblock128_t tmpAns1 = simd_umult_32(simd_and(loMask, arg1), simd_and(loMask, arg2));
1017        bitblock128_t tmpAns2 = simd_umult_32(simd_and(loMask, simd_srli_64((32), arg1)), simd_and(loMask, simd_srli_64((32), arg2)));
1018        return simd_or(tmpAns1, simd_slli_64((32), tmpAns2));
1019}
1020//The total number of operations is 1.0
1021static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1022{
1023        return _mm_hadd_epi16(arg2, arg1);
1024}
1025//The total number of operations is 73.0
1026static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2)
1027{
1028        return simd_add_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
1029}
1030//The total number of operations is 59.0
1031static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2)
1032{
1033        return simd_add_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
1034}
1035//The total number of operations is 35.3333333333
1036static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2)
1037{
1038        return simd_add_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
1039}
1040//The total number of operations is 1.0
1041static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2)
1042{
1043        return _mm_hadd_epi32(arg2, arg1);
1044}
1045//The total number of operations is 11.6666666667
1046static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2)
1047{
1048        return simd_add_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
1049}
1050//The total number of operations is 7.0
1051static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2)
1052{
1053        return simd_add_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
1054}
1055//The total number of operations is 7.0
1056static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2)
1057{
1058        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
1059        return simd_lt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1060}
1061//The total number of operations is 1.0
1062static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2)
1063{
1064        return simd_andc(arg2, arg1);
1065}
1066//The total number of operations is 13.6666666667
1067static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2)
1068{
1069        bitblock128_t tmp = simd_not(arg1);
1070        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(1, simd_and(tmp, arg2)), simd_or(tmp, arg2)));
1071        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
1072}
1073//The total number of operations is 20.0
1074static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2)
1075{
1076        return simd_ifh_1(simd_himask_8(), simd_ult_8(arg1, simd_and(simd_himask_8(), arg2)), simd_ult_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
1077}
1078//The total number of operations is 7.0
1079static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2)
1080{
1081        bitblock128_t high_bit = simd_constant_8((128));
1082        return simd_lt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1083}
1084//The total number of operations is 7.0
1085static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2)
1086{
1087        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
1088        return simd_lt_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1089}
1090//The total number of operations is 26.75
1091static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2)
1092{
1093        bitblock128_t tmpAns = simd_ult_64(arg1, arg2);
1094        bitblock128_t mask = simd_and(tmpAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
1095        mask = simd_or(mask, simd_slli_128((64), mask));
1096        return simd_or(simd_srai_128((64), tmpAns), mask);
1097}
1098//The total number of operations is 7.0
1099static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2)
1100{
1101        bitblock128_t high_bit = simd_constant_16((32768));
1102        return simd_lt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1103}
1104//The total number of operations is 1.0
1105static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1)
1106{
1107        return _mm_loadu_si128((bitblock128_t*)(arg1));
1108}
1109//The total number of operations is 19.0
1110static inline bitblock128_t simd_ctz_32(bitblock128_t arg1)
1111{
1112        return simd_popcount_32(simd_andc(simd_sub_32(arg1, simd_constant_32(1)), arg1));
1113}
1114//The total number of operations is 1.0
1115static inline bitblock128_t simd_ctz_1(bitblock128_t arg1)
1116{
1117        return simd_not(arg1);
1118}
1119//The total number of operations is 10.6666666667
1120static inline bitblock128_t simd_ctz_2(bitblock128_t arg1)
1121{
1122        bitblock128_t tmp = simd_not(arg1);
1123        return simd_ifh_1(simd_himask_2(), simd_and(tmp, simd_slli_128(1, tmp)), simd_and(simd_srli_128(1, arg1), tmp));
1124}
1125//The total number of operations is 14.0
1126static inline bitblock128_t simd_ctz_4(bitblock128_t arg1)
1127{
1128        return simd_popcount_4(simd_andc(simd_sub_4(arg1, simd_constant_4(1)), arg1));
1129}
1130//The total number of operations is 13.0
1131static inline bitblock128_t simd_ctz_8(bitblock128_t arg1)
1132{
1133        return simd_popcount_8(simd_andc(simd_sub_8(arg1, simd_constant_8(1)), arg1));
1134}
1135//The total number of operations is 14.0
1136static inline bitblock128_t simd_ctz_64(bitblock128_t arg1)
1137{
1138        return simd_popcount_64(simd_andc(simd_sub_64(arg1, simd_constant_64(1)), arg1));
1139}
1140//The total number of operations is 26.6666666667
1141static inline bitblock128_t simd_ctz_128(bitblock128_t arg1)
1142{
1143        return simd_popcount_128(simd_andc(simd_sub_128(arg1, simd_constant_128(1)), arg1));
1144}
1145//The total number of operations is 16.0
1146static inline bitblock128_t simd_ctz_16(bitblock128_t arg1)
1147{
1148        return simd_popcount_16(simd_andc(simd_sub_16(arg1, simd_constant_16(1)), arg1));
1149}
1150//The total number of operations is 10.0
1151static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask)
1152{
1153        return simd_ifh_1(simd_himask_128(), _mm_sll_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_sll_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1154}
1155//The total number of operations is 13.0
1156static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask)
1157{
1158        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1159        return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1160}
1161//The total number of operations is 1.0
1162static inline bitblock128_t mvmd_fill_32(uint64_t val1)
1163{
1164        return _mm_set1_epi32((int32_t)(val1));
1165}
1166//The total number of operations is 1.0
1167static inline bitblock128_t mvmd_fill_1(uint64_t val1)
1168{
1169        return mvmd_fill_32((-1*val1));
1170}
1171//The total number of operations is 1.0
1172static inline bitblock128_t mvmd_fill_2(uint64_t val1)
1173{
1174        return mvmd_fill_4(((val1<<2)|val1));
1175}
1176//The total number of operations is 1.0
1177static inline bitblock128_t mvmd_fill_4(uint64_t val1)
1178{
1179        return mvmd_fill_8(((val1<<4)|val1));
1180}
1181//The total number of operations is 1.0
1182static inline bitblock128_t mvmd_fill_8(uint64_t val1)
1183{
1184        return _mm_set1_epi8((int32_t)(val1));
1185}
1186//The total number of operations is 1.0
1187static inline bitblock128_t mvmd_fill_64(uint64_t val1)
1188{
1189        return _mm_set_epi32((int32_t)((val1>>32)), (int32_t)(val1), (int32_t)((val1>>32)), (int32_t)(val1));
1190}
1191//The total number of operations is 1.0
1192static inline bitblock128_t mvmd_fill_128(uint64_t val1)
1193{
1194        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val1>>32)), (int32_t)(val1));
1195}
1196//The total number of operations is 1.0
1197static inline bitblock128_t mvmd_fill_16(uint64_t val1)
1198{
1199        return _mm_set1_epi16((int32_t)(val1));
1200}
1201//The total number of operations is 19.0
1202static inline bitblock128_t mvmd_shuffle_32(bitblock128_t arg1, bitblock128_t arg2)
1203{
1204        bitblock128_t tmp1 = simd_and(simd_constant_32((3)), arg2);
1205        bitblock128_t msk1 = simd_add_32(tmp1, tmp1);
1206        bitblock128_t msk2 = simd_add_32(msk1, simd_constant_32(1));
1207        bitblock128_t msk = simd_or(msk1, simd_slli_32((16), msk2));
1208        return simd_ifh_32(arg2, simd_constant_32(0), mvmd_shuffle_16(arg1, msk));
1209}
1210//The total number of operations is 1.0
1211static inline bitblock128_t mvmd_shuffle_8(bitblock128_t arg1, bitblock128_t arg2)
1212{
1213        return _mm_shuffle_epi8(arg1, arg2);
1214}
1215//The total number of operations is 28.0
1216static inline bitblock128_t mvmd_shuffle_64(bitblock128_t arg1, bitblock128_t arg2)
1217{
1218        bitblock128_t tmp1 = simd_and(simd_constant_64((1)), arg2);
1219        bitblock128_t msk1 = simd_add_64(tmp1, tmp1);
1220        bitblock128_t msk2 = simd_add_64(msk1, simd_constant_64(1));
1221        bitblock128_t msk = simd_or(msk1, simd_slli_64((32), msk2));
1222        return simd_ifh_64(arg2, simd_constant_64(0), mvmd_shuffle_32(arg1, msk));
1223}
1224//The total number of operations is 10.0
1225static inline bitblock128_t mvmd_shuffle_16(bitblock128_t arg1, bitblock128_t arg2)
1226{
1227        bitblock128_t tmp1 = simd_and(simd_constant_16((7)), arg2);
1228        bitblock128_t msk1 = simd_add_16(tmp1, tmp1);
1229        bitblock128_t msk2 = simd_add_16(msk1, simd_constant_16(1));
1230        bitblock128_t msk = simd_or(msk1, simd_slli_16((8), msk2));
1231        return simd_ifh_16(arg2, simd_constant_16(0), mvmd_shuffle_8(arg1, msk));
1232}
1233//The total number of operations is 1.0
1234static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2)
1235{
1236        return _mm_packs_epi32(arg2, arg1);
1237}
1238//The total number of operations is 108.666666667
1239static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2)
1240{
1241        bitblock128_t hiBound = simd_srli_2(1, simd_lomask_2());
1242        bitblock128_t loBound = simd_not(hiBound);
1243        return hsimd_packl_2(simd_ifh_1(simd_gt_2(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_2(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg2, loBound), arg2, loBound)));
1244}
1245//The total number of operations is 79.3333333333
1246static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2)
1247{
1248        bitblock128_t hiBound = simd_srli_4(1, simd_lomask_4());
1249        bitblock128_t loBound = simd_not(hiBound);
1250        return hsimd_packl_4(simd_ifh_1(simd_gt_4(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_4(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg2, loBound), arg2, loBound)));
1251}
1252//The total number of operations is 32.6666666667
1253static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2)
1254{
1255        bitblock128_t hiBound = simd_srli_8(1, simd_lomask_8());
1256        bitblock128_t loBound = simd_not(hiBound);
1257        return hsimd_packl_8(simd_ifh_1(simd_gt_8(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_8(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg2, loBound), arg2, loBound)));
1258}
1259//The total number of operations is 21.0
1260static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2)
1261{
1262        bitblock128_t hiBound = simd_srli_64(1, simd_lomask_64());
1263        bitblock128_t loBound = simd_not(hiBound);
1264        return hsimd_packl_64(simd_ifh_1(simd_gt_64(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_64(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg2, loBound), arg2, loBound)));
1265}
1266//The total number of operations is 115.666666667
1267static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2)
1268{
1269        bitblock128_t hiBound = simd_srli_128(1, simd_lomask_128());
1270        bitblock128_t loBound = simd_not(hiBound);
1271        return hsimd_packl_128(simd_ifh_1(simd_gt_128(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_128(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg2, loBound), arg2, loBound)));
1272}
1273//The total number of operations is 1.0
1274static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2)
1275{
1276        return _mm_packs_epi16(arg2, arg1);
1277}
1278//The total number of operations is 13.0
1279static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2)
1280{
1281        return simd_srl_128(arg1, arg2);
1282}
1283//The total number of operations is 1.0
1284static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
1285{
1286        _mm_store_si128((bitblock128_t*)(arg2), arg1);
1287}
1288//The total number of operations is 1.0
1289static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2)
1290{
1291        return _mm_cmpeq_epi32(arg1, arg2);
1292}
1293//The total number of operations is 2.0
1294static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2)
1295{
1296        return simd_not(simd_xor(arg1, arg2));
1297}
1298//The total number of operations is 8.0
1299static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2)
1300{
1301        bitblock128_t tmpAns = simd_eq_1(arg1, arg2);
1302        bitblock128_t loMask = simd_and(tmpAns, simd_srli_2((1), tmpAns));
1303        bitblock128_t hiMask = simd_slli_2((1), loMask);
1304        return simd_or(loMask, hiMask);
1305}
1306//The total number of operations is 9.0
1307static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2)
1308{
1309        return simd_or(simd_and(simd_himask_8(), simd_eq_8(simd_and(simd_himask_8(), arg1), simd_and(simd_himask_8(), arg2))), simd_and(simd_lomask_8(), simd_eq_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2))));
1310}
1311//The total number of operations is 1.0
1312static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2)
1313{
1314        return _mm_cmpeq_epi8(arg1, arg2);
1315}
1316//The total number of operations is 1.0
1317static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2)
1318{
1319        return _mm_cmpeq_epi64(arg1, arg2);
1320}
1321//The total number of operations is 7.66666666667
1322static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2)
1323{
1324        bitblock128_t tmpAns = simd_eq_64(arg1, arg2);
1325        bitblock128_t loMask = simd_and(tmpAns, simd_srli_128((64), tmpAns));
1326        bitblock128_t hiMask = simd_slli_128((64), loMask);
1327        return simd_or(loMask, hiMask);
1328}
1329//The total number of operations is 1.0
1330static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2)
1331{
1332        return _mm_cmpeq_epi16(arg1, arg2);
1333}
1334//The total number of operations is 17.0
1335static inline bitblock128_t simd_popcount_32(bitblock128_t arg1)
1336{
1337        return simd_add_hl_32(simd_popcount_16(arg1));
1338}
1339//The total number of operations is 0
1340static inline bitblock128_t simd_popcount_1(bitblock128_t arg1)
1341{
1342        return arg1;
1343}
1344//The total number of operations is 3.0
1345static inline bitblock128_t simd_popcount_2(bitblock128_t arg1)
1346{
1347        return simd_add_hl_2(simd_popcount_1(arg1));
1348}
1349//The total number of operations is 7.0
1350static inline bitblock128_t simd_popcount_4(bitblock128_t arg1)
1351{
1352        return simd_add_hl_4(simd_popcount_2(arg1));
1353}
1354//The total number of operations is 11.0
1355static inline bitblock128_t simd_popcount_8(bitblock128_t arg1)
1356{
1357        return simd_add_hl_8(simd_popcount_4(arg1));
1358}
1359//The total number of operations is 12.0
1360static inline bitblock128_t simd_popcount_64(bitblock128_t arg1)
1361{
1362        return _mm_sad_epu8(simd_popcount_8(arg1), simd_constant_8(0));
1363}
1364//The total number of operations is 16.3333333333
1365static inline bitblock128_t simd_popcount_128(bitblock128_t arg1)
1366{
1367        bitblock128_t tmpAns = simd_popcount_64(arg1);
1368        return simd_add_64(simd_and(tmpAns, simd_lomask_128()), simd_srli_128((64), tmpAns));
1369}
1370//The total number of operations is 14.0
1371static inline bitblock128_t simd_popcount_16(bitblock128_t arg1)
1372{
1373        return simd_add_hl_16(simd_popcount_8(arg1));
1374}
1375//The total number of operations is 1.0
1376static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
1377{
1378        return _mm_andnot_si128(arg2, arg1);
1379}
1380//The total number of operations is 1.0
1381static inline bitblock128_t simd_neg_32(bitblock128_t arg1)
1382{
1383        return _mm_sign_epi32(arg1, simd_constant_32(-1));
1384}
1385//The total number of operations is 6.33333333333
1386static inline bitblock128_t simd_neg_2(bitblock128_t arg1)
1387{
1388        return simd_ifh_1(simd_himask_2(), simd_xor(arg1, simd_slli_128(1, arg1)), arg1);
1389}
1390//The total number of operations is 6.0
1391static inline bitblock128_t simd_neg_4(bitblock128_t arg1)
1392{
1393        return simd_sub_4(simd_constant_4(0), arg1);
1394}
1395//The total number of operations is 1.0
1396static inline bitblock128_t simd_neg_8(bitblock128_t arg1)
1397{
1398        return simd_sub_8(simd_constant_8(0), arg1);
1399}
1400//The total number of operations is 1.0
1401static inline bitblock128_t simd_neg_64(bitblock128_t arg1)
1402{
1403        return simd_sub_64(simd_constant_64(0), arg1);
1404}
1405//The total number of operations is 9.33333333333
1406static inline bitblock128_t simd_neg_128(bitblock128_t arg1)
1407{
1408        return simd_sub_128(simd_constant_128(0), arg1);
1409}
1410//The total number of operations is 1.0
1411static inline bitblock128_t simd_neg_16(bitblock128_t arg1)
1412{
1413        return simd_sub_16(simd_constant_16(0), arg1);
1414}
1415//The total number of operations is 3.0
1416static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2)
1417{
1418        return _mm_hsub_epi16(simd_srli_32((16), arg2), simd_srli_32((16), arg1));
1419}
1420//The total number of operations is 37.0
1421static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2)
1422{
1423        return hsimd_packl_2(simd_srli_64((1), arg1), simd_srli_64((1), arg2));
1424}
1425//The total number of operations is 26.3333333333
1426static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2)
1427{
1428        return hsimd_packl_4(simd_srli_64((2), arg1), simd_srli_64((2), arg2));
1429}
1430//The total number of operations is 15.6666666667
1431static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2)
1432{
1433        return hsimd_packl_8(simd_srli_64((4), arg1), simd_srli_64((4), arg2));
1434}
1435//The total number of operations is 3.0
1436static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2)
1437{
1438        return _mm_hsub_epi32(simd_srli_64((32), arg2), simd_srli_64((32), arg1));
1439}
1440//The total number of operations is 5.33333333333
1441static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2)
1442{
1443        return simd_ifh_1(simd_himask_128(), arg1, simd_srli_128((64), arg2));
1444}
1445//The total number of operations is 3.0
1446static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2)
1447{
1448        return hsimd_packus_16(simd_srli_16((8), arg1), simd_srli_16((8), arg2));
1449}
1450//The total number of operations is 0
1451static inline bitblock128_t simd_himask_32()
1452{
1453        return simd_constant_32(-65536);
1454}
1455//The total number of operations is 0
1456static inline bitblock128_t simd_himask_2()
1457{
1458        return simd_constant_2((2));
1459}
1460//The total number of operations is 0
1461static inline bitblock128_t simd_himask_4()
1462{
1463        return simd_constant_4((12));
1464}
1465//The total number of operations is 0
1466static inline bitblock128_t simd_himask_8()
1467{
1468        return simd_constant_8((240));
1469}
1470//The total number of operations is 0
1471static inline bitblock128_t simd_himask_64()
1472{
1473        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
1474}
1475//The total number of operations is 0
1476static inline bitblock128_t simd_himask_128()
1477{
1478        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
1479}
1480//The total number of operations is 0
1481static inline bitblock128_t simd_himask_16()
1482{
1483        return simd_constant_16((65280));
1484}
1485//The total number of operations is 2.0
1486static inline bool bitblock_all(bitblock128_t arg1)
1487{
1488        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(-1))) == 65535;
1489}
1490//The total number of operations is 4.0
1491static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1492{
1493        return simd_ifh_1(simd_gt_32(simd_constant_32(0), arg1), arg2, arg3);
1494}
1495//The total number of operations is 3.0
1496static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1497{
1498        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
1499}
1500//The total number of operations is 8.0
1501static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1502{
1503        return simd_ifh_1(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2((1), arg1)), arg2, arg3);
1504}
1505//The total number of operations is 13.0
1506static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1507{
1508        return simd_ifh_1(simd_gt_4(simd_constant_4(0), arg1), arg2, arg3);
1509}
1510//The total number of operations is 1.0
1511static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1512{
1513        return _mm_blendv_epi8(arg3, arg2, arg1);
1514}
1515//The total number of operations is 4.0
1516static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1517{
1518        return simd_ifh_1(simd_gt_64(simd_constant_64(0), arg1), arg2, arg3);
1519}
1520//The total number of operations is 9.33333333333
1521static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1522{
1523        return simd_ifh_64(simd_ifh_1(simd_himask_128(), arg1, simd_srli_128((64), arg1)), arg2, arg3);
1524}
1525//The total number of operations is 4.0
1526static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1527{
1528        return simd_ifh_1(simd_gt_16(simd_constant_16(0), arg1), arg2, arg3);
1529}
1530//The total number of operations is 1.0
1531static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2)
1532{
1533        return _mm_sub_epi32(arg1, arg2);
1534}
1535//The total number of operations is 1.0
1536static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2)
1537{
1538        return simd_xor(arg1, arg2);
1539}
1540//The total number of operations is 9.33333333333
1541static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2)
1542{
1543        bitblock128_t tmp = simd_xor(arg1, arg2);
1544        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(1, simd_and(simd_not(arg1), arg2))), tmp);
1545}
1546//The total number of operations is 6.0
1547static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2)
1548{
1549        return simd_ifh_1(simd_himask_8(), simd_sub_8(arg1, simd_and(simd_himask_8(), arg2)), simd_sub_8(arg1, arg2));
1550}
1551//The total number of operations is 1.0
1552static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2)
1553{
1554        return _mm_sub_epi8(arg1, arg2);
1555}
1556//The total number of operations is 1.0
1557static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2)
1558{
1559        return _mm_sub_epi64(arg1, arg2);
1560}
1561//The total number of operations is 9.33333333333
1562static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2)
1563{
1564        bitblock128_t partial = simd_sub_64(arg1, arg2);
1565        bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
1566        bitblock128_t borrow = simd_slli_128((64), simd_srli_64((63), borrowMask));
1567        return simd_sub_64(partial, borrow);
1568}
1569//The total number of operations is 1.0
1570static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2)
1571{
1572        return _mm_sub_epi16(arg1, arg2);
1573}
1574//The total number of operations is 3.0
1575static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1)
1576{
1577        return simd_add_64(simd_srli_32((16), arg1), simd_and(arg1, simd_lomask_32()));
1578}
1579//The total number of operations is 3.0
1580static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1)
1581{
1582        return simd_sub_16(arg1, simd_and(simd_lomask_2(), simd_srli_16(1, arg1)));
1583}
1584//The total number of operations is 4.0
1585static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1)
1586{
1587        return simd_add_8(simd_srli_4((2), arg1), simd_and(arg1, simd_lomask_4()));
1588}
1589//The total number of operations is 4.0
1590static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1)
1591{
1592        return simd_add_16(simd_srli_8((4), arg1), simd_and(arg1, simd_lomask_8()));
1593}
1594//The total number of operations is 3.0
1595static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1)
1596{
1597        return simd_add_64(simd_srli_64((32), arg1), simd_and(arg1, simd_lomask_64()));
1598}
1599//The total number of operations is 12.6666666667
1600static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1)
1601{
1602        return simd_add_128(simd_srli_128((64), arg1), simd_and(arg1, simd_lomask_128()));
1603}
1604//The total number of operations is 3.0
1605static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1)
1606{
1607        return simd_add_32(simd_srli_16((8), arg1), simd_and(arg1, simd_lomask_16()));
1608}
1609//The total number of operations is 10.0
1610static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask)
1611{
1612        return simd_ifh_1(simd_himask_128(), _mm_srl_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_srl_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1613}
1614//The total number of operations is 13.0
1615static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask)
1616{
1617        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1618        return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1619}
1620//The total number of operations is 0
1621static inline bitblock128_t simd_lomask_32()
1622{
1623        return simd_constant_32((65535));
1624}
1625//The total number of operations is 0
1626static inline bitblock128_t simd_lomask_2()
1627{
1628        return simd_constant_2((1));
1629}
1630//The total number of operations is 0
1631static inline bitblock128_t simd_lomask_4()
1632{
1633        return simd_constant_4((3));
1634}
1635//The total number of operations is 0
1636static inline bitblock128_t simd_lomask_8()
1637{
1638        return simd_constant_8((15));
1639}
1640//The total number of operations is 0
1641static inline bitblock128_t simd_lomask_64()
1642{
1643        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
1644}
1645//The total number of operations is 0
1646static inline bitblock128_t simd_lomask_128()
1647{
1648        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
1649}
1650//The total number of operations is 0
1651static inline bitblock128_t simd_lomask_16()
1652{
1653        return simd_constant_16((255));
1654}
1655//The total number of operations is 3.0
1656static inline uint64_t hsimd_signmask_32(bitblock128_t arg1)
1657{
1658        return hsimd_signmask_16(hsimd_packss_32(simd_constant_32(0), arg1));
1659}
1660//The total number of operations is 24.0
1661static inline uint64_t hsimd_signmask_4(bitblock128_t arg1)
1662{
1663        uint64_t tmpAns1 = hsimd_signmask_8(esimd_mergeh_4(arg1, simd_constant_4(0)));
1664        uint64_t tmpAns2 = hsimd_signmask_8(esimd_mergel_4(arg1, simd_constant_4(0)));
1665        return ((tmpAns1<<(16))+tmpAns2);
1666}
1667//The total number of operations is 1.0
1668static inline uint64_t hsimd_signmask_8(bitblock128_t arg1)
1669{
1670        return _mm_movemask_epi8(arg1);
1671}
1672//The total number of operations is 1.0
1673static inline uint64_t hsimd_signmask_64(bitblock128_t arg1)
1674{
1675        return _mm_movemask_pd(_mm_castsi128_pd(arg1));
1676}
1677//The total number of operations is 6.33333333333
1678static inline uint64_t hsimd_signmask_128(bitblock128_t arg1)
1679{
1680        return hsimd_signmask_64(hsimd_packh_128(simd_constant_128(0), arg1));
1681}
1682//The total number of operations is 2.0
1683static inline uint64_t hsimd_signmask_16(bitblock128_t arg1)
1684{
1685        return hsimd_signmask_8(hsimd_packss_16(simd_constant_16(0), arg1));
1686}
1687//The total number of operations is 3.0
1688static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1)
1689{
1690        return esimd_mergeh_64(simd_srli_64(32, arg1), simd_and(simd_lomask_64(), arg1));
1691}
1692//The total number of operations is 24.0
1693static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1)
1694{
1695        return esimd_mergeh_2(simd_srli_2(1, arg1), simd_and(simd_lomask_2(), arg1));
1696}
1697//The total number of operations is 14.0
1698static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1)
1699{
1700        return esimd_mergeh_4(simd_srli_4(2, arg1), simd_and(simd_lomask_4(), arg1));
1701}
1702//The total number of operations is 4.0
1703static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1)
1704{
1705        return esimd_mergeh_8(simd_srli_8(4, arg1), simd_and(simd_lomask_8(), arg1));
1706}
1707//The total number of operations is 3.0
1708static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1)
1709{
1710        return esimd_mergeh_16(simd_srli_16(8, arg1), simd_and(simd_lomask_16(), arg1));
1711}
1712//The total number of operations is 2.33333333333
1713static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1)
1714{
1715        return simd_srli_128(64, arg1);
1716}
1717//The total number of operations is 3.0
1718static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1)
1719{
1720        return esimd_mergeh_32(simd_srli_32(16, arg1), simd_and(simd_lomask_32(), arg1));
1721}
1722//The total number of operations is 1.0
1723static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1)
1724{
1725        return _mm_cvtepu32_epi64(arg1);
1726}
1727//The total number of operations is 24.0
1728static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1)
1729{
1730        return esimd_mergel_2(simd_srli_2(1, arg1), simd_and(simd_lomask_2(), arg1));
1731}
1732//The total number of operations is 14.0
1733static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1)
1734{
1735        return esimd_mergel_4(simd_srli_4(2, arg1), simd_and(simd_lomask_4(), arg1));
1736}
1737//The total number of operations is 4.0
1738static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1)
1739{
1740        return esimd_mergel_8(simd_srli_8(4, arg1), simd_and(simd_lomask_8(), arg1));
1741}
1742//The total number of operations is 1.0
1743static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1)
1744{
1745        return _mm_cvtepu8_epi16(arg1);
1746}
1747//The total number of operations is 1.0
1748static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1)
1749{
1750        return simd_and(simd_lomask_128(), arg1);
1751}
1752//The total number of operations is 1.0
1753static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1)
1754{
1755        return _mm_cvtepu16_epi32(arg1);
1756}
1757//The total number of operations is 1.0
1758static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1759{
1760        return _mm_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
1761}
1762//The total number of operations is 5.0
1763static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1764{
1765        return simd_ifh_1(simd_himask_4(), mvmd_fill2_1(val1, val2), mvmd_fill2_1(val3, val4));
1766}
1767//The total number of operations is 5.0
1768static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1769{
1770        return simd_ifh_1(simd_himask_8(), mvmd_fill2_2(val1, val2), mvmd_fill2_2(val3, val4));
1771}
1772//The total number of operations is 5.0
1773static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1774{
1775        return simd_ifh_1(simd_himask_16(), mvmd_fill2_4(val1, val2), mvmd_fill2_4(val3, val4));
1776}
1777//The total number of operations is 5.0
1778static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1779{
1780        return simd_ifh_1(simd_himask_32(), mvmd_fill2_8(val1, val2), mvmd_fill2_8(val3, val4));
1781}
1782//The total number of operations is 3.0
1783static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1784{
1785        return simd_or(mvmd_fill4_32((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd_fill4_32((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
1786}
1787//The total number of operations is 1.0
1788static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2)
1789{
1790        return _mm_min_epu32(arg1, arg2);
1791}
1792//The total number of operations is 1.0
1793static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2)
1794{
1795        return simd_and(arg1, arg2);
1796}
1797//The total number of operations is 16.0
1798static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2)
1799{
1800        return simd_or(simd_and(simd_himask_4(), simd_umin_4(arg1, arg2)), simd_umin_4(simd_and(simd_lomask_4(), arg1), simd_and(simd_lomask_4(), arg2)));
1801}
1802//The total number of operations is 6.0
1803static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2)
1804{
1805        return simd_or(simd_and(simd_himask_8(), simd_umin_8(arg1, arg2)), simd_umin_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
1806}
1807//The total number of operations is 1.0
1808static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2)
1809{
1810        return _mm_min_epu8(arg1, arg2);
1811}
1812//The total number of operations is 7.0
1813static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2)
1814{
1815        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
1816        return simd_xor(simd_min_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1817}
1818//The total number of operations is 22.6666666667
1819static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2)
1820{
1821        bitblock128_t tmpAns = simd_umin_64(arg1, arg2);
1822        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(tmpAns, arg1));
1823        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(tmpAns, arg2));
1824        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1825}
1826//The total number of operations is 1.0
1827static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2)
1828{
1829        return _mm_min_epu16(arg1, arg2);
1830}
1831//The total number of operations is 1.0
1832static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2)
1833{
1834        return _mm_min_epi32(arg1, arg2);
1835}
1836//The total number of operations is 1.0
1837static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2)
1838{
1839        return simd_or(arg1, arg2);
1840}
1841//The total number of operations is 16.6666666667
1842static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2)
1843{
1844        bitblock128_t tmp1 = simd_srli_128(1, arg1);
1845        bitblock128_t tmp2 = simd_srli_128(1, arg2);
1846        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg1, simd_and(tmp1, simd_not(tmp2))), simd_and(arg2, simd_or(simd_and(simd_not(tmp1), tmp2), arg1))));
1847}
1848//The total number of operations is 9.0
1849static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2)
1850{
1851        bitblock128_t high_bit = simd_constant_4((8));
1852        return simd_xor(simd_umin_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1853}
1854//The total number of operations is 1.0
1855static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2)
1856{
1857        return _mm_min_epi8(arg1, arg2);
1858}
1859//The total number of operations is 4.0
1860static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2)
1861{
1862        return simd_ifh_1(simd_gt_64(arg1, arg2), arg2, arg1);
1863}
1864//The total number of operations is 26.6666666667
1865static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2)
1866{
1867        bitblock128_t hiAns = simd_min_64(arg1, arg2);
1868        bitblock128_t loAns = simd_umin_64(arg1, arg2);
1869        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(hiAns, arg1));
1870        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(hiAns, arg2));
1871        return simd_ifh_1(simd_himask_128(), hiAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, loAns, arg1), arg2));
1872}
1873//The total number of operations is 1.0
1874static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2)
1875{
1876        return _mm_min_epi16(arg1, arg2);
1877}
1878//The total number of operations is 5.0
1879static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2)
1880{
1881        return simd_ifh_1(simd_himask_64(), mvmd_fill_32(val1), mvmd_fill_32(val2));
1882}
1883//The total number of operations is 1.0
1884static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2)
1885{
1886        return mvmd_fill_2(((val1<<1)|(val2&(1))));
1887}
1888//The total number of operations is 1.0
1889static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2)
1890{
1891        return mvmd_fill_4(((val1<<2)|(val2&(3))));
1892}
1893//The total number of operations is 1.0
1894static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2)
1895{
1896        return mvmd_fill_8(((val1<<4)|(val2&(15))));
1897}
1898//The total number of operations is 1.0
1899static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2)
1900{
1901        return mvmd_fill_16(((val1<<8)|(val2&(255))));
1902}
1903//The total number of operations is 5.0
1904static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2)
1905{
1906        return simd_ifh_1(simd_himask_128(), mvmd_fill_64(val1), mvmd_fill_64(val2));
1907}
1908//The total number of operations is 1.0
1909static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2)
1910{
1911        return mvmd_fill_32(((val1<<16)|(val2&(65535))));
1912}
1913//The total number of operations is 2.0
1914static inline bool bitblock_any(bitblock128_t arg1)
1915{
1916        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(0))) != 65535;
1917}
1918//The total number of operations is 18.3333333333
1919static inline uint64_t bitblock_popcount(bitblock128_t arg1)
1920{
1921        return mvmd_extract_64(0, simd_popcount_128(arg1));
1922}
1923//The total number of operations is 1.0
1924static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
1925{
1926        return _mm_or_si128(arg1, arg2);
1927}
1928//The total number of operations is 3.0
1929static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2)
1930{
1931        return hsimd_packus_32(simd_and(arg1, simd_lomask_32()), simd_and(arg2, simd_lomask_32()));
1932}
1933//The total number of operations is 35.0
1934static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2)
1935{
1936        return hsimd_packl_4(simd_ifh_1(simd_himask_2(), simd_srli_128((1), arg1), arg1), simd_ifh_1(simd_himask_2(), simd_srli_128((1), arg2), arg2));
1937}
1938//The total number of operations is 24.3333333333
1939static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2)
1940{
1941        return hsimd_packl_8(simd_ifh_1(simd_himask_4(), simd_srli_128((2), arg1), arg1), simd_ifh_1(simd_himask_4(), simd_srli_128((2), arg2), arg2));
1942}
1943//The total number of operations is 13.6666666667
1944static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2)
1945{
1946        return hsimd_packl_16(simd_ifh_1(simd_himask_8(), simd_srli_128((4), arg1), arg1), simd_ifh_1(simd_himask_8(), simd_srli_128((4), arg2), arg2));
1947}
1948//The total number of operations is 3.0
1949static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2)
1950{
1951        return _mm_hsub_epi32(simd_and(arg2, simd_lomask_64()), simd_and(arg1, simd_lomask_64()));
1952}
1953//The total number of operations is 5.33333333333
1954static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2)
1955{
1956        return simd_ifh_1(simd_himask_128(), simd_slli_128((64), arg1), arg2);
1957}
1958//The total number of operations is 3.0
1959static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2)
1960{
1961        return hsimd_packus_16(simd_and(arg1, simd_lomask_16()), simd_and(arg2, simd_lomask_16()));
1962}
1963//The total number of operations is 13.0
1964static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1965{
1966        return simd_ifh_1(simd_himask_8(), mvmd_fill4_1(val1, val2, val3, val4), mvmd_fill4_1(val5, val6, val7, val8));
1967}
1968//The total number of operations is 13.0
1969static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1970{
1971        return simd_ifh_1(simd_himask_16(), mvmd_fill4_2(val1, val2, val3, val4), mvmd_fill4_2(val5, val6, val7, val8));
1972}
1973//The total number of operations is 7.0
1974static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1975{
1976        return simd_or(mvmd_fill8_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd_fill8_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
1977}
1978//The total number of operations is 3.0
1979static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1980{
1981        return simd_or(mvmd_fill8_16((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd_fill8_16((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
1982}
1983//The total number of operations is 1.0
1984static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1985{
1986        return _mm_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
1987}
1988//The total number of operations is 7.0
1989static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1990{
1991        return simd_min_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
1992}
1993//The total number of operations is 73.0
1994static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2)
1995{
1996        return simd_min_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
1997}
1998//The total number of operations is 67.3333333333
1999static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2)
2000{
2001        return simd_min_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
2002}
2003//The total number of operations is 38.3333333333
2004static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2)
2005{
2006        return simd_min_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
2007}
2008//The total number of operations is 7.0
2009static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2)
2010{
2011        return simd_min_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
2012}
2013//The total number of operations is 14.6666666667
2014static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2)
2015{
2016        return simd_min_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
2017}
2018//The total number of operations is 7.0
2019static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2)
2020{
2021        return simd_min_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
2022}
2023//The total number of operations is 1.0
2024static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2)
2025{
2026        return _mm_xor_si128(arg1, arg2);
2027}
2028//The total number of operations is 1.0
2029static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2)
2030{
2031        return _mm_max_epu32(arg1, arg2);
2032}
2033//The total number of operations is 1.0
2034static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2)
2035{
2036        return simd_or(arg1, arg2);
2037}
2038//The total number of operations is 15.6666666667
2039static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2)
2040{
2041        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(1, simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd_srli_128(1, simd_or(arg1, simd_not(arg2))))));
2042}
2043//The total number of operations is 6.0
2044static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2)
2045{
2046        return simd_or(simd_and(simd_himask_8(), simd_umax_8(arg1, arg2)), simd_umax_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
2047}
2048//The total number of operations is 1.0
2049static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2)
2050{
2051        return _mm_max_epu8(arg1, arg2);
2052}
2053//The total number of operations is 7.0
2054static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2)
2055{
2056        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
2057        return simd_xor(simd_max_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
2058}
2059//The total number of operations is 22.6666666667
2060static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2)
2061{
2062        bitblock128_t tmpAns = simd_umax_64(arg1, arg2);
2063        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(tmpAns, arg1));
2064        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(tmpAns, arg2));
2065        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2066}
2067//The total number of operations is 1.0
2068static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2)
2069{
2070        return _mm_max_epu16(arg1, arg2);
2071}
2072//The total number of operations is 1.0
2073static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1)
2074{
2075        return _mm_load_si128((bitblock128_t*)(arg1));
2076}
2077//The total number of operations is 1.0
2078static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
2079{
2080        _mm_storeu_si128((bitblock128_t*)(arg2), arg1);
2081}
2082//The total number of operations is 1.0
2083static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1)
2084{
2085        return _mm_cvtepi32_epi64(arg1);
2086}
2087//The total number of operations is 31.0
2088static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1)
2089{
2090        return esimd_mergel_2(simd_srai_2(1, arg1), simd_srai_2(1, simd_slli_2(1, arg1)));
2091}
2092//The total number of operations is 33.0
2093static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1)
2094{
2095        return esimd_mergel_4(simd_srai_4(2, arg1), simd_srai_4(2, simd_slli_4(2, arg1)));
2096}
2097//The total number of operations is 13.0
2098static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1)
2099{
2100        return esimd_mergel_8(simd_srai_8(4, arg1), simd_srai_8(4, simd_slli_8(4, arg1)));
2101}
2102//The total number of operations is 1.0
2103static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1)
2104{
2105        return _mm_cvtepi8_epi16(arg1);
2106}
2107//The total number of operations is 13.4166666667
2108static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1)
2109{
2110        return simd_srai_128(64, simd_slli_128(64, arg1));
2111}
2112//The total number of operations is 1.0
2113static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1)
2114{
2115        return _mm_cvtepi16_epi32(arg1);
2116}
2117//The total number of operations is 1.0
2118static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2)
2119{
2120        return _mm_packus_epi32(arg2, arg1);
2121}
2122//The total number of operations is 75.0
2123static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2)
2124{
2125        bitblock128_t arg11 = simd_ifh_2(arg1, simd_constant_2(0), arg1);
2126        bitblock128_t arg12 = simd_and(simd_lomask_2(), arg11);
2127        bitblock128_t arg21 = simd_ifh_2(arg2, simd_constant_2(0), arg2);
2128        bitblock128_t arg22 = simd_and(simd_lomask_2(), arg21);
2129        return hsimd_packl_2(simd_ifh_1(simd_eq_2(arg12, arg11), arg12, simd_lomask_2()), simd_ifh_1(simd_eq_2(arg22, arg21), arg22, simd_lomask_2()));
2130}
2131//The total number of operations is 74.3333333333
2132static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2)
2133{
2134        bitblock128_t hiPart = hsimd_packh_4(arg1, arg2);
2135        return simd_ifh_2(hiPart, simd_constant_2(0), simd_or(simd_gt_2(hiPart, simd_constant_2(0)), hsimd_packl_4(arg1, arg2)));
2136}
2137//The total number of operations is 25.6666666667
2138static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2)
2139{
2140        bitblock128_t arg11 = simd_ifh_8(arg1, simd_constant_8(0), arg1);
2141        bitblock128_t arg12 = simd_and(simd_lomask_8(), arg11);
2142        bitblock128_t arg21 = simd_ifh_8(arg2, simd_constant_8(0), arg2);
2143        bitblock128_t arg22 = simd_and(simd_lomask_8(), arg21);
2144        return hsimd_packl_8(simd_ifh_1(simd_eq_8(arg12, arg11), arg12, simd_lomask_8()), simd_ifh_1(simd_eq_8(arg22, arg21), arg22, simd_lomask_8()));
2145}
2146//The total number of operations is 12.0
2147static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2)
2148{
2149        bitblock128_t hiPart = hsimd_packh_64(arg1, arg2);
2150        return simd_ifh_32(hiPart, simd_constant_32(0), simd_or(simd_gt_32(hiPart, simd_constant_32(0)), hsimd_packl_64(arg1, arg2)));
2151}
2152//The total number of operations is 16.6666666667
2153static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2)
2154{
2155        bitblock128_t hiPart = hsimd_packh_128(arg1, arg2);
2156        return simd_ifh_64(hiPart, simd_constant_64(0), simd_or(simd_gt_64(hiPart, simd_constant_64(0)), hsimd_packl_128(arg1, arg2)));
2157}
2158//The total number of operations is 1.0
2159static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2)
2160{
2161        return _mm_packus_epi16(arg2, arg1);
2162}
2163//The total number of operations is 1.0
2164static inline bitblock128_t simd_abs_32(bitblock128_t arg1)
2165{
2166        return _mm_abs_epi32(arg1);
2167}
2168//The total number of operations is 7.33333333333
2169static inline bitblock128_t simd_abs_2(bitblock128_t arg1)
2170{
2171        return simd_ifh_1(simd_himask_2(), simd_and(arg1, simd_slli_128(1, simd_not(arg1))), arg1);
2172}
2173//The total number of operations is 19.0
2174static inline bitblock128_t simd_abs_4(bitblock128_t arg1)
2175{
2176        bitblock128_t gtMask = simd_gt_4(arg1, simd_constant_4(0));
2177        return simd_ifh_1(gtMask, arg1, simd_sub_4(gtMask, arg1));
2178}
2179//The total number of operations is 1.0
2180static inline bitblock128_t simd_abs_8(bitblock128_t arg1)
2181{
2182        return _mm_abs_epi8(arg1);
2183}
2184//The total number of operations is 5.0
2185static inline bitblock128_t simd_abs_64(bitblock128_t arg1)
2186{
2187        bitblock128_t gtMask = simd_gt_64(arg1, simd_constant_64(0));
2188        return simd_ifh_1(gtMask, arg1, simd_sub_64(gtMask, arg1));
2189}
2190//The total number of operations is 28.0
2191static inline bitblock128_t simd_abs_128(bitblock128_t arg1)
2192{
2193        bitblock128_t eqMask = simd_eq_128(simd_ifh_1(simd_himask_128(), simd_abs_64(arg1), arg1), arg1);
2194        return simd_ifh_1(eqMask, arg1, simd_sub_128(eqMask, arg1));
2195}
2196//The total number of operations is 1.0
2197static inline bitblock128_t simd_abs_16(bitblock128_t arg1)
2198{
2199        return _mm_abs_epi16(arg1);
2200}
2201//The total number of operations is 3.0
2202static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1)
2203{
2204        return simd_xor(simd_srli_32((16), arg1), simd_and(arg1, simd_lomask_32()));
2205}
2206//The total number of operations is 4.0
2207static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1)
2208{
2209        return simd_xor(simd_srli_2((1), arg1), simd_and(arg1, simd_lomask_2()));
2210}
2211//The total number of operations is 4.0
2212static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1)
2213{
2214        return simd_xor(simd_srli_4((2), arg1), simd_and(arg1, simd_lomask_4()));
2215}
2216//The total number of operations is 4.0
2217static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1)
2218{
2219        return simd_xor(simd_srli_8((4), arg1), simd_and(arg1, simd_lomask_8()));
2220}
2221//The total number of operations is 3.0
2222static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1)
2223{
2224        return simd_xor(simd_srli_64((32), arg1), simd_and(arg1, simd_lomask_64()));
2225}
2226//The total number of operations is 4.33333333333
2227static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1)
2228{
2229        return simd_xor(simd_srli_128((64), arg1), simd_and(arg1, simd_lomask_128()));
2230}
2231//The total number of operations is 3.0
2232static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1)
2233{
2234        return simd_xor(simd_srli_16((8), arg1), simd_and(arg1, simd_lomask_16()));
2235}
2236//The total number of operations is 10.0
2237static inline bitblock128_t simd_srai_4(uint64_t sh, bitblock128_t arg1)
2238{
2239        bitblock128_t tmp = simd_srli_4(((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)), arg1);
2240        return simd_or(tmp, simd_sub_4(simd_constant_4(0), simd_and(simd_constant_4((1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2241}
2242//The total number of operations is 5.0
2243static inline bitblock128_t simd_srai_8(uint64_t sh, bitblock128_t arg1)
2244{
2245        bitblock128_t tmp = simd_srli_8(((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)), arg1);
2246        return simd_or(tmp, simd_sub_8(simd_constant_8(0), simd_and(simd_constant_8((1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2247}
2248//The total number of operations is 1.0
2249static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
2250{
2251        return _mm_and_si128(arg1, arg2);
2252}
2253//The total number of operations is 15.0
2254static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2255{
2256        return simd_or(mvmd_fill16_2((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd_fill16_2((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
2257}
2258//The total number of operations is 7.0
2259static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2260{
2261        return simd_or(mvmd_fill16_4((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd_fill16_4((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
2262}
2263//The total number of operations is 3.0
2264static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2265{
2266        return simd_or(mvmd_fill16_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd_fill16_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
2267}
2268//The total number of operations is 1.0
2269static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2270{
2271        return _mm_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
2272}
2273//The total number of operations is 5.0
2274static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2)
2275{
2276        return simd_and(simd_not(simd_gt_32(arg1, arg2)), simd_not(simd_eq_32(arg1, arg2)));
2277}
2278//The total number of operations is 1.0
2279static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2)
2280{
2281        return simd_andc(arg1, arg2);
2282}
2283//The total number of operations is 14.6666666667
2284static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2)
2285{
2286        bitblock128_t tmp = simd_not(arg2);
2287        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(1, simd_and(simd_not(arg1), arg2)), simd_or(arg1, tmp)));
2288        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
2289}
2290//The total number of operations is 18.0
2291static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2)
2292{
2293        return simd_ifh_1(simd_himask_8(), simd_lt_8(arg1, simd_and(simd_himask_8(), arg2)), simd_lt_8(simd_slli_8(4, arg1), simd_slli_8(4, arg2)));
2294}
2295//The total number of operations is 5.0
2296static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2)
2297{
2298        return simd_and(simd_not(simd_gt_8(arg1, arg2)), simd_not(simd_eq_8(arg1, arg2)));
2299}
2300//The total number of operations is 5.0
2301static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2)
2302{
2303        return simd_and(simd_not(simd_gt_64(arg1, arg2)), simd_not(simd_eq_64(arg1, arg2)));
2304}
2305//The total number of operations is 31.75
2306static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2)
2307{
2308        bitblock128_t hiAns = simd_lt_64(arg1, arg2);
2309        bitblock128_t loAns = simd_ult_64(arg1, arg2);
2310        bitblock128_t mask = simd_and(loAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
2311        mask = simd_or(mask, simd_slli_128((64), mask));
2312        return simd_or(simd_srai_128((64), hiAns), mask);
2313}
2314//The total number of operations is 5.0
2315static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2)
2316{
2317        return simd_and(simd_not(simd_gt_16(arg1, arg2)), simd_not(simd_eq_16(arg1, arg2)));
2318}
2319//The total number of operations is 1.0
2320static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2)
2321{
2322        return _mm_add_epi32(arg1, arg2);
2323}
2324//The total number of operations is 1.0
2325static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2)
2326{
2327        return simd_xor(arg1, arg2);
2328}
2329//The total number of operations is 8.33333333333
2330static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2)
2331{
2332        bitblock128_t tmp = simd_xor(arg1, arg2);
2333        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(1, simd_and(arg1, arg2))), tmp);
2334}
2335//The total number of operations is 6.0
2336static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2)
2337{
2338        return simd_ifh_1(simd_himask_8(), simd_add_8(arg1, simd_and(simd_himask_8(), arg2)), simd_add_8(arg1, arg2));
2339}
2340//The total number of operations is 1.0
2341static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2)
2342{
2343        return _mm_add_epi8(arg1, arg2);
2344}
2345//The total number of operations is 1.0
2346static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2)
2347{
2348        return _mm_add_epi64(arg1, arg2);
2349}
2350//The total number of operations is 9.33333333333
2351static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2)
2352{
2353        bitblock128_t partial = simd_add_64(arg1, arg2);
2354        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
2355        bitblock128_t carry = simd_slli_128((64), simd_srli_64((63), carryMask));
2356        return simd_add_64(partial, carry);
2357}
2358//The total number of operations is 1.0
2359static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2)
2360{
2361        return _mm_add_epi16(arg1, arg2);
2362}
2363//The total number of operations is 3.0
2364static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2)
2365{
2366        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
2367        return simd_gt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2368}
2369//The total number of operations is 1.0
2370static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2)
2371{
2372        return simd_andc(arg1, arg2);
2373}
2374//The total number of operations is 13.6666666667
2375static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2)
2376{
2377        bitblock128_t tmp = simd_not(arg2);
2378        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(1, simd_and(arg1, tmp)), simd_or(arg1, tmp)));
2379        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
2380}
2381//The total number of operations is 12.0
2382static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2)
2383{
2384        return simd_ifh_1(simd_himask_8(), simd_ugt_8(simd_and(simd_himask_8(), arg1), arg2), simd_ugt_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
2385}
2386//The total number of operations is 3.0
2387static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2)
2388{
2389        bitblock128_t high_bit = simd_constant_8((128));
2390        return simd_gt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2391}
2392//The total number of operations is 3.0
2393static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2)
2394{
2395        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
2396        return simd_gt_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2397}
2398//The total number of operations is 22.75
2399static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2)
2400{
2401        bitblock128_t tmpAns = simd_ugt_64(arg1, arg2);
2402        bitblock128_t mask = simd_and(tmpAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
2403        mask = simd_or(mask, simd_slli_128((64), mask));
2404        return simd_or(simd_srai_128((64), tmpAns), mask);
2405}
2406//The total number of operations is 3.0
2407static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2)
2408{
2409        bitblock128_t high_bit = simd_constant_16((32768));
2410        return simd_gt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2411}
2412#endif
Note: See TracBrowser for help on using the repository browser.