source: trunk/lib_c/idisa_c/idisa_sse4_1_c.h

Last change on this file was 3391, checked in by linmengl, 6 years ago

check in IDISA C library and other support libraries. Some template features still remain.

File size: 111.9 KB
Line 
1
2/* Copyright (c) 2011, Hua Huang and Robert D. Cameron.
3   Licensed under the Academic Free License 3.0.
4   This file is generated by the IDISA+ generator;
5   modifications should be made only by changing the
6   generator configuration and data files. */
7
8#ifndef _IDISA_SSE4_1_C_H
9#define _IDISA_SSE4_1_C_H
10#include "smmintrin.h"
11
12#include <stdint.h>
13typedef __m128i bitblock128_t;
14
15#define shufflemask4(s1, s2, s3, s4) \
16        ((s1<<6) | (s2<<4) | (s3<<2) | s4)
17
18#define shufflemask4_from_shufflemask2(msk) \
19        (msk==3 ? 238 : (msk==2 ? 228 : (msk==1 ? 78 : 68)))
20
21#define shufflemask8_to_shufflemask4(msk) \
22        ((msk&3) | (((msk>>3)&3)<<2) | (((msk>>6)&3)<<4) | (((msk>>9)&3)<<6) | (((msk>>12)&3)<<8) | (((msk>>15)&3)<<10) | (((msk>>18)&3)<<12) | (((msk>>21)&3)<<14))
23
24//Declaration Starts here
25static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2);
26static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2);
27static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2);
28static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2);
29static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2);
30static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2);
31static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2);
32static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1);
33static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1);
34static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1);
35static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1);
36static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1);
37static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1);
38static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1);
39static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2);
40static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2);
41static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2);
42static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2);
43static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2);
44static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2);
45static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2);
46static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2);
47static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2);
48static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2);
49static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2);
50static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2);
51static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2);
52static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2);
53static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2);
54static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2);
55static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2);
56static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2);
57static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2);
58static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2);
59static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2);
60static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2);
61static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2);
62static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2);
63static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2);
64static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2);
65static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2);
66static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2);
67static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2);
68static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2);
69static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
70static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2);
71static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2);
72static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2);
73static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2);
74static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2);
75static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2);
76static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2);
77static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2);
78static inline bitblock128_t simd_not(bitblock128_t arg1);
79static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2);
80static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2);
81static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2);
82static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2);
83static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2);
84static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2);
85static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2);
86static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2);
87static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2);
88static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2);
89static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2);
90static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2);
91static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2);
92static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2);
93static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2);
94static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2);
95static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2);
96static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2);
97static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2);
98static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2);
99static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2);
100static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2);
101static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2);
102//The total number of operations is 1.0
103#define mvmd_shufflei_32(msk, arg1) \
104        _mm_shuffle_epi32(arg1, (int32_t)(msk))
105
106//The total number of operations is 1.0
107#define mvmd_shufflei_64(msk, arg1) \
108        mvmd_shufflei_32(shufflemask4_from_shufflemask2(msk), arg1)
109
110//The total number of operations is 13.6666666667
111#define mvmd_shufflei_16(msk, arg1) \
112        simd_ifh_1(mvmd_fill8_16(((((msk>>21)&4) == 0) ? 0 : (131071)), ((((msk>>18)&4) == 0) ? 0 : (131071)), ((((msk>>15)&4) == 0) ? 0 : (131071)), ((((msk>>12)&4) == 0) ? 0 : (131071)), ((((msk>>9)&4) == 0) ? (131071) : 0), ((((msk>>6)&4) == 0) ? (131071) : 0), ((((msk>>3)&4) == 0) ? (131071) : 0), (((msk&4) == 0) ? (131071) : 0)), _mm_shufflelo_epi16(_mm_shufflehi_epi16(arg1, (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), (int32_t)((shufflemask8_to_shufflemask4(msk)&255))), simd_or(_mm_shufflehi_epi16(simd_slli_128(64, arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), _mm_shufflelo_epi16(simd_srli_128(64, arg1), (int32_t)((shufflemask8_to_shufflemask4(msk)&255)))))
113
114//The total number of operations is 1.0
115#define simd_srli_32(sh, arg1) \
116        _mm_srli_epi32(arg1, (int32_t)(sh))
117
118//The total number of operations is 2.0
119#define simd_srli_2(sh, arg1) \
120        simd_and(simd_srli_32(sh, arg1), simd_constant_2(((3)>>sh)))
121
122//The total number of operations is 2.0
123#define simd_srli_4(sh, arg1) \
124        simd_and(simd_srli_32(sh, arg1), simd_constant_4(((15)>>sh)))
125
126//The total number of operations is 2.0
127#define simd_srli_8(sh, arg1) \
128        simd_and(simd_srli_32(sh, arg1), simd_constant_8(((255)>>sh)))
129
130//The total number of operations is 1.0
131#define simd_srli_64(sh, arg1) \
132        _mm_srli_epi64(arg1, (int32_t)(sh))
133
134//The total number of operations is 2.33333333333
135#define simd_srli_128(sh, arg1) \
136        (((sh%8) == 0) ? _mm_srli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_srli_64((sh&63), _mm_srli_si128(arg1, (int32_t)(8))) : simd_or(simd_srli_64(sh, arg1), _mm_srli_si128(simd_slli_64(((128-sh)&63), arg1), (int32_t)(8)))))
137
138//The total number of operations is 1.0
139#define simd_srli_16(sh, arg1) \
140        _mm_srli_epi16(arg1, (int32_t)(sh))
141
142static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1);
143//The total number of operations is 3.0
144#define mvmd_dsrli_32(sh, arg1, arg2) \
145        simd_or(mvmd_srli_32(sh, arg1), mvmd_slli_32(((4)-sh), arg2))
146
147//The total number of operations is 5.66666666667
148#define mvmd_dsrli_2(sh, arg1, arg2) \
149        simd_or(mvmd_srli_2(sh, arg1), mvmd_slli_2(((64)-sh), arg2))
150
151//The total number of operations is 5.66666666667
152#define mvmd_dsrli_4(sh, arg1, arg2) \
153        simd_or(mvmd_srli_4(sh, arg1), mvmd_slli_4(((32)-sh), arg2))
154
155//The total number of operations is 3.0
156#define mvmd_dsrli_8(sh, arg1, arg2) \
157        simd_or(mvmd_srli_8(sh, arg1), mvmd_slli_8(((16)-sh), arg2))
158
159//The total number of operations is 3.0
160#define mvmd_dsrli_64(sh, arg1, arg2) \
161        simd_or(mvmd_srli_64(sh, arg1), mvmd_slli_64(((2)-sh), arg2))
162
163//The total number of operations is 3.0
164#define mvmd_dsrli_128(sh, arg1, arg2) \
165        simd_or(mvmd_srli_128(sh, arg1), mvmd_slli_128(((1)-sh), arg2))
166
167//The total number of operations is 3.0
168#define mvmd_dsrli_16(sh, arg1, arg2) \
169        simd_or(mvmd_srli_16(sh, arg1), mvmd_slli_16(((8)-sh), arg2))
170
171//The total number of operations is 2.33333333333
172#define bitblock_srli(sh, arg1) \
173        simd_srli_128(sh, arg1)
174
175static inline bitblock128_t simd_ctz_32(bitblock128_t arg1);
176static inline bitblock128_t simd_ctz_1(bitblock128_t arg1);
177static inline bitblock128_t simd_ctz_2(bitblock128_t arg1);
178static inline bitblock128_t simd_ctz_4(bitblock128_t arg1);
179static inline bitblock128_t simd_ctz_8(bitblock128_t arg1);
180static inline bitblock128_t simd_ctz_64(bitblock128_t arg1);
181static inline bitblock128_t simd_ctz_128(bitblock128_t arg1);
182static inline bitblock128_t simd_ctz_16(bitblock128_t arg1);
183static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask);
184static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask);
185static inline bitblock128_t mvmd_fill_32(uint64_t val1);
186static inline bitblock128_t mvmd_fill_1(uint64_t val1);
187static inline bitblock128_t mvmd_fill_2(uint64_t val1);
188static inline bitblock128_t mvmd_fill_4(uint64_t val1);
189static inline bitblock128_t mvmd_fill_8(uint64_t val1);
190static inline bitblock128_t mvmd_fill_64(uint64_t val1);
191static inline bitblock128_t mvmd_fill_128(uint64_t val1);
192static inline bitblock128_t mvmd_fill_16(uint64_t val1);
193static inline bitblock128_t mvmd_shuffle_32(bitblock128_t arg1, bitblock128_t arg2);
194static inline bitblock128_t mvmd_shuffle_8(bitblock128_t arg1, bitblock128_t arg2);
195static inline bitblock128_t mvmd_shuffle_64(bitblock128_t arg1, bitblock128_t arg2);
196static inline bitblock128_t mvmd_shuffle_16(bitblock128_t arg1, bitblock128_t arg2);
197static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2);
198static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2);
199static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2);
200static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2);
201static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2);
202static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2);
203static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2);
204static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2);
205static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
206static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2);
207static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2);
208static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2);
209static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2);
210static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2);
211static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2);
212static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2);
213static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2);
214static inline bitblock128_t simd_popcount_32(bitblock128_t arg1);
215static inline bitblock128_t simd_popcount_1(bitblock128_t arg1);
216static inline bitblock128_t simd_popcount_2(bitblock128_t arg1);
217static inline bitblock128_t simd_popcount_4(bitblock128_t arg1);
218static inline bitblock128_t simd_popcount_8(bitblock128_t arg1);
219static inline bitblock128_t simd_popcount_64(bitblock128_t arg1);
220static inline bitblock128_t simd_popcount_128(bitblock128_t arg1);
221static inline bitblock128_t simd_popcount_16(bitblock128_t arg1);
222static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
223//The total number of operations is 1.0
224#define mvmd_extract_32(pos, arg1) \
225        (((uint64_t)(((4294967296ULL)-1)))&_mm_extract_epi32(arg1, (int32_t)(pos)))
226
227//The total number of operations is 1.0
228#define mvmd_extract_1(pos, arg1) \
229        (((pos%2) == 0) ? (mvmd_extract_2((pos/2), arg1)&(1)) : (mvmd_extract_2((pos/2), arg1)>>1))
230
231//The total number of operations is 1.0
232#define mvmd_extract_2(pos, arg1) \
233        (((pos%2) == 0) ? (mvmd_extract_4((pos/2), arg1)&(3)) : (mvmd_extract_4((pos/2), arg1)>>2))
234
235//The total number of operations is 1.0
236#define mvmd_extract_4(pos, arg1) \
237        (((pos%2) == 0) ? (mvmd_extract_8((pos/2), arg1)&(15)) : (mvmd_extract_8((pos/2), arg1)>>4))
238
239//The total number of operations is 1.0
240#define mvmd_extract_8(pos, arg1) \
241        (((pos%2) == 0) ? (mvmd_extract_16((pos/2), arg1)&(255)) : (mvmd_extract_16((pos/2), arg1)>>8))
242
243//The total number of operations is 2.0
244#define mvmd_extract_64(pos, arg1) \
245        ((((uint64_t)(mvmd_extract_32(((2*pos)+1), arg1)))<<(32))|mvmd_extract_32((2*pos), arg1))
246
247//The total number of operations is 1.0
248#define mvmd_extract_16(pos, arg1) \
249        (65535&_mm_extract_epi16(arg1, (int32_t)(pos)))
250
251static inline bitblock128_t simd_neg_32(bitblock128_t arg1);
252static inline bitblock128_t simd_neg_2(bitblock128_t arg1);
253static inline bitblock128_t simd_neg_4(bitblock128_t arg1);
254static inline bitblock128_t simd_neg_8(bitblock128_t arg1);
255static inline bitblock128_t simd_neg_64(bitblock128_t arg1);
256static inline bitblock128_t simd_neg_128(bitblock128_t arg1);
257static inline bitblock128_t simd_neg_16(bitblock128_t arg1);
258//The total number of operations is 1.0
259#define mvmd_splat_32(pos, arg1) \
260        mvmd_shufflei_32(shufflemask4(pos, pos, pos, pos), arg1)
261
262//The total number of operations is 12.6666666667
263#define mvmd_splat_1(pos, arg1) \
264        simd_sub_128(simd_constant_128(0), simd_and(simd_constant_128(1), simd_srli_128(pos, arg1)))
265
266//The total number of operations is 10.0
267#define mvmd_splat_2(pos, arg1) \
268        mvmd_splat_4((pos/2), simd_or((((pos%2) == 0) ? simd_slli_4(2, arg1) : simd_srli_4(2, arg1)), (((pos%2) == 0) ? simd_and(simd_lomask_4(), arg1) : simd_and(simd_himask_4(), arg1))))
269
270//The total number of operations is 6.0
271#define mvmd_splat_4(pos, arg1) \
272        mvmd_splat_8((pos/2), simd_or((((pos%2) == 0) ? simd_slli_8(4, arg1) : simd_srli_8(4, arg1)), (((pos%2) == 0) ? simd_and(simd_lomask_8(), arg1) : simd_and(simd_himask_8(), arg1))))
273
274//The total number of operations is 2.0
275#define mvmd_splat_8(pos, arg1) \
276        mvmd_fill_8(_mm_extract_epi8(arg1, (int32_t)(pos)))
277
278//The total number of operations is 5.0
279#define mvmd_splat_64(pos, arg1) \
280        simd_ifh_1(simd_himask_64(), mvmd_splat_32(((2*pos)+1), arg1), mvmd_splat_32((2*pos), arg1))
281
282//The total number of operations is 13.0
283#define mvmd_splat_128(pos, arg1) \
284        simd_ifh_1(simd_himask_128(), mvmd_splat_64(((2*pos)+1), arg1), mvmd_splat_64((2*pos), arg1))
285
286//The total number of operations is 2.0
287#define mvmd_splat_16(pos, arg1) \
288        mvmd_fill_16(_mm_extract_epi16(arg1, (int32_t)(pos)))
289
290static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2);
291static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2);
292static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2);
293static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2);
294static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2);
295static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2);
296static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2);
297static inline bitblock128_t simd_himask_32();
298static inline bitblock128_t simd_himask_2();
299static inline bitblock128_t simd_himask_4();
300static inline bitblock128_t simd_himask_8();
301static inline bitblock128_t simd_himask_64();
302static inline bitblock128_t simd_himask_128();
303static inline bitblock128_t simd_himask_16();
304//The total number of operations is 1.0
305#define simd_slli_32(sh, arg1) \
306        _mm_slli_epi32(arg1, (int32_t)(sh))
307
308//The total number of operations is 2.0
309#define simd_slli_2(sh, arg1) \
310        simd_and(simd_slli_32(sh, arg1), simd_constant_2((((3)<<sh)&(3))))
311
312//The total number of operations is 2.0
313#define simd_slli_4(sh, arg1) \
314        simd_and(simd_slli_32(sh, arg1), simd_constant_4((((15)<<sh)&(15))))
315
316//The total number of operations is 2.0
317#define simd_slli_8(sh, arg1) \
318        simd_and(simd_slli_32(sh, arg1), simd_constant_8((((255)<<sh)&(255))))
319
320//The total number of operations is 1.0
321#define simd_slli_64(sh, arg1) \
322        _mm_slli_epi64(arg1, (int32_t)(sh))
323
324//The total number of operations is 2.33333333333
325#define simd_slli_128(sh, arg1) \
326        (((sh%8) == 0) ? _mm_slli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_slli_64((sh&63), _mm_slli_si128(arg1, (int32_t)(8))) : simd_or(simd_slli_64(sh, arg1), _mm_slli_si128(simd_srli_64(((128-sh)&63), arg1), (int32_t)(8)))))
327
328//The total number of operations is 1.0
329#define simd_slli_16(sh, arg1) \
330        _mm_slli_epi16(arg1, (int32_t)(sh))
331
332static inline bool bitblock_all(bitblock128_t arg1);
333static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
334static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
335static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
336static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
337static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
338static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
339static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
340static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
341static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2);
342static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2);
343static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2);
344static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2);
345static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2);
346static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2);
347static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2);
348static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2);
349static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1);
350static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1);
351static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1);
352static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1);
353static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1);
354static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1);
355static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1);
356static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask);
357static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask);
358//The total number of operations is 1.0
359#define mvmd_slli_32(sh, arg1) \
360        mvmd_slli_16((sh*2), arg1)
361
362//The total number of operations is 2.33333333333
363#define mvmd_slli_2(sh, arg1) \
364        simd_slli_128((sh*2), arg1)
365
366//The total number of operations is 2.33333333333
367#define mvmd_slli_4(sh, arg1) \
368        mvmd_slli_2((sh*2), arg1)
369
370//The total number of operations is 1.0
371#define mvmd_slli_8(sh, arg1) \
372        _mm_slli_si128(arg1, (int32_t)(sh))
373
374//The total number of operations is 1.0
375#define mvmd_slli_64(sh, arg1) \
376        mvmd_slli_32((sh*2), arg1)
377
378//The total number of operations is 1.0
379#define mvmd_slli_128(sh, arg1) \
380        mvmd_slli_64((sh*2), arg1)
381
382//The total number of operations is 1.0
383#define mvmd_slli_16(sh, arg1) \
384        mvmd_slli_8((sh*2), arg1)
385
386static inline bitblock128_t simd_lomask_32();
387static inline bitblock128_t simd_lomask_2();
388static inline bitblock128_t simd_lomask_4();
389static inline bitblock128_t simd_lomask_8();
390static inline bitblock128_t simd_lomask_64();
391static inline bitblock128_t simd_lomask_128();
392static inline bitblock128_t simd_lomask_16();
393static inline uint64_t hsimd_signmask_32(bitblock128_t arg1);
394static inline uint64_t hsimd_signmask_4(bitblock128_t arg1);
395static inline uint64_t hsimd_signmask_8(bitblock128_t arg1);
396static inline uint64_t hsimd_signmask_64(bitblock128_t arg1);
397static inline uint64_t hsimd_signmask_128(bitblock128_t arg1);
398static inline uint64_t hsimd_signmask_16(bitblock128_t arg1);
399static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1);
400static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1);
401static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1);
402static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1);
403static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1);
404static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1);
405static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1);
406static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1);
407static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1);
408static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1);
409static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1);
410static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1);
411static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1);
412static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1);
413static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
414static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
415static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
416static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
417static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
418static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
419static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2);
420static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2);
421static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2);
422static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2);
423static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2);
424static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2);
425static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2);
426static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2);
427//The total number of operations is 1.0
428#define mvmd_srli_32(sh, arg1) \
429        mvmd_srli_16((sh*2), arg1)
430
431//The total number of operations is 2.33333333333
432#define mvmd_srli_2(sh, arg1) \
433        simd_srli_128((sh*2), arg1)
434
435//The total number of operations is 2.33333333333
436#define mvmd_srli_4(sh, arg1) \
437        simd_srli_128((sh*4), arg1)
438
439//The total number of operations is 1.0
440#define mvmd_srli_8(sh, arg1) \
441        _mm_srli_si128(arg1, (int32_t)(sh))
442
443//The total number of operations is 1.0
444#define mvmd_srli_64(sh, arg1) \
445        mvmd_srli_32((sh*2), arg1)
446
447//The total number of operations is 1.0
448#define mvmd_srli_128(sh, arg1) \
449        mvmd_srli_64((sh*2), arg1)
450
451//The total number of operations is 1.0
452#define mvmd_srli_16(sh, arg1) \
453        mvmd_srli_8((sh*2), arg1)
454
455//The total number of operations is 0
456#define simd_constant_32(val) \
457        _mm_set1_epi32((int32_t)(val))
458
459//The total number of operations is 0
460#define simd_constant_1(val) \
461        simd_constant_32((-1*val))
462
463//The total number of operations is 0
464#define simd_constant_2(val) \
465        ((val < 0) ? simd_constant_4(((val<<2)|(val^(-4)))) : simd_constant_4(((val<<2)|val)))
466
467//The total number of operations is 0
468#define simd_constant_4(val) \
469        ((val < 0) ? simd_constant_8(((val<<4)|(val^(-16)))) : simd_constant_8(((val<<4)|val)))
470
471//The total number of operations is 0
472#define simd_constant_8(val) \
473        _mm_set1_epi8((int32_t)(val))
474
475//The total number of operations is 0
476#define simd_constant_64(val) \
477        _mm_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))
478
479//The total number of operations is 0
480#define simd_constant_128(val) \
481        _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))
482
483//The total number of operations is 0
484#define simd_constant_16(val) \
485        _mm_set1_epi16((int32_t)(val))
486
487static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2);
488static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2);
489static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2);
490static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2);
491static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2);
492static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2);
493static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2);
494static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2);
495static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2);
496static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2);
497static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2);
498static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2);
499static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2);
500static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2);
501static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2);
502static inline bool bitblock_any(bitblock128_t arg1);
503static inline uint64_t bitblock_popcount(bitblock128_t arg1);
504//The total number of operations is 2.33333333333
505#define bitblock_slli(sh, arg1) \
506        simd_slli_128(sh, arg1)
507
508static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
509static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2);
510static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2);
511static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2);
512static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2);
513static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2);
514static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2);
515static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2);
516//The total number of operations is 3.0
517#define mvmd_dslli_32(sh, arg1, arg2) \
518        simd_or(mvmd_slli_32(sh, arg1), mvmd_srli_32(((4)-sh), arg2))
519
520//The total number of operations is 5.66666666667
521#define mvmd_dslli_2(sh, arg1, arg2) \
522        simd_or(mvmd_slli_2(sh, arg1), mvmd_srli_2(((64)-sh), arg2))
523
524//The total number of operations is 5.66666666667
525#define mvmd_dslli_4(sh, arg1, arg2) \
526        simd_or(mvmd_slli_4(sh, arg1), mvmd_srli_4(((32)-sh), arg2))
527
528//The total number of operations is 3.0
529#define mvmd_dslli_8(sh, arg1, arg2) \
530        simd_or(mvmd_slli_8(sh, arg1), mvmd_srli_8(((16)-sh), arg2))
531
532//The total number of operations is 3.0
533#define mvmd_dslli_64(sh, arg1, arg2) \
534        simd_or(mvmd_slli_64(sh, arg1), mvmd_srli_64(((2)-sh), arg2))
535
536//The total number of operations is 3.0
537#define mvmd_dslli_128(sh, arg1, arg2) \
538        simd_or(mvmd_slli_128(sh, arg1), mvmd_srli_128(((1)-sh), arg2))
539
540//The total number of operations is 3.0
541#define mvmd_dslli_16(sh, arg1, arg2) \
542        simd_or(mvmd_slli_16(sh, arg1), mvmd_srli_16(((8)-sh), arg2))
543
544static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
545static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
546static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
547static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
548static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
549static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2);
550static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2);
551static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2);
552static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2);
553static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2);
554static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2);
555static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2);
556static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
557static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2);
558static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2);
559static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2);
560static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2);
561static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2);
562static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2);
563static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2);
564static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2);
565static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1);
566static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2);
567static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1);
568static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1);
569static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1);
570static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1);
571static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1);
572static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1);
573static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1);
574static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2);
575static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2);
576static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2);
577static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2);
578static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2);
579static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2);
580static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2);
581static inline bitblock128_t simd_abs_32(bitblock128_t arg1);
582static inline bitblock128_t simd_abs_2(bitblock128_t arg1);
583static inline bitblock128_t simd_abs_4(bitblock128_t arg1);
584static inline bitblock128_t simd_abs_8(bitblock128_t arg1);
585static inline bitblock128_t simd_abs_64(bitblock128_t arg1);
586static inline bitblock128_t simd_abs_128(bitblock128_t arg1);
587static inline bitblock128_t simd_abs_16(bitblock128_t arg1);
588static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1);
589static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1);
590static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1);
591static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1);
592static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1);
593static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1);
594static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1);
595//The total number of operations is 1.0
596#define simd_srai_32(sh, arg1) \
597        _mm_srai_epi32(arg1, (int32_t)(sh))
598
599//The total number of operations is 4.0
600#define simd_srai_2(sh, arg1) \
601        ((sh == 0) ? arg1 : simd_or(simd_and(simd_himask_2(), arg1), simd_srli_2(1, arg1)))
602
603static inline bitblock128_t simd_srai_4(uint64_t sh, bitblock128_t arg1);
604static inline bitblock128_t simd_srai_8(uint64_t sh, bitblock128_t arg1);
605//The total number of operations is 4.5
606#define simd_srai_64(sh, arg1) \
607        simd_or(simd_and(simd_himask_64(), simd_srai_32(((sh < (32)) ? sh : (32)), arg1)), ((sh <= (32)) ? simd_srli_64(sh, arg1) : simd_srai_32((sh-(32)), simd_srli_64((32), arg1))))
608
609//The total number of operations is 11.0833333333
610#define simd_srai_128(sh, arg1) \
611        simd_or(simd_and(simd_himask_128(), simd_srai_64(((sh < (64)) ? sh : (64)), arg1)), ((sh <= (64)) ? simd_srli_128(sh, arg1) : simd_srai_64((sh-(64)), simd_srli_128((64), arg1))))
612
613//The total number of operations is 1.0
614#define simd_srai_16(sh, arg1) \
615        _mm_srai_epi16(arg1, (int32_t)(sh))
616
617static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
618static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
619static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
620static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
621static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
622static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2);
623static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2);
624static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2);
625static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2);
626static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2);
627static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2);
628static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2);
629static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2);
630static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2);
631static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2);
632static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2);
633static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2);
634static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2);
635static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2);
636static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2);
637static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2);
638static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2);
639static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2);
640static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2);
641static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2);
642static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2);
643static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2);
644static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2);
645static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2);
646
647//Implementation Starts here
648//The total number of operations is 1.0
649static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2)
650{
651        return _mm_unpacklo_epi32(arg2, arg1);
652}
653//The total number of operations is 31.0
654static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2)
655{
656        return esimd_mergel_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(1, arg2)), simd_ifh_1(simd_himask_2(), simd_slli_2(1, arg1), arg2));
657}
658//The total number of operations is 21.0
659static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2)
660{
661        return esimd_mergel_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(2, arg2)), simd_ifh_1(simd_himask_4(), simd_slli_4(2, arg1), arg2));
662}
663//The total number of operations is 11.0
664static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2)
665{
666        return esimd_mergel_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(4, arg2)), simd_ifh_1(simd_himask_8(), simd_slli_8(4, arg1), arg2));
667}
668//The total number of operations is 1.0
669static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2)
670{
671        return _mm_unpacklo_epi8(arg2, arg1);
672}
673//The total number of operations is 1.0
674static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2)
675{
676        return _mm_unpacklo_epi64(arg2, arg1);
677}
678//The total number of operations is 1.0
679static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2)
680{
681        return _mm_unpacklo_epi16(arg2, arg1);
682}
683//The total number of operations is 3.33333333333
684static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1)
685{
686        return esimd_signextendl_32(simd_srli_128((64), arg1));
687}
688//The total number of operations is 31.0
689static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1)
690{
691        return esimd_mergeh_2(simd_srai_2(1, arg1), simd_srai_2(1, simd_slli_2(1, arg1)));
692}
693//The total number of operations is 33.0
694static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1)
695{
696        return esimd_mergeh_4(simd_srai_4(2, arg1), simd_srai_4(2, simd_slli_4(2, arg1)));
697}
698//The total number of operations is 13.0
699static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1)
700{
701        return esimd_mergeh_8(simd_srai_8(4, arg1), simd_srai_8(4, simd_slli_8(4, arg1)));
702}
703//The total number of operations is 3.33333333333
704static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1)
705{
706        return esimd_signextendl_8(simd_srli_128((64), arg1));
707}
708//The total number of operations is 11.0833333333
709static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1)
710{
711        return simd_srai_128(64, arg1);
712}
713//The total number of operations is 3.33333333333
714static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1)
715{
716        return esimd_signextendl_16(simd_srli_128((64), arg1));
717}
718//The total number of operations is 1.0
719static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2)
720{
721        return _mm_max_epi32(arg1, arg2);
722}
723//The total number of operations is 1.0
724static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2)
725{
726        return simd_and(arg1, arg2);
727}
728//The total number of operations is 15.6666666667
729static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2)
730{
731        return simd_ifh_1(simd_himask_2(), simd_and(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(1, simd_or(arg1, simd_not(arg2)))), simd_and(arg1, simd_srli_128(1, simd_or(simd_not(arg1), arg2)))));
732}
733//The total number of operations is 9.0
734static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2)
735{
736        bitblock128_t high_bit = simd_constant_4((8));
737        return simd_xor(simd_umax_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
738}
739//The total number of operations is 1.0
740static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2)
741{
742        return _mm_max_epi8(arg1, arg2);
743}
744//The total number of operations is 15.0
745static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2)
746{
747        bitblock128_t hiAns = simd_max_32(arg1, arg2);
748        bitblock128_t loAns = simd_umax_32(arg1, arg2);
749        bitblock128_t eqMask1 = simd_srli_64((32), simd_eq_32(hiAns, arg1));
750        bitblock128_t eqMask2 = simd_srli_64((32), simd_eq_32(hiAns, arg2));
751        return simd_ifh_1(simd_himask_64(), hiAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, loAns, arg1), arg2));
752}
753//The total number of operations is 44.6666666667
754static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2)
755{
756        bitblock128_t hiAns = simd_max_64(arg1, arg2);
757        bitblock128_t loAns = simd_umax_64(arg1, arg2);
758        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(hiAns, arg1));
759        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(hiAns, arg2));
760        return simd_ifh_1(simd_himask_128(), hiAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, loAns, arg1), arg2));
761}
762//The total number of operations is 1.0
763static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2)
764{
765        return _mm_max_epi16(arg1, arg2);
766}
767//The total number of operations is 1.0
768static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2)
769{
770        return _mm_unpackhi_epi32(arg2, arg1);
771}
772//The total number of operations is 31.0
773static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2)
774{
775        return esimd_mergeh_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(1, arg2)), simd_ifh_1(simd_himask_2(), simd_slli_2(1, arg1), arg2));
776}
777//The total number of operations is 21.0
778static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2)
779{
780        return esimd_mergeh_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(2, arg2)), simd_ifh_1(simd_himask_4(), simd_slli_4(2, arg1), arg2));
781}
782//The total number of operations is 11.0
783static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2)
784{
785        return esimd_mergeh_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(4, arg2)), simd_ifh_1(simd_himask_8(), simd_slli_8(4, arg1), arg2));
786}
787//The total number of operations is 1.0
788static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2)
789{
790        return _mm_unpackhi_epi8(arg2, arg1);
791}
792//The total number of operations is 1.0
793static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2)
794{
795        return _mm_unpackhi_epi64(arg2, arg1);
796}
797//The total number of operations is 1.0
798static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2)
799{
800        return _mm_unpackhi_epi16(arg2, arg1);
801}
802//The total number of operations is 1.0
803static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2)
804{
805        return _mm_mullo_epi32(arg1, arg2);
806}
807//The total number of operations is 1.0
808static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2)
809{
810        return simd_and(arg1, arg2);
811}
812//The total number of operations is 19.6666666667
813static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2)
814{
815        bitblock128_t tmp1 = simd_slli_128(1, arg1);
816        bitblock128_t tmp2 = simd_slli_128(1, arg2);
817        return simd_ifh_1(simd_himask_2(), simd_or(simd_and(tmp1, simd_and(arg2, simd_or(simd_not(arg1), simd_not(tmp2)))), simd_and(arg1, simd_and(tmp2, simd_or(simd_not(tmp1), simd_not(arg2))))), simd_and(arg1, arg2));
818}
819//The total number of operations is 31.0
820static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2)
821{
822        bitblock128_t loMask = simd_lomask_8();
823        bitblock128_t tmpAns1 = simd_mult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
824        bitblock128_t tmpAns2 = simd_mult_8(simd_srli_8(4, arg1), simd_srli_8(4, arg2));
825        return simd_ifh_1(loMask, tmpAns1, simd_slli_8(4, tmpAns2));
826}
827//The total number of operations is 10.0
828static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2)
829{
830        bitblock128_t loMask = simd_lomask_16();
831        bitblock128_t tmpAns1 = simd_mult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
832        bitblock128_t tmpAns2 = simd_mult_16(simd_srli_16(8, arg1), simd_srli_16(8, arg2));
833        return simd_ifh_1(loMask, tmpAns1, simd_slli_16(8, tmpAns2));
834}
835//The total number of operations is 11.0
836static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2)
837{
838        bitblock128_t loMask = simd_lomask_64();
839        bitblock128_t arg1_low = simd_and(arg1, loMask);
840        bitblock128_t arg1_high = simd_srli_64((32), arg1);
841        bitblock128_t arg2_low = simd_and(arg2, loMask);
842        bitblock128_t arg2_high = simd_srli_64((32), arg2);
843        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
844        bitblock128_t tmpAns2 = simd_slli_64((32), simd_umult_32(arg1_low, arg2_high));
845        bitblock128_t tmpAns3 = simd_slli_64((32), simd_umult_32(arg1_high, arg2_low));
846        return simd_add_64(tmpAns1, simd_add_64(tmpAns2, tmpAns3));
847}
848//The total number of operations is 165.0
849static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2)
850{
851        bitblock128_t loMask = simd_lomask_128();
852        bitblock128_t arg1_low = simd_and(arg1, loMask);
853        bitblock128_t arg1_high = simd_srli_128((64), arg1);
854        bitblock128_t arg2_low = simd_and(arg2, loMask);
855        bitblock128_t arg2_high = simd_srli_128((64), arg2);
856        bitblock128_t tmpAns1 = simd_umult_64(arg1_low, arg2_low);
857        bitblock128_t tmpAns2 = simd_slli_128((64), simd_umult_64(arg1_low, arg2_high));
858        bitblock128_t tmpAns3 = simd_slli_128((64), simd_umult_64(arg1_high, arg2_low));
859        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, tmpAns3));
860}
861//The total number of operations is 1.0
862static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2)
863{
864        return _mm_mullo_epi16(arg1, arg2);
865}
866//The total number of operations is 7.0
867static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2)
868{
869        return simd_umin_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
870}
871//The total number of operations is 73.0
872static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2)
873{
874        return simd_umin_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
875}
876//The total number of operations is 66.6666666667
877static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2)
878{
879        return simd_umin_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
880}
881//The total number of operations is 35.3333333333
882static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2)
883{
884        return simd_umin_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
885}
886//The total number of operations is 7.0
887static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2)
888{
889        return simd_umin_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
890}
891//The total number of operations is 24.6666666667
892static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2)
893{
894        return simd_umin_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
895}
896//The total number of operations is 7.0
897static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2)
898{
899        return simd_umin_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
900}
901//The total number of operations is 2.0
902static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2)
903{
904        return simd_not(simd_or(arg1, arg2));
905}
906//The total number of operations is 1.0
907static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2)
908{
909        return _mm_cmpgt_epi32(arg1, arg2);
910}
911//The total number of operations is 1.0
912static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2)
913{
914        return simd_andc(arg2, arg1);
915}
916//The total number of operations is 14.6666666667
917static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2)
918{
919        bitblock128_t tmp = simd_not(arg1);
920        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(1, simd_and(arg1, simd_not(arg2))), simd_or(tmp, arg2)));
921        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
922}
923//The total number of operations is 10.0
924static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2)
925{
926        return simd_ifh_1(simd_himask_8(), simd_gt_8(simd_and(simd_himask_8(), arg1), arg2), simd_gt_8(simd_slli_8(4, arg1), simd_slli_8(4, arg2)));
927}
928//The total number of operations is 1.0
929static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2)
930{
931        return _mm_cmpgt_epi8(arg1, arg2);
932}
933//The total number of operations is 14.5
934static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2)
935{
936        bitblock128_t hiAns = simd_gt_32(arg1, arg2);
937        bitblock128_t loAns = simd_ugt_32(arg1, arg2);
938        bitblock128_t mask = simd_and(loAns, simd_srli_64((32), simd_eq_32(arg1, arg2)));
939        mask = simd_or(mask, simd_slli_64((32), mask));
940        return simd_or(simd_srai_64((32), hiAns), mask);
941}
942//The total number of operations is 47.75
943static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2)
944{
945        bitblock128_t hiAns = simd_gt_64(arg1, arg2);
946        bitblock128_t loAns = simd_ugt_64(arg1, arg2);
947        bitblock128_t mask = simd_and(loAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
948        mask = simd_or(mask, simd_slli_128((64), mask));
949        return simd_or(simd_srai_128((64), hiAns), mask);
950}
951//The total number of operations is 1.0
952static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2)
953{
954        return _mm_cmpgt_epi16(arg1, arg2);
955}
956//The total number of operations is 1.0
957static inline bitblock128_t simd_not(bitblock128_t arg1)
958{
959        return simd_xor(arg1, simd_constant_32(-1));
960}
961//The total number of operations is 13.0
962static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2)
963{
964        return simd_sll_128(arg1, arg2);
965}
966//The total number of operations is 1.0
967static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2)
968{
969        return _mm_mul_epu32(arg1, arg2);
970}
971//The total number of operations is 289.0
972static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2)
973{
974        bitblock128_t loMask = simd_lomask_2();
975        bitblock128_t tmpAns1 = simd_umult_2(simd_and(loMask, arg1), simd_and(loMask, arg2));
976        bitblock128_t tmpAns2 = simd_umult_2(simd_and(loMask, simd_srli_4((2), arg1)), simd_and(loMask, simd_srli_4((2), arg2)));
977        return simd_or(tmpAns1, simd_slli_4((2), tmpAns2));
978}
979//The total number of operations is 139.0
980static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2)
981{
982        bitblock128_t loMask = simd_lomask_4();
983        bitblock128_t tmpAns1 = simd_umult_4(simd_and(loMask, arg1), simd_and(loMask, arg2));
984        bitblock128_t tmpAns2 = simd_umult_4(simd_and(loMask, simd_srli_8((4), arg1)), simd_and(loMask, simd_srli_8((4), arg2)));
985        return simd_or(tmpAns1, simd_slli_8((4), tmpAns2));
986}
987//The total number of operations is 64.0
988static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2)
989{
990        bitblock128_t loMask = simd_lomask_8();
991        bitblock128_t tmpAns1 = simd_umult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
992        bitblock128_t tmpAns2 = simd_umult_8(simd_and(loMask, simd_srli_16((8), arg1)), simd_and(loMask, simd_srli_16((8), arg2)));
993        return simd_or(tmpAns1, simd_slli_16((8), tmpAns2));
994}
995//The total number of operations is 28.0
996static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2)
997{
998        bitblock128_t loMask = simd_lomask_16();
999        bitblock128_t tmpAns1 = simd_umult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
1000        bitblock128_t tmpAns2 = simd_umult_16(simd_and(loMask, simd_srli_32((16), arg1)), simd_and(loMask, simd_srli_32((16), arg2)));
1001        return simd_or(tmpAns1, simd_slli_32((16), tmpAns2));
1002}
1003//The total number of operations is 45.0
1004static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2)
1005{
1006        bitblock128_t loMask1 = simd_lomask_128();
1007        bitblock128_t arg11 = simd_and(arg1, loMask1);
1008        bitblock128_t arg22 = simd_and(arg2, loMask1);
1009        bitblock128_t loMask2 = simd_lomask_64();
1010        bitblock128_t arg1_low = simd_and(arg11, loMask2);
1011        bitblock128_t arg1_high = simd_srli_64((32), arg11);
1012        bitblock128_t arg2_low = simd_and(arg22, loMask2);
1013        bitblock128_t arg2_high = simd_srli_64((32), arg22);
1014        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
1015        bitblock128_t tmpAns2 = simd_slli_128((32), simd_umult_32(arg1_low, arg2_high));
1016        bitblock128_t tmpAns3 = simd_slli_128((32), simd_umult_32(arg1_high, arg2_low));
1017        bitblock128_t tmpAns4 = simd_slli_128(64, simd_umult_32(arg1_high, arg2_high));
1018        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, simd_add_128(tmpAns3, tmpAns4)));
1019}
1020//The total number of operations is 10.0
1021static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2)
1022{
1023        bitblock128_t loMask = simd_lomask_32();
1024        bitblock128_t tmpAns1 = simd_umult_32(simd_and(loMask, arg1), simd_and(loMask, arg2));
1025        bitblock128_t tmpAns2 = simd_umult_32(simd_and(loMask, simd_srli_64((32), arg1)), simd_and(loMask, simd_srli_64((32), arg2)));
1026        return simd_or(tmpAns1, simd_slli_64((32), tmpAns2));
1027}
1028//The total number of operations is 1.0
1029static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1030{
1031        return _mm_hadd_epi16(arg2, arg1);
1032}
1033//The total number of operations is 73.0
1034static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2)
1035{
1036        return simd_add_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
1037}
1038//The total number of operations is 59.0
1039static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2)
1040{
1041        return simd_add_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
1042}
1043//The total number of operations is 35.3333333333
1044static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2)
1045{
1046        return simd_add_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
1047}
1048//The total number of operations is 1.0
1049static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2)
1050{
1051        return _mm_hadd_epi32(arg2, arg1);
1052}
1053//The total number of operations is 11.6666666667
1054static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2)
1055{
1056        return simd_add_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
1057}
1058//The total number of operations is 7.0
1059static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2)
1060{
1061        return simd_add_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
1062}
1063//The total number of operations is 7.0
1064static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2)
1065{
1066        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
1067        return simd_lt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1068}
1069//The total number of operations is 1.0
1070static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2)
1071{
1072        return simd_andc(arg2, arg1);
1073}
1074//The total number of operations is 13.6666666667
1075static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2)
1076{
1077        bitblock128_t tmp = simd_not(arg1);
1078        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(1, simd_and(tmp, arg2)), simd_or(tmp, arg2)));
1079        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
1080}
1081//The total number of operations is 20.0
1082static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2)
1083{
1084        return simd_ifh_1(simd_himask_8(), simd_ult_8(arg1, simd_and(simd_himask_8(), arg2)), simd_ult_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
1085}
1086//The total number of operations is 7.0
1087static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2)
1088{
1089        bitblock128_t high_bit = simd_constant_8((128));
1090        return simd_lt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1091}
1092//The total number of operations is 14.5
1093static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2)
1094{
1095        return simd_and(simd_srai_64((63), simd_or(simd_and(simd_not(arg1), arg2), simd_and(simd_not(simd_xor(arg1, arg2)), simd_sub_64(arg1, arg2)))), simd_not(simd_eq_64(arg1, arg2)));
1096}
1097//The total number of operations is 34.25
1098static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2)
1099{
1100        bitblock128_t tmpAns = simd_ult_64(arg1, arg2);
1101        bitblock128_t mask = simd_and(tmpAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
1102        mask = simd_or(mask, simd_slli_128((64), mask));
1103        return simd_or(simd_srai_128((64), tmpAns), mask);
1104}
1105//The total number of operations is 7.0
1106static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2)
1107{
1108        bitblock128_t high_bit = simd_constant_16((32768));
1109        return simd_lt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1110}
1111//The total number of operations is 1.0
1112static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1)
1113{
1114        return _mm_loadu_si128((bitblock128_t*)(arg1));
1115}
1116//The total number of operations is 19.0
1117static inline bitblock128_t simd_ctz_32(bitblock128_t arg1)
1118{
1119        return simd_popcount_32(simd_andc(simd_sub_32(arg1, simd_constant_32(1)), arg1));
1120}
1121//The total number of operations is 1.0
1122static inline bitblock128_t simd_ctz_1(bitblock128_t arg1)
1123{
1124        return simd_not(arg1);
1125}
1126//The total number of operations is 10.6666666667
1127static inline bitblock128_t simd_ctz_2(bitblock128_t arg1)
1128{
1129        bitblock128_t tmp = simd_not(arg1);
1130        return simd_ifh_1(simd_himask_2(), simd_and(tmp, simd_slli_128(1, tmp)), simd_and(simd_srli_128(1, arg1), tmp));
1131}
1132//The total number of operations is 14.0
1133static inline bitblock128_t simd_ctz_4(bitblock128_t arg1)
1134{
1135        return simd_popcount_4(simd_andc(simd_sub_4(arg1, simd_constant_4(1)), arg1));
1136}
1137//The total number of operations is 13.0
1138static inline bitblock128_t simd_ctz_8(bitblock128_t arg1)
1139{
1140        return simd_popcount_8(simd_andc(simd_sub_8(arg1, simd_constant_8(1)), arg1));
1141}
1142//The total number of operations is 14.0
1143static inline bitblock128_t simd_ctz_64(bitblock128_t arg1)
1144{
1145        return simd_popcount_64(simd_andc(simd_sub_64(arg1, simd_constant_64(1)), arg1));
1146}
1147//The total number of operations is 26.6666666667
1148static inline bitblock128_t simd_ctz_128(bitblock128_t arg1)
1149{
1150        return simd_popcount_128(simd_andc(simd_sub_128(arg1, simd_constant_128(1)), arg1));
1151}
1152//The total number of operations is 16.0
1153static inline bitblock128_t simd_ctz_16(bitblock128_t arg1)
1154{
1155        return simd_popcount_16(simd_andc(simd_sub_16(arg1, simd_constant_16(1)), arg1));
1156}
1157//The total number of operations is 10.0
1158static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask)
1159{
1160        return simd_ifh_1(simd_himask_128(), _mm_sll_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_sll_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1161}
1162//The total number of operations is 13.0
1163static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask)
1164{
1165        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1166        return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1167}
1168//The total number of operations is 1.0
1169static inline bitblock128_t mvmd_fill_32(uint64_t val1)
1170{
1171        return _mm_set1_epi32((int32_t)(val1));
1172}
1173//The total number of operations is 1.0
1174static inline bitblock128_t mvmd_fill_1(uint64_t val1)
1175{
1176        return mvmd_fill_32((-1*val1));
1177}
1178//The total number of operations is 1.0
1179static inline bitblock128_t mvmd_fill_2(uint64_t val1)
1180{
1181        return mvmd_fill_4(((val1<<2)|val1));
1182}
1183//The total number of operations is 1.0
1184static inline bitblock128_t mvmd_fill_4(uint64_t val1)
1185{
1186        return mvmd_fill_8(((val1<<4)|val1));
1187}
1188//The total number of operations is 1.0
1189static inline bitblock128_t mvmd_fill_8(uint64_t val1)
1190{
1191        return _mm_set1_epi8((int32_t)(val1));
1192}
1193//The total number of operations is 1.0
1194static inline bitblock128_t mvmd_fill_64(uint64_t val1)
1195{
1196        return _mm_set_epi32((int32_t)((val1>>32)), (int32_t)(val1), (int32_t)((val1>>32)), (int32_t)(val1));
1197}
1198//The total number of operations is 1.0
1199static inline bitblock128_t mvmd_fill_128(uint64_t val1)
1200{
1201        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val1>>32)), (int32_t)(val1));
1202}
1203//The total number of operations is 1.0
1204static inline bitblock128_t mvmd_fill_16(uint64_t val1)
1205{
1206        return _mm_set1_epi16((int32_t)(val1));
1207}
1208//The total number of operations is 19.0
1209static inline bitblock128_t mvmd_shuffle_32(bitblock128_t arg1, bitblock128_t arg2)
1210{
1211        bitblock128_t tmp1 = simd_and(simd_constant_32((3)), arg2);
1212        bitblock128_t msk1 = simd_add_32(tmp1, tmp1);
1213        bitblock128_t msk2 = simd_add_32(msk1, simd_constant_32(1));
1214        bitblock128_t msk = simd_or(msk1, simd_slli_32((16), msk2));
1215        return simd_ifh_32(arg2, simd_constant_32(0), mvmd_shuffle_16(arg1, msk));
1216}
1217//The total number of operations is 1.0
1218static inline bitblock128_t mvmd_shuffle_8(bitblock128_t arg1, bitblock128_t arg2)
1219{
1220        return _mm_shuffle_epi8(arg1, arg2);
1221}
1222//The total number of operations is 32.0
1223static inline bitblock128_t mvmd_shuffle_64(bitblock128_t arg1, bitblock128_t arg2)
1224{
1225        bitblock128_t tmp1 = simd_and(simd_constant_64((1)), arg2);
1226        bitblock128_t msk1 = simd_add_64(tmp1, tmp1);
1227        bitblock128_t msk2 = simd_add_64(msk1, simd_constant_64(1));
1228        bitblock128_t msk = simd_or(msk1, simd_slli_64((32), msk2));
1229        return simd_ifh_64(arg2, simd_constant_64(0), mvmd_shuffle_32(arg1, msk));
1230}
1231//The total number of operations is 10.0
1232static inline bitblock128_t mvmd_shuffle_16(bitblock128_t arg1, bitblock128_t arg2)
1233{
1234        bitblock128_t tmp1 = simd_and(simd_constant_16((7)), arg2);
1235        bitblock128_t msk1 = simd_add_16(tmp1, tmp1);
1236        bitblock128_t msk2 = simd_add_16(msk1, simd_constant_16(1));
1237        bitblock128_t msk = simd_or(msk1, simd_slli_16((8), msk2));
1238        return simd_ifh_16(arg2, simd_constant_16(0), mvmd_shuffle_8(arg1, msk));
1239}
1240//The total number of operations is 1.0
1241static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2)
1242{
1243        return _mm_packs_epi32(arg2, arg1);
1244}
1245//The total number of operations is 108.666666667
1246static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2)
1247{
1248        bitblock128_t hiBound = simd_srli_2(1, simd_lomask_2());
1249        bitblock128_t loBound = simd_not(hiBound);
1250        return hsimd_packl_2(simd_ifh_1(simd_gt_2(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_2(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg2, loBound), arg2, loBound)));
1251}
1252//The total number of operations is 79.3333333333
1253static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2)
1254{
1255        bitblock128_t hiBound = simd_srli_4(1, simd_lomask_4());
1256        bitblock128_t loBound = simd_not(hiBound);
1257        return hsimd_packl_4(simd_ifh_1(simd_gt_4(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_4(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg2, loBound), arg2, loBound)));
1258}
1259//The total number of operations is 32.6666666667
1260static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2)
1261{
1262        bitblock128_t hiBound = simd_srli_8(1, simd_lomask_8());
1263        bitblock128_t loBound = simd_not(hiBound);
1264        return hsimd_packl_8(simd_ifh_1(simd_gt_8(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_8(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg2, loBound), arg2, loBound)));
1265}
1266//The total number of operations is 75.0
1267static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2)
1268{
1269        bitblock128_t hiBound = simd_srli_64(1, simd_lomask_64());
1270        bitblock128_t loBound = simd_not(hiBound);
1271        return hsimd_packl_64(simd_ifh_1(simd_gt_64(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_64(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg2, loBound), arg2, loBound)));
1272}
1273//The total number of operations is 211.666666667
1274static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2)
1275{
1276        bitblock128_t hiBound = simd_srli_128(1, simd_lomask_128());
1277        bitblock128_t loBound = simd_not(hiBound);
1278        return hsimd_packl_128(simd_ifh_1(simd_gt_128(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_128(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg2, loBound), arg2, loBound)));
1279}
1280//The total number of operations is 1.0
1281static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2)
1282{
1283        return _mm_packs_epi16(arg2, arg1);
1284}
1285//The total number of operations is 13.0
1286static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2)
1287{
1288        return simd_srl_128(arg1, arg2);
1289}
1290//The total number of operations is 1.0
1291static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
1292{
1293        _mm_store_si128((bitblock128_t*)(arg2), arg1);
1294}
1295//The total number of operations is 1.0
1296static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2)
1297{
1298        return _mm_cmpeq_epi32(arg1, arg2);
1299}
1300//The total number of operations is 2.0
1301static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2)
1302{
1303        return simd_not(simd_xor(arg1, arg2));
1304}
1305//The total number of operations is 8.0
1306static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2)
1307{
1308        bitblock128_t tmpAns = simd_eq_1(arg1, arg2);
1309        bitblock128_t loMask = simd_and(tmpAns, simd_srli_2((1), tmpAns));
1310        bitblock128_t hiMask = simd_slli_2((1), loMask);
1311        return simd_or(loMask, hiMask);
1312}
1313//The total number of operations is 9.0
1314static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2)
1315{
1316        return simd_or(simd_and(simd_himask_8(), simd_eq_8(simd_and(simd_himask_8(), arg1), simd_and(simd_himask_8(), arg2))), simd_and(simd_lomask_8(), simd_eq_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2))));
1317}
1318//The total number of operations is 1.0
1319static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2)
1320{
1321        return _mm_cmpeq_epi8(arg1, arg2);
1322}
1323//The total number of operations is 1.0
1324static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2)
1325{
1326        return _mm_cmpeq_epi64(arg1, arg2);
1327}
1328//The total number of operations is 7.66666666667
1329static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2)
1330{
1331        bitblock128_t tmpAns = simd_eq_64(arg1, arg2);
1332        bitblock128_t loMask = simd_and(tmpAns, simd_srli_128((64), tmpAns));
1333        bitblock128_t hiMask = simd_slli_128((64), loMask);
1334        return simd_or(loMask, hiMask);
1335}
1336//The total number of operations is 1.0
1337static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2)
1338{
1339        return _mm_cmpeq_epi16(arg1, arg2);
1340}
1341//The total number of operations is 17.0
1342static inline bitblock128_t simd_popcount_32(bitblock128_t arg1)
1343{
1344        return simd_add_hl_32(simd_popcount_16(arg1));
1345}
1346//The total number of operations is 0
1347static inline bitblock128_t simd_popcount_1(bitblock128_t arg1)
1348{
1349        return arg1;
1350}
1351//The total number of operations is 3.0
1352static inline bitblock128_t simd_popcount_2(bitblock128_t arg1)
1353{
1354        return simd_add_hl_2(simd_popcount_1(arg1));
1355}
1356//The total number of operations is 7.0
1357static inline bitblock128_t simd_popcount_4(bitblock128_t arg1)
1358{
1359        return simd_add_hl_4(simd_popcount_2(arg1));
1360}
1361//The total number of operations is 11.0
1362static inline bitblock128_t simd_popcount_8(bitblock128_t arg1)
1363{
1364        return simd_add_hl_8(simd_popcount_4(arg1));
1365}
1366//The total number of operations is 12.0
1367static inline bitblock128_t simd_popcount_64(bitblock128_t arg1)
1368{
1369        return _mm_sad_epu8(simd_popcount_8(arg1), simd_constant_8(0));
1370}
1371//The total number of operations is 16.3333333333
1372static inline bitblock128_t simd_popcount_128(bitblock128_t arg1)
1373{
1374        bitblock128_t tmpAns = simd_popcount_64(arg1);
1375        return simd_add_64(simd_and(tmpAns, simd_lomask_128()), simd_srli_128((64), tmpAns));
1376}
1377//The total number of operations is 14.0
1378static inline bitblock128_t simd_popcount_16(bitblock128_t arg1)
1379{
1380        return simd_add_hl_16(simd_popcount_8(arg1));
1381}
1382//The total number of operations is 1.0
1383static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
1384{
1385        return _mm_andnot_si128(arg2, arg1);
1386}
1387//The total number of operations is 1.0
1388static inline bitblock128_t simd_neg_32(bitblock128_t arg1)
1389{
1390        return _mm_sign_epi32(arg1, simd_constant_32(-1));
1391}
1392//The total number of operations is 6.33333333333
1393static inline bitblock128_t simd_neg_2(bitblock128_t arg1)
1394{
1395        return simd_ifh_1(simd_himask_2(), simd_xor(arg1, simd_slli_128(1, arg1)), arg1);
1396}
1397//The total number of operations is 6.0
1398static inline bitblock128_t simd_neg_4(bitblock128_t arg1)
1399{
1400        return simd_sub_4(simd_constant_4(0), arg1);
1401}
1402//The total number of operations is 1.0
1403static inline bitblock128_t simd_neg_8(bitblock128_t arg1)
1404{
1405        return simd_sub_8(simd_constant_8(0), arg1);
1406}
1407//The total number of operations is 1.0
1408static inline bitblock128_t simd_neg_64(bitblock128_t arg1)
1409{
1410        return simd_sub_64(simd_constant_64(0), arg1);
1411}
1412//The total number of operations is 9.33333333333
1413static inline bitblock128_t simd_neg_128(bitblock128_t arg1)
1414{
1415        return simd_sub_128(simd_constant_128(0), arg1);
1416}
1417//The total number of operations is 1.0
1418static inline bitblock128_t simd_neg_16(bitblock128_t arg1)
1419{
1420        return simd_sub_16(simd_constant_16(0), arg1);
1421}
1422//The total number of operations is 3.0
1423static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2)
1424{
1425        return _mm_hsub_epi16(simd_srli_32((16), arg2), simd_srli_32((16), arg1));
1426}
1427//The total number of operations is 37.0
1428static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2)
1429{
1430        return hsimd_packl_2(simd_srli_64((1), arg1), simd_srli_64((1), arg2));
1431}
1432//The total number of operations is 26.3333333333
1433static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2)
1434{
1435        return hsimd_packl_4(simd_srli_64((2), arg1), simd_srli_64((2), arg2));
1436}
1437//The total number of operations is 15.6666666667
1438static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2)
1439{
1440        return hsimd_packl_8(simd_srli_64((4), arg1), simd_srli_64((4), arg2));
1441}
1442//The total number of operations is 3.0
1443static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2)
1444{
1445        return _mm_hsub_epi32(simd_srli_64((32), arg2), simd_srli_64((32), arg1));
1446}
1447//The total number of operations is 5.33333333333
1448static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2)
1449{
1450        return simd_ifh_1(simd_himask_128(), arg1, simd_srli_128((64), arg2));
1451}
1452//The total number of operations is 3.0
1453static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2)
1454{
1455        return hsimd_packus_16(simd_srli_16((8), arg1), simd_srli_16((8), arg2));
1456}
1457//The total number of operations is 0
1458static inline bitblock128_t simd_himask_32()
1459{
1460        return simd_constant_32(-65536);
1461}
1462//The total number of operations is 0
1463static inline bitblock128_t simd_himask_2()
1464{
1465        return simd_constant_2((2));
1466}
1467//The total number of operations is 0
1468static inline bitblock128_t simd_himask_4()
1469{
1470        return simd_constant_4((12));
1471}
1472//The total number of operations is 0
1473static inline bitblock128_t simd_himask_8()
1474{
1475        return simd_constant_8((240));
1476}
1477//The total number of operations is 0
1478static inline bitblock128_t simd_himask_64()
1479{
1480        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
1481}
1482//The total number of operations is 0
1483static inline bitblock128_t simd_himask_128()
1484{
1485        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
1486}
1487//The total number of operations is 0
1488static inline bitblock128_t simd_himask_16()
1489{
1490        return simd_constant_16((65280));
1491}
1492//The total number of operations is 2.0
1493static inline bool bitblock_all(bitblock128_t arg1)
1494{
1495        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(-1))) == 65535;
1496}
1497//The total number of operations is 4.0
1498static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1499{
1500        return simd_ifh_1(simd_gt_32(simd_constant_32(0), arg1), arg2, arg3);
1501}
1502//The total number of operations is 3.0
1503static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1504{
1505        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
1506}
1507//The total number of operations is 8.0
1508static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1509{
1510        return simd_ifh_1(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2((1), arg1)), arg2, arg3);
1511}
1512//The total number of operations is 13.0
1513static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1514{
1515        return simd_ifh_1(simd_gt_4(simd_constant_4(0), arg1), arg2, arg3);
1516}
1517//The total number of operations is 1.0
1518static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1519{
1520        return _mm_blendv_epi8(arg3, arg2, arg1);
1521}
1522//The total number of operations is 8.0
1523static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1524{
1525        return simd_ifh_32(simd_ifh_1(simd_himask_64(), arg1, simd_srli_64((32), arg1)), arg2, arg3);
1526}
1527//The total number of operations is 13.3333333333
1528static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1529{
1530        return simd_ifh_64(simd_ifh_1(simd_himask_128(), arg1, simd_srli_128((64), arg1)), arg2, arg3);
1531}
1532//The total number of operations is 4.0
1533static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1534{
1535        return simd_ifh_1(simd_gt_16(simd_constant_16(0), arg1), arg2, arg3);
1536}
1537//The total number of operations is 1.0
1538static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2)
1539{
1540        return _mm_sub_epi32(arg1, arg2);
1541}
1542//The total number of operations is 1.0
1543static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2)
1544{
1545        return simd_xor(arg1, arg2);
1546}
1547//The total number of operations is 9.33333333333
1548static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2)
1549{
1550        bitblock128_t tmp = simd_xor(arg1, arg2);
1551        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(1, simd_and(simd_not(arg1), arg2))), tmp);
1552}
1553//The total number of operations is 6.0
1554static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2)
1555{
1556        return simd_ifh_1(simd_himask_8(), simd_sub_8(arg1, simd_and(simd_himask_8(), arg2)), simd_sub_8(arg1, arg2));
1557}
1558//The total number of operations is 1.0
1559static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2)
1560{
1561        return _mm_sub_epi8(arg1, arg2);
1562}
1563//The total number of operations is 1.0
1564static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2)
1565{
1566        return _mm_sub_epi64(arg1, arg2);
1567}
1568//The total number of operations is 9.33333333333
1569static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2)
1570{
1571        bitblock128_t partial = simd_sub_64(arg1, arg2);
1572        bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
1573        bitblock128_t borrow = simd_slli_128((64), simd_srli_64((63), borrowMask));
1574        return simd_sub_64(partial, borrow);
1575}
1576//The total number of operations is 1.0
1577static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2)
1578{
1579        return _mm_sub_epi16(arg1, arg2);
1580}
1581//The total number of operations is 3.0
1582static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1)
1583{
1584        return simd_add_64(simd_srli_32((16), arg1), simd_and(arg1, simd_lomask_32()));
1585}
1586//The total number of operations is 3.0
1587static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1)
1588{
1589        return simd_sub_16(arg1, simd_and(simd_lomask_2(), simd_srli_16(1, arg1)));
1590}
1591//The total number of operations is 4.0
1592static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1)
1593{
1594        return simd_add_8(simd_srli_4((2), arg1), simd_and(arg1, simd_lomask_4()));
1595}
1596//The total number of operations is 4.0
1597static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1)
1598{
1599        return simd_add_16(simd_srli_8((4), arg1), simd_and(arg1, simd_lomask_8()));
1600}
1601//The total number of operations is 3.0
1602static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1)
1603{
1604        return simd_add_64(simd_srli_64((32), arg1), simd_and(arg1, simd_lomask_64()));
1605}
1606//The total number of operations is 12.6666666667
1607static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1)
1608{
1609        return simd_add_128(simd_srli_128((64), arg1), simd_and(arg1, simd_lomask_128()));
1610}
1611//The total number of operations is 3.0
1612static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1)
1613{
1614        return simd_add_32(simd_srli_16((8), arg1), simd_and(arg1, simd_lomask_16()));
1615}
1616//The total number of operations is 10.0
1617static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask)
1618{
1619        return simd_ifh_1(simd_himask_128(), _mm_srl_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_srl_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1620}
1621//The total number of operations is 13.0
1622static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask)
1623{
1624        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1625        return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1626}
1627//The total number of operations is 0
1628static inline bitblock128_t simd_lomask_32()
1629{
1630        return simd_constant_32((65535));
1631}
1632//The total number of operations is 0
1633static inline bitblock128_t simd_lomask_2()
1634{
1635        return simd_constant_2((1));
1636}
1637//The total number of operations is 0
1638static inline bitblock128_t simd_lomask_4()
1639{
1640        return simd_constant_4((3));
1641}
1642//The total number of operations is 0
1643static inline bitblock128_t simd_lomask_8()
1644{
1645        return simd_constant_8((15));
1646}
1647//The total number of operations is 0
1648static inline bitblock128_t simd_lomask_64()
1649{
1650        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
1651}
1652//The total number of operations is 0
1653static inline bitblock128_t simd_lomask_128()
1654{
1655        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
1656}
1657//The total number of operations is 0
1658static inline bitblock128_t simd_lomask_16()
1659{
1660        return simd_constant_16((255));
1661}
1662//The total number of operations is 3.0
1663static inline uint64_t hsimd_signmask_32(bitblock128_t arg1)
1664{
1665        return hsimd_signmask_16(hsimd_packss_32(simd_constant_32(0), arg1));
1666}
1667//The total number of operations is 24.0
1668static inline uint64_t hsimd_signmask_4(bitblock128_t arg1)
1669{
1670        uint64_t tmpAns1 = hsimd_signmask_8(esimd_mergeh_4(arg1, simd_constant_4(0)));
1671        uint64_t tmpAns2 = hsimd_signmask_8(esimd_mergel_4(arg1, simd_constant_4(0)));
1672        return ((tmpAns1<<(16))+tmpAns2);
1673}
1674//The total number of operations is 1.0
1675static inline uint64_t hsimd_signmask_8(bitblock128_t arg1)
1676{
1677        return _mm_movemask_epi8(arg1);
1678}
1679//The total number of operations is 1.0
1680static inline uint64_t hsimd_signmask_64(bitblock128_t arg1)
1681{
1682        return _mm_movemask_pd(_mm_castsi128_pd(arg1));
1683}
1684//The total number of operations is 6.33333333333
1685static inline uint64_t hsimd_signmask_128(bitblock128_t arg1)
1686{
1687        return hsimd_signmask_64(hsimd_packh_128(simd_constant_128(0), arg1));
1688}
1689//The total number of operations is 2.0
1690static inline uint64_t hsimd_signmask_16(bitblock128_t arg1)
1691{
1692        return hsimd_signmask_8(hsimd_packss_16(simd_constant_16(0), arg1));
1693}
1694//The total number of operations is 3.0
1695static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1)
1696{
1697        return esimd_mergeh_64(simd_srli_64(32, arg1), simd_and(simd_lomask_64(), arg1));
1698}
1699//The total number of operations is 24.0
1700static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1)
1701{
1702        return esimd_mergeh_2(simd_srli_2(1, arg1), simd_and(simd_lomask_2(), arg1));
1703}
1704//The total number of operations is 14.0
1705static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1)
1706{
1707        return esimd_mergeh_4(simd_srli_4(2, arg1), simd_and(simd_lomask_4(), arg1));
1708}
1709//The total number of operations is 4.0
1710static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1)
1711{
1712        return esimd_mergeh_8(simd_srli_8(4, arg1), simd_and(simd_lomask_8(), arg1));
1713}
1714//The total number of operations is 3.0
1715static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1)
1716{
1717        return esimd_mergeh_16(simd_srli_16(8, arg1), simd_and(simd_lomask_16(), arg1));
1718}
1719//The total number of operations is 2.33333333333
1720static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1)
1721{
1722        return simd_srli_128(64, arg1);
1723}
1724//The total number of operations is 3.0
1725static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1)
1726{
1727        return esimd_mergeh_32(simd_srli_32(16, arg1), simd_and(simd_lomask_32(), arg1));
1728}
1729//The total number of operations is 1.0
1730static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1)
1731{
1732        return _mm_cvtepu32_epi64(arg1);
1733}
1734//The total number of operations is 24.0
1735static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1)
1736{
1737        return esimd_mergel_2(simd_srli_2(1, arg1), simd_and(simd_lomask_2(), arg1));
1738}
1739//The total number of operations is 14.0
1740static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1)
1741{
1742        return esimd_mergel_4(simd_srli_4(2, arg1), simd_and(simd_lomask_4(), arg1));
1743}
1744//The total number of operations is 4.0
1745static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1)
1746{
1747        return esimd_mergel_8(simd_srli_8(4, arg1), simd_and(simd_lomask_8(), arg1));
1748}
1749//The total number of operations is 1.0
1750static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1)
1751{
1752        return _mm_cvtepu8_epi16(arg1);
1753}
1754//The total number of operations is 1.0
1755static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1)
1756{
1757        return simd_and(simd_lomask_128(), arg1);
1758}
1759//The total number of operations is 1.0
1760static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1)
1761{
1762        return _mm_cvtepu16_epi32(arg1);
1763}
1764//The total number of operations is 1.0
1765static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1766{
1767        return _mm_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
1768}
1769//The total number of operations is 5.0
1770static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1771{
1772        return simd_ifh_1(simd_himask_4(), mvmd_fill2_1(val1, val2), mvmd_fill2_1(val3, val4));
1773}
1774//The total number of operations is 5.0
1775static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1776{
1777        return simd_ifh_1(simd_himask_8(), mvmd_fill2_2(val1, val2), mvmd_fill2_2(val3, val4));
1778}
1779//The total number of operations is 5.0
1780static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1781{
1782        return simd_ifh_1(simd_himask_16(), mvmd_fill2_4(val1, val2), mvmd_fill2_4(val3, val4));
1783}
1784//The total number of operations is 5.0
1785static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1786{
1787        return simd_ifh_1(simd_himask_32(), mvmd_fill2_8(val1, val2), mvmd_fill2_8(val3, val4));
1788}
1789//The total number of operations is 3.0
1790static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1791{
1792        return simd_or(mvmd_fill4_32((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd_fill4_32((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
1793}
1794//The total number of operations is 1.0
1795static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2)
1796{
1797        return _mm_min_epu32(arg1, arg2);
1798}
1799//The total number of operations is 1.0
1800static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2)
1801{
1802        return simd_and(arg1, arg2);
1803}
1804//The total number of operations is 16.0
1805static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2)
1806{
1807        return simd_or(simd_and(simd_himask_4(), simd_umin_4(arg1, arg2)), simd_umin_4(simd_and(simd_lomask_4(), arg1), simd_and(simd_lomask_4(), arg2)));
1808}
1809//The total number of operations is 6.0
1810static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2)
1811{
1812        return simd_or(simd_and(simd_himask_8(), simd_umin_8(arg1, arg2)), simd_umin_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
1813}
1814//The total number of operations is 1.0
1815static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2)
1816{
1817        return _mm_min_epu8(arg1, arg2);
1818}
1819//The total number of operations is 14.0
1820static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2)
1821{
1822        bitblock128_t tmpAns = simd_umin_32(arg1, arg2);
1823        bitblock128_t eqMask1 = simd_srli_64((32), simd_eq_32(tmpAns, arg1));
1824        bitblock128_t eqMask2 = simd_srli_64((32), simd_eq_32(tmpAns, arg2));
1825        return simd_ifh_1(simd_himask_64(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1826}
1827//The total number of operations is 29.6666666667
1828static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2)
1829{
1830        bitblock128_t tmpAns = simd_umin_64(arg1, arg2);
1831        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(tmpAns, arg1));
1832        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(tmpAns, arg2));
1833        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1834}
1835//The total number of operations is 1.0
1836static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2)
1837{
1838        return _mm_min_epu16(arg1, arg2);
1839}
1840//The total number of operations is 1.0
1841static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2)
1842{
1843        return _mm_min_epi32(arg1, arg2);
1844}
1845//The total number of operations is 1.0
1846static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2)
1847{
1848        return simd_or(arg1, arg2);
1849}
1850//The total number of operations is 16.6666666667
1851static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2)
1852{
1853        bitblock128_t tmp1 = simd_srli_128(1, arg1);
1854        bitblock128_t tmp2 = simd_srli_128(1, arg2);
1855        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg1, simd_and(tmp1, simd_not(tmp2))), simd_and(arg2, simd_or(simd_and(simd_not(tmp1), tmp2), arg1))));
1856}
1857//The total number of operations is 9.0
1858static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2)
1859{
1860        bitblock128_t high_bit = simd_constant_4((8));
1861        return simd_xor(simd_umin_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1862}
1863//The total number of operations is 1.0
1864static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2)
1865{
1866        return _mm_min_epi8(arg1, arg2);
1867}
1868//The total number of operations is 15.0
1869static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2)
1870{
1871        bitblock128_t hiAns = simd_min_32(arg1, arg2);
1872        bitblock128_t loAns = simd_umin_32(arg1, arg2);
1873        bitblock128_t eqMask1 = simd_srli_64((32), simd_eq_32(hiAns, arg1));
1874        bitblock128_t eqMask2 = simd_srli_64((32), simd_eq_32(hiAns, arg2));
1875        return simd_ifh_1(simd_himask_64(), hiAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, loAns, arg1), arg2));
1876}
1877//The total number of operations is 44.6666666667
1878static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2)
1879{
1880        bitblock128_t hiAns = simd_min_64(arg1, arg2);
1881        bitblock128_t loAns = simd_umin_64(arg1, arg2);
1882        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(hiAns, arg1));
1883        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(hiAns, arg2));
1884        return simd_ifh_1(simd_himask_128(), hiAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, loAns, arg1), arg2));
1885}
1886//The total number of operations is 1.0
1887static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2)
1888{
1889        return _mm_min_epi16(arg1, arg2);
1890}
1891//The total number of operations is 5.0
1892static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2)
1893{
1894        return simd_ifh_1(simd_himask_64(), mvmd_fill_32(val1), mvmd_fill_32(val2));
1895}
1896//The total number of operations is 1.0
1897static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2)
1898{
1899        return mvmd_fill_2(((val1<<1)|(val2&(1))));
1900}
1901//The total number of operations is 1.0
1902static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2)
1903{
1904        return mvmd_fill_4(((val1<<2)|(val2&(3))));
1905}
1906//The total number of operations is 1.0
1907static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2)
1908{
1909        return mvmd_fill_8(((val1<<4)|(val2&(15))));
1910}
1911//The total number of operations is 1.0
1912static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2)
1913{
1914        return mvmd_fill_16(((val1<<8)|(val2&(255))));
1915}
1916//The total number of operations is 5.0
1917static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2)
1918{
1919        return simd_ifh_1(simd_himask_128(), mvmd_fill_64(val1), mvmd_fill_64(val2));
1920}
1921//The total number of operations is 1.0
1922static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2)
1923{
1924        return mvmd_fill_32(((val1<<16)|(val2&(65535))));
1925}
1926//The total number of operations is 2.0
1927static inline bool bitblock_any(bitblock128_t arg1)
1928{
1929        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(0))) != 65535;
1930}
1931//The total number of operations is 18.3333333333
1932static inline uint64_t bitblock_popcount(bitblock128_t arg1)
1933{
1934        return mvmd_extract_64(0, simd_popcount_128(arg1));
1935}
1936//The total number of operations is 1.0
1937static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
1938{
1939        return _mm_or_si128(arg1, arg2);
1940}
1941//The total number of operations is 3.0
1942static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2)
1943{
1944        return hsimd_packus_32(simd_and(arg1, simd_lomask_32()), simd_and(arg2, simd_lomask_32()));
1945}
1946//The total number of operations is 35.0
1947static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2)
1948{
1949        return hsimd_packl_4(simd_ifh_1(simd_himask_2(), simd_srli_128((1), arg1), arg1), simd_ifh_1(simd_himask_2(), simd_srli_128((1), arg2), arg2));
1950}
1951//The total number of operations is 24.3333333333
1952static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2)
1953{
1954        return hsimd_packl_8(simd_ifh_1(simd_himask_4(), simd_srli_128((2), arg1), arg1), simd_ifh_1(simd_himask_4(), simd_srli_128((2), arg2), arg2));
1955}
1956//The total number of operations is 13.6666666667
1957static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2)
1958{
1959        return hsimd_packl_16(simd_ifh_1(simd_himask_8(), simd_srli_128((4), arg1), arg1), simd_ifh_1(simd_himask_8(), simd_srli_128((4), arg2), arg2));
1960}
1961//The total number of operations is 3.0
1962static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2)
1963{
1964        return _mm_hsub_epi32(simd_and(arg2, simd_lomask_64()), simd_and(arg1, simd_lomask_64()));
1965}
1966//The total number of operations is 5.33333333333
1967static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2)
1968{
1969        return simd_ifh_1(simd_himask_128(), simd_slli_128((64), arg1), arg2);
1970}
1971//The total number of operations is 3.0
1972static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2)
1973{
1974        return hsimd_packus_16(simd_and(arg1, simd_lomask_16()), simd_and(arg2, simd_lomask_16()));
1975}
1976//The total number of operations is 13.0
1977static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1978{
1979        return simd_ifh_1(simd_himask_8(), mvmd_fill4_1(val1, val2, val3, val4), mvmd_fill4_1(val5, val6, val7, val8));
1980}
1981//The total number of operations is 13.0
1982static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1983{
1984        return simd_ifh_1(simd_himask_16(), mvmd_fill4_2(val1, val2, val3, val4), mvmd_fill4_2(val5, val6, val7, val8));
1985}
1986//The total number of operations is 7.0
1987static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1988{
1989        return simd_or(mvmd_fill8_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd_fill8_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
1990}
1991//The total number of operations is 3.0
1992static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1993{
1994        return simd_or(mvmd_fill8_16((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd_fill8_16((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
1995}
1996//The total number of operations is 1.0
1997static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1998{
1999        return _mm_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
2000}
2001//The total number of operations is 7.0
2002static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2)
2003{
2004        return simd_min_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
2005}
2006//The total number of operations is 73.0
2007static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2)
2008{
2009        return simd_min_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
2010}
2011//The total number of operations is 67.3333333333
2012static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2)
2013{
2014        return simd_min_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
2015}
2016//The total number of operations is 38.3333333333
2017static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2)
2018{
2019        return simd_min_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
2020}
2021//The total number of operations is 7.0
2022static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2)
2023{
2024        return simd_min_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
2025}
2026//The total number of operations is 25.6666666667
2027static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2)
2028{
2029        return simd_min_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
2030}
2031//The total number of operations is 7.0
2032static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2)
2033{
2034        return simd_min_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
2035}
2036//The total number of operations is 1.0
2037static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2)
2038{
2039        return _mm_xor_si128(arg1, arg2);
2040}
2041//The total number of operations is 1.0
2042static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2)
2043{
2044        return _mm_max_epu32(arg1, arg2);
2045}
2046//The total number of operations is 1.0
2047static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2)
2048{
2049        return simd_or(arg1, arg2);
2050}
2051//The total number of operations is 15.6666666667
2052static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2)
2053{
2054        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(1, simd_or(simd_not(arg1), arg2))), simd_and(arg1, simd_srli_128(1, simd_or(arg1, simd_not(arg2))))));
2055}
2056//The total number of operations is 6.0
2057static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2)
2058{
2059        return simd_or(simd_and(simd_himask_8(), simd_umax_8(arg1, arg2)), simd_umax_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
2060}
2061//The total number of operations is 1.0
2062static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2)
2063{
2064        return _mm_max_epu8(arg1, arg2);
2065}
2066//The total number of operations is 14.0
2067static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2)
2068{
2069        bitblock128_t tmpAns = simd_umax_32(arg1, arg2);
2070        bitblock128_t eqMask1 = simd_srli_64((32), simd_eq_32(tmpAns, arg1));
2071        bitblock128_t eqMask2 = simd_srli_64((32), simd_eq_32(tmpAns, arg2));
2072        return simd_ifh_1(simd_himask_64(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2073}
2074//The total number of operations is 29.6666666667
2075static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2)
2076{
2077        bitblock128_t tmpAns = simd_umax_64(arg1, arg2);
2078        bitblock128_t eqMask1 = simd_srli_128((64), simd_eq_64(tmpAns, arg1));
2079        bitblock128_t eqMask2 = simd_srli_128((64), simd_eq_64(tmpAns, arg2));
2080        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2081}
2082//The total number of operations is 1.0
2083static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2)
2084{
2085        return _mm_max_epu16(arg1, arg2);
2086}
2087//The total number of operations is 1.0
2088static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1)
2089{
2090        return _mm_load_si128((bitblock128_t*)(arg1));
2091}
2092//The total number of operations is 1.0
2093static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
2094{
2095        _mm_storeu_si128((bitblock128_t*)(arg2), arg1);
2096}
2097//The total number of operations is 1.0
2098static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1)
2099{
2100        return _mm_cvtepi32_epi64(arg1);
2101}
2102//The total number of operations is 31.0
2103static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1)
2104{
2105        return esimd_mergel_2(simd_srai_2(1, arg1), simd_srai_2(1, simd_slli_2(1, arg1)));
2106}
2107//The total number of operations is 33.0
2108static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1)
2109{
2110        return esimd_mergel_4(simd_srai_4(2, arg1), simd_srai_4(2, simd_slli_4(2, arg1)));
2111}
2112//The total number of operations is 13.0
2113static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1)
2114{
2115        return esimd_mergel_8(simd_srai_8(4, arg1), simd_srai_8(4, simd_slli_8(4, arg1)));
2116}
2117//The total number of operations is 1.0
2118static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1)
2119{
2120        return _mm_cvtepi8_epi16(arg1);
2121}
2122//The total number of operations is 13.4166666667
2123static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1)
2124{
2125        return simd_srai_128(64, simd_slli_128(64, arg1));
2126}
2127//The total number of operations is 1.0
2128static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1)
2129{
2130        return _mm_cvtepi16_epi32(arg1);
2131}
2132//The total number of operations is 1.0
2133static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2)
2134{
2135        return _mm_packus_epi32(arg2, arg1);
2136}
2137//The total number of operations is 75.0
2138static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2)
2139{
2140        bitblock128_t arg11 = simd_ifh_2(arg1, simd_constant_2(0), arg1);
2141        bitblock128_t arg12 = simd_and(simd_lomask_2(), arg11);
2142        bitblock128_t arg21 = simd_ifh_2(arg2, simd_constant_2(0), arg2);
2143        bitblock128_t arg22 = simd_and(simd_lomask_2(), arg21);
2144        return hsimd_packl_2(simd_ifh_1(simd_eq_2(arg12, arg11), arg12, simd_lomask_2()), simd_ifh_1(simd_eq_2(arg22, arg21), arg22, simd_lomask_2()));
2145}
2146//The total number of operations is 74.3333333333
2147static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2)
2148{
2149        bitblock128_t hiPart = hsimd_packh_4(arg1, arg2);
2150        return simd_ifh_2(hiPart, simd_constant_2(0), simd_or(simd_gt_2(hiPart, simd_constant_2(0)), hsimd_packl_4(arg1, arg2)));
2151}
2152//The total number of operations is 25.6666666667
2153static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2)
2154{
2155        bitblock128_t arg11 = simd_ifh_8(arg1, simd_constant_8(0), arg1);
2156        bitblock128_t arg12 = simd_and(simd_lomask_8(), arg11);
2157        bitblock128_t arg21 = simd_ifh_8(arg2, simd_constant_8(0), arg2);
2158        bitblock128_t arg22 = simd_and(simd_lomask_8(), arg21);
2159        return hsimd_packl_8(simd_ifh_1(simd_eq_8(arg12, arg11), arg12, simd_lomask_8()), simd_ifh_1(simd_eq_8(arg22, arg21), arg22, simd_lomask_8()));
2160}
2161//The total number of operations is 12.0
2162static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2)
2163{
2164        bitblock128_t hiPart = hsimd_packh_64(arg1, arg2);
2165        return simd_ifh_32(hiPart, simd_constant_32(0), simd_or(simd_gt_32(hiPart, simd_constant_32(0)), hsimd_packl_64(arg1, arg2)));
2166}
2167//The total number of operations is 34.1666666667
2168static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2)
2169{
2170        bitblock128_t hiPart = hsimd_packh_128(arg1, arg2);
2171        return simd_ifh_64(hiPart, simd_constant_64(0), simd_or(simd_gt_64(hiPart, simd_constant_64(0)), hsimd_packl_128(arg1, arg2)));
2172}
2173//The total number of operations is 1.0
2174static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2)
2175{
2176        return _mm_packus_epi16(arg2, arg1);
2177}
2178//The total number of operations is 1.0
2179static inline bitblock128_t simd_abs_32(bitblock128_t arg1)
2180{
2181        return _mm_abs_epi32(arg1);
2182}
2183//The total number of operations is 7.33333333333
2184static inline bitblock128_t simd_abs_2(bitblock128_t arg1)
2185{
2186        return simd_ifh_1(simd_himask_2(), simd_and(arg1, simd_slli_128(1, simd_not(arg1))), arg1);
2187}
2188//The total number of operations is 19.0
2189static inline bitblock128_t simd_abs_4(bitblock128_t arg1)
2190{
2191        bitblock128_t gtMask = simd_gt_4(arg1, simd_constant_4(0));
2192        return simd_ifh_1(gtMask, arg1, simd_sub_4(gtMask, arg1));
2193}
2194//The total number of operations is 1.0
2195static inline bitblock128_t simd_abs_8(bitblock128_t arg1)
2196{
2197        return _mm_abs_epi8(arg1);
2198}
2199//The total number of operations is 9.0
2200static inline bitblock128_t simd_abs_64(bitblock128_t arg1)
2201{
2202        bitblock128_t eqMask = simd_eq_64(simd_ifh_1(simd_himask_64(), simd_abs_32(arg1), arg1), arg1);
2203        return simd_ifh_1(eqMask, arg1, simd_sub_64(eqMask, arg1));
2204}
2205//The total number of operations is 32.0
2206static inline bitblock128_t simd_abs_128(bitblock128_t arg1)
2207{
2208        bitblock128_t eqMask = simd_eq_128(simd_ifh_1(simd_himask_128(), simd_abs_64(arg1), arg1), arg1);
2209        return simd_ifh_1(eqMask, arg1, simd_sub_128(eqMask, arg1));
2210}
2211//The total number of operations is 1.0
2212static inline bitblock128_t simd_abs_16(bitblock128_t arg1)
2213{
2214        return _mm_abs_epi16(arg1);
2215}
2216//The total number of operations is 3.0
2217static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1)
2218{
2219        return simd_xor(simd_srli_32((16), arg1), simd_and(arg1, simd_lomask_32()));
2220}
2221//The total number of operations is 4.0
2222static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1)
2223{
2224        return simd_xor(simd_srli_2((1), arg1), simd_and(arg1, simd_lomask_2()));
2225}
2226//The total number of operations is 4.0
2227static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1)
2228{
2229        return simd_xor(simd_srli_4((2), arg1), simd_and(arg1, simd_lomask_4()));
2230}
2231//The total number of operations is 4.0
2232static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1)
2233{
2234        return simd_xor(simd_srli_8((4), arg1), simd_and(arg1, simd_lomask_8()));
2235}
2236//The total number of operations is 3.0
2237static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1)
2238{
2239        return simd_xor(simd_srli_64((32), arg1), simd_and(arg1, simd_lomask_64()));
2240}
2241//The total number of operations is 4.33333333333
2242static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1)
2243{
2244        return simd_xor(simd_srli_128((64), arg1), simd_and(arg1, simd_lomask_128()));
2245}
2246//The total number of operations is 3.0
2247static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1)
2248{
2249        return simd_xor(simd_srli_16((8), arg1), simd_and(arg1, simd_lomask_16()));
2250}
2251//The total number of operations is 10.0
2252static inline bitblock128_t simd_srai_4(uint64_t sh, bitblock128_t arg1)
2253{
2254        bitblock128_t tmp = simd_srli_4(((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)), arg1);
2255        return simd_or(tmp, simd_sub_4(simd_constant_4(0), simd_and(simd_constant_4((1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2256}
2257//The total number of operations is 5.0
2258static inline bitblock128_t simd_srai_8(uint64_t sh, bitblock128_t arg1)
2259{
2260        bitblock128_t tmp = simd_srli_8(((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)), arg1);
2261        return simd_or(tmp, simd_sub_8(simd_constant_8(0), simd_and(simd_constant_8((1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2262}
2263//The total number of operations is 1.0
2264static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
2265{
2266        return _mm_and_si128(arg1, arg2);
2267}
2268//The total number of operations is 15.0
2269static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2270{
2271        return simd_or(mvmd_fill16_2((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd_fill16_2((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
2272}
2273//The total number of operations is 7.0
2274static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2275{
2276        return simd_or(mvmd_fill16_4((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd_fill16_4((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
2277}
2278//The total number of operations is 3.0
2279static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2280{
2281        return simd_or(mvmd_fill16_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd_fill16_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
2282}
2283//The total number of operations is 1.0
2284static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2285{
2286        return _mm_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
2287}
2288//The total number of operations is 5.0
2289static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2)
2290{
2291        return simd_and(simd_not(simd_gt_32(arg1, arg2)), simd_not(simd_eq_32(arg1, arg2)));
2292}
2293//The total number of operations is 1.0
2294static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2)
2295{
2296        return simd_andc(arg1, arg2);
2297}
2298//The total number of operations is 14.6666666667
2299static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2)
2300{
2301        bitblock128_t tmp = simd_not(arg2);
2302        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(1, simd_and(simd_not(arg1), arg2)), simd_or(arg1, tmp)));
2303        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
2304}
2305//The total number of operations is 18.0
2306static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2)
2307{
2308        return simd_ifh_1(simd_himask_8(), simd_lt_8(arg1, simd_and(simd_himask_8(), arg2)), simd_lt_8(simd_slli_8(4, arg1), simd_slli_8(4, arg2)));
2309}
2310//The total number of operations is 5.0
2311static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2)
2312{
2313        return simd_and(simd_not(simd_gt_8(arg1, arg2)), simd_not(simd_eq_8(arg1, arg2)));
2314}
2315//The total number of operations is 16.5
2316static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2)
2317{
2318        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
2319        return simd_ult_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2320}
2321//The total number of operations is 50.75
2322static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2)
2323{
2324        bitblock128_t hiAns = simd_lt_64(arg1, arg2);
2325        bitblock128_t loAns = simd_ult_64(arg1, arg2);
2326        bitblock128_t mask = simd_and(loAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
2327        mask = simd_or(mask, simd_slli_128((64), mask));
2328        return simd_or(simd_srai_128((64), hiAns), mask);
2329}
2330//The total number of operations is 5.0
2331static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2)
2332{
2333        return simd_and(simd_not(simd_gt_16(arg1, arg2)), simd_not(simd_eq_16(arg1, arg2)));
2334}
2335//The total number of operations is 1.0
2336static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2)
2337{
2338        return _mm_add_epi32(arg1, arg2);
2339}
2340//The total number of operations is 1.0
2341static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2)
2342{
2343        return simd_xor(arg1, arg2);
2344}
2345//The total number of operations is 8.33333333333
2346static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2)
2347{
2348        bitblock128_t tmp = simd_xor(arg1, arg2);
2349        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(1, simd_and(arg1, arg2))), tmp);
2350}
2351//The total number of operations is 6.0
2352static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2)
2353{
2354        return simd_ifh_1(simd_himask_8(), simd_add_8(arg1, simd_and(simd_himask_8(), arg2)), simd_add_8(arg1, arg2));
2355}
2356//The total number of operations is 1.0
2357static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2)
2358{
2359        return _mm_add_epi8(arg1, arg2);
2360}
2361//The total number of operations is 1.0
2362static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2)
2363{
2364        return _mm_add_epi64(arg1, arg2);
2365}
2366//The total number of operations is 9.33333333333
2367static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2)
2368{
2369        bitblock128_t partial = simd_add_64(arg1, arg2);
2370        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
2371        bitblock128_t carry = simd_slli_128((64), simd_srli_64((63), carryMask));
2372        return simd_add_64(partial, carry);
2373}
2374//The total number of operations is 1.0
2375static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2)
2376{
2377        return _mm_add_epi16(arg1, arg2);
2378}
2379//The total number of operations is 3.0
2380static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2)
2381{
2382        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
2383        return simd_gt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2384}
2385//The total number of operations is 1.0
2386static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2)
2387{
2388        return simd_andc(arg1, arg2);
2389}
2390//The total number of operations is 13.6666666667
2391static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2)
2392{
2393        bitblock128_t tmp = simd_not(arg2);
2394        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(1, simd_and(arg1, tmp)), simd_or(arg1, tmp)));
2395        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(1, tmpAns));
2396}
2397//The total number of operations is 12.0
2398static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2)
2399{
2400        return simd_ifh_1(simd_himask_8(), simd_ugt_8(simd_and(simd_himask_8(), arg1), arg2), simd_ugt_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
2401}
2402//The total number of operations is 3.0
2403static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2)
2404{
2405        bitblock128_t high_bit = simd_constant_8((128));
2406        return simd_gt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2407}
2408//The total number of operations is 13.5
2409static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2)
2410{
2411        bitblock128_t tmpAns = simd_ugt_32(arg1, arg2);
2412        bitblock128_t mask = simd_and(tmpAns, simd_srli_64((32), simd_eq_32(arg1, arg2)));
2413        mask = simd_or(mask, simd_slli_64((32), mask));
2414        return simd_or(simd_srai_64((32), tmpAns), mask);
2415}
2416//The total number of operations is 33.25
2417static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2)
2418{
2419        bitblock128_t tmpAns = simd_ugt_64(arg1, arg2);
2420        bitblock128_t mask = simd_and(tmpAns, simd_srli_128((64), simd_eq_64(arg1, arg2)));
2421        mask = simd_or(mask, simd_slli_128((64), mask));
2422        return simd_or(simd_srai_128((64), tmpAns), mask);
2423}
2424//The total number of operations is 3.0
2425static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2)
2426{
2427        bitblock128_t high_bit = simd_constant_16((32768));
2428        return simd_gt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2429}
2430#endif
Note: See TracBrowser for help on using the repository browser.