source: trunk/lib/idisa_c/idisa_sse3_c.h @ 3125

Last change on this file since 3125 was 3125, checked in by linmengl, 6 years ago

add IDISA C library to the wild, 'idisa128_c.h' is the main header file to use

File size: 111.4 KB
Line 
1
2/* Copyright (c) 2011, Hua Huang and Robert D. Cameron.
3   Licensed under the Academic Free License 3.0.
4   This file is generated by the IDISA+ generator;
5   modifications should be made only by changing the
6   generator configuration and data files. */
7
8#ifndef _IDISA_SSE3_C_H
9#define _IDISA_SSE3_C_H
10#include "pmmintrin.h"
11
12#include <stdint.h>
13typedef __m128i bitblock128_t;
14
15#define shufflemask4(s1, s2, s3, s4) \
16        ((s1<<6) | (s2<<4) | (s3<<2) | s4)
17
18#define shufflemask4_from_shufflemask2(msk) \
19        (msk==3 ? 238 : (msk==2 ? 228 : (msk==1 ? 78 : 68)))
20
21#define shufflemask8_to_shufflemask4(msk) \
22        ((msk&3) | (((msk>>3)&3)<<2) | (((msk>>6)&3)<<4) | (((msk>>9)&3)<<6) | (((msk>>12)&3)<<8) | (((msk>>15)&3)<<10) | (((msk>>18)&3)<<12) | (((msk>>21)&3)<<14))
23
24//Declaration Starts here
25static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2);
26static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2);
27static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2);
28static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2);
29static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2);
30static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2);
31static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2);
32static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1);
33static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1);
34static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1);
35static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1);
36static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1);
37static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1);
38static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1);
39static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2);
40static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2);
41static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2);
42static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2);
43static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2);
44static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2);
45static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2);
46static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2);
47static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2);
48static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2);
49static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2);
50static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2);
51static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2);
52static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2);
53static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2);
54static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2);
55static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2);
56static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2);
57static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2);
58static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2);
59static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2);
60static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2);
61static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2);
62static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2);
63static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2);
64static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2);
65static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2);
66static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2);
67static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2);
68static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2);
69static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
70static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2);
71static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2);
72static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2);
73static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2);
74static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2);
75static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2);
76static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2);
77static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2);
78static inline bitblock128_t simd_not(bitblock128_t arg1);
79static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2);
80static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2);
81static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2);
82static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2);
83static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2);
84static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2);
85static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2);
86static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2);
87static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2);
88static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2);
89static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2);
90static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2);
91static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2);
92static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2);
93static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2);
94static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2);
95static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2);
96static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2);
97static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2);
98static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2);
99static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2);
100static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2);
101static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2);
102//The total number of operations is 1.0
103#define mvmd_shufflei_32(arg1, msk) \
104        _mm_shuffle_epi32(arg1, (int32_t)(msk))
105
106//The total number of operations is 1.0
107#define mvmd_shufflei_64(arg1, msk) \
108        mvmd_shufflei_32(arg1, shufflemask4_from_shufflemask2(msk))
109
110//The total number of operations is 13.6666666667
111#define mvmd_shufflei_16(arg1, msk) \
112        simd_ifh_1(mvmd_fill8_16(((((msk>>21)&4) == 0) ? 0 : (131071)), ((((msk>>18)&4) == 0) ? 0 : (131071)), ((((msk>>15)&4) == 0) ? 0 : (131071)), ((((msk>>12)&4) == 0) ? 0 : (131071)), ((((msk>>9)&4) == 0) ? (131071) : 0), ((((msk>>6)&4) == 0) ? (131071) : 0), ((((msk>>3)&4) == 0) ? (131071) : 0), (((msk&4) == 0) ? (131071) : 0)), _mm_shufflelo_epi16(_mm_shufflehi_epi16(arg1, (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), (int32_t)((shufflemask8_to_shufflemask4(msk)&255))), simd_or(_mm_shufflehi_epi16(simd_slli_128(arg1, 64), (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), _mm_shufflelo_epi16(simd_srli_128(arg1, 64), (int32_t)((shufflemask8_to_shufflemask4(msk)&255)))))
113
114//The total number of operations is 1.0
115#define simd_srli_32(arg1, sh) \
116        _mm_srli_epi32(arg1, (int32_t)(sh))
117
118//The total number of operations is 2.0
119#define simd_srli_2(arg1, sh) \
120        simd_and(simd_srli_32(arg1, sh), simd_constant_2(((3)>>sh)))
121
122//The total number of operations is 2.0
123#define simd_srli_4(arg1, sh) \
124        simd_and(simd_srli_32(arg1, sh), simd_constant_4(((15)>>sh)))
125
126//The total number of operations is 2.0
127#define simd_srli_8(arg1, sh) \
128        simd_and(simd_srli_32(arg1, sh), simd_constant_8(((255)>>sh)))
129
130//The total number of operations is 1.0
131#define simd_srli_64(arg1, sh) \
132        _mm_srli_epi64(arg1, (int32_t)(sh))
133
134//The total number of operations is 2.33333333333
135#define simd_srli_128(arg1, sh) \
136        (((sh%8) == 0) ? _mm_srli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_srli_64(_mm_srli_si128(arg1, (int32_t)(8)), (sh&63)) : simd_or(simd_srli_64(arg1, sh), _mm_srli_si128(simd_slli_64(arg1, ((128-sh)&63)), (int32_t)(8)))))
137
138//The total number of operations is 1.0
139#define simd_srli_16(arg1, sh) \
140        _mm_srli_epi16(arg1, (int32_t)(sh))
141
142static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1);
143//The total number of operations is 3.0
144#define mvmd_dsrli_32(arg1, arg2, sh) \
145        simd_or(mvmd_srli_32(arg1, sh), mvmd_slli_32(arg2, ((4)-sh)))
146
147//The total number of operations is 5.66666666667
148#define mvmd_dsrli_2(arg1, arg2, sh) \
149        simd_or(mvmd_srli_2(arg1, sh), mvmd_slli_2(arg2, ((64)-sh)))
150
151//The total number of operations is 5.66666666667
152#define mvmd_dsrli_4(arg1, arg2, sh) \
153        simd_or(mvmd_srli_4(arg1, sh), mvmd_slli_4(arg2, ((32)-sh)))
154
155//The total number of operations is 3.0
156#define mvmd_dsrli_8(arg1, arg2, sh) \
157        simd_or(mvmd_srli_8(arg1, sh), mvmd_slli_8(arg2, ((16)-sh)))
158
159//The total number of operations is 3.0
160#define mvmd_dsrli_64(arg1, arg2, sh) \
161        simd_or(mvmd_srli_64(arg1, sh), mvmd_slli_64(arg2, ((2)-sh)))
162
163//The total number of operations is 3.0
164#define mvmd_dsrli_128(arg1, arg2, sh) \
165        simd_or(mvmd_srli_128(arg1, sh), mvmd_slli_128(arg2, ((1)-sh)))
166
167//The total number of operations is 3.0
168#define mvmd_dsrli_16(arg1, arg2, sh) \
169        simd_or(mvmd_srli_16(arg1, sh), mvmd_slli_16(arg2, ((8)-sh)))
170
171//The total number of operations is 2.33333333333
172#define bitblock_srli(arg1, sh) \
173        simd_srli_128(arg1, sh)
174
175static inline bitblock128_t simd_ctz_32(bitblock128_t arg1);
176static inline bitblock128_t simd_ctz_1(bitblock128_t arg1);
177static inline bitblock128_t simd_ctz_2(bitblock128_t arg1);
178static inline bitblock128_t simd_ctz_4(bitblock128_t arg1);
179static inline bitblock128_t simd_ctz_8(bitblock128_t arg1);
180static inline bitblock128_t simd_ctz_64(bitblock128_t arg1);
181static inline bitblock128_t simd_ctz_128(bitblock128_t arg1);
182static inline bitblock128_t simd_ctz_16(bitblock128_t arg1);
183static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask);
184static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask);
185static inline bitblock128_t mvmd_fill_32(uint64_t val1);
186static inline bitblock128_t mvmd_fill_1(uint64_t val1);
187static inline bitblock128_t mvmd_fill_2(uint64_t val1);
188static inline bitblock128_t mvmd_fill_4(uint64_t val1);
189static inline bitblock128_t mvmd_fill_8(uint64_t val1);
190static inline bitblock128_t mvmd_fill_64(uint64_t val1);
191static inline bitblock128_t mvmd_fill_128(uint64_t val1);
192static inline bitblock128_t mvmd_fill_16(uint64_t val1);
193static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2);
194static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2);
195static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2);
196static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2);
197static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2);
198static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2);
199static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2);
200static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2);
201static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
202static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2);
203static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2);
204static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2);
205static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2);
206static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2);
207static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2);
208static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2);
209static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2);
210static inline bitblock128_t simd_popcount_32(bitblock128_t arg1);
211static inline bitblock128_t simd_popcount_1(bitblock128_t arg1);
212static inline bitblock128_t simd_popcount_2(bitblock128_t arg1);
213static inline bitblock128_t simd_popcount_4(bitblock128_t arg1);
214static inline bitblock128_t simd_popcount_8(bitblock128_t arg1);
215static inline bitblock128_t simd_popcount_64(bitblock128_t arg1);
216static inline bitblock128_t simd_popcount_128(bitblock128_t arg1);
217static inline bitblock128_t simd_popcount_16(bitblock128_t arg1);
218static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
219//The total number of operations is 2.0
220#define mvmd_extract_32(arg1, pos) \
221        ((((uint64_t)(mvmd_extract_16(arg1, ((2*pos)+1))))<<(16))|mvmd_extract_16(arg1, (2*pos)))
222
223//The total number of operations is 1.0
224#define mvmd_extract_1(arg1, pos) \
225        (((pos%2) == 0) ? (mvmd_extract_2(arg1, (pos/2))&(1)) : (mvmd_extract_2(arg1, (pos/2))>>1))
226
227//The total number of operations is 1.0
228#define mvmd_extract_2(arg1, pos) \
229        (((pos%2) == 0) ? (mvmd_extract_4(arg1, (pos/2))&(3)) : (mvmd_extract_4(arg1, (pos/2))>>2))
230
231//The total number of operations is 1.0
232#define mvmd_extract_4(arg1, pos) \
233        (((pos%2) == 0) ? (mvmd_extract_8(arg1, (pos/2))&(15)) : (mvmd_extract_8(arg1, (pos/2))>>4))
234
235//The total number of operations is 1.0
236#define mvmd_extract_8(arg1, pos) \
237        (((pos%2) == 0) ? (mvmd_extract_16(arg1, (pos/2))&(255)) : (mvmd_extract_16(arg1, (pos/2))>>8))
238
239//The total number of operations is 4.0
240#define mvmd_extract_64(arg1, pos) \
241        ((((uint64_t)(mvmd_extract_32(arg1, ((2*pos)+1))))<<(32))|mvmd_extract_32(arg1, (2*pos)))
242
243//The total number of operations is 1.0
244#define mvmd_extract_16(arg1, pos) \
245        (65535&_mm_extract_epi16(arg1, (int32_t)(pos)))
246
247static inline bitblock128_t simd_neg_32(bitblock128_t arg1);
248static inline bitblock128_t simd_neg_2(bitblock128_t arg1);
249static inline bitblock128_t simd_neg_4(bitblock128_t arg1);
250static inline bitblock128_t simd_neg_8(bitblock128_t arg1);
251static inline bitblock128_t simd_neg_64(bitblock128_t arg1);
252static inline bitblock128_t simd_neg_128(bitblock128_t arg1);
253static inline bitblock128_t simd_neg_16(bitblock128_t arg1);
254//The total number of operations is 1.0
255#define mvmd_splat_32(arg1, pos) \
256        mvmd_shufflei_32(arg1, shufflemask4(pos, pos, pos, pos))
257
258//The total number of operations is 12.6666666667
259#define mvmd_splat_1(arg1, pos) \
260        simd_sub_128(simd_constant_128(0), simd_and(simd_constant_128(1), simd_srli_128(arg1, pos)))
261
262//The total number of operations is 13.0
263#define mvmd_splat_2(arg1, pos) \
264        mvmd_splat_4(simd_or((((pos%2) == 0) ? simd_slli_4(arg1, 2) : simd_srli_4(arg1, 2)), (((pos%2) == 0) ? simd_and(simd_lomask_4(), arg1) : simd_and(simd_himask_4(), arg1))), (pos/2))
265
266//The total number of operations is 9.0
267#define mvmd_splat_4(arg1, pos) \
268        mvmd_splat_8(simd_or((((pos%2) == 0) ? simd_slli_8(arg1, 4) : simd_srli_8(arg1, 4)), (((pos%2) == 0) ? simd_and(simd_lomask_8(), arg1) : simd_and(simd_himask_8(), arg1))), (pos/2))
269
270//The total number of operations is 5.0
271#define mvmd_splat_8(arg1, pos) \
272        mvmd_splat_16(simd_or((((pos%2) == 0) ? simd_slli_16(arg1, 8) : simd_srli_16(arg1, 8)), (((pos%2) == 0) ? simd_and(simd_lomask_16(), arg1) : simd_and(simd_himask_16(), arg1))), (pos/2))
273
274//The total number of operations is 5.0
275#define mvmd_splat_64(arg1, pos) \
276        simd_ifh_1(simd_himask_64(), mvmd_splat_32(arg1, ((2*pos)+1)), mvmd_splat_32(arg1, (2*pos)))
277
278//The total number of operations is 13.0
279#define mvmd_splat_128(arg1, pos) \
280        simd_ifh_1(simd_himask_128(), mvmd_splat_64(arg1, ((2*pos)+1)), mvmd_splat_64(arg1, (2*pos)))
281
282//The total number of operations is 2.0
283#define mvmd_splat_16(arg1, pos) \
284        mvmd_fill_16(_mm_extract_epi16(arg1, (int32_t)(pos)))
285
286static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2);
287static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2);
288static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2);
289static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2);
290static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2);
291static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2);
292static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2);
293static inline bitblock128_t simd_himask_32();
294static inline bitblock128_t simd_himask_2();
295static inline bitblock128_t simd_himask_4();
296static inline bitblock128_t simd_himask_8();
297static inline bitblock128_t simd_himask_64();
298static inline bitblock128_t simd_himask_128();
299static inline bitblock128_t simd_himask_16();
300//The total number of operations is 1.0
301#define simd_slli_32(arg1, sh) \
302        _mm_slli_epi32(arg1, (int32_t)(sh))
303
304//The total number of operations is 2.0
305#define simd_slli_2(arg1, sh) \
306        simd_and(simd_slli_32(arg1, sh), simd_constant_2((((3)<<sh)&(3))))
307
308//The total number of operations is 2.0
309#define simd_slli_4(arg1, sh) \
310        simd_and(simd_slli_32(arg1, sh), simd_constant_4((((15)<<sh)&(15))))
311
312//The total number of operations is 2.0
313#define simd_slli_8(arg1, sh) \
314        simd_and(simd_slli_32(arg1, sh), simd_constant_8((((255)<<sh)&(255))))
315
316//The total number of operations is 1.0
317#define simd_slli_64(arg1, sh) \
318        _mm_slli_epi64(arg1, (int32_t)(sh))
319
320//The total number of operations is 2.33333333333
321#define simd_slli_128(arg1, sh) \
322        (((sh%8) == 0) ? _mm_slli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_slli_64(_mm_slli_si128(arg1, (int32_t)(8)), (sh&63)) : simd_or(simd_slli_64(arg1, sh), _mm_slli_si128(simd_srli_64(arg1, ((128-sh)&63)), (int32_t)(8)))))
323
324//The total number of operations is 1.0
325#define simd_slli_16(arg1, sh) \
326        _mm_slli_epi16(arg1, (int32_t)(sh))
327
328static inline bool bitblock_all(bitblock128_t arg1);
329static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
330static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
331static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
332static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
333static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
334static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
335static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
336static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
337static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2);
338static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2);
339static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2);
340static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2);
341static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2);
342static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2);
343static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2);
344static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2);
345static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1);
346static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1);
347static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1);
348static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1);
349static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1);
350static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1);
351static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1);
352static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask);
353static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask);
354//The total number of operations is 1.0
355#define mvmd_slli_32(arg1, sh) \
356        mvmd_slli_16(arg1, (sh*2))
357
358//The total number of operations is 2.33333333333
359#define mvmd_slli_2(arg1, sh) \
360        simd_slli_128(arg1, (sh*2))
361
362//The total number of operations is 2.33333333333
363#define mvmd_slli_4(arg1, sh) \
364        mvmd_slli_2(arg1, (sh*2))
365
366//The total number of operations is 1.0
367#define mvmd_slli_8(arg1, sh) \
368        _mm_slli_si128(arg1, (int32_t)(sh))
369
370//The total number of operations is 1.0
371#define mvmd_slli_64(arg1, sh) \
372        mvmd_slli_32(arg1, (sh*2))
373
374//The total number of operations is 1.0
375#define mvmd_slli_128(arg1, sh) \
376        mvmd_slli_64(arg1, (sh*2))
377
378//The total number of operations is 1.0
379#define mvmd_slli_16(arg1, sh) \
380        mvmd_slli_8(arg1, (sh*2))
381
382static inline bitblock128_t simd_lomask_32();
383static inline bitblock128_t simd_lomask_2();
384static inline bitblock128_t simd_lomask_4();
385static inline bitblock128_t simd_lomask_8();
386static inline bitblock128_t simd_lomask_64();
387static inline bitblock128_t simd_lomask_128();
388static inline bitblock128_t simd_lomask_16();
389static inline uint64_t hsimd_signmask_32(bitblock128_t arg1);
390static inline uint64_t hsimd_signmask_4(bitblock128_t arg1);
391static inline uint64_t hsimd_signmask_8(bitblock128_t arg1);
392static inline uint64_t hsimd_signmask_64(bitblock128_t arg1);
393static inline uint64_t hsimd_signmask_128(bitblock128_t arg1);
394static inline uint64_t hsimd_signmask_16(bitblock128_t arg1);
395static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1);
396static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1);
397static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1);
398static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1);
399static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1);
400static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1);
401static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1);
402static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1);
403static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1);
404static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1);
405static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1);
406static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1);
407static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1);
408static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1);
409static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
410static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
411static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
412static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
413static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
414static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
415static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2);
416static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2);
417static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2);
418static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2);
419static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2);
420static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2);
421static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2);
422static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2);
423//The total number of operations is 1.0
424#define mvmd_srli_32(arg1, sh) \
425        mvmd_srli_16(arg1, (sh*2))
426
427//The total number of operations is 2.33333333333
428#define mvmd_srli_2(arg1, sh) \
429        simd_srli_128(arg1, (sh*2))
430
431//The total number of operations is 2.33333333333
432#define mvmd_srli_4(arg1, sh) \
433        simd_srli_128(arg1, (sh*4))
434
435//The total number of operations is 1.0
436#define mvmd_srli_8(arg1, sh) \
437        _mm_srli_si128(arg1, (int32_t)(sh))
438
439//The total number of operations is 1.0
440#define mvmd_srli_64(arg1, sh) \
441        mvmd_srli_32(arg1, (sh*2))
442
443//The total number of operations is 1.0
444#define mvmd_srli_128(arg1, sh) \
445        mvmd_srli_64(arg1, (sh*2))
446
447//The total number of operations is 1.0
448#define mvmd_srli_16(arg1, sh) \
449        mvmd_srli_8(arg1, (sh*2))
450
451//The total number of operations is 0
452#define simd_constant_32(val) \
453        _mm_set1_epi32((int32_t)(val))
454
455//The total number of operations is 0
456#define simd_constant_1(val) \
457        simd_constant_32((-1*val))
458
459//The total number of operations is 0
460#define simd_constant_2(val) \
461        ((val < 0) ? simd_constant_4(((val<<2)|(val^(-4)))) : simd_constant_4(((val<<2)|val)))
462
463//The total number of operations is 0
464#define simd_constant_4(val) \
465        ((val < 0) ? simd_constant_8(((val<<4)|(val^(-16)))) : simd_constant_8(((val<<4)|val)))
466
467//The total number of operations is 0
468#define simd_constant_8(val) \
469        _mm_set1_epi8((int32_t)(val))
470
471//The total number of operations is 0
472#define simd_constant_64(val) \
473        _mm_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))
474
475//The total number of operations is 0
476#define simd_constant_128(val) \
477        _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))
478
479//The total number of operations is 0
480#define simd_constant_16(val) \
481        _mm_set1_epi16((int32_t)(val))
482
483static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2);
484static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2);
485static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2);
486static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2);
487static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2);
488static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2);
489static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2);
490static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2);
491static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2);
492static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2);
493static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2);
494static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2);
495static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2);
496static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2);
497static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2);
498static inline bool bitblock_any(bitblock128_t arg1);
499static inline uint64_t bitblock_popcount(bitblock128_t arg1);
500//The total number of operations is 2.33333333333
501#define bitblock_slli(arg1, sh) \
502        simd_slli_128(arg1, sh)
503
504static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
505static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2);
506static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2);
507static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2);
508static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2);
509static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2);
510static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2);
511static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2);
512//The total number of operations is 3.0
513#define mvmd_dslli_32(arg1, arg2, sh) \
514        simd_or(mvmd_slli_32(arg1, sh), mvmd_srli_32(arg2, ((4)-sh)))
515
516//The total number of operations is 5.66666666667
517#define mvmd_dslli_2(arg1, arg2, sh) \
518        simd_or(mvmd_slli_2(arg1, sh), mvmd_srli_2(arg2, ((64)-sh)))
519
520//The total number of operations is 5.66666666667
521#define mvmd_dslli_4(arg1, arg2, sh) \
522        simd_or(mvmd_slli_4(arg1, sh), mvmd_srli_4(arg2, ((32)-sh)))
523
524//The total number of operations is 3.0
525#define mvmd_dslli_8(arg1, arg2, sh) \
526        simd_or(mvmd_slli_8(arg1, sh), mvmd_srli_8(arg2, ((16)-sh)))
527
528//The total number of operations is 3.0
529#define mvmd_dslli_64(arg1, arg2, sh) \
530        simd_or(mvmd_slli_64(arg1, sh), mvmd_srli_64(arg2, ((2)-sh)))
531
532//The total number of operations is 3.0
533#define mvmd_dslli_128(arg1, arg2, sh) \
534        simd_or(mvmd_slli_128(arg1, sh), mvmd_srli_128(arg2, ((1)-sh)))
535
536//The total number of operations is 3.0
537#define mvmd_dslli_16(arg1, arg2, sh) \
538        simd_or(mvmd_slli_16(arg1, sh), mvmd_srli_16(arg2, ((8)-sh)))
539
540static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
541static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
542static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
543static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
544static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
545static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2);
546static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2);
547static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2);
548static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2);
549static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2);
550static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2);
551static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2);
552static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
553static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2);
554static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2);
555static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2);
556static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2);
557static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2);
558static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2);
559static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2);
560static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2);
561static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1);
562static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2);
563static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1);
564static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1);
565static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1);
566static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1);
567static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1);
568static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1);
569static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1);
570static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2);
571static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2);
572static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2);
573static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2);
574static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2);
575static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2);
576static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2);
577static inline bitblock128_t simd_abs_32(bitblock128_t arg1);
578static inline bitblock128_t simd_abs_2(bitblock128_t arg1);
579static inline bitblock128_t simd_abs_4(bitblock128_t arg1);
580static inline bitblock128_t simd_abs_8(bitblock128_t arg1);
581static inline bitblock128_t simd_abs_64(bitblock128_t arg1);
582static inline bitblock128_t simd_abs_128(bitblock128_t arg1);
583static inline bitblock128_t simd_abs_16(bitblock128_t arg1);
584static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1);
585static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1);
586static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1);
587static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1);
588static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1);
589static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1);
590static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1);
591//The total number of operations is 1.0
592#define simd_srai_32(arg1, sh) \
593        _mm_srai_epi32(arg1, (int32_t)(sh))
594
595//The total number of operations is 4.0
596#define simd_srai_2(arg1, sh) \
597        ((sh == 0) ? arg1 : simd_or(simd_and(simd_himask_2(), arg1), simd_srli_2(arg1, 1)))
598
599static inline bitblock128_t simd_srai_4(bitblock128_t arg1, uint64_t sh);
600static inline bitblock128_t simd_srai_8(bitblock128_t arg1, uint64_t sh);
601//The total number of operations is 4.5
602#define simd_srai_64(arg1, sh) \
603        simd_or(simd_and(simd_himask_64(), simd_srai_32(arg1, ((sh < (32)) ? sh : (32)))), ((sh <= (32)) ? simd_srli_64(arg1, sh) : simd_srai_32(simd_srli_64(arg1, (32)), (sh-(32)))))
604
605//The total number of operations is 11.0833333333
606#define simd_srai_128(arg1, sh) \
607        simd_or(simd_and(simd_himask_128(), simd_srai_64(arg1, ((sh < (64)) ? sh : (64)))), ((sh <= (64)) ? simd_srli_128(arg1, sh) : simd_srai_64(simd_srli_128(arg1, (64)), (sh-(64)))))
608
609//The total number of operations is 1.0
610#define simd_srai_16(arg1, sh) \
611        _mm_srai_epi16(arg1, (int32_t)(sh))
612
613static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
614static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
615static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
616static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
617static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
618static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2);
619static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2);
620static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2);
621static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2);
622static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2);
623static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2);
624static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2);
625static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2);
626static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2);
627static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2);
628static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2);
629static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2);
630static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2);
631static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2);
632static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2);
633static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2);
634static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2);
635static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2);
636static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2);
637static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2);
638static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2);
639static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2);
640static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2);
641static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2);
642
643//Implementation Starts here
644//The total number of operations is 1.0
645static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2)
646{
647        return _mm_unpacklo_epi32(arg2, arg1);
648}
649//The total number of operations is 31.0
650static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2)
651{
652        return esimd_mergel_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(arg2, 1)), simd_ifh_1(simd_himask_2(), simd_slli_2(arg1, 1), arg2));
653}
654//The total number of operations is 21.0
655static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2)
656{
657        return esimd_mergel_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(arg2, 2)), simd_ifh_1(simd_himask_4(), simd_slli_4(arg1, 2), arg2));
658}
659//The total number of operations is 11.0
660static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2)
661{
662        return esimd_mergel_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(arg2, 4)), simd_ifh_1(simd_himask_8(), simd_slli_8(arg1, 4), arg2));
663}
664//The total number of operations is 1.0
665static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2)
666{
667        return _mm_unpacklo_epi8(arg2, arg1);
668}
669//The total number of operations is 1.0
670static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2)
671{
672        return _mm_unpacklo_epi64(arg2, arg1);
673}
674//The total number of operations is 1.0
675static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2)
676{
677        return _mm_unpacklo_epi16(arg2, arg1);
678}
679//The total number of operations is 11.0
680static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1)
681{
682        return esimd_mergeh_64(simd_srai_64(arg1, 32), simd_srai_64(simd_slli_64(arg1, 32), 32));
683}
684//The total number of operations is 31.0
685static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1)
686{
687        return esimd_mergeh_2(simd_srai_2(arg1, 1), simd_srai_2(simd_slli_2(arg1, 1), 1));
688}
689//The total number of operations is 33.0
690static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1)
691{
692        return esimd_mergeh_4(simd_srai_4(arg1, 2), simd_srai_4(simd_slli_4(arg1, 2), 2));
693}
694//The total number of operations is 13.0
695static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1)
696{
697        return esimd_mergeh_8(simd_srai_8(arg1, 4), simd_srai_8(simd_slli_8(arg1, 4), 4));
698}
699//The total number of operations is 4.0
700static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1)
701{
702        return esimd_mergeh_16(simd_srai_16(arg1, 8), simd_srai_16(simd_slli_16(arg1, 8), 8));
703}
704//The total number of operations is 11.0833333333
705static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1)
706{
707        return simd_srai_128(arg1, 64);
708}
709//The total number of operations is 4.0
710static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1)
711{
712        return esimd_mergeh_32(simd_srai_32(arg1, 16), simd_srai_32(simd_slli_32(arg1, 16), 16));
713}
714//The total number of operations is 4.0
715static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2)
716{
717        return simd_ifh_1(simd_gt_32(arg1, arg2), arg1, arg2);
718}
719//The total number of operations is 1.0
720static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2)
721{
722        return simd_and(arg1, arg2);
723}
724//The total number of operations is 15.6666666667
725static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2)
726{
727        return simd_ifh_1(simd_himask_2(), simd_and(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(simd_or(arg1, simd_not(arg2)), 1)), simd_and(arg1, simd_srli_128(simd_or(simd_not(arg1), arg2), 1))));
728}
729//The total number of operations is 9.0
730static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2)
731{
732        bitblock128_t high_bit = simd_constant_4((8));
733        return simd_xor(simd_umax_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
734}
735//The total number of operations is 4.0
736static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2)
737{
738        return simd_ifh_1(simd_gt_8(arg1, arg2), arg1, arg2);
739}
740//The total number of operations is 17.5
741static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2)
742{
743        return simd_ifh_1(simd_gt_64(arg1, arg2), arg1, arg2);
744}
745//The total number of operations is 54.75
746static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2)
747{
748        return simd_ifh_1(simd_gt_128(arg1, arg2), arg1, arg2);
749}
750//The total number of operations is 1.0
751static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2)
752{
753        return _mm_max_epi16(arg1, arg2);
754}
755//The total number of operations is 1.0
756static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2)
757{
758        return _mm_unpackhi_epi32(arg2, arg1);
759}
760//The total number of operations is 31.0
761static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2)
762{
763        return esimd_mergeh_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(arg2, 1)), simd_ifh_1(simd_himask_2(), simd_slli_2(arg1, 1), arg2));
764}
765//The total number of operations is 21.0
766static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2)
767{
768        return esimd_mergeh_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(arg2, 2)), simd_ifh_1(simd_himask_4(), simd_slli_4(arg1, 2), arg2));
769}
770//The total number of operations is 11.0
771static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2)
772{
773        return esimd_mergeh_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(arg2, 4)), simd_ifh_1(simd_himask_8(), simd_slli_8(arg1, 4), arg2));
774}
775//The total number of operations is 1.0
776static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2)
777{
778        return _mm_unpackhi_epi8(arg2, arg1);
779}
780//The total number of operations is 1.0
781static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2)
782{
783        return _mm_unpackhi_epi64(arg2, arg1);
784}
785//The total number of operations is 1.0
786static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2)
787{
788        return _mm_unpackhi_epi16(arg2, arg1);
789}
790//The total number of operations is 30.0
791static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2)
792{
793        bitblock128_t loMask = simd_lomask_64();
794        bitblock128_t tmpAns1 = simd_mult_64(simd_and(loMask, arg1), simd_and(loMask, arg2));
795        bitblock128_t tmpAns2 = simd_mult_64(simd_srli_64(arg1, 32), simd_srli_64(arg2, 32));
796        return simd_ifh_1(loMask, tmpAns1, simd_slli_64(tmpAns2, 32));
797}
798//The total number of operations is 1.0
799static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2)
800{
801        return simd_and(arg1, arg2);
802}
803//The total number of operations is 19.6666666667
804static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2)
805{
806        bitblock128_t tmp1 = simd_slli_128(arg1, 1);
807        bitblock128_t tmp2 = simd_slli_128(arg2, 1);
808        return simd_ifh_1(simd_himask_2(), simd_or(simd_and(tmp1, simd_and(arg2, simd_or(simd_not(arg1), simd_not(tmp2)))), simd_and(arg1, simd_and(tmp2, simd_or(simd_not(tmp1), simd_not(arg2))))), simd_and(arg1, arg2));
809}
810//The total number of operations is 31.0
811static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2)
812{
813        bitblock128_t loMask = simd_lomask_8();
814        bitblock128_t tmpAns1 = simd_mult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
815        bitblock128_t tmpAns2 = simd_mult_8(simd_srli_8(arg1, 4), simd_srli_8(arg2, 4));
816        return simd_ifh_1(loMask, tmpAns1, simd_slli_8(tmpAns2, 4));
817}
818//The total number of operations is 10.0
819static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2)
820{
821        bitblock128_t loMask = simd_lomask_16();
822        bitblock128_t tmpAns1 = simd_mult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
823        bitblock128_t tmpAns2 = simd_mult_16(simd_srli_16(arg1, 8), simd_srli_16(arg2, 8));
824        return simd_ifh_1(loMask, tmpAns1, simd_slli_16(tmpAns2, 8));
825}
826//The total number of operations is 11.0
827static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2)
828{
829        bitblock128_t loMask = simd_lomask_64();
830        bitblock128_t arg1_low = simd_and(arg1, loMask);
831        bitblock128_t arg1_high = simd_srli_64(arg1, (32));
832        bitblock128_t arg2_low = simd_and(arg2, loMask);
833        bitblock128_t arg2_high = simd_srli_64(arg2, (32));
834        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
835        bitblock128_t tmpAns2 = simd_slli_64(simd_umult_32(arg1_low, arg2_high), (32));
836        bitblock128_t tmpAns3 = simd_slli_64(simd_umult_32(arg1_high, arg2_low), (32));
837        return simd_add_64(tmpAns1, simd_add_64(tmpAns2, tmpAns3));
838}
839//The total number of operations is 165.0
840static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2)
841{
842        bitblock128_t loMask = simd_lomask_128();
843        bitblock128_t arg1_low = simd_and(arg1, loMask);
844        bitblock128_t arg1_high = simd_srli_128(arg1, (64));
845        bitblock128_t arg2_low = simd_and(arg2, loMask);
846        bitblock128_t arg2_high = simd_srli_128(arg2, (64));
847        bitblock128_t tmpAns1 = simd_umult_64(arg1_low, arg2_low);
848        bitblock128_t tmpAns2 = simd_slli_128(simd_umult_64(arg1_low, arg2_high), (64));
849        bitblock128_t tmpAns3 = simd_slli_128(simd_umult_64(arg1_high, arg2_low), (64));
850        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, tmpAns3));
851}
852//The total number of operations is 1.0
853static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2)
854{
855        return _mm_mullo_epi16(arg1, arg2);
856}
857//The total number of operations is 37.3333333333
858static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2)
859{
860        return simd_umin_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
861}
862//The total number of operations is 73.0
863static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2)
864{
865        return simd_umin_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
866}
867//The total number of operations is 66.6666666667
868static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2)
869{
870        return simd_umin_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
871}
872//The total number of operations is 35.3333333333
873static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2)
874{
875        return simd_umin_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
876}
877//The total number of operations is 19.0
878static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2)
879{
880        return simd_umin_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
881}
882//The total number of operations is 30.6666666667
883static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2)
884{
885        return simd_umin_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
886}
887//The total number of operations is 7.0
888static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2)
889{
890        return simd_umin_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
891}
892//The total number of operations is 2.0
893static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2)
894{
895        return simd_not(simd_or(arg1, arg2));
896}
897//The total number of operations is 1.0
898static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2)
899{
900        return _mm_cmpgt_epi32(arg1, arg2);
901}
902//The total number of operations is 1.0
903static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2)
904{
905        return simd_andc(arg2, arg1);
906}
907//The total number of operations is 14.6666666667
908static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2)
909{
910        bitblock128_t tmp = simd_not(arg1);
911        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(simd_and(arg1, simd_not(arg2)), 1), simd_or(tmp, arg2)));
912        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
913}
914//The total number of operations is 10.0
915static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2)
916{
917        return simd_ifh_1(simd_himask_8(), simd_gt_8(simd_and(simd_himask_8(), arg1), arg2), simd_gt_8(simd_slli_8(arg1, 4), simd_slli_8(arg2, 4)));
918}
919//The total number of operations is 1.0
920static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2)
921{
922        return _mm_cmpgt_epi8(arg1, arg2);
923}
924//The total number of operations is 14.5
925static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2)
926{
927        bitblock128_t hiAns = simd_gt_32(arg1, arg2);
928        bitblock128_t loAns = simd_ugt_32(arg1, arg2);
929        bitblock128_t mask = simd_and(loAns, simd_srli_64(simd_eq_32(arg1, arg2), (32)));
930        mask = simd_or(mask, simd_slli_64(mask, (32)));
931        return simd_or(simd_srai_64(hiAns, (32)), mask);
932}
933//The total number of operations is 51.75
934static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2)
935{
936        bitblock128_t hiAns = simd_gt_64(arg1, arg2);
937        bitblock128_t loAns = simd_ugt_64(arg1, arg2);
938        bitblock128_t mask = simd_and(loAns, simd_srli_128(simd_eq_64(arg1, arg2), (64)));
939        mask = simd_or(mask, simd_slli_128(mask, (64)));
940        return simd_or(simd_srai_128(hiAns, (64)), mask);
941}
942//The total number of operations is 1.0
943static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2)
944{
945        return _mm_cmpgt_epi16(arg1, arg2);
946}
947//The total number of operations is 1.0
948static inline bitblock128_t simd_not(bitblock128_t arg1)
949{
950        return simd_xor(arg1, simd_constant_32(-1));
951}
952//The total number of operations is 13.0
953static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2)
954{
955        return simd_sll_128(arg1, arg2);
956}
957//The total number of operations is 1.0
958static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2)
959{
960        return _mm_mul_epu32(arg1, arg2);
961}
962//The total number of operations is 289.0
963static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2)
964{
965        bitblock128_t loMask = simd_lomask_2();
966        bitblock128_t tmpAns1 = simd_umult_2(simd_and(loMask, arg1), simd_and(loMask, arg2));
967        bitblock128_t tmpAns2 = simd_umult_2(simd_and(loMask, simd_srli_4(arg1, (2))), simd_and(loMask, simd_srli_4(arg2, (2))));
968        return simd_or(tmpAns1, simd_slli_4(tmpAns2, (2)));
969}
970//The total number of operations is 139.0
971static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2)
972{
973        bitblock128_t loMask = simd_lomask_4();
974        bitblock128_t tmpAns1 = simd_umult_4(simd_and(loMask, arg1), simd_and(loMask, arg2));
975        bitblock128_t tmpAns2 = simd_umult_4(simd_and(loMask, simd_srli_8(arg1, (4))), simd_and(loMask, simd_srli_8(arg2, (4))));
976        return simd_or(tmpAns1, simd_slli_8(tmpAns2, (4)));
977}
978//The total number of operations is 64.0
979static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2)
980{
981        bitblock128_t loMask = simd_lomask_8();
982        bitblock128_t tmpAns1 = simd_umult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
983        bitblock128_t tmpAns2 = simd_umult_8(simd_and(loMask, simd_srli_16(arg1, (8))), simd_and(loMask, simd_srli_16(arg2, (8))));
984        return simd_or(tmpAns1, simd_slli_16(tmpAns2, (8)));
985}
986//The total number of operations is 28.0
987static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2)
988{
989        bitblock128_t loMask = simd_lomask_16();
990        bitblock128_t tmpAns1 = simd_umult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
991        bitblock128_t tmpAns2 = simd_umult_16(simd_and(loMask, simd_srli_32(arg1, (16))), simd_and(loMask, simd_srli_32(arg2, (16))));
992        return simd_or(tmpAns1, simd_slli_32(tmpAns2, (16)));
993}
994//The total number of operations is 45.0
995static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2)
996{
997        bitblock128_t loMask1 = simd_lomask_128();
998        bitblock128_t arg11 = simd_and(arg1, loMask1);
999        bitblock128_t arg22 = simd_and(arg2, loMask1);
1000        bitblock128_t loMask2 = simd_lomask_64();
1001        bitblock128_t arg1_low = simd_and(arg11, loMask2);
1002        bitblock128_t arg1_high = simd_srli_64(arg11, (32));
1003        bitblock128_t arg2_low = simd_and(arg22, loMask2);
1004        bitblock128_t arg2_high = simd_srli_64(arg22, (32));
1005        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
1006        bitblock128_t tmpAns2 = simd_slli_128(simd_umult_32(arg1_low, arg2_high), (32));
1007        bitblock128_t tmpAns3 = simd_slli_128(simd_umult_32(arg1_high, arg2_low), (32));
1008        bitblock128_t tmpAns4 = simd_slli_128(simd_umult_32(arg1_high, arg2_high), 64);
1009        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, simd_add_128(tmpAns3, tmpAns4)));
1010}
1011//The total number of operations is 10.0
1012static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2)
1013{
1014        bitblock128_t loMask = simd_lomask_32();
1015        bitblock128_t tmpAns1 = simd_umult_32(simd_and(loMask, arg1), simd_and(loMask, arg2));
1016        bitblock128_t tmpAns2 = simd_umult_32(simd_and(loMask, simd_srli_64(arg1, (32))), simd_and(loMask, simd_srli_64(arg2, (32))));
1017        return simd_or(tmpAns1, simd_slli_64(tmpAns2, (32)));
1018}
1019//The total number of operations is 34.3333333333
1020static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1021{
1022        return simd_add_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
1023}
1024//The total number of operations is 73.0
1025static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2)
1026{
1027        return simd_add_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
1028}
1029//The total number of operations is 59.0
1030static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2)
1031{
1032        return simd_add_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
1033}
1034//The total number of operations is 35.3333333333
1035static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2)
1036{
1037        return simd_add_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
1038}
1039//The total number of operations is 13.0
1040static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2)
1041{
1042        return simd_add_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
1043}
1044//The total number of operations is 11.6666666667
1045static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2)
1046{
1047        return simd_add_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
1048}
1049//The total number of operations is 7.0
1050static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2)
1051{
1052        return simd_add_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
1053}
1054//The total number of operations is 7.0
1055static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2)
1056{
1057        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
1058        return simd_lt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1059}
1060//The total number of operations is 1.0
1061static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2)
1062{
1063        return simd_andc(arg2, arg1);
1064}
1065//The total number of operations is 13.6666666667
1066static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2)
1067{
1068        bitblock128_t tmp = simd_not(arg1);
1069        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(simd_and(tmp, arg2), 1), simd_or(tmp, arg2)));
1070        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
1071}
1072//The total number of operations is 20.0
1073static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2)
1074{
1075        return simd_ifh_1(simd_himask_8(), simd_ult_8(arg1, simd_and(simd_himask_8(), arg2)), simd_ult_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
1076}
1077//The total number of operations is 7.0
1078static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2)
1079{
1080        bitblock128_t high_bit = simd_constant_8((128));
1081        return simd_lt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1082}
1083//The total number of operations is 17.5
1084static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2)
1085{
1086        bitblock128_t tmpAns = simd_ult_32(arg1, arg2);
1087        bitblock128_t mask = simd_and(tmpAns, simd_srli_64(simd_eq_32(arg1, arg2), (32)));
1088        mask = simd_or(mask, simd_slli_64(mask, (32)));
1089        return simd_or(simd_srai_64(tmpAns, (32)), mask);
1090}
1091//The total number of operations is 40.0833333333
1092static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2)
1093{
1094        return simd_and(simd_srai_128(simd_or(simd_and(simd_not(arg1), arg2), simd_and(simd_not(simd_xor(arg1, arg2)), simd_sub_128(arg1, arg2))), (127)), simd_not(simd_eq_128(arg1, arg2)));
1095}
1096//The total number of operations is 7.0
1097static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2)
1098{
1099        bitblock128_t high_bit = simd_constant_16((32768));
1100        return simd_lt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1101}
1102//The total number of operations is 1.0
1103static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1)
1104{
1105        return _mm_loadu_si128((bitblock128_t*)(arg1));
1106}
1107//The total number of operations is 19.0
1108static inline bitblock128_t simd_ctz_32(bitblock128_t arg1)
1109{
1110        return simd_popcount_32(simd_andc(simd_sub_32(arg1, simd_constant_32(1)), arg1));
1111}
1112//The total number of operations is 1.0
1113static inline bitblock128_t simd_ctz_1(bitblock128_t arg1)
1114{
1115        return simd_not(arg1);
1116}
1117//The total number of operations is 10.6666666667
1118static inline bitblock128_t simd_ctz_2(bitblock128_t arg1)
1119{
1120        bitblock128_t tmp = simd_not(arg1);
1121        return simd_ifh_1(simd_himask_2(), simd_and(tmp, simd_slli_128(tmp, 1)), simd_and(simd_srli_128(arg1, 1), tmp));
1122}
1123//The total number of operations is 14.0
1124static inline bitblock128_t simd_ctz_4(bitblock128_t arg1)
1125{
1126        return simd_popcount_4(simd_andc(simd_sub_4(arg1, simd_constant_4(1)), arg1));
1127}
1128//The total number of operations is 13.0
1129static inline bitblock128_t simd_ctz_8(bitblock128_t arg1)
1130{
1131        return simd_popcount_8(simd_andc(simd_sub_8(arg1, simd_constant_8(1)), arg1));
1132}
1133//The total number of operations is 14.0
1134static inline bitblock128_t simd_ctz_64(bitblock128_t arg1)
1135{
1136        return simd_popcount_64(simd_andc(simd_sub_64(arg1, simd_constant_64(1)), arg1));
1137}
1138//The total number of operations is 26.6666666667
1139static inline bitblock128_t simd_ctz_128(bitblock128_t arg1)
1140{
1141        return simd_popcount_128(simd_andc(simd_sub_128(arg1, simd_constant_128(1)), arg1));
1142}
1143//The total number of operations is 16.0
1144static inline bitblock128_t simd_ctz_16(bitblock128_t arg1)
1145{
1146        return simd_popcount_16(simd_andc(simd_sub_16(arg1, simd_constant_16(1)), arg1));
1147}
1148//The total number of operations is 10.0
1149static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask)
1150{
1151        return simd_ifh_1(simd_himask_128(), _mm_sll_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_sll_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1152}
1153//The total number of operations is 13.0
1154static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask)
1155{
1156        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1157        return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1158}
1159//The total number of operations is 1.0
1160static inline bitblock128_t mvmd_fill_32(uint64_t val1)
1161{
1162        return _mm_set1_epi32((int32_t)(val1));
1163}
1164//The total number of operations is 1.0
1165static inline bitblock128_t mvmd_fill_1(uint64_t val1)
1166{
1167        return mvmd_fill_32((-1*val1));
1168}
1169//The total number of operations is 1.0
1170static inline bitblock128_t mvmd_fill_2(uint64_t val1)
1171{
1172        return mvmd_fill_4(((val1<<2)|val1));
1173}
1174//The total number of operations is 1.0
1175static inline bitblock128_t mvmd_fill_4(uint64_t val1)
1176{
1177        return mvmd_fill_8(((val1<<4)|val1));
1178}
1179//The total number of operations is 1.0
1180static inline bitblock128_t mvmd_fill_8(uint64_t val1)
1181{
1182        return _mm_set1_epi8((int32_t)(val1));
1183}
1184//The total number of operations is 1.0
1185static inline bitblock128_t mvmd_fill_64(uint64_t val1)
1186{
1187        return _mm_set_epi32((int32_t)((val1>>32)), (int32_t)(val1), (int32_t)((val1>>32)), (int32_t)(val1));
1188}
1189//The total number of operations is 1.0
1190static inline bitblock128_t mvmd_fill_128(uint64_t val1)
1191{
1192        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val1>>32)), (int32_t)(val1));
1193}
1194//The total number of operations is 1.0
1195static inline bitblock128_t mvmd_fill_16(uint64_t val1)
1196{
1197        return _mm_set1_epi16((int32_t)(val1));
1198}
1199//The total number of operations is 1.0
1200static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2)
1201{
1202        return _mm_packs_epi32(arg2, arg1);
1203}
1204//The total number of operations is 108.666666667
1205static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2)
1206{
1207        bitblock128_t hiBound = simd_srli_2(simd_lomask_2(), 1);
1208        bitblock128_t loBound = simd_not(hiBound);
1209        return hsimd_packl_2(simd_ifh_1(simd_gt_2(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_2(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg2, loBound), arg2, loBound)));
1210}
1211//The total number of operations is 79.3333333333
1212static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2)
1213{
1214        bitblock128_t hiBound = simd_srli_4(simd_lomask_4(), 1);
1215        bitblock128_t loBound = simd_not(hiBound);
1216        return hsimd_packl_4(simd_ifh_1(simd_gt_4(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_4(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg2, loBound), arg2, loBound)));
1217}
1218//The total number of operations is 32.6666666667
1219static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2)
1220{
1221        bitblock128_t hiBound = simd_srli_8(simd_lomask_8(), 1);
1222        bitblock128_t loBound = simd_not(hiBound);
1223        return hsimd_packl_8(simd_ifh_1(simd_gt_8(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_8(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg2, loBound), arg2, loBound)));
1224}
1225//The total number of operations is 77.0
1226static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2)
1227{
1228        bitblock128_t hiBound = simd_srli_64(simd_lomask_64(), 1);
1229        bitblock128_t loBound = simd_not(hiBound);
1230        return hsimd_packl_64(simd_ifh_1(simd_gt_64(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_64(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg2, loBound), arg2, loBound)));
1231}
1232//The total number of operations is 227.666666667
1233static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2)
1234{
1235        bitblock128_t hiBound = simd_srli_128(simd_lomask_128(), 1);
1236        bitblock128_t loBound = simd_not(hiBound);
1237        return hsimd_packl_128(simd_ifh_1(simd_gt_128(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_128(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg2, loBound), arg2, loBound)));
1238}
1239//The total number of operations is 1.0
1240static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2)
1241{
1242        return _mm_packs_epi16(arg2, arg1);
1243}
1244//The total number of operations is 13.0
1245static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2)
1246{
1247        return simd_srl_128(arg1, arg2);
1248}
1249//The total number of operations is 1.0
1250static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
1251{
1252        _mm_store_si128((bitblock128_t*)(arg2), arg1);
1253}
1254//The total number of operations is 1.0
1255static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2)
1256{
1257        return _mm_cmpeq_epi32(arg1, arg2);
1258}
1259//The total number of operations is 2.0
1260static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2)
1261{
1262        return simd_not(simd_xor(arg1, arg2));
1263}
1264//The total number of operations is 8.0
1265static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2)
1266{
1267        bitblock128_t tmpAns = simd_eq_1(arg1, arg2);
1268        bitblock128_t loMask = simd_and(tmpAns, simd_srli_2(tmpAns, (1)));
1269        bitblock128_t hiMask = simd_slli_2(loMask, (1));
1270        return simd_or(loMask, hiMask);
1271}
1272//The total number of operations is 9.0
1273static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2)
1274{
1275        return simd_or(simd_and(simd_himask_8(), simd_eq_8(simd_and(simd_himask_8(), arg1), simd_and(simd_himask_8(), arg2))), simd_and(simd_lomask_8(), simd_eq_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2))));
1276}
1277//The total number of operations is 1.0
1278static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2)
1279{
1280        return _mm_cmpeq_epi8(arg1, arg2);
1281}
1282//The total number of operations is 5.0
1283static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2)
1284{
1285        bitblock128_t tmpAns = simd_eq_32(arg1, arg2);
1286        bitblock128_t loMask = simd_and(tmpAns, simd_srli_64(tmpAns, (32)));
1287        bitblock128_t hiMask = simd_slli_64(loMask, (32));
1288        return simd_or(loMask, hiMask);
1289}
1290//The total number of operations is 11.6666666667
1291static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2)
1292{
1293        bitblock128_t tmpAns = simd_eq_64(arg1, arg2);
1294        bitblock128_t loMask = simd_and(tmpAns, simd_srli_128(tmpAns, (64)));
1295        bitblock128_t hiMask = simd_slli_128(loMask, (64));
1296        return simd_or(loMask, hiMask);
1297}
1298//The total number of operations is 1.0
1299static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2)
1300{
1301        return _mm_cmpeq_epi16(arg1, arg2);
1302}
1303//The total number of operations is 17.0
1304static inline bitblock128_t simd_popcount_32(bitblock128_t arg1)
1305{
1306        return simd_add_hl_32(simd_popcount_16(arg1));
1307}
1308//The total number of operations is 0
1309static inline bitblock128_t simd_popcount_1(bitblock128_t arg1)
1310{
1311        return arg1;
1312}
1313//The total number of operations is 3.0
1314static inline bitblock128_t simd_popcount_2(bitblock128_t arg1)
1315{
1316        return simd_add_hl_2(simd_popcount_1(arg1));
1317}
1318//The total number of operations is 7.0
1319static inline bitblock128_t simd_popcount_4(bitblock128_t arg1)
1320{
1321        return simd_add_hl_4(simd_popcount_2(arg1));
1322}
1323//The total number of operations is 11.0
1324static inline bitblock128_t simd_popcount_8(bitblock128_t arg1)
1325{
1326        return simd_add_hl_8(simd_popcount_4(arg1));
1327}
1328//The total number of operations is 12.0
1329static inline bitblock128_t simd_popcount_64(bitblock128_t arg1)
1330{
1331        return _mm_sad_epu8(simd_popcount_8(arg1), simd_constant_8(0));
1332}
1333//The total number of operations is 16.3333333333
1334static inline bitblock128_t simd_popcount_128(bitblock128_t arg1)
1335{
1336        bitblock128_t tmpAns = simd_popcount_64(arg1);
1337        return simd_add_64(simd_and(tmpAns, simd_lomask_128()), simd_srli_128(tmpAns, (64)));
1338}
1339//The total number of operations is 14.0
1340static inline bitblock128_t simd_popcount_16(bitblock128_t arg1)
1341{
1342        return simd_add_hl_16(simd_popcount_8(arg1));
1343}
1344//The total number of operations is 1.0
1345static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
1346{
1347        return _mm_andnot_si128(arg2, arg1);
1348}
1349//The total number of operations is 1.0
1350static inline bitblock128_t simd_neg_32(bitblock128_t arg1)
1351{
1352        return simd_sub_32(simd_constant_32(0), arg1);
1353}
1354//The total number of operations is 6.33333333333
1355static inline bitblock128_t simd_neg_2(bitblock128_t arg1)
1356{
1357        return simd_ifh_1(simd_himask_2(), simd_xor(arg1, simd_slli_128(arg1, 1)), arg1);
1358}
1359//The total number of operations is 6.0
1360static inline bitblock128_t simd_neg_4(bitblock128_t arg1)
1361{
1362        return simd_sub_4(simd_constant_4(0), arg1);
1363}
1364//The total number of operations is 1.0
1365static inline bitblock128_t simd_neg_8(bitblock128_t arg1)
1366{
1367        return simd_sub_8(simd_constant_8(0), arg1);
1368}
1369//The total number of operations is 1.0
1370static inline bitblock128_t simd_neg_64(bitblock128_t arg1)
1371{
1372        return simd_sub_64(simd_constant_64(0), arg1);
1373}
1374//The total number of operations is 9.33333333333
1375static inline bitblock128_t simd_neg_128(bitblock128_t arg1)
1376{
1377        return simd_sub_128(simd_constant_128(0), arg1);
1378}
1379//The total number of operations is 1.0
1380static inline bitblock128_t simd_neg_16(bitblock128_t arg1)
1381{
1382        return simd_sub_16(simd_constant_16(0), arg1);
1383}
1384//The total number of operations is 17.6666666667
1385static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2)
1386{
1387        return hsimd_packl_32(simd_srli_64(arg1, (16)), simd_srli_64(arg2, (16)));
1388}
1389//The total number of operations is 37.0
1390static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2)
1391{
1392        return hsimd_packl_2(simd_srli_64(arg1, (1)), simd_srli_64(arg2, (1)));
1393}
1394//The total number of operations is 26.3333333333
1395static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2)
1396{
1397        return hsimd_packl_4(simd_srli_64(arg1, (2)), simd_srli_64(arg2, (2)));
1398}
1399//The total number of operations is 15.6666666667
1400static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2)
1401{
1402        return hsimd_packl_8(simd_srli_64(arg1, (4)), simd_srli_64(arg2, (4)));
1403}
1404//The total number of operations is 7.0
1405static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2)
1406{
1407        return hsimd_packl_64(simd_srli_64(arg1, (32)), simd_srli_64(arg2, (32)));
1408}
1409//The total number of operations is 5.33333333333
1410static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2)
1411{
1412        return simd_ifh_1(simd_himask_128(), arg1, simd_srli_128(arg2, (64)));
1413}
1414//The total number of operations is 3.0
1415static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2)
1416{
1417        return hsimd_packus_16(simd_srli_16(arg1, (8)), simd_srli_16(arg2, (8)));
1418}
1419//The total number of operations is 0
1420static inline bitblock128_t simd_himask_32()
1421{
1422        return simd_constant_32(-65536);
1423}
1424//The total number of operations is 0
1425static inline bitblock128_t simd_himask_2()
1426{
1427        return simd_constant_2((2));
1428}
1429//The total number of operations is 0
1430static inline bitblock128_t simd_himask_4()
1431{
1432        return simd_constant_4((12));
1433}
1434//The total number of operations is 0
1435static inline bitblock128_t simd_himask_8()
1436{
1437        return simd_constant_8((240));
1438}
1439//The total number of operations is 0
1440static inline bitblock128_t simd_himask_64()
1441{
1442        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
1443}
1444//The total number of operations is 0
1445static inline bitblock128_t simd_himask_128()
1446{
1447        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
1448}
1449//The total number of operations is 0
1450static inline bitblock128_t simd_himask_16()
1451{
1452        return simd_constant_16((65280));
1453}
1454//The total number of operations is 2.0
1455static inline bool bitblock_all(bitblock128_t arg1)
1456{
1457        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(-1))) == 65535;
1458}
1459//The total number of operations is 4.0
1460static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1461{
1462        return simd_ifh_1(simd_gt_32(simd_constant_32(0), arg1), arg2, arg3);
1463}
1464//The total number of operations is 3.0
1465static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1466{
1467        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
1468}
1469//The total number of operations is 8.0
1470static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1471{
1472        return simd_ifh_1(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(arg1, (1))), arg2, arg3);
1473}
1474//The total number of operations is 13.0
1475static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1476{
1477        return simd_ifh_1(simd_gt_4(simd_constant_4(0), arg1), arg2, arg3);
1478}
1479//The total number of operations is 4.0
1480static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1481{
1482        return simd_ifh_1(simd_gt_8(simd_constant_8(0), arg1), arg2, arg3);
1483}
1484//The total number of operations is 8.0
1485static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1486{
1487        return simd_ifh_32(simd_ifh_1(simd_himask_64(), arg1, simd_srli_64(arg1, (32))), arg2, arg3);
1488}
1489//The total number of operations is 13.3333333333
1490static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1491{
1492        return simd_ifh_64(simd_ifh_1(simd_himask_128(), arg1, simd_srli_128(arg1, (64))), arg2, arg3);
1493}
1494//The total number of operations is 4.0
1495static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1496{
1497        return simd_ifh_1(simd_gt_16(simd_constant_16(0), arg1), arg2, arg3);
1498}
1499//The total number of operations is 1.0
1500static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2)
1501{
1502        return _mm_sub_epi32(arg1, arg2);
1503}
1504//The total number of operations is 1.0
1505static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2)
1506{
1507        return simd_xor(arg1, arg2);
1508}
1509//The total number of operations is 9.33333333333
1510static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2)
1511{
1512        bitblock128_t tmp = simd_xor(arg1, arg2);
1513        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(simd_and(simd_not(arg1), arg2), 1)), tmp);
1514}
1515//The total number of operations is 6.0
1516static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2)
1517{
1518        return simd_ifh_1(simd_himask_8(), simd_sub_8(arg1, simd_and(simd_himask_8(), arg2)), simd_sub_8(arg1, arg2));
1519}
1520//The total number of operations is 1.0
1521static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2)
1522{
1523        return _mm_sub_epi8(arg1, arg2);
1524}
1525//The total number of operations is 1.0
1526static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2)
1527{
1528        return _mm_sub_epi64(arg1, arg2);
1529}
1530//The total number of operations is 9.33333333333
1531static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2)
1532{
1533        bitblock128_t partial = simd_sub_64(arg1, arg2);
1534        bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
1535        bitblock128_t borrow = simd_slli_128(simd_srli_64(borrowMask, (63)), (64));
1536        return simd_sub_64(partial, borrow);
1537}
1538//The total number of operations is 1.0
1539static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2)
1540{
1541        return _mm_sub_epi16(arg1, arg2);
1542}
1543//The total number of operations is 3.0
1544static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1)
1545{
1546        return simd_add_64(simd_srli_32(arg1, (16)), simd_and(arg1, simd_lomask_32()));
1547}
1548//The total number of operations is 3.0
1549static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1)
1550{
1551        return simd_sub_16(arg1, simd_and(simd_lomask_2(), simd_srli_16(arg1, 1)));
1552}
1553//The total number of operations is 4.0
1554static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1)
1555{
1556        return simd_add_8(simd_srli_4(arg1, (2)), simd_and(arg1, simd_lomask_4()));
1557}
1558//The total number of operations is 4.0
1559static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1)
1560{
1561        return simd_add_16(simd_srli_8(arg1, (4)), simd_and(arg1, simd_lomask_8()));
1562}
1563//The total number of operations is 3.0
1564static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1)
1565{
1566        return simd_add_64(simd_srli_64(arg1, (32)), simd_and(arg1, simd_lomask_64()));
1567}
1568//The total number of operations is 12.6666666667
1569static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1)
1570{
1571        return simd_add_128(simd_srli_128(arg1, (64)), simd_and(arg1, simd_lomask_128()));
1572}
1573//The total number of operations is 3.0
1574static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1)
1575{
1576        return simd_add_32(simd_srli_16(arg1, (8)), simd_and(arg1, simd_lomask_16()));
1577}
1578//The total number of operations is 10.0
1579static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask)
1580{
1581        return simd_ifh_1(simd_himask_128(), _mm_srl_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_srl_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1582}
1583//The total number of operations is 13.0
1584static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask)
1585{
1586        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1587        return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1588}
1589//The total number of operations is 0
1590static inline bitblock128_t simd_lomask_32()
1591{
1592        return simd_constant_32((65535));
1593}
1594//The total number of operations is 0
1595static inline bitblock128_t simd_lomask_2()
1596{
1597        return simd_constant_2((1));
1598}
1599//The total number of operations is 0
1600static inline bitblock128_t simd_lomask_4()
1601{
1602        return simd_constant_4((3));
1603}
1604//The total number of operations is 0
1605static inline bitblock128_t simd_lomask_8()
1606{
1607        return simd_constant_8((15));
1608}
1609//The total number of operations is 0
1610static inline bitblock128_t simd_lomask_64()
1611{
1612        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
1613}
1614//The total number of operations is 0
1615static inline bitblock128_t simd_lomask_128()
1616{
1617        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
1618}
1619//The total number of operations is 0
1620static inline bitblock128_t simd_lomask_16()
1621{
1622        return simd_constant_16((255));
1623}
1624//The total number of operations is 3.0
1625static inline uint64_t hsimd_signmask_32(bitblock128_t arg1)
1626{
1627        return hsimd_signmask_16(hsimd_packss_32(simd_constant_32(0), arg1));
1628}
1629//The total number of operations is 24.0
1630static inline uint64_t hsimd_signmask_4(bitblock128_t arg1)
1631{
1632        uint64_t tmpAns1 = hsimd_signmask_8(esimd_mergeh_4(arg1, simd_constant_4(0)));
1633        uint64_t tmpAns2 = hsimd_signmask_8(esimd_mergel_4(arg1, simd_constant_4(0)));
1634        return ((tmpAns1<<(16))+tmpAns2);
1635}
1636//The total number of operations is 1.0
1637static inline uint64_t hsimd_signmask_8(bitblock128_t arg1)
1638{
1639        return _mm_movemask_epi8(arg1);
1640}
1641//The total number of operations is 1.0
1642static inline uint64_t hsimd_signmask_64(bitblock128_t arg1)
1643{
1644        return _mm_movemask_pd(_mm_castsi128_pd(arg1));
1645}
1646//The total number of operations is 6.33333333333
1647static inline uint64_t hsimd_signmask_128(bitblock128_t arg1)
1648{
1649        return hsimd_signmask_64(hsimd_packh_128(simd_constant_128(0), arg1));
1650}
1651//The total number of operations is 2.0
1652static inline uint64_t hsimd_signmask_16(bitblock128_t arg1)
1653{
1654        return hsimd_signmask_8(hsimd_packss_16(simd_constant_16(0), arg1));
1655}
1656//The total number of operations is 3.0
1657static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1)
1658{
1659        return esimd_mergeh_64(simd_srli_64(arg1, 32), simd_and(simd_lomask_64(), arg1));
1660}
1661//The total number of operations is 24.0
1662static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1)
1663{
1664        return esimd_mergeh_2(simd_srli_2(arg1, 1), simd_and(simd_lomask_2(), arg1));
1665}
1666//The total number of operations is 14.0
1667static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1)
1668{
1669        return esimd_mergeh_4(simd_srli_4(arg1, 2), simd_and(simd_lomask_4(), arg1));
1670}
1671//The total number of operations is 4.0
1672static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1)
1673{
1674        return esimd_mergeh_8(simd_srli_8(arg1, 4), simd_and(simd_lomask_8(), arg1));
1675}
1676//The total number of operations is 3.0
1677static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1)
1678{
1679        return esimd_mergeh_16(simd_srli_16(arg1, 8), simd_and(simd_lomask_16(), arg1));
1680}
1681//The total number of operations is 2.33333333333
1682static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1)
1683{
1684        return simd_srli_128(arg1, 64);
1685}
1686//The total number of operations is 3.0
1687static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1)
1688{
1689        return esimd_mergeh_32(simd_srli_32(arg1, 16), simd_and(simd_lomask_32(), arg1));
1690}
1691//The total number of operations is 3.0
1692static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1)
1693{
1694        return esimd_mergel_64(simd_srli_64(arg1, 32), simd_and(simd_lomask_64(), arg1));
1695}
1696//The total number of operations is 24.0
1697static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1)
1698{
1699        return esimd_mergel_2(simd_srli_2(arg1, 1), simd_and(simd_lomask_2(), arg1));
1700}
1701//The total number of operations is 14.0
1702static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1)
1703{
1704        return esimd_mergel_4(simd_srli_4(arg1, 2), simd_and(simd_lomask_4(), arg1));
1705}
1706//The total number of operations is 4.0
1707static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1)
1708{
1709        return esimd_mergel_8(simd_srli_8(arg1, 4), simd_and(simd_lomask_8(), arg1));
1710}
1711//The total number of operations is 3.0
1712static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1)
1713{
1714        return esimd_mergel_16(simd_srli_16(arg1, 8), simd_and(simd_lomask_16(), arg1));
1715}
1716//The total number of operations is 1.0
1717static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1)
1718{
1719        return simd_and(simd_lomask_128(), arg1);
1720}
1721//The total number of operations is 3.0
1722static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1)
1723{
1724        return esimd_mergel_32(simd_srli_32(arg1, 16), simd_and(simd_lomask_32(), arg1));
1725}
1726//The total number of operations is 1.0
1727static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1728{
1729        return _mm_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
1730}
1731//The total number of operations is 5.0
1732static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1733{
1734        return simd_ifh_1(simd_himask_4(), mvmd_fill2_1(val1, val2), mvmd_fill2_1(val3, val4));
1735}
1736//The total number of operations is 5.0
1737static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1738{
1739        return simd_ifh_1(simd_himask_8(), mvmd_fill2_2(val1, val2), mvmd_fill2_2(val3, val4));
1740}
1741//The total number of operations is 5.0
1742static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1743{
1744        return simd_ifh_1(simd_himask_16(), mvmd_fill2_4(val1, val2), mvmd_fill2_4(val3, val4));
1745}
1746//The total number of operations is 5.0
1747static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1748{
1749        return simd_ifh_1(simd_himask_32(), mvmd_fill2_8(val1, val2), mvmd_fill2_8(val3, val4));
1750}
1751//The total number of operations is 3.0
1752static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1753{
1754        return simd_or(mvmd_fill4_32((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd_fill4_32((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
1755}
1756//The total number of operations is 7.0
1757static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2)
1758{
1759        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
1760        return simd_xor(simd_min_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1761}
1762//The total number of operations is 1.0
1763static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2)
1764{
1765        return simd_and(arg1, arg2);
1766}
1767//The total number of operations is 16.0
1768static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2)
1769{
1770        return simd_or(simd_and(simd_himask_4(), simd_umin_4(arg1, arg2)), simd_umin_4(simd_and(simd_lomask_4(), arg1), simd_and(simd_lomask_4(), arg2)));
1771}
1772//The total number of operations is 6.0
1773static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2)
1774{
1775        return simd_or(simd_and(simd_himask_8(), simd_umin_8(arg1, arg2)), simd_umin_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
1776}
1777//The total number of operations is 1.0
1778static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2)
1779{
1780        return _mm_min_epu8(arg1, arg2);
1781}
1782//The total number of operations is 20.0
1783static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2)
1784{
1785        bitblock128_t tmpAns = simd_umin_32(arg1, arg2);
1786        bitblock128_t eqMask1 = simd_srli_64(simd_eq_32(tmpAns, arg1), (32));
1787        bitblock128_t eqMask2 = simd_srli_64(simd_eq_32(tmpAns, arg2), (32));
1788        return simd_ifh_1(simd_himask_64(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1789}
1790//The total number of operations is 43.6666666667
1791static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2)
1792{
1793        bitblock128_t tmpAns = simd_umin_64(arg1, arg2);
1794        bitblock128_t eqMask1 = simd_srli_128(simd_eq_64(tmpAns, arg1), (64));
1795        bitblock128_t eqMask2 = simd_srli_128(simd_eq_64(tmpAns, arg2), (64));
1796        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1797}
1798//The total number of operations is 4.0
1799static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2)
1800{
1801        bitblock128_t high_bit = simd_constant_16((32768));
1802        return simd_xor(simd_min_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1803}
1804//The total number of operations is 4.0
1805static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2)
1806{
1807        return simd_ifh_1(simd_gt_32(arg1, arg2), arg2, arg1);
1808}
1809//The total number of operations is 1.0
1810static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2)
1811{
1812        return simd_or(arg1, arg2);
1813}
1814//The total number of operations is 16.6666666667
1815static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2)
1816{
1817        bitblock128_t tmp1 = simd_srli_128(arg1, 1);
1818        bitblock128_t tmp2 = simd_srli_128(arg2, 1);
1819        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg1, simd_and(tmp1, simd_not(tmp2))), simd_and(arg2, simd_or(simd_and(simd_not(tmp1), tmp2), arg1))));
1820}
1821//The total number of operations is 9.0
1822static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2)
1823{
1824        bitblock128_t high_bit = simd_constant_4((8));
1825        return simd_xor(simd_umin_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1826}
1827//The total number of operations is 4.0
1828static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2)
1829{
1830        return simd_ifh_1(simd_gt_8(arg1, arg2), arg2, arg1);
1831}
1832//The total number of operations is 17.5
1833static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2)
1834{
1835        return simd_ifh_1(simd_gt_64(arg1, arg2), arg2, arg1);
1836}
1837//The total number of operations is 54.75
1838static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2)
1839{
1840        return simd_ifh_1(simd_gt_128(arg1, arg2), arg2, arg1);
1841}
1842//The total number of operations is 1.0
1843static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2)
1844{
1845        return _mm_min_epi16(arg1, arg2);
1846}
1847//The total number of operations is 5.0
1848static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2)
1849{
1850        return simd_ifh_1(simd_himask_64(), mvmd_fill_32(val1), mvmd_fill_32(val2));
1851}
1852//The total number of operations is 1.0
1853static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2)
1854{
1855        return mvmd_fill_2(((val1<<1)|(val2&(1))));
1856}
1857//The total number of operations is 1.0
1858static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2)
1859{
1860        return mvmd_fill_4(((val1<<2)|(val2&(3))));
1861}
1862//The total number of operations is 1.0
1863static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2)
1864{
1865        return mvmd_fill_8(((val1<<4)|(val2&(15))));
1866}
1867//The total number of operations is 1.0
1868static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2)
1869{
1870        return mvmd_fill_16(((val1<<8)|(val2&(255))));
1871}
1872//The total number of operations is 5.0
1873static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2)
1874{
1875        return simd_ifh_1(simd_himask_128(), mvmd_fill_64(val1), mvmd_fill_64(val2));
1876}
1877//The total number of operations is 1.0
1878static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2)
1879{
1880        return mvmd_fill_32(((val1<<16)|(val2&(65535))));
1881}
1882//The total number of operations is 2.0
1883static inline bool bitblock_any(bitblock128_t arg1)
1884{
1885        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(0))) != 65535;
1886}
1887//The total number of operations is 20.3333333333
1888static inline uint64_t bitblock_popcount(bitblock128_t arg1)
1889{
1890        return mvmd_extract_64(simd_popcount_128(arg1), 0);
1891}
1892//The total number of operations is 1.0
1893static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
1894{
1895        return _mm_or_si128(arg1, arg2);
1896}
1897//The total number of operations is 15.6666666667
1898static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2)
1899{
1900        return hsimd_packl_64(simd_ifh_1(simd_himask_32(), simd_srli_128(arg1, (16)), arg1), simd_ifh_1(simd_himask_32(), simd_srli_128(arg2, (16)), arg2));
1901}
1902//The total number of operations is 35.0
1903static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2)
1904{
1905        return hsimd_packl_4(simd_ifh_1(simd_himask_2(), simd_srli_128(arg1, (1)), arg1), simd_ifh_1(simd_himask_2(), simd_srli_128(arg2, (1)), arg2));
1906}
1907//The total number of operations is 24.3333333333
1908static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2)
1909{
1910        return hsimd_packl_8(simd_ifh_1(simd_himask_4(), simd_srli_128(arg1, (2)), arg1), simd_ifh_1(simd_himask_4(), simd_srli_128(arg2, (2)), arg2));
1911}
1912//The total number of operations is 13.6666666667
1913static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2)
1914{
1915        return hsimd_packl_16(simd_ifh_1(simd_himask_8(), simd_srli_128(arg1, (4)), arg1), simd_ifh_1(simd_himask_8(), simd_srli_128(arg2, (4)), arg2));
1916}
1917//The total number of operations is 5.0
1918static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2)
1919{
1920        return simd_or(mvmd_shufflei_32(simd_andc(arg1, simd_himask_64()), shufflemask4(2, 0, 3, 3)), mvmd_shufflei_32(simd_andc(arg2, simd_himask_64()), shufflemask4(3, 3, 2, 0)));
1921}
1922//The total number of operations is 5.33333333333
1923static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2)
1924{
1925        return simd_ifh_1(simd_himask_128(), simd_slli_128(arg1, (64)), arg2);
1926}
1927//The total number of operations is 3.0
1928static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2)
1929{
1930        return hsimd_packus_16(simd_and(arg1, simd_lomask_16()), simd_and(arg2, simd_lomask_16()));
1931}
1932//The total number of operations is 13.0
1933static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1934{
1935        return simd_ifh_1(simd_himask_8(), mvmd_fill4_1(val1, val2, val3, val4), mvmd_fill4_1(val5, val6, val7, val8));
1936}
1937//The total number of operations is 13.0
1938static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1939{
1940        return simd_ifh_1(simd_himask_16(), mvmd_fill4_2(val1, val2, val3, val4), mvmd_fill4_2(val5, val6, val7, val8));
1941}
1942//The total number of operations is 7.0
1943static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1944{
1945        return simd_or(mvmd_fill8_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd_fill8_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
1946}
1947//The total number of operations is 3.0
1948static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1949{
1950        return simd_or(mvmd_fill8_16((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd_fill8_16((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
1951}
1952//The total number of operations is 1.0
1953static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1954{
1955        return _mm_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
1956}
1957//The total number of operations is 34.3333333333
1958static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1959{
1960        return simd_min_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
1961}
1962//The total number of operations is 73.0
1963static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2)
1964{
1965        return simd_min_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
1966}
1967//The total number of operations is 67.3333333333
1968static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2)
1969{
1970        return simd_min_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
1971}
1972//The total number of operations is 38.3333333333
1973static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2)
1974{
1975        return simd_min_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
1976}
1977//The total number of operations is 16.0
1978static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2)
1979{
1980        return simd_min_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
1981}
1982//The total number of operations is 28.1666666667
1983static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2)
1984{
1985        return simd_min_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
1986}
1987//The total number of operations is 10.0
1988static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2)
1989{
1990        return simd_min_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
1991}
1992//The total number of operations is 1.0
1993static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2)
1994{
1995        return _mm_xor_si128(arg1, arg2);
1996}
1997//The total number of operations is 7.0
1998static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2)
1999{
2000        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
2001        return simd_xor(simd_max_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
2002}
2003//The total number of operations is 1.0
2004static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2)
2005{
2006        return simd_or(arg1, arg2);
2007}
2008//The total number of operations is 15.6666666667
2009static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2)
2010{
2011        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(simd_or(simd_not(arg1), arg2), 1)), simd_and(arg1, simd_srli_128(simd_or(arg1, simd_not(arg2)), 1))));
2012}
2013//The total number of operations is 6.0
2014static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2)
2015{
2016        return simd_or(simd_and(simd_himask_8(), simd_umax_8(arg1, arg2)), simd_umax_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
2017}
2018//The total number of operations is 1.0
2019static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2)
2020{
2021        return _mm_max_epu8(arg1, arg2);
2022}
2023//The total number of operations is 20.0
2024static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2)
2025{
2026        bitblock128_t tmpAns = simd_umax_32(arg1, arg2);
2027        bitblock128_t eqMask1 = simd_srli_64(simd_eq_32(tmpAns, arg1), (32));
2028        bitblock128_t eqMask2 = simd_srli_64(simd_eq_32(tmpAns, arg2), (32));
2029        return simd_ifh_1(simd_himask_64(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2030}
2031//The total number of operations is 43.6666666667
2032static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2)
2033{
2034        bitblock128_t tmpAns = simd_umax_64(arg1, arg2);
2035        bitblock128_t eqMask1 = simd_srli_128(simd_eq_64(tmpAns, arg1), (64));
2036        bitblock128_t eqMask2 = simd_srli_128(simd_eq_64(tmpAns, arg2), (64));
2037        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2038}
2039//The total number of operations is 4.0
2040static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2)
2041{
2042        bitblock128_t high_bit = simd_constant_16((32768));
2043        return simd_xor(simd_max_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
2044}
2045//The total number of operations is 1.0
2046static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1)
2047{
2048        return _mm_load_si128((bitblock128_t*)(arg1));
2049}
2050//The total number of operations is 1.0
2051static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
2052{
2053        _mm_storeu_si128((bitblock128_t*)(arg2), arg1);
2054}
2055//The total number of operations is 11.0
2056static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1)
2057{
2058        return esimd_mergel_64(simd_srai_64(arg1, 32), simd_srai_64(simd_slli_64(arg1, 32), 32));
2059}
2060//The total number of operations is 31.0
2061static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1)
2062{
2063        return esimd_mergel_2(simd_srai_2(arg1, 1), simd_srai_2(simd_slli_2(arg1, 1), 1));
2064}
2065//The total number of operations is 33.0
2066static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1)
2067{
2068        return esimd_mergel_4(simd_srai_4(arg1, 2), simd_srai_4(simd_slli_4(arg1, 2), 2));
2069}
2070//The total number of operations is 13.0
2071static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1)
2072{
2073        return esimd_mergel_8(simd_srai_8(arg1, 4), simd_srai_8(simd_slli_8(arg1, 4), 4));
2074}
2075//The total number of operations is 4.0
2076static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1)
2077{
2078        return esimd_mergel_16(simd_srai_16(arg1, 8), simd_srai_16(simd_slli_16(arg1, 8), 8));
2079}
2080//The total number of operations is 13.4166666667
2081static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1)
2082{
2083        return simd_srai_128(simd_slli_128(arg1, 64), 64);
2084}
2085//The total number of operations is 4.0
2086static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1)
2087{
2088        return esimd_mergel_32(simd_srai_32(arg1, 16), simd_srai_32(simd_slli_32(arg1, 16), 16));
2089}
2090//The total number of operations is 33.6666666667
2091static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2)
2092{
2093        bitblock128_t arg11 = simd_ifh_32(arg1, simd_constant_32(0), arg1);
2094        bitblock128_t arg12 = simd_and(simd_lomask_32(), arg11);
2095        bitblock128_t arg21 = simd_ifh_32(arg2, simd_constant_32(0), arg2);
2096        bitblock128_t arg22 = simd_and(simd_lomask_32(), arg21);
2097        return hsimd_packl_32(simd_ifh_1(simd_eq_32(arg12, arg11), arg12, simd_lomask_32()), simd_ifh_1(simd_eq_32(arg22, arg21), arg22, simd_lomask_32()));
2098}
2099//The total number of operations is 75.0
2100static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2)
2101{
2102        bitblock128_t arg11 = simd_ifh_2(arg1, simd_constant_2(0), arg1);
2103        bitblock128_t arg12 = simd_and(simd_lomask_2(), arg11);
2104        bitblock128_t arg21 = simd_ifh_2(arg2, simd_constant_2(0), arg2);
2105        bitblock128_t arg22 = simd_and(simd_lomask_2(), arg21);
2106        return hsimd_packl_2(simd_ifh_1(simd_eq_2(arg12, arg11), arg12, simd_lomask_2()), simd_ifh_1(simd_eq_2(arg22, arg21), arg22, simd_lomask_2()));
2107}
2108//The total number of operations is 74.3333333333
2109static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2)
2110{
2111        bitblock128_t hiPart = hsimd_packh_4(arg1, arg2);
2112        return simd_ifh_2(hiPart, simd_constant_2(0), simd_or(simd_gt_2(hiPart, simd_constant_2(0)), hsimd_packl_4(arg1, arg2)));
2113}
2114//The total number of operations is 31.6666666667
2115static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2)
2116{
2117        bitblock128_t arg11 = simd_ifh_8(arg1, simd_constant_8(0), arg1);
2118        bitblock128_t arg12 = simd_and(simd_lomask_8(), arg11);
2119        bitblock128_t arg21 = simd_ifh_8(arg2, simd_constant_8(0), arg2);
2120        bitblock128_t arg22 = simd_and(simd_lomask_8(), arg21);
2121        return hsimd_packl_8(simd_ifh_1(simd_eq_8(arg12, arg11), arg12, simd_lomask_8()), simd_ifh_1(simd_eq_8(arg22, arg21), arg22, simd_lomask_8()));
2122}
2123//The total number of operations is 18.0
2124static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2)
2125{
2126        bitblock128_t hiPart = hsimd_packh_64(arg1, arg2);
2127        return simd_ifh_32(hiPart, simd_constant_32(0), simd_or(simd_gt_32(hiPart, simd_constant_32(0)), hsimd_packl_64(arg1, arg2)));
2128}
2129//The total number of operations is 34.1666666667
2130static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2)
2131{
2132        bitblock128_t hiPart = hsimd_packh_128(arg1, arg2);
2133        return simd_ifh_64(hiPart, simd_constant_64(0), simd_or(simd_gt_64(hiPart, simd_constant_64(0)), hsimd_packl_128(arg1, arg2)));
2134}
2135//The total number of operations is 1.0
2136static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2)
2137{
2138        return _mm_packus_epi16(arg2, arg1);
2139}
2140//The total number of operations is 5.0
2141static inline bitblock128_t simd_abs_32(bitblock128_t arg1)
2142{
2143        bitblock128_t gtMask = simd_gt_32(arg1, simd_constant_32(0));
2144        return simd_ifh_1(gtMask, arg1, simd_sub_32(gtMask, arg1));
2145}
2146//The total number of operations is 7.33333333333
2147static inline bitblock128_t simd_abs_2(bitblock128_t arg1)
2148{
2149        return simd_ifh_1(simd_himask_2(), simd_and(arg1, simd_slli_128(simd_not(arg1), 1)), arg1);
2150}
2151//The total number of operations is 19.0
2152static inline bitblock128_t simd_abs_4(bitblock128_t arg1)
2153{
2154        bitblock128_t gtMask = simd_gt_4(arg1, simd_constant_4(0));
2155        return simd_ifh_1(gtMask, arg1, simd_sub_4(gtMask, arg1));
2156}
2157//The total number of operations is 5.0
2158static inline bitblock128_t simd_abs_8(bitblock128_t arg1)
2159{
2160        bitblock128_t gtMask = simd_gt_8(arg1, simd_constant_8(0));
2161        return simd_ifh_1(gtMask, arg1, simd_sub_8(gtMask, arg1));
2162}
2163//The total number of operations is 17.0
2164static inline bitblock128_t simd_abs_64(bitblock128_t arg1)
2165{
2166        bitblock128_t eqMask = simd_eq_64(simd_ifh_1(simd_himask_64(), simd_abs_32(arg1), arg1), arg1);
2167        return simd_ifh_1(eqMask, arg1, simd_sub_64(eqMask, arg1));
2168}
2169//The total number of operations is 44.0
2170static inline bitblock128_t simd_abs_128(bitblock128_t arg1)
2171{
2172        bitblock128_t eqMask = simd_eq_128(simd_ifh_1(simd_himask_128(), simd_abs_64(arg1), arg1), arg1);
2173        return simd_ifh_1(eqMask, arg1, simd_sub_128(eqMask, arg1));
2174}
2175//The total number of operations is 5.0
2176static inline bitblock128_t simd_abs_16(bitblock128_t arg1)
2177{
2178        bitblock128_t gtMask = simd_gt_16(arg1, simd_constant_16(0));
2179        return simd_ifh_1(gtMask, arg1, simd_sub_16(gtMask, arg1));
2180}
2181//The total number of operations is 3.0
2182static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1)
2183{
2184        return simd_xor(simd_srli_32(arg1, (16)), simd_and(arg1, simd_lomask_32()));
2185}
2186//The total number of operations is 4.0
2187static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1)
2188{
2189        return simd_xor(simd_srli_2(arg1, (1)), simd_and(arg1, simd_lomask_2()));
2190}
2191//The total number of operations is 4.0
2192static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1)
2193{
2194        return simd_xor(simd_srli_4(arg1, (2)), simd_and(arg1, simd_lomask_4()));
2195}
2196//The total number of operations is 4.0
2197static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1)
2198{
2199        return simd_xor(simd_srli_8(arg1, (4)), simd_and(arg1, simd_lomask_8()));
2200}
2201//The total number of operations is 3.0
2202static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1)
2203{
2204        return simd_xor(simd_srli_64(arg1, (32)), simd_and(arg1, simd_lomask_64()));
2205}
2206//The total number of operations is 4.33333333333
2207static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1)
2208{
2209        return simd_xor(simd_srli_128(arg1, (64)), simd_and(arg1, simd_lomask_128()));
2210}
2211//The total number of operations is 3.0
2212static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1)
2213{
2214        return simd_xor(simd_srli_16(arg1, (8)), simd_and(arg1, simd_lomask_16()));
2215}
2216//The total number of operations is 10.0
2217static inline bitblock128_t simd_srai_4(bitblock128_t arg1, uint64_t sh)
2218{
2219        bitblock128_t tmp = simd_srli_4(arg1, ((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)));
2220        return simd_or(tmp, simd_sub_4(simd_constant_4(0), simd_and(simd_constant_4((1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2221}
2222//The total number of operations is 5.0
2223static inline bitblock128_t simd_srai_8(bitblock128_t arg1, uint64_t sh)
2224{
2225        bitblock128_t tmp = simd_srli_8(arg1, ((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)));
2226        return simd_or(tmp, simd_sub_8(simd_constant_8(0), simd_and(simd_constant_8((1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2227}
2228//The total number of operations is 1.0
2229static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
2230{
2231        return _mm_and_si128(arg1, arg2);
2232}
2233//The total number of operations is 15.0
2234static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2235{
2236        return simd_or(mvmd_fill16_2((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd_fill16_2((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
2237}
2238//The total number of operations is 7.0
2239static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2240{
2241        return simd_or(mvmd_fill16_4((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd_fill16_4((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
2242}
2243//The total number of operations is 3.0
2244static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2245{
2246        return simd_or(mvmd_fill16_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd_fill16_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
2247}
2248//The total number of operations is 1.0
2249static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2250{
2251        return _mm_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
2252}
2253//The total number of operations is 5.0
2254static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2)
2255{
2256        return simd_and(simd_not(simd_gt_32(arg1, arg2)), simd_not(simd_eq_32(arg1, arg2)));
2257}
2258//The total number of operations is 1.0
2259static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2)
2260{
2261        return simd_andc(arg1, arg2);
2262}
2263//The total number of operations is 14.6666666667
2264static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2)
2265{
2266        bitblock128_t tmp = simd_not(arg2);
2267        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(simd_and(simd_not(arg1), arg2), 1), simd_or(arg1, tmp)));
2268        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
2269}
2270//The total number of operations is 18.0
2271static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2)
2272{
2273        return simd_ifh_1(simd_himask_8(), simd_lt_8(arg1, simd_and(simd_himask_8(), arg2)), simd_lt_8(simd_slli_8(arg1, 4), simd_slli_8(arg2, 4)));
2274}
2275//The total number of operations is 5.0
2276static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2)
2277{
2278        return simd_and(simd_not(simd_gt_8(arg1, arg2)), simd_not(simd_eq_8(arg1, arg2)));
2279}
2280//The total number of operations is 19.5
2281static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2)
2282{
2283        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
2284        return simd_ult_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2285}
2286//The total number of operations is 60.75
2287static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2)
2288{
2289        bitblock128_t hiAns = simd_lt_64(arg1, arg2);
2290        bitblock128_t loAns = simd_ult_64(arg1, arg2);
2291        bitblock128_t mask = simd_and(loAns, simd_srli_128(simd_eq_64(arg1, arg2), (64)));
2292        mask = simd_or(mask, simd_slli_128(mask, (64)));
2293        return simd_or(simd_srai_128(hiAns, (64)), mask);
2294}
2295//The total number of operations is 5.0
2296static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2)
2297{
2298        return simd_and(simd_not(simd_gt_16(arg1, arg2)), simd_not(simd_eq_16(arg1, arg2)));
2299}
2300//The total number of operations is 1.0
2301static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2)
2302{
2303        return _mm_add_epi32(arg1, arg2);
2304}
2305//The total number of operations is 1.0
2306static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2)
2307{
2308        return simd_xor(arg1, arg2);
2309}
2310//The total number of operations is 8.33333333333
2311static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2)
2312{
2313        bitblock128_t tmp = simd_xor(arg1, arg2);
2314        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(simd_and(arg1, arg2), 1)), tmp);
2315}
2316//The total number of operations is 6.0
2317static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2)
2318{
2319        return simd_ifh_1(simd_himask_8(), simd_add_8(arg1, simd_and(simd_himask_8(), arg2)), simd_add_8(arg1, arg2));
2320}
2321//The total number of operations is 1.0
2322static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2)
2323{
2324        return _mm_add_epi8(arg1, arg2);
2325}
2326//The total number of operations is 1.0
2327static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2)
2328{
2329        return _mm_add_epi64(arg1, arg2);
2330}
2331//The total number of operations is 9.33333333333
2332static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2)
2333{
2334        bitblock128_t partial = simd_add_64(arg1, arg2);
2335        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
2336        bitblock128_t carry = simd_slli_128(simd_srli_64(carryMask, (63)), (64));
2337        return simd_add_64(partial, carry);
2338}
2339//The total number of operations is 1.0
2340static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2)
2341{
2342        return _mm_add_epi16(arg1, arg2);
2343}
2344//The total number of operations is 3.0
2345static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2)
2346{
2347        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
2348        return simd_gt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2349}
2350//The total number of operations is 1.0
2351static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2)
2352{
2353        return simd_andc(arg1, arg2);
2354}
2355//The total number of operations is 13.6666666667
2356static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2)
2357{
2358        bitblock128_t tmp = simd_not(arg2);
2359        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(simd_and(arg1, tmp), 1), simd_or(arg1, tmp)));
2360        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
2361}
2362//The total number of operations is 12.0
2363static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2)
2364{
2365        return simd_ifh_1(simd_himask_8(), simd_ugt_8(simd_and(simd_himask_8(), arg1), arg2), simd_ugt_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
2366}
2367//The total number of operations is 3.0
2368static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2)
2369{
2370        bitblock128_t high_bit = simd_constant_8((128));
2371        return simd_gt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2372}
2373//The total number of operations is 13.5
2374static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2)
2375{
2376        bitblock128_t tmpAns = simd_ugt_32(arg1, arg2);
2377        bitblock128_t mask = simd_and(tmpAns, simd_srli_64(simd_eq_32(arg1, arg2), (32)));
2378        mask = simd_or(mask, simd_slli_64(mask, (32)));
2379        return simd_or(simd_srai_64(tmpAns, (32)), mask);
2380}
2381//The total number of operations is 37.25
2382static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2)
2383{
2384        bitblock128_t tmpAns = simd_ugt_64(arg1, arg2);
2385        bitblock128_t mask = simd_and(tmpAns, simd_srli_128(simd_eq_64(arg1, arg2), (64)));
2386        mask = simd_or(mask, simd_slli_128(mask, (64)));
2387        return simd_or(simd_srai_128(tmpAns, (64)), mask);
2388}
2389//The total number of operations is 3.0
2390static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2)
2391{
2392        bitblock128_t high_bit = simd_constant_16((32768));
2393        return simd_gt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2394}
2395#endif
Note: See TracBrowser for help on using the repository browser.