source: trunk/lib/idisa_c/idisa_ssse3_c.h @ 3125

Last change on this file since 3125 was 3125, checked in by linmengl, 6 years ago

add IDISA C library to the wild, 'idisa128_c.h' is the main header file to use

File size: 112.4 KB
Line 
1
2/* Copyright (c) 2011, Hua Huang and Robert D. Cameron.
3   Licensed under the Academic Free License 3.0.
4   This file is generated by the IDISA+ generator;
5   modifications should be made only by changing the
6   generator configuration and data files. */
7
8#ifndef _IDISA_SSSE3_C_H
9#define _IDISA_SSSE3_C_H
10#include "tmmintrin.h"
11
12#include <stdint.h>
13typedef __m128i bitblock128_t;
14
15#define shufflemask4(s1, s2, s3, s4) \
16        ((s1<<6) | (s2<<4) | (s3<<2) | s4)
17
18#define shufflemask4_from_shufflemask2(msk) \
19        (msk==3 ? 238 : (msk==2 ? 228 : (msk==1 ? 78 : 68)))
20
21#define shufflemask8_to_shufflemask4(msk) \
22        ((msk&3) | (((msk>>3)&3)<<2) | (((msk>>6)&3)<<4) | (((msk>>9)&3)<<6) | (((msk>>12)&3)<<8) | (((msk>>15)&3)<<10) | (((msk>>18)&3)<<12) | (((msk>>21)&3)<<14))
23
24//Declaration Starts here
25static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2);
26static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2);
27static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2);
28static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2);
29static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2);
30static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2);
31static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2);
32static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1);
33static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1);
34static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1);
35static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1);
36static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1);
37static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1);
38static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1);
39static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2);
40static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2);
41static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2);
42static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2);
43static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2);
44static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2);
45static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2);
46static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2);
47static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2);
48static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2);
49static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2);
50static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2);
51static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2);
52static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2);
53static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2);
54static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2);
55static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2);
56static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2);
57static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2);
58static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2);
59static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2);
60static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2);
61static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2);
62static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2);
63static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2);
64static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2);
65static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2);
66static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2);
67static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2);
68static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2);
69static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2);
70static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2);
71static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2);
72static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2);
73static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2);
74static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2);
75static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2);
76static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2);
77static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2);
78static inline bitblock128_t simd_not(bitblock128_t arg1);
79static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2);
80static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2);
81static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2);
82static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2);
83static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2);
84static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2);
85static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2);
86static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2);
87static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2);
88static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2);
89static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2);
90static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2);
91static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2);
92static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2);
93static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2);
94static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2);
95static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2);
96static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2);
97static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2);
98static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2);
99static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2);
100static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2);
101static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2);
102//The total number of operations is 1.0
103#define mvmd_shufflei_32(arg1, msk) \
104        _mm_shuffle_epi32(arg1, (int32_t)(msk))
105
106//The total number of operations is 1.0
107#define mvmd_shufflei_64(arg1, msk) \
108        mvmd_shufflei_32(arg1, shufflemask4_from_shufflemask2(msk))
109
110//The total number of operations is 13.6666666667
111#define mvmd_shufflei_16(arg1, msk) \
112        simd_ifh_1(mvmd_fill8_16(((((msk>>21)&4) == 0) ? 0 : (131071)), ((((msk>>18)&4) == 0) ? 0 : (131071)), ((((msk>>15)&4) == 0) ? 0 : (131071)), ((((msk>>12)&4) == 0) ? 0 : (131071)), ((((msk>>9)&4) == 0) ? (131071) : 0), ((((msk>>6)&4) == 0) ? (131071) : 0), ((((msk>>3)&4) == 0) ? (131071) : 0), (((msk&4) == 0) ? (131071) : 0)), _mm_shufflelo_epi16(_mm_shufflehi_epi16(arg1, (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), (int32_t)((shufflemask8_to_shufflemask4(msk)&255))), simd_or(_mm_shufflehi_epi16(simd_slli_128(arg1, 64), (int32_t)((shufflemask8_to_shufflemask4(msk)>>8))), _mm_shufflelo_epi16(simd_srli_128(arg1, 64), (int32_t)((shufflemask8_to_shufflemask4(msk)&255)))))
113
114//The total number of operations is 1.0
115#define simd_srli_32(arg1, sh) \
116        _mm_srli_epi32(arg1, (int32_t)(sh))
117
118//The total number of operations is 2.0
119#define simd_srli_2(arg1, sh) \
120        simd_and(simd_srli_32(arg1, sh), simd_constant_2(((3)>>sh)))
121
122//The total number of operations is 2.0
123#define simd_srli_4(arg1, sh) \
124        simd_and(simd_srli_32(arg1, sh), simd_constant_4(((15)>>sh)))
125
126//The total number of operations is 2.0
127#define simd_srli_8(arg1, sh) \
128        simd_and(simd_srli_32(arg1, sh), simd_constant_8(((255)>>sh)))
129
130//The total number of operations is 1.0
131#define simd_srli_64(arg1, sh) \
132        _mm_srli_epi64(arg1, (int32_t)(sh))
133
134//The total number of operations is 2.33333333333
135#define simd_srli_128(arg1, sh) \
136        (((sh%8) == 0) ? _mm_srli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_srli_64(_mm_srli_si128(arg1, (int32_t)(8)), (sh&63)) : simd_or(simd_srli_64(arg1, sh), _mm_srli_si128(simd_slli_64(arg1, ((128-sh)&63)), (int32_t)(8)))))
137
138//The total number of operations is 1.0
139#define simd_srli_16(arg1, sh) \
140        _mm_srli_epi16(arg1, (int32_t)(sh))
141
142static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1);
143//The total number of operations is 3.0
144#define mvmd_dsrli_32(arg1, arg2, sh) \
145        simd_or(mvmd_srli_32(arg1, sh), mvmd_slli_32(arg2, ((4)-sh)))
146
147//The total number of operations is 5.66666666667
148#define mvmd_dsrli_2(arg1, arg2, sh) \
149        simd_or(mvmd_srli_2(arg1, sh), mvmd_slli_2(arg2, ((64)-sh)))
150
151//The total number of operations is 5.66666666667
152#define mvmd_dsrli_4(arg1, arg2, sh) \
153        simd_or(mvmd_srli_4(arg1, sh), mvmd_slli_4(arg2, ((32)-sh)))
154
155//The total number of operations is 3.0
156#define mvmd_dsrli_8(arg1, arg2, sh) \
157        simd_or(mvmd_srli_8(arg1, sh), mvmd_slli_8(arg2, ((16)-sh)))
158
159//The total number of operations is 3.0
160#define mvmd_dsrli_64(arg1, arg2, sh) \
161        simd_or(mvmd_srli_64(arg1, sh), mvmd_slli_64(arg2, ((2)-sh)))
162
163//The total number of operations is 3.0
164#define mvmd_dsrli_128(arg1, arg2, sh) \
165        simd_or(mvmd_srli_128(arg1, sh), mvmd_slli_128(arg2, ((1)-sh)))
166
167//The total number of operations is 3.0
168#define mvmd_dsrli_16(arg1, arg2, sh) \
169        simd_or(mvmd_srli_16(arg1, sh), mvmd_slli_16(arg2, ((8)-sh)))
170
171//The total number of operations is 2.33333333333
172#define bitblock_srli(arg1, sh) \
173        simd_srli_128(arg1, sh)
174
175static inline bitblock128_t simd_ctz_32(bitblock128_t arg1);
176static inline bitblock128_t simd_ctz_1(bitblock128_t arg1);
177static inline bitblock128_t simd_ctz_2(bitblock128_t arg1);
178static inline bitblock128_t simd_ctz_4(bitblock128_t arg1);
179static inline bitblock128_t simd_ctz_8(bitblock128_t arg1);
180static inline bitblock128_t simd_ctz_64(bitblock128_t arg1);
181static inline bitblock128_t simd_ctz_128(bitblock128_t arg1);
182static inline bitblock128_t simd_ctz_16(bitblock128_t arg1);
183static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask);
184static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask);
185static inline bitblock128_t mvmd_fill_32(uint64_t val1);
186static inline bitblock128_t mvmd_fill_1(uint64_t val1);
187static inline bitblock128_t mvmd_fill_2(uint64_t val1);
188static inline bitblock128_t mvmd_fill_4(uint64_t val1);
189static inline bitblock128_t mvmd_fill_8(uint64_t val1);
190static inline bitblock128_t mvmd_fill_64(uint64_t val1);
191static inline bitblock128_t mvmd_fill_128(uint64_t val1);
192static inline bitblock128_t mvmd_fill_16(uint64_t val1);
193static inline bitblock128_t mvmd_shuffle_32(bitblock128_t arg1, bitblock128_t arg2);
194static inline bitblock128_t mvmd_shuffle_8(bitblock128_t arg1, bitblock128_t arg2);
195static inline bitblock128_t mvmd_shuffle_64(bitblock128_t arg1, bitblock128_t arg2);
196static inline bitblock128_t mvmd_shuffle_16(bitblock128_t arg1, bitblock128_t arg2);
197static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2);
198static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2);
199static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2);
200static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2);
201static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2);
202static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2);
203static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2);
204static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2);
205static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2);
206static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2);
207static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2);
208static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2);
209static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2);
210static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2);
211static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2);
212static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2);
213static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2);
214static inline bitblock128_t simd_popcount_32(bitblock128_t arg1);
215static inline bitblock128_t simd_popcount_1(bitblock128_t arg1);
216static inline bitblock128_t simd_popcount_2(bitblock128_t arg1);
217static inline bitblock128_t simd_popcount_4(bitblock128_t arg1);
218static inline bitblock128_t simd_popcount_8(bitblock128_t arg1);
219static inline bitblock128_t simd_popcount_64(bitblock128_t arg1);
220static inline bitblock128_t simd_popcount_128(bitblock128_t arg1);
221static inline bitblock128_t simd_popcount_16(bitblock128_t arg1);
222static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2);
223//The total number of operations is 2.0
224#define mvmd_extract_32(arg1, pos) \
225        ((((uint64_t)(mvmd_extract_16(arg1, ((2*pos)+1))))<<(16))|mvmd_extract_16(arg1, (2*pos)))
226
227//The total number of operations is 1.0
228#define mvmd_extract_1(arg1, pos) \
229        (((pos%2) == 0) ? (mvmd_extract_2(arg1, (pos/2))&(1)) : (mvmd_extract_2(arg1, (pos/2))>>1))
230
231//The total number of operations is 1.0
232#define mvmd_extract_2(arg1, pos) \
233        (((pos%2) == 0) ? (mvmd_extract_4(arg1, (pos/2))&(3)) : (mvmd_extract_4(arg1, (pos/2))>>2))
234
235//The total number of operations is 1.0
236#define mvmd_extract_4(arg1, pos) \
237        (((pos%2) == 0) ? (mvmd_extract_8(arg1, (pos/2))&(15)) : (mvmd_extract_8(arg1, (pos/2))>>4))
238
239//The total number of operations is 1.0
240#define mvmd_extract_8(arg1, pos) \
241        (((pos%2) == 0) ? (mvmd_extract_16(arg1, (pos/2))&(255)) : (mvmd_extract_16(arg1, (pos/2))>>8))
242
243//The total number of operations is 4.0
244#define mvmd_extract_64(arg1, pos) \
245        ((((uint64_t)(mvmd_extract_32(arg1, ((2*pos)+1))))<<(32))|mvmd_extract_32(arg1, (2*pos)))
246
247//The total number of operations is 1.0
248#define mvmd_extract_16(arg1, pos) \
249        (65535&_mm_extract_epi16(arg1, (int32_t)(pos)))
250
251static inline bitblock128_t simd_neg_32(bitblock128_t arg1);
252static inline bitblock128_t simd_neg_2(bitblock128_t arg1);
253static inline bitblock128_t simd_neg_4(bitblock128_t arg1);
254static inline bitblock128_t simd_neg_8(bitblock128_t arg1);
255static inline bitblock128_t simd_neg_64(bitblock128_t arg1);
256static inline bitblock128_t simd_neg_128(bitblock128_t arg1);
257static inline bitblock128_t simd_neg_16(bitblock128_t arg1);
258//The total number of operations is 1.0
259#define mvmd_splat_32(arg1, pos) \
260        mvmd_shufflei_32(arg1, shufflemask4(pos, pos, pos, pos))
261
262//The total number of operations is 12.6666666667
263#define mvmd_splat_1(arg1, pos) \
264        simd_sub_128(simd_constant_128(0), simd_and(simd_constant_128(1), simd_srli_128(arg1, pos)))
265
266//The total number of operations is 13.0
267#define mvmd_splat_2(arg1, pos) \
268        mvmd_splat_4(simd_or((((pos%2) == 0) ? simd_slli_4(arg1, 2) : simd_srli_4(arg1, 2)), (((pos%2) == 0) ? simd_and(simd_lomask_4(), arg1) : simd_and(simd_himask_4(), arg1))), (pos/2))
269
270//The total number of operations is 9.0
271#define mvmd_splat_4(arg1, pos) \
272        mvmd_splat_8(simd_or((((pos%2) == 0) ? simd_slli_8(arg1, 4) : simd_srli_8(arg1, 4)), (((pos%2) == 0) ? simd_and(simd_lomask_8(), arg1) : simd_and(simd_himask_8(), arg1))), (pos/2))
273
274//The total number of operations is 5.0
275#define mvmd_splat_8(arg1, pos) \
276        mvmd_splat_16(simd_or((((pos%2) == 0) ? simd_slli_16(arg1, 8) : simd_srli_16(arg1, 8)), (((pos%2) == 0) ? simd_and(simd_lomask_16(), arg1) : simd_and(simd_himask_16(), arg1))), (pos/2))
277
278//The total number of operations is 5.0
279#define mvmd_splat_64(arg1, pos) \
280        simd_ifh_1(simd_himask_64(), mvmd_splat_32(arg1, ((2*pos)+1)), mvmd_splat_32(arg1, (2*pos)))
281
282//The total number of operations is 13.0
283#define mvmd_splat_128(arg1, pos) \
284        simd_ifh_1(simd_himask_128(), mvmd_splat_64(arg1, ((2*pos)+1)), mvmd_splat_64(arg1, (2*pos)))
285
286//The total number of operations is 2.0
287#define mvmd_splat_16(arg1, pos) \
288        mvmd_fill_16(_mm_extract_epi16(arg1, (int32_t)(pos)))
289
290static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2);
291static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2);
292static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2);
293static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2);
294static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2);
295static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2);
296static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2);
297static inline bitblock128_t simd_himask_32();
298static inline bitblock128_t simd_himask_2();
299static inline bitblock128_t simd_himask_4();
300static inline bitblock128_t simd_himask_8();
301static inline bitblock128_t simd_himask_64();
302static inline bitblock128_t simd_himask_128();
303static inline bitblock128_t simd_himask_16();
304//The total number of operations is 1.0
305#define simd_slli_32(arg1, sh) \
306        _mm_slli_epi32(arg1, (int32_t)(sh))
307
308//The total number of operations is 2.0
309#define simd_slli_2(arg1, sh) \
310        simd_and(simd_slli_32(arg1, sh), simd_constant_2((((3)<<sh)&(3))))
311
312//The total number of operations is 2.0
313#define simd_slli_4(arg1, sh) \
314        simd_and(simd_slli_32(arg1, sh), simd_constant_4((((15)<<sh)&(15))))
315
316//The total number of operations is 2.0
317#define simd_slli_8(arg1, sh) \
318        simd_and(simd_slli_32(arg1, sh), simd_constant_8((((255)<<sh)&(255))))
319
320//The total number of operations is 1.0
321#define simd_slli_64(arg1, sh) \
322        _mm_slli_epi64(arg1, (int32_t)(sh))
323
324//The total number of operations is 2.33333333333
325#define simd_slli_128(arg1, sh) \
326        (((sh%8) == 0) ? _mm_slli_si128(arg1, (int32_t)((sh/8))) : ((sh >= 64) ? simd_slli_64(_mm_slli_si128(arg1, (int32_t)(8)), (sh&63)) : simd_or(simd_slli_64(arg1, sh), _mm_slli_si128(simd_srli_64(arg1, ((128-sh)&63)), (int32_t)(8)))))
327
328//The total number of operations is 1.0
329#define simd_slli_16(arg1, sh) \
330        _mm_slli_epi16(arg1, (int32_t)(sh))
331
332static inline bool bitblock_all(bitblock128_t arg1);
333static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
334static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
335static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
336static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
337static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
338static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
339static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
340static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3);
341static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2);
342static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2);
343static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2);
344static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2);
345static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2);
346static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2);
347static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2);
348static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2);
349static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1);
350static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1);
351static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1);
352static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1);
353static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1);
354static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1);
355static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1);
356static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask);
357static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask);
358//The total number of operations is 1.0
359#define mvmd_slli_32(arg1, sh) \
360        mvmd_slli_16(arg1, (sh*2))
361
362//The total number of operations is 2.33333333333
363#define mvmd_slli_2(arg1, sh) \
364        simd_slli_128(arg1, (sh*2))
365
366//The total number of operations is 2.33333333333
367#define mvmd_slli_4(arg1, sh) \
368        mvmd_slli_2(arg1, (sh*2))
369
370//The total number of operations is 1.0
371#define mvmd_slli_8(arg1, sh) \
372        _mm_slli_si128(arg1, (int32_t)(sh))
373
374//The total number of operations is 1.0
375#define mvmd_slli_64(arg1, sh) \
376        mvmd_slli_32(arg1, (sh*2))
377
378//The total number of operations is 1.0
379#define mvmd_slli_128(arg1, sh) \
380        mvmd_slli_64(arg1, (sh*2))
381
382//The total number of operations is 1.0
383#define mvmd_slli_16(arg1, sh) \
384        mvmd_slli_8(arg1, (sh*2))
385
386static inline bitblock128_t simd_lomask_32();
387static inline bitblock128_t simd_lomask_2();
388static inline bitblock128_t simd_lomask_4();
389static inline bitblock128_t simd_lomask_8();
390static inline bitblock128_t simd_lomask_64();
391static inline bitblock128_t simd_lomask_128();
392static inline bitblock128_t simd_lomask_16();
393static inline uint64_t hsimd_signmask_32(bitblock128_t arg1);
394static inline uint64_t hsimd_signmask_4(bitblock128_t arg1);
395static inline uint64_t hsimd_signmask_8(bitblock128_t arg1);
396static inline uint64_t hsimd_signmask_64(bitblock128_t arg1);
397static inline uint64_t hsimd_signmask_128(bitblock128_t arg1);
398static inline uint64_t hsimd_signmask_16(bitblock128_t arg1);
399static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1);
400static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1);
401static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1);
402static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1);
403static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1);
404static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1);
405static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1);
406static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1);
407static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1);
408static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1);
409static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1);
410static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1);
411static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1);
412static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1);
413static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
414static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
415static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
416static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
417static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
418static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4);
419static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2);
420static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2);
421static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2);
422static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2);
423static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2);
424static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2);
425static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2);
426static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2);
427//The total number of operations is 1.0
428#define mvmd_srli_32(arg1, sh) \
429        mvmd_srli_16(arg1, (sh*2))
430
431//The total number of operations is 2.33333333333
432#define mvmd_srli_2(arg1, sh) \
433        simd_srli_128(arg1, (sh*2))
434
435//The total number of operations is 2.33333333333
436#define mvmd_srli_4(arg1, sh) \
437        simd_srli_128(arg1, (sh*4))
438
439//The total number of operations is 1.0
440#define mvmd_srli_8(arg1, sh) \
441        _mm_srli_si128(arg1, (int32_t)(sh))
442
443//The total number of operations is 1.0
444#define mvmd_srli_64(arg1, sh) \
445        mvmd_srli_32(arg1, (sh*2))
446
447//The total number of operations is 1.0
448#define mvmd_srli_128(arg1, sh) \
449        mvmd_srli_64(arg1, (sh*2))
450
451//The total number of operations is 1.0
452#define mvmd_srli_16(arg1, sh) \
453        mvmd_srli_8(arg1, (sh*2))
454
455//The total number of operations is 0
456#define simd_constant_32(val) \
457        _mm_set1_epi32((int32_t)(val))
458
459//The total number of operations is 0
460#define simd_constant_1(val) \
461        simd_constant_32((-1*val))
462
463//The total number of operations is 0
464#define simd_constant_2(val) \
465        ((val < 0) ? simd_constant_4(((val<<2)|(val^(-4)))) : simd_constant_4(((val<<2)|val)))
466
467//The total number of operations is 0
468#define simd_constant_4(val) \
469        ((val < 0) ? simd_constant_8(((val<<4)|(val^(-16)))) : simd_constant_8(((val<<4)|val)))
470
471//The total number of operations is 0
472#define simd_constant_8(val) \
473        _mm_set1_epi8((int32_t)(val))
474
475//The total number of operations is 0
476#define simd_constant_64(val) \
477        _mm_set_epi32((int32_t)((val>>32)), (int32_t)(val), (int32_t)((val>>32)), (int32_t)(val))
478
479//The total number of operations is 0
480#define simd_constant_128(val) \
481        _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val>>32)), (int32_t)(val))
482
483//The total number of operations is 0
484#define simd_constant_16(val) \
485        _mm_set1_epi16((int32_t)(val))
486
487static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2);
488static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2);
489static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2);
490static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2);
491static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2);
492static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2);
493static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2);
494static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2);
495static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2);
496static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2);
497static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2);
498static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2);
499static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2);
500static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2);
501static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2);
502static inline bool bitblock_any(bitblock128_t arg1);
503static inline uint64_t bitblock_popcount(bitblock128_t arg1);
504//The total number of operations is 2.33333333333
505#define bitblock_slli(arg1, sh) \
506        simd_slli_128(arg1, sh)
507
508static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2);
509static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2);
510static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2);
511static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2);
512static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2);
513static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2);
514static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2);
515static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2);
516//The total number of operations is 3.0
517#define mvmd_dslli_32(arg1, arg2, sh) \
518        simd_or(mvmd_slli_32(arg1, sh), mvmd_srli_32(arg2, ((4)-sh)))
519
520//The total number of operations is 5.66666666667
521#define mvmd_dslli_2(arg1, arg2, sh) \
522        simd_or(mvmd_slli_2(arg1, sh), mvmd_srli_2(arg2, ((64)-sh)))
523
524//The total number of operations is 5.66666666667
525#define mvmd_dslli_4(arg1, arg2, sh) \
526        simd_or(mvmd_slli_4(arg1, sh), mvmd_srli_4(arg2, ((32)-sh)))
527
528//The total number of operations is 3.0
529#define mvmd_dslli_8(arg1, arg2, sh) \
530        simd_or(mvmd_slli_8(arg1, sh), mvmd_srli_8(arg2, ((16)-sh)))
531
532//The total number of operations is 3.0
533#define mvmd_dslli_64(arg1, arg2, sh) \
534        simd_or(mvmd_slli_64(arg1, sh), mvmd_srli_64(arg2, ((2)-sh)))
535
536//The total number of operations is 3.0
537#define mvmd_dslli_128(arg1, arg2, sh) \
538        simd_or(mvmd_slli_128(arg1, sh), mvmd_srli_128(arg2, ((1)-sh)))
539
540//The total number of operations is 3.0
541#define mvmd_dslli_16(arg1, arg2, sh) \
542        simd_or(mvmd_slli_16(arg1, sh), mvmd_srli_16(arg2, ((8)-sh)))
543
544static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
545static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
546static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
547static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
548static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8);
549static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2);
550static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2);
551static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2);
552static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2);
553static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2);
554static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2);
555static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2);
556static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2);
557static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2);
558static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2);
559static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2);
560static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2);
561static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2);
562static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2);
563static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2);
564static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2);
565static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1);
566static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2);
567static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1);
568static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1);
569static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1);
570static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1);
571static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1);
572static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1);
573static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1);
574static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2);
575static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2);
576static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2);
577static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2);
578static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2);
579static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2);
580static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2);
581static inline bitblock128_t simd_abs_32(bitblock128_t arg1);
582static inline bitblock128_t simd_abs_2(bitblock128_t arg1);
583static inline bitblock128_t simd_abs_4(bitblock128_t arg1);
584static inline bitblock128_t simd_abs_8(bitblock128_t arg1);
585static inline bitblock128_t simd_abs_64(bitblock128_t arg1);
586static inline bitblock128_t simd_abs_128(bitblock128_t arg1);
587static inline bitblock128_t simd_abs_16(bitblock128_t arg1);
588static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1);
589static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1);
590static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1);
591static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1);
592static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1);
593static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1);
594static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1);
595//The total number of operations is 1.0
596#define simd_srai_32(arg1, sh) \
597        _mm_srai_epi32(arg1, (int32_t)(sh))
598
599//The total number of operations is 4.0
600#define simd_srai_2(arg1, sh) \
601        ((sh == 0) ? arg1 : simd_or(simd_and(simd_himask_2(), arg1), simd_srli_2(arg1, 1)))
602
603static inline bitblock128_t simd_srai_4(bitblock128_t arg1, uint64_t sh);
604static inline bitblock128_t simd_srai_8(bitblock128_t arg1, uint64_t sh);
605//The total number of operations is 4.5
606#define simd_srai_64(arg1, sh) \
607        simd_or(simd_and(simd_himask_64(), simd_srai_32(arg1, ((sh < (32)) ? sh : (32)))), ((sh <= (32)) ? simd_srli_64(arg1, sh) : simd_srai_32(simd_srli_64(arg1, (32)), (sh-(32)))))
608
609//The total number of operations is 11.0833333333
610#define simd_srai_128(arg1, sh) \
611        simd_or(simd_and(simd_himask_128(), simd_srai_64(arg1, ((sh < (64)) ? sh : (64)))), ((sh <= (64)) ? simd_srli_128(arg1, sh) : simd_srai_64(simd_srli_128(arg1, (64)), (sh-(64)))))
612
613//The total number of operations is 1.0
614#define simd_srai_16(arg1, sh) \
615        _mm_srai_epi16(arg1, (int32_t)(sh))
616
617static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2);
618static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
619static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
620static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
621static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16);
622static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2);
623static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2);
624static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2);
625static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2);
626static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2);
627static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2);
628static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2);
629static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2);
630static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2);
631static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2);
632static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2);
633static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2);
634static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2);
635static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2);
636static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2);
637static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2);
638static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2);
639static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2);
640static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2);
641static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2);
642static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2);
643static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2);
644static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2);
645static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2);
646
647//Implementation Starts here
648//The total number of operations is 1.0
649static inline bitblock128_t esimd_mergel_32(bitblock128_t arg1, bitblock128_t arg2)
650{
651        return _mm_unpacklo_epi32(arg2, arg1);
652}
653//The total number of operations is 31.0
654static inline bitblock128_t esimd_mergel_1(bitblock128_t arg1, bitblock128_t arg2)
655{
656        return esimd_mergel_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(arg2, 1)), simd_ifh_1(simd_himask_2(), simd_slli_2(arg1, 1), arg2));
657}
658//The total number of operations is 21.0
659static inline bitblock128_t esimd_mergel_2(bitblock128_t arg1, bitblock128_t arg2)
660{
661        return esimd_mergel_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(arg2, 2)), simd_ifh_1(simd_himask_4(), simd_slli_4(arg1, 2), arg2));
662}
663//The total number of operations is 11.0
664static inline bitblock128_t esimd_mergel_4(bitblock128_t arg1, bitblock128_t arg2)
665{
666        return esimd_mergel_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(arg2, 4)), simd_ifh_1(simd_himask_8(), simd_slli_8(arg1, 4), arg2));
667}
668//The total number of operations is 1.0
669static inline bitblock128_t esimd_mergel_8(bitblock128_t arg1, bitblock128_t arg2)
670{
671        return _mm_unpacklo_epi8(arg2, arg1);
672}
673//The total number of operations is 1.0
674static inline bitblock128_t esimd_mergel_64(bitblock128_t arg1, bitblock128_t arg2)
675{
676        return _mm_unpacklo_epi64(arg2, arg1);
677}
678//The total number of operations is 1.0
679static inline bitblock128_t esimd_mergel_16(bitblock128_t arg1, bitblock128_t arg2)
680{
681        return _mm_unpacklo_epi16(arg2, arg1);
682}
683//The total number of operations is 11.0
684static inline bitblock128_t esimd_signextendh_32(bitblock128_t arg1)
685{
686        return esimd_mergeh_64(simd_srai_64(arg1, 32), simd_srai_64(simd_slli_64(arg1, 32), 32));
687}
688//The total number of operations is 31.0
689static inline bitblock128_t esimd_signextendh_1(bitblock128_t arg1)
690{
691        return esimd_mergeh_2(simd_srai_2(arg1, 1), simd_srai_2(simd_slli_2(arg1, 1), 1));
692}
693//The total number of operations is 33.0
694static inline bitblock128_t esimd_signextendh_2(bitblock128_t arg1)
695{
696        return esimd_mergeh_4(simd_srai_4(arg1, 2), simd_srai_4(simd_slli_4(arg1, 2), 2));
697}
698//The total number of operations is 13.0
699static inline bitblock128_t esimd_signextendh_4(bitblock128_t arg1)
700{
701        return esimd_mergeh_8(simd_srai_8(arg1, 4), simd_srai_8(simd_slli_8(arg1, 4), 4));
702}
703//The total number of operations is 4.0
704static inline bitblock128_t esimd_signextendh_8(bitblock128_t arg1)
705{
706        return esimd_mergeh_16(simd_srai_16(arg1, 8), simd_srai_16(simd_slli_16(arg1, 8), 8));
707}
708//The total number of operations is 11.0833333333
709static inline bitblock128_t esimd_signextendh_64(bitblock128_t arg1)
710{
711        return simd_srai_128(arg1, 64);
712}
713//The total number of operations is 4.0
714static inline bitblock128_t esimd_signextendh_16(bitblock128_t arg1)
715{
716        return esimd_mergeh_32(simd_srai_32(arg1, 16), simd_srai_32(simd_slli_32(arg1, 16), 16));
717}
718//The total number of operations is 4.0
719static inline bitblock128_t simd_max_32(bitblock128_t arg1, bitblock128_t arg2)
720{
721        return simd_ifh_1(simd_gt_32(arg1, arg2), arg1, arg2);
722}
723//The total number of operations is 1.0
724static inline bitblock128_t simd_max_1(bitblock128_t arg1, bitblock128_t arg2)
725{
726        return simd_and(arg1, arg2);
727}
728//The total number of operations is 15.6666666667
729static inline bitblock128_t simd_max_2(bitblock128_t arg1, bitblock128_t arg2)
730{
731        return simd_ifh_1(simd_himask_2(), simd_and(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(simd_or(arg1, simd_not(arg2)), 1)), simd_and(arg1, simd_srli_128(simd_or(simd_not(arg1), arg2), 1))));
732}
733//The total number of operations is 9.0
734static inline bitblock128_t simd_max_4(bitblock128_t arg1, bitblock128_t arg2)
735{
736        bitblock128_t high_bit = simd_constant_4((8));
737        return simd_xor(simd_umax_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
738}
739//The total number of operations is 4.0
740static inline bitblock128_t simd_max_8(bitblock128_t arg1, bitblock128_t arg2)
741{
742        return simd_ifh_1(simd_gt_8(arg1, arg2), arg1, arg2);
743}
744//The total number of operations is 17.5
745static inline bitblock128_t simd_max_64(bitblock128_t arg1, bitblock128_t arg2)
746{
747        return simd_ifh_1(simd_gt_64(arg1, arg2), arg1, arg2);
748}
749//The total number of operations is 54.75
750static inline bitblock128_t simd_max_128(bitblock128_t arg1, bitblock128_t arg2)
751{
752        return simd_ifh_1(simd_gt_128(arg1, arg2), arg1, arg2);
753}
754//The total number of operations is 1.0
755static inline bitblock128_t simd_max_16(bitblock128_t arg1, bitblock128_t arg2)
756{
757        return _mm_max_epi16(arg1, arg2);
758}
759//The total number of operations is 1.0
760static inline bitblock128_t esimd_mergeh_32(bitblock128_t arg1, bitblock128_t arg2)
761{
762        return _mm_unpackhi_epi32(arg2, arg1);
763}
764//The total number of operations is 31.0
765static inline bitblock128_t esimd_mergeh_1(bitblock128_t arg1, bitblock128_t arg2)
766{
767        return esimd_mergeh_2(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(arg2, 1)), simd_ifh_1(simd_himask_2(), simd_slli_2(arg1, 1), arg2));
768}
769//The total number of operations is 21.0
770static inline bitblock128_t esimd_mergeh_2(bitblock128_t arg1, bitblock128_t arg2)
771{
772        return esimd_mergeh_4(simd_ifh_1(simd_himask_4(), arg1, simd_srli_4(arg2, 2)), simd_ifh_1(simd_himask_4(), simd_slli_4(arg1, 2), arg2));
773}
774//The total number of operations is 11.0
775static inline bitblock128_t esimd_mergeh_4(bitblock128_t arg1, bitblock128_t arg2)
776{
777        return esimd_mergeh_8(simd_ifh_1(simd_himask_8(), arg1, simd_srli_8(arg2, 4)), simd_ifh_1(simd_himask_8(), simd_slli_8(arg1, 4), arg2));
778}
779//The total number of operations is 1.0
780static inline bitblock128_t esimd_mergeh_8(bitblock128_t arg1, bitblock128_t arg2)
781{
782        return _mm_unpackhi_epi8(arg2, arg1);
783}
784//The total number of operations is 1.0
785static inline bitblock128_t esimd_mergeh_64(bitblock128_t arg1, bitblock128_t arg2)
786{
787        return _mm_unpackhi_epi64(arg2, arg1);
788}
789//The total number of operations is 1.0
790static inline bitblock128_t esimd_mergeh_16(bitblock128_t arg1, bitblock128_t arg2)
791{
792        return _mm_unpackhi_epi16(arg2, arg1);
793}
794//The total number of operations is 30.0
795static inline bitblock128_t simd_mult_32(bitblock128_t arg1, bitblock128_t arg2)
796{
797        bitblock128_t loMask = simd_lomask_64();
798        bitblock128_t tmpAns1 = simd_mult_64(simd_and(loMask, arg1), simd_and(loMask, arg2));
799        bitblock128_t tmpAns2 = simd_mult_64(simd_srli_64(arg1, 32), simd_srli_64(arg2, 32));
800        return simd_ifh_1(loMask, tmpAns1, simd_slli_64(tmpAns2, 32));
801}
802//The total number of operations is 1.0
803static inline bitblock128_t simd_mult_1(bitblock128_t arg1, bitblock128_t arg2)
804{
805        return simd_and(arg1, arg2);
806}
807//The total number of operations is 19.6666666667
808static inline bitblock128_t simd_mult_2(bitblock128_t arg1, bitblock128_t arg2)
809{
810        bitblock128_t tmp1 = simd_slli_128(arg1, 1);
811        bitblock128_t tmp2 = simd_slli_128(arg2, 1);
812        return simd_ifh_1(simd_himask_2(), simd_or(simd_and(tmp1, simd_and(arg2, simd_or(simd_not(arg1), simd_not(tmp2)))), simd_and(arg1, simd_and(tmp2, simd_or(simd_not(tmp1), simd_not(arg2))))), simd_and(arg1, arg2));
813}
814//The total number of operations is 31.0
815static inline bitblock128_t simd_mult_4(bitblock128_t arg1, bitblock128_t arg2)
816{
817        bitblock128_t loMask = simd_lomask_8();
818        bitblock128_t tmpAns1 = simd_mult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
819        bitblock128_t tmpAns2 = simd_mult_8(simd_srli_8(arg1, 4), simd_srli_8(arg2, 4));
820        return simd_ifh_1(loMask, tmpAns1, simd_slli_8(tmpAns2, 4));
821}
822//The total number of operations is 10.0
823static inline bitblock128_t simd_mult_8(bitblock128_t arg1, bitblock128_t arg2)
824{
825        bitblock128_t loMask = simd_lomask_16();
826        bitblock128_t tmpAns1 = simd_mult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
827        bitblock128_t tmpAns2 = simd_mult_16(simd_srli_16(arg1, 8), simd_srli_16(arg2, 8));
828        return simd_ifh_1(loMask, tmpAns1, simd_slli_16(tmpAns2, 8));
829}
830//The total number of operations is 11.0
831static inline bitblock128_t simd_mult_64(bitblock128_t arg1, bitblock128_t arg2)
832{
833        bitblock128_t loMask = simd_lomask_64();
834        bitblock128_t arg1_low = simd_and(arg1, loMask);
835        bitblock128_t arg1_high = simd_srli_64(arg1, (32));
836        bitblock128_t arg2_low = simd_and(arg2, loMask);
837        bitblock128_t arg2_high = simd_srli_64(arg2, (32));
838        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
839        bitblock128_t tmpAns2 = simd_slli_64(simd_umult_32(arg1_low, arg2_high), (32));
840        bitblock128_t tmpAns3 = simd_slli_64(simd_umult_32(arg1_high, arg2_low), (32));
841        return simd_add_64(tmpAns1, simd_add_64(tmpAns2, tmpAns3));
842}
843//The total number of operations is 165.0
844static inline bitblock128_t simd_mult_128(bitblock128_t arg1, bitblock128_t arg2)
845{
846        bitblock128_t loMask = simd_lomask_128();
847        bitblock128_t arg1_low = simd_and(arg1, loMask);
848        bitblock128_t arg1_high = simd_srli_128(arg1, (64));
849        bitblock128_t arg2_low = simd_and(arg2, loMask);
850        bitblock128_t arg2_high = simd_srli_128(arg2, (64));
851        bitblock128_t tmpAns1 = simd_umult_64(arg1_low, arg2_low);
852        bitblock128_t tmpAns2 = simd_slli_128(simd_umult_64(arg1_low, arg2_high), (64));
853        bitblock128_t tmpAns3 = simd_slli_128(simd_umult_64(arg1_high, arg2_low), (64));
854        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, tmpAns3));
855}
856//The total number of operations is 1.0
857static inline bitblock128_t simd_mult_16(bitblock128_t arg1, bitblock128_t arg2)
858{
859        return _mm_mullo_epi16(arg1, arg2);
860}
861//The total number of operations is 10.0
862static inline bitblock128_t hsimd_umin_hl_32(bitblock128_t arg1, bitblock128_t arg2)
863{
864        return simd_umin_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
865}
866//The total number of operations is 73.0
867static inline bitblock128_t hsimd_umin_hl_2(bitblock128_t arg1, bitblock128_t arg2)
868{
869        return simd_umin_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
870}
871//The total number of operations is 66.6666666667
872static inline bitblock128_t hsimd_umin_hl_4(bitblock128_t arg1, bitblock128_t arg2)
873{
874        return simd_umin_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
875}
876//The total number of operations is 35.3333333333
877static inline bitblock128_t hsimd_umin_hl_8(bitblock128_t arg1, bitblock128_t arg2)
878{
879        return simd_umin_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
880}
881//The total number of operations is 13.0
882static inline bitblock128_t hsimd_umin_hl_64(bitblock128_t arg1, bitblock128_t arg2)
883{
884        return simd_umin_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
885}
886//The total number of operations is 30.6666666667
887static inline bitblock128_t hsimd_umin_hl_128(bitblock128_t arg1, bitblock128_t arg2)
888{
889        return simd_umin_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
890}
891//The total number of operations is 7.0
892static inline bitblock128_t hsimd_umin_hl_16(bitblock128_t arg1, bitblock128_t arg2)
893{
894        return simd_umin_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
895}
896//The total number of operations is 2.0
897static inline bitblock128_t simd_nor(bitblock128_t arg1, bitblock128_t arg2)
898{
899        return simd_not(simd_or(arg1, arg2));
900}
901//The total number of operations is 1.0
902static inline bitblock128_t simd_gt_32(bitblock128_t arg1, bitblock128_t arg2)
903{
904        return _mm_cmpgt_epi32(arg1, arg2);
905}
906//The total number of operations is 1.0
907static inline bitblock128_t simd_gt_1(bitblock128_t arg1, bitblock128_t arg2)
908{
909        return simd_andc(arg2, arg1);
910}
911//The total number of operations is 14.6666666667
912static inline bitblock128_t simd_gt_2(bitblock128_t arg1, bitblock128_t arg2)
913{
914        bitblock128_t tmp = simd_not(arg1);
915        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(simd_and(arg1, simd_not(arg2)), 1), simd_or(tmp, arg2)));
916        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
917}
918//The total number of operations is 10.0
919static inline bitblock128_t simd_gt_4(bitblock128_t arg1, bitblock128_t arg2)
920{
921        return simd_ifh_1(simd_himask_8(), simd_gt_8(simd_and(simd_himask_8(), arg1), arg2), simd_gt_8(simd_slli_8(arg1, 4), simd_slli_8(arg2, 4)));
922}
923//The total number of operations is 1.0
924static inline bitblock128_t simd_gt_8(bitblock128_t arg1, bitblock128_t arg2)
925{
926        return _mm_cmpgt_epi8(arg1, arg2);
927}
928//The total number of operations is 14.5
929static inline bitblock128_t simd_gt_64(bitblock128_t arg1, bitblock128_t arg2)
930{
931        bitblock128_t hiAns = simd_gt_32(arg1, arg2);
932        bitblock128_t loAns = simd_ugt_32(arg1, arg2);
933        bitblock128_t mask = simd_and(loAns, simd_srli_64(simd_eq_32(arg1, arg2), (32)));
934        mask = simd_or(mask, simd_slli_64(mask, (32)));
935        return simd_or(simd_srai_64(hiAns, (32)), mask);
936}
937//The total number of operations is 51.75
938static inline bitblock128_t simd_gt_128(bitblock128_t arg1, bitblock128_t arg2)
939{
940        bitblock128_t hiAns = simd_gt_64(arg1, arg2);
941        bitblock128_t loAns = simd_ugt_64(arg1, arg2);
942        bitblock128_t mask = simd_and(loAns, simd_srli_128(simd_eq_64(arg1, arg2), (64)));
943        mask = simd_or(mask, simd_slli_128(mask, (64)));
944        return simd_or(simd_srai_128(hiAns, (64)), mask);
945}
946//The total number of operations is 1.0
947static inline bitblock128_t simd_gt_16(bitblock128_t arg1, bitblock128_t arg2)
948{
949        return _mm_cmpgt_epi16(arg1, arg2);
950}
951//The total number of operations is 1.0
952static inline bitblock128_t simd_not(bitblock128_t arg1)
953{
954        return simd_xor(arg1, simd_constant_32(-1));
955}
956//The total number of operations is 13.0
957static inline bitblock128_t bitblock_sll(bitblock128_t arg1, bitblock128_t arg2)
958{
959        return simd_sll_128(arg1, arg2);
960}
961//The total number of operations is 1.0
962static inline bitblock128_t simd_umult_32(bitblock128_t arg1, bitblock128_t arg2)
963{
964        return _mm_mul_epu32(arg1, arg2);
965}
966//The total number of operations is 289.0
967static inline bitblock128_t simd_umult_1(bitblock128_t arg1, bitblock128_t arg2)
968{
969        bitblock128_t loMask = simd_lomask_2();
970        bitblock128_t tmpAns1 = simd_umult_2(simd_and(loMask, arg1), simd_and(loMask, arg2));
971        bitblock128_t tmpAns2 = simd_umult_2(simd_and(loMask, simd_srli_4(arg1, (2))), simd_and(loMask, simd_srli_4(arg2, (2))));
972        return simd_or(tmpAns1, simd_slli_4(tmpAns2, (2)));
973}
974//The total number of operations is 139.0
975static inline bitblock128_t simd_umult_2(bitblock128_t arg1, bitblock128_t arg2)
976{
977        bitblock128_t loMask = simd_lomask_4();
978        bitblock128_t tmpAns1 = simd_umult_4(simd_and(loMask, arg1), simd_and(loMask, arg2));
979        bitblock128_t tmpAns2 = simd_umult_4(simd_and(loMask, simd_srli_8(arg1, (4))), simd_and(loMask, simd_srli_8(arg2, (4))));
980        return simd_or(tmpAns1, simd_slli_8(tmpAns2, (4)));
981}
982//The total number of operations is 64.0
983static inline bitblock128_t simd_umult_4(bitblock128_t arg1, bitblock128_t arg2)
984{
985        bitblock128_t loMask = simd_lomask_8();
986        bitblock128_t tmpAns1 = simd_umult_8(simd_and(loMask, arg1), simd_and(loMask, arg2));
987        bitblock128_t tmpAns2 = simd_umult_8(simd_and(loMask, simd_srli_16(arg1, (8))), simd_and(loMask, simd_srli_16(arg2, (8))));
988        return simd_or(tmpAns1, simd_slli_16(tmpAns2, (8)));
989}
990//The total number of operations is 28.0
991static inline bitblock128_t simd_umult_8(bitblock128_t arg1, bitblock128_t arg2)
992{
993        bitblock128_t loMask = simd_lomask_16();
994        bitblock128_t tmpAns1 = simd_umult_16(simd_and(loMask, arg1), simd_and(loMask, arg2));
995        bitblock128_t tmpAns2 = simd_umult_16(simd_and(loMask, simd_srli_32(arg1, (16))), simd_and(loMask, simd_srli_32(arg2, (16))));
996        return simd_or(tmpAns1, simd_slli_32(tmpAns2, (16)));
997}
998//The total number of operations is 45.0
999static inline bitblock128_t simd_umult_64(bitblock128_t arg1, bitblock128_t arg2)
1000{
1001        bitblock128_t loMask1 = simd_lomask_128();
1002        bitblock128_t arg11 = simd_and(arg1, loMask1);
1003        bitblock128_t arg22 = simd_and(arg2, loMask1);
1004        bitblock128_t loMask2 = simd_lomask_64();
1005        bitblock128_t arg1_low = simd_and(arg11, loMask2);
1006        bitblock128_t arg1_high = simd_srli_64(arg11, (32));
1007        bitblock128_t arg2_low = simd_and(arg22, loMask2);
1008        bitblock128_t arg2_high = simd_srli_64(arg22, (32));
1009        bitblock128_t tmpAns1 = simd_umult_32(arg1_low, arg2_low);
1010        bitblock128_t tmpAns2 = simd_slli_128(simd_umult_32(arg1_low, arg2_high), (32));
1011        bitblock128_t tmpAns3 = simd_slli_128(simd_umult_32(arg1_high, arg2_low), (32));
1012        bitblock128_t tmpAns4 = simd_slli_128(simd_umult_32(arg1_high, arg2_high), 64);
1013        return simd_add_128(tmpAns1, simd_add_128(tmpAns2, simd_add_128(tmpAns3, tmpAns4)));
1014}
1015//The total number of operations is 10.0
1016static inline bitblock128_t simd_umult_16(bitblock128_t arg1, bitblock128_t arg2)
1017{
1018        bitblock128_t loMask = simd_lomask_32();
1019        bitblock128_t tmpAns1 = simd_umult_32(simd_and(loMask, arg1), simd_and(loMask, arg2));
1020        bitblock128_t tmpAns2 = simd_umult_32(simd_and(loMask, simd_srli_64(arg1, (32))), simd_and(loMask, simd_srli_64(arg2, (32))));
1021        return simd_or(tmpAns1, simd_slli_64(tmpAns2, (32)));
1022}
1023//The total number of operations is 1.0
1024static inline bitblock128_t hsimd_add_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1025{
1026        return _mm_hadd_epi16(arg2, arg1);
1027}
1028//The total number of operations is 73.0
1029static inline bitblock128_t hsimd_add_hl_2(bitblock128_t arg1, bitblock128_t arg2)
1030{
1031        return simd_add_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
1032}
1033//The total number of operations is 59.0
1034static inline bitblock128_t hsimd_add_hl_4(bitblock128_t arg1, bitblock128_t arg2)
1035{
1036        return simd_add_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
1037}
1038//The total number of operations is 35.3333333333
1039static inline bitblock128_t hsimd_add_hl_8(bitblock128_t arg1, bitblock128_t arg2)
1040{
1041        return simd_add_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
1042}
1043//The total number of operations is 1.0
1044static inline bitblock128_t hsimd_add_hl_64(bitblock128_t arg1, bitblock128_t arg2)
1045{
1046        return _mm_hadd_epi32(arg2, arg1);
1047}
1048//The total number of operations is 11.6666666667
1049static inline bitblock128_t hsimd_add_hl_128(bitblock128_t arg1, bitblock128_t arg2)
1050{
1051        return simd_add_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
1052}
1053//The total number of operations is 7.0
1054static inline bitblock128_t hsimd_add_hl_16(bitblock128_t arg1, bitblock128_t arg2)
1055{
1056        return simd_add_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
1057}
1058//The total number of operations is 7.0
1059static inline bitblock128_t simd_ult_32(bitblock128_t arg1, bitblock128_t arg2)
1060{
1061        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
1062        return simd_lt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1063}
1064//The total number of operations is 1.0
1065static inline bitblock128_t simd_ult_1(bitblock128_t arg1, bitblock128_t arg2)
1066{
1067        return simd_andc(arg2, arg1);
1068}
1069//The total number of operations is 13.6666666667
1070static inline bitblock128_t simd_ult_2(bitblock128_t arg1, bitblock128_t arg2)
1071{
1072        bitblock128_t tmp = simd_not(arg1);
1073        bitblock128_t tmpAns = simd_or(simd_and(tmp, arg2), simd_and(simd_slli_128(simd_and(tmp, arg2), 1), simd_or(tmp, arg2)));
1074        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
1075}
1076//The total number of operations is 20.0
1077static inline bitblock128_t simd_ult_4(bitblock128_t arg1, bitblock128_t arg2)
1078{
1079        return simd_ifh_1(simd_himask_8(), simd_ult_8(arg1, simd_and(simd_himask_8(), arg2)), simd_ult_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
1080}
1081//The total number of operations is 7.0
1082static inline bitblock128_t simd_ult_8(bitblock128_t arg1, bitblock128_t arg2)
1083{
1084        bitblock128_t high_bit = simd_constant_8((128));
1085        return simd_lt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1086}
1087//The total number of operations is 17.5
1088static inline bitblock128_t simd_ult_64(bitblock128_t arg1, bitblock128_t arg2)
1089{
1090        bitblock128_t tmpAns = simd_ult_32(arg1, arg2);
1091        bitblock128_t mask = simd_and(tmpAns, simd_srli_64(simd_eq_32(arg1, arg2), (32)));
1092        mask = simd_or(mask, simd_slli_64(mask, (32)));
1093        return simd_or(simd_srai_64(tmpAns, (32)), mask);
1094}
1095//The total number of operations is 40.0833333333
1096static inline bitblock128_t simd_ult_128(bitblock128_t arg1, bitblock128_t arg2)
1097{
1098        return simd_and(simd_srai_128(simd_or(simd_and(simd_not(arg1), arg2), simd_and(simd_not(simd_xor(arg1, arg2)), simd_sub_128(arg1, arg2))), (127)), simd_not(simd_eq_128(arg1, arg2)));
1099}
1100//The total number of operations is 7.0
1101static inline bitblock128_t simd_ult_16(bitblock128_t arg1, bitblock128_t arg2)
1102{
1103        bitblock128_t high_bit = simd_constant_16((32768));
1104        return simd_lt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
1105}
1106//The total number of operations is 1.0
1107static inline bitblock128_t bitblock_load_unaligned(const bitblock128_t* arg1)
1108{
1109        return _mm_loadu_si128((bitblock128_t*)(arg1));
1110}
1111//The total number of operations is 19.0
1112static inline bitblock128_t simd_ctz_32(bitblock128_t arg1)
1113{
1114        return simd_popcount_32(simd_andc(simd_sub_32(arg1, simd_constant_32(1)), arg1));
1115}
1116//The total number of operations is 1.0
1117static inline bitblock128_t simd_ctz_1(bitblock128_t arg1)
1118{
1119        return simd_not(arg1);
1120}
1121//The total number of operations is 10.6666666667
1122static inline bitblock128_t simd_ctz_2(bitblock128_t arg1)
1123{
1124        bitblock128_t tmp = simd_not(arg1);
1125        return simd_ifh_1(simd_himask_2(), simd_and(tmp, simd_slli_128(tmp, 1)), simd_and(simd_srli_128(arg1, 1), tmp));
1126}
1127//The total number of operations is 14.0
1128static inline bitblock128_t simd_ctz_4(bitblock128_t arg1)
1129{
1130        return simd_popcount_4(simd_andc(simd_sub_4(arg1, simd_constant_4(1)), arg1));
1131}
1132//The total number of operations is 13.0
1133static inline bitblock128_t simd_ctz_8(bitblock128_t arg1)
1134{
1135        return simd_popcount_8(simd_andc(simd_sub_8(arg1, simd_constant_8(1)), arg1));
1136}
1137//The total number of operations is 14.0
1138static inline bitblock128_t simd_ctz_64(bitblock128_t arg1)
1139{
1140        return simd_popcount_64(simd_andc(simd_sub_64(arg1, simd_constant_64(1)), arg1));
1141}
1142//The total number of operations is 26.6666666667
1143static inline bitblock128_t simd_ctz_128(bitblock128_t arg1)
1144{
1145        return simd_popcount_128(simd_andc(simd_sub_128(arg1, simd_constant_128(1)), arg1));
1146}
1147//The total number of operations is 16.0
1148static inline bitblock128_t simd_ctz_16(bitblock128_t arg1)
1149{
1150        return simd_popcount_16(simd_andc(simd_sub_16(arg1, simd_constant_16(1)), arg1));
1151}
1152//The total number of operations is 10.0
1153static inline bitblock128_t simd_sll_64(bitblock128_t arg1, bitblock128_t shift_mask)
1154{
1155        return simd_ifh_1(simd_himask_128(), _mm_sll_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_sll_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1156}
1157//The total number of operations is 13.0
1158static inline bitblock128_t simd_sll_128(bitblock128_t arg1, bitblock128_t shift_mask)
1159{
1160        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1161        return simd_or(_mm_sll_epi64(arg1, shift), simd_or(_mm_slli_si128(_mm_sll_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_slli_si128(_mm_srl_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1162}
1163//The total number of operations is 1.0
1164static inline bitblock128_t mvmd_fill_32(uint64_t val1)
1165{
1166        return _mm_set1_epi32((int32_t)(val1));
1167}
1168//The total number of operations is 1.0
1169static inline bitblock128_t mvmd_fill_1(uint64_t val1)
1170{
1171        return mvmd_fill_32((-1*val1));
1172}
1173//The total number of operations is 1.0
1174static inline bitblock128_t mvmd_fill_2(uint64_t val1)
1175{
1176        return mvmd_fill_4(((val1<<2)|val1));
1177}
1178//The total number of operations is 1.0
1179static inline bitblock128_t mvmd_fill_4(uint64_t val1)
1180{
1181        return mvmd_fill_8(((val1<<4)|val1));
1182}
1183//The total number of operations is 1.0
1184static inline bitblock128_t mvmd_fill_8(uint64_t val1)
1185{
1186        return _mm_set1_epi8((int32_t)(val1));
1187}
1188//The total number of operations is 1.0
1189static inline bitblock128_t mvmd_fill_64(uint64_t val1)
1190{
1191        return _mm_set_epi32((int32_t)((val1>>32)), (int32_t)(val1), (int32_t)((val1>>32)), (int32_t)(val1));
1192}
1193//The total number of operations is 1.0
1194static inline bitblock128_t mvmd_fill_128(uint64_t val1)
1195{
1196        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)((val1>>32)), (int32_t)(val1));
1197}
1198//The total number of operations is 1.0
1199static inline bitblock128_t mvmd_fill_16(uint64_t val1)
1200{
1201        return _mm_set1_epi16((int32_t)(val1));
1202}
1203//The total number of operations is 19.0
1204static inline bitblock128_t mvmd_shuffle_32(bitblock128_t arg1, bitblock128_t arg2)
1205{
1206        bitblock128_t tmp1 = simd_and(simd_constant_32((3)), arg2);
1207        bitblock128_t msk1 = simd_add_32(tmp1, tmp1);
1208        bitblock128_t msk2 = simd_add_32(msk1, simd_constant_32(1));
1209        bitblock128_t msk = simd_or(msk1, simd_slli_32(msk2, (16)));
1210        return simd_ifh_32(arg2, simd_constant_32(0), mvmd_shuffle_16(arg1, msk));
1211}
1212//The total number of operations is 1.0
1213static inline bitblock128_t mvmd_shuffle_8(bitblock128_t arg1, bitblock128_t arg2)
1214{
1215        return _mm_shuffle_epi8(arg1, arg2);
1216}
1217//The total number of operations is 32.0
1218static inline bitblock128_t mvmd_shuffle_64(bitblock128_t arg1, bitblock128_t arg2)
1219{
1220        bitblock128_t tmp1 = simd_and(simd_constant_64((1)), arg2);
1221        bitblock128_t msk1 = simd_add_64(tmp1, tmp1);
1222        bitblock128_t msk2 = simd_add_64(msk1, simd_constant_64(1));
1223        bitblock128_t msk = simd_or(msk1, simd_slli_64(msk2, (32)));
1224        return simd_ifh_64(arg2, simd_constant_64(0), mvmd_shuffle_32(arg1, msk));
1225}
1226//The total number of operations is 10.0
1227static inline bitblock128_t mvmd_shuffle_16(bitblock128_t arg1, bitblock128_t arg2)
1228{
1229        bitblock128_t tmp1 = simd_and(simd_constant_16((7)), arg2);
1230        bitblock128_t msk1 = simd_add_16(tmp1, tmp1);
1231        bitblock128_t msk2 = simd_add_16(msk1, simd_constant_16(1));
1232        bitblock128_t msk = simd_or(msk1, simd_slli_16(msk2, (8)));
1233        return simd_ifh_16(arg2, simd_constant_16(0), mvmd_shuffle_8(arg1, msk));
1234}
1235//The total number of operations is 1.0
1236static inline bitblock128_t hsimd_packss_32(bitblock128_t arg1, bitblock128_t arg2)
1237{
1238        return _mm_packs_epi32(arg2, arg1);
1239}
1240//The total number of operations is 108.666666667
1241static inline bitblock128_t hsimd_packss_2(bitblock128_t arg1, bitblock128_t arg2)
1242{
1243        bitblock128_t hiBound = simd_srli_2(simd_lomask_2(), 1);
1244        bitblock128_t loBound = simd_not(hiBound);
1245        return hsimd_packl_2(simd_ifh_1(simd_gt_2(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_2(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_2(arg2, loBound), arg2, loBound)));
1246}
1247//The total number of operations is 79.3333333333
1248static inline bitblock128_t hsimd_packss_4(bitblock128_t arg1, bitblock128_t arg2)
1249{
1250        bitblock128_t hiBound = simd_srli_4(simd_lomask_4(), 1);
1251        bitblock128_t loBound = simd_not(hiBound);
1252        return hsimd_packl_4(simd_ifh_1(simd_gt_4(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_4(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_4(arg2, loBound), arg2, loBound)));
1253}
1254//The total number of operations is 32.6666666667
1255static inline bitblock128_t hsimd_packss_8(bitblock128_t arg1, bitblock128_t arg2)
1256{
1257        bitblock128_t hiBound = simd_srli_8(simd_lomask_8(), 1);
1258        bitblock128_t loBound = simd_not(hiBound);
1259        return hsimd_packl_8(simd_ifh_1(simd_gt_8(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_8(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_8(arg2, loBound), arg2, loBound)));
1260}
1261//The total number of operations is 75.0
1262static inline bitblock128_t hsimd_packss_64(bitblock128_t arg1, bitblock128_t arg2)
1263{
1264        bitblock128_t hiBound = simd_srli_64(simd_lomask_64(), 1);
1265        bitblock128_t loBound = simd_not(hiBound);
1266        return hsimd_packl_64(simd_ifh_1(simd_gt_64(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_64(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_64(arg2, loBound), arg2, loBound)));
1267}
1268//The total number of operations is 227.666666667
1269static inline bitblock128_t hsimd_packss_128(bitblock128_t arg1, bitblock128_t arg2)
1270{
1271        bitblock128_t hiBound = simd_srli_128(simd_lomask_128(), 1);
1272        bitblock128_t loBound = simd_not(hiBound);
1273        return hsimd_packl_128(simd_ifh_1(simd_gt_128(arg1, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg1, loBound), arg1, loBound)), simd_ifh_1(simd_gt_128(arg2, hiBound), hiBound, simd_ifh_1(simd_gt_128(arg2, loBound), arg2, loBound)));
1274}
1275//The total number of operations is 1.0
1276static inline bitblock128_t hsimd_packss_16(bitblock128_t arg1, bitblock128_t arg2)
1277{
1278        return _mm_packs_epi16(arg2, arg1);
1279}
1280//The total number of operations is 13.0
1281static inline bitblock128_t bitblock_srl(bitblock128_t arg1, bitblock128_t arg2)
1282{
1283        return simd_srl_128(arg1, arg2);
1284}
1285//The total number of operations is 1.0
1286static inline void bitblock_store_aligned(bitblock128_t arg1, bitblock128_t* arg2)
1287{
1288        _mm_store_si128((bitblock128_t*)(arg2), arg1);
1289}
1290//The total number of operations is 1.0
1291static inline bitblock128_t simd_eq_32(bitblock128_t arg1, bitblock128_t arg2)
1292{
1293        return _mm_cmpeq_epi32(arg1, arg2);
1294}
1295//The total number of operations is 2.0
1296static inline bitblock128_t simd_eq_1(bitblock128_t arg1, bitblock128_t arg2)
1297{
1298        return simd_not(simd_xor(arg1, arg2));
1299}
1300//The total number of operations is 8.0
1301static inline bitblock128_t simd_eq_2(bitblock128_t arg1, bitblock128_t arg2)
1302{
1303        bitblock128_t tmpAns = simd_eq_1(arg1, arg2);
1304        bitblock128_t loMask = simd_and(tmpAns, simd_srli_2(tmpAns, (1)));
1305        bitblock128_t hiMask = simd_slli_2(loMask, (1));
1306        return simd_or(loMask, hiMask);
1307}
1308//The total number of operations is 9.0
1309static inline bitblock128_t simd_eq_4(bitblock128_t arg1, bitblock128_t arg2)
1310{
1311        return simd_or(simd_and(simd_himask_8(), simd_eq_8(simd_and(simd_himask_8(), arg1), simd_and(simd_himask_8(), arg2))), simd_and(simd_lomask_8(), simd_eq_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2))));
1312}
1313//The total number of operations is 1.0
1314static inline bitblock128_t simd_eq_8(bitblock128_t arg1, bitblock128_t arg2)
1315{
1316        return _mm_cmpeq_epi8(arg1, arg2);
1317}
1318//The total number of operations is 5.0
1319static inline bitblock128_t simd_eq_64(bitblock128_t arg1, bitblock128_t arg2)
1320{
1321        bitblock128_t tmpAns = simd_eq_32(arg1, arg2);
1322        bitblock128_t loMask = simd_and(tmpAns, simd_srli_64(tmpAns, (32)));
1323        bitblock128_t hiMask = simd_slli_64(loMask, (32));
1324        return simd_or(loMask, hiMask);
1325}
1326//The total number of operations is 11.6666666667
1327static inline bitblock128_t simd_eq_128(bitblock128_t arg1, bitblock128_t arg2)
1328{
1329        bitblock128_t tmpAns = simd_eq_64(arg1, arg2);
1330        bitblock128_t loMask = simd_and(tmpAns, simd_srli_128(tmpAns, (64)));
1331        bitblock128_t hiMask = simd_slli_128(loMask, (64));
1332        return simd_or(loMask, hiMask);
1333}
1334//The total number of operations is 1.0
1335static inline bitblock128_t simd_eq_16(bitblock128_t arg1, bitblock128_t arg2)
1336{
1337        return _mm_cmpeq_epi16(arg1, arg2);
1338}
1339//The total number of operations is 17.0
1340static inline bitblock128_t simd_popcount_32(bitblock128_t arg1)
1341{
1342        return simd_add_hl_32(simd_popcount_16(arg1));
1343}
1344//The total number of operations is 0
1345static inline bitblock128_t simd_popcount_1(bitblock128_t arg1)
1346{
1347        return arg1;
1348}
1349//The total number of operations is 3.0
1350static inline bitblock128_t simd_popcount_2(bitblock128_t arg1)
1351{
1352        return simd_add_hl_2(simd_popcount_1(arg1));
1353}
1354//The total number of operations is 7.0
1355static inline bitblock128_t simd_popcount_4(bitblock128_t arg1)
1356{
1357        return simd_add_hl_4(simd_popcount_2(arg1));
1358}
1359//The total number of operations is 11.0
1360static inline bitblock128_t simd_popcount_8(bitblock128_t arg1)
1361{
1362        return simd_add_hl_8(simd_popcount_4(arg1));
1363}
1364//The total number of operations is 12.0
1365static inline bitblock128_t simd_popcount_64(bitblock128_t arg1)
1366{
1367        return _mm_sad_epu8(simd_popcount_8(arg1), simd_constant_8(0));
1368}
1369//The total number of operations is 16.3333333333
1370static inline bitblock128_t simd_popcount_128(bitblock128_t arg1)
1371{
1372        bitblock128_t tmpAns = simd_popcount_64(arg1);
1373        return simd_add_64(simd_and(tmpAns, simd_lomask_128()), simd_srli_128(tmpAns, (64)));
1374}
1375//The total number of operations is 14.0
1376static inline bitblock128_t simd_popcount_16(bitblock128_t arg1)
1377{
1378        return simd_add_hl_16(simd_popcount_8(arg1));
1379}
1380//The total number of operations is 1.0
1381static inline bitblock128_t simd_andc(bitblock128_t arg1, bitblock128_t arg2)
1382{
1383        return _mm_andnot_si128(arg2, arg1);
1384}
1385//The total number of operations is 1.0
1386static inline bitblock128_t simd_neg_32(bitblock128_t arg1)
1387{
1388        return _mm_sign_epi32(arg1, simd_constant_32(-1));
1389}
1390//The total number of operations is 6.33333333333
1391static inline bitblock128_t simd_neg_2(bitblock128_t arg1)
1392{
1393        return simd_ifh_1(simd_himask_2(), simd_xor(arg1, simd_slli_128(arg1, 1)), arg1);
1394}
1395//The total number of operations is 6.0
1396static inline bitblock128_t simd_neg_4(bitblock128_t arg1)
1397{
1398        return simd_sub_4(simd_constant_4(0), arg1);
1399}
1400//The total number of operations is 1.0
1401static inline bitblock128_t simd_neg_8(bitblock128_t arg1)
1402{
1403        return simd_sub_8(simd_constant_8(0), arg1);
1404}
1405//The total number of operations is 1.0
1406static inline bitblock128_t simd_neg_64(bitblock128_t arg1)
1407{
1408        return simd_sub_64(simd_constant_64(0), arg1);
1409}
1410//The total number of operations is 9.33333333333
1411static inline bitblock128_t simd_neg_128(bitblock128_t arg1)
1412{
1413        return simd_sub_128(simd_constant_128(0), arg1);
1414}
1415//The total number of operations is 1.0
1416static inline bitblock128_t simd_neg_16(bitblock128_t arg1)
1417{
1418        return simd_sub_16(simd_constant_16(0), arg1);
1419}
1420//The total number of operations is 3.0
1421static inline bitblock128_t hsimd_packh_32(bitblock128_t arg1, bitblock128_t arg2)
1422{
1423        return _mm_hsub_epi16(simd_srli_32(arg2, (16)), simd_srli_32(arg1, (16)));
1424}
1425//The total number of operations is 37.0
1426static inline bitblock128_t hsimd_packh_2(bitblock128_t arg1, bitblock128_t arg2)
1427{
1428        return hsimd_packl_2(simd_srli_64(arg1, (1)), simd_srli_64(arg2, (1)));
1429}
1430//The total number of operations is 26.3333333333
1431static inline bitblock128_t hsimd_packh_4(bitblock128_t arg1, bitblock128_t arg2)
1432{
1433        return hsimd_packl_4(simd_srli_64(arg1, (2)), simd_srli_64(arg2, (2)));
1434}
1435//The total number of operations is 15.6666666667
1436static inline bitblock128_t hsimd_packh_8(bitblock128_t arg1, bitblock128_t arg2)
1437{
1438        return hsimd_packl_8(simd_srli_64(arg1, (4)), simd_srli_64(arg2, (4)));
1439}
1440//The total number of operations is 3.0
1441static inline bitblock128_t hsimd_packh_64(bitblock128_t arg1, bitblock128_t arg2)
1442{
1443        return _mm_hsub_epi32(simd_srli_64(arg2, (32)), simd_srli_64(arg1, (32)));
1444}
1445//The total number of operations is 5.33333333333
1446static inline bitblock128_t hsimd_packh_128(bitblock128_t arg1, bitblock128_t arg2)
1447{
1448        return simd_ifh_1(simd_himask_128(), arg1, simd_srli_128(arg2, (64)));
1449}
1450//The total number of operations is 3.0
1451static inline bitblock128_t hsimd_packh_16(bitblock128_t arg1, bitblock128_t arg2)
1452{
1453        return hsimd_packus_16(simd_srli_16(arg1, (8)), simd_srli_16(arg2, (8)));
1454}
1455//The total number of operations is 0
1456static inline bitblock128_t simd_himask_32()
1457{
1458        return simd_constant_32(-65536);
1459}
1460//The total number of operations is 0
1461static inline bitblock128_t simd_himask_2()
1462{
1463        return simd_constant_2((2));
1464}
1465//The total number of operations is 0
1466static inline bitblock128_t simd_himask_4()
1467{
1468        return simd_constant_4((12));
1469}
1470//The total number of operations is 0
1471static inline bitblock128_t simd_himask_8()
1472{
1473        return simd_constant_8((240));
1474}
1475//The total number of operations is 0
1476static inline bitblock128_t simd_himask_64()
1477{
1478        return _mm_set_epi32((int32_t)(-1), (int32_t)(0), (int32_t)(-1), (int32_t)(0));
1479}
1480//The total number of operations is 0
1481static inline bitblock128_t simd_himask_128()
1482{
1483        return _mm_set_epi32((int32_t)(-1), (int32_t)(-1), (int32_t)(0), (int32_t)(0));
1484}
1485//The total number of operations is 0
1486static inline bitblock128_t simd_himask_16()
1487{
1488        return simd_constant_16((65280));
1489}
1490//The total number of operations is 2.0
1491static inline bool bitblock_all(bitblock128_t arg1)
1492{
1493        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(-1))) == 65535;
1494}
1495//The total number of operations is 4.0
1496static inline bitblock128_t simd_ifh_32(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1497{
1498        return simd_ifh_1(simd_gt_32(simd_constant_32(0), arg1), arg2, arg3);
1499}
1500//The total number of operations is 3.0
1501static inline bitblock128_t simd_ifh_1(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1502{
1503        return simd_or(simd_and(arg2, arg1), simd_andc(arg3, arg1));
1504}
1505//The total number of operations is 8.0
1506static inline bitblock128_t simd_ifh_2(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1507{
1508        return simd_ifh_1(simd_ifh_1(simd_himask_2(), arg1, simd_srli_2(arg1, (1))), arg2, arg3);
1509}
1510//The total number of operations is 13.0
1511static inline bitblock128_t simd_ifh_4(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1512{
1513        return simd_ifh_1(simd_gt_4(simd_constant_4(0), arg1), arg2, arg3);
1514}
1515//The total number of operations is 4.0
1516static inline bitblock128_t simd_ifh_8(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1517{
1518        return simd_ifh_1(simd_gt_8(simd_constant_8(0), arg1), arg2, arg3);
1519}
1520//The total number of operations is 8.0
1521static inline bitblock128_t simd_ifh_64(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1522{
1523        return simd_ifh_32(simd_ifh_1(simd_himask_64(), arg1, simd_srli_64(arg1, (32))), arg2, arg3);
1524}
1525//The total number of operations is 13.3333333333
1526static inline bitblock128_t simd_ifh_128(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1527{
1528        return simd_ifh_64(simd_ifh_1(simd_himask_128(), arg1, simd_srli_128(arg1, (64))), arg2, arg3);
1529}
1530//The total number of operations is 4.0
1531static inline bitblock128_t simd_ifh_16(bitblock128_t arg1, bitblock128_t arg2, bitblock128_t arg3)
1532{
1533        return simd_ifh_1(simd_gt_16(simd_constant_16(0), arg1), arg2, arg3);
1534}
1535//The total number of operations is 1.0
1536static inline bitblock128_t simd_sub_32(bitblock128_t arg1, bitblock128_t arg2)
1537{
1538        return _mm_sub_epi32(arg1, arg2);
1539}
1540//The total number of operations is 1.0
1541static inline bitblock128_t simd_sub_1(bitblock128_t arg1, bitblock128_t arg2)
1542{
1543        return simd_xor(arg1, arg2);
1544}
1545//The total number of operations is 9.33333333333
1546static inline bitblock128_t simd_sub_2(bitblock128_t arg1, bitblock128_t arg2)
1547{
1548        bitblock128_t tmp = simd_xor(arg1, arg2);
1549        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(simd_and(simd_not(arg1), arg2), 1)), tmp);
1550}
1551//The total number of operations is 6.0
1552static inline bitblock128_t simd_sub_4(bitblock128_t arg1, bitblock128_t arg2)
1553{
1554        return simd_ifh_1(simd_himask_8(), simd_sub_8(arg1, simd_and(simd_himask_8(), arg2)), simd_sub_8(arg1, arg2));
1555}
1556//The total number of operations is 1.0
1557static inline bitblock128_t simd_sub_8(bitblock128_t arg1, bitblock128_t arg2)
1558{
1559        return _mm_sub_epi8(arg1, arg2);
1560}
1561//The total number of operations is 1.0
1562static inline bitblock128_t simd_sub_64(bitblock128_t arg1, bitblock128_t arg2)
1563{
1564        return _mm_sub_epi64(arg1, arg2);
1565}
1566//The total number of operations is 9.33333333333
1567static inline bitblock128_t simd_sub_128(bitblock128_t arg1, bitblock128_t arg2)
1568{
1569        bitblock128_t partial = simd_sub_64(arg1, arg2);
1570        bitblock128_t borrowMask = simd_or(simd_andc(arg2, arg1), simd_andc(partial, simd_xor(arg1, arg2)));
1571        bitblock128_t borrow = simd_slli_128(simd_srli_64(borrowMask, (63)), (64));
1572        return simd_sub_64(partial, borrow);
1573}
1574//The total number of operations is 1.0
1575static inline bitblock128_t simd_sub_16(bitblock128_t arg1, bitblock128_t arg2)
1576{
1577        return _mm_sub_epi16(arg1, arg2);
1578}
1579//The total number of operations is 3.0
1580static inline bitblock128_t simd_add_hl_32(bitblock128_t arg1)
1581{
1582        return simd_add_64(simd_srli_32(arg1, (16)), simd_and(arg1, simd_lomask_32()));
1583}
1584//The total number of operations is 3.0
1585static inline bitblock128_t simd_add_hl_2(bitblock128_t arg1)
1586{
1587        return simd_sub_16(arg1, simd_and(simd_lomask_2(), simd_srli_16(arg1, 1)));
1588}
1589//The total number of operations is 4.0
1590static inline bitblock128_t simd_add_hl_4(bitblock128_t arg1)
1591{
1592        return simd_add_8(simd_srli_4(arg1, (2)), simd_and(arg1, simd_lomask_4()));
1593}
1594//The total number of operations is 4.0
1595static inline bitblock128_t simd_add_hl_8(bitblock128_t arg1)
1596{
1597        return simd_add_16(simd_srli_8(arg1, (4)), simd_and(arg1, simd_lomask_8()));
1598}
1599//The total number of operations is 3.0
1600static inline bitblock128_t simd_add_hl_64(bitblock128_t arg1)
1601{
1602        return simd_add_64(simd_srli_64(arg1, (32)), simd_and(arg1, simd_lomask_64()));
1603}
1604//The total number of operations is 12.6666666667
1605static inline bitblock128_t simd_add_hl_128(bitblock128_t arg1)
1606{
1607        return simd_add_128(simd_srli_128(arg1, (64)), simd_and(arg1, simd_lomask_128()));
1608}
1609//The total number of operations is 3.0
1610static inline bitblock128_t simd_add_hl_16(bitblock128_t arg1)
1611{
1612        return simd_add_32(simd_srli_16(arg1, (8)), simd_and(arg1, simd_lomask_16()));
1613}
1614//The total number of operations is 10.0
1615static inline bitblock128_t simd_srl_64(bitblock128_t arg1, bitblock128_t shift_mask)
1616{
1617        return simd_ifh_1(simd_himask_128(), _mm_srl_epi64(arg1, simd_and(_mm_srli_si128(shift_mask, (int32_t)(8)), _mm_cvtsi32_si128((int32_t)(63)))), _mm_srl_epi64(arg1, simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(63)))));
1618}
1619//The total number of operations is 13.0
1620static inline bitblock128_t simd_srl_128(bitblock128_t arg1, bitblock128_t shift_mask)
1621{
1622        bitblock128_t shift = simd_and(shift_mask, _mm_cvtsi32_si128((int32_t)(127)));
1623        return simd_or(_mm_srl_epi64(arg1, shift), simd_or(_mm_srli_si128(_mm_srl_epi64(arg1, simd_sub_32(shift, _mm_cvtsi32_si128((int32_t)(64)))), (int32_t)(8)), _mm_srli_si128(_mm_sll_epi64(arg1, simd_sub_32(_mm_cvtsi32_si128((int32_t)(64)), shift)), (int32_t)(8))));
1624}
1625//The total number of operations is 0
1626static inline bitblock128_t simd_lomask_32()
1627{
1628        return simd_constant_32((65535));
1629}
1630//The total number of operations is 0
1631static inline bitblock128_t simd_lomask_2()
1632{
1633        return simd_constant_2((1));
1634}
1635//The total number of operations is 0
1636static inline bitblock128_t simd_lomask_4()
1637{
1638        return simd_constant_4((3));
1639}
1640//The total number of operations is 0
1641static inline bitblock128_t simd_lomask_8()
1642{
1643        return simd_constant_8((15));
1644}
1645//The total number of operations is 0
1646static inline bitblock128_t simd_lomask_64()
1647{
1648        return _mm_set_epi32((int32_t)(0), (int32_t)(-1), (int32_t)(0), (int32_t)(-1));
1649}
1650//The total number of operations is 0
1651static inline bitblock128_t simd_lomask_128()
1652{
1653        return _mm_set_epi32((int32_t)(0), (int32_t)(0), (int32_t)(-1), (int32_t)(-1));
1654}
1655//The total number of operations is 0
1656static inline bitblock128_t simd_lomask_16()
1657{
1658        return simd_constant_16((255));
1659}
1660//The total number of operations is 3.0
1661static inline uint64_t hsimd_signmask_32(bitblock128_t arg1)
1662{
1663        return hsimd_signmask_16(hsimd_packss_32(simd_constant_32(0), arg1));
1664}
1665//The total number of operations is 24.0
1666static inline uint64_t hsimd_signmask_4(bitblock128_t arg1)
1667{
1668        uint64_t tmpAns1 = hsimd_signmask_8(esimd_mergeh_4(arg1, simd_constant_4(0)));
1669        uint64_t tmpAns2 = hsimd_signmask_8(esimd_mergel_4(arg1, simd_constant_4(0)));
1670        return ((tmpAns1<<(16))+tmpAns2);
1671}
1672//The total number of operations is 1.0
1673static inline uint64_t hsimd_signmask_8(bitblock128_t arg1)
1674{
1675        return _mm_movemask_epi8(arg1);
1676}
1677//The total number of operations is 1.0
1678static inline uint64_t hsimd_signmask_64(bitblock128_t arg1)
1679{
1680        return _mm_movemask_pd(_mm_castsi128_pd(arg1));
1681}
1682//The total number of operations is 6.33333333333
1683static inline uint64_t hsimd_signmask_128(bitblock128_t arg1)
1684{
1685        return hsimd_signmask_64(hsimd_packh_128(simd_constant_128(0), arg1));
1686}
1687//The total number of operations is 2.0
1688static inline uint64_t hsimd_signmask_16(bitblock128_t arg1)
1689{
1690        return hsimd_signmask_8(hsimd_packss_16(simd_constant_16(0), arg1));
1691}
1692//The total number of operations is 3.0
1693static inline bitblock128_t esimd_zeroextendh_32(bitblock128_t arg1)
1694{
1695        return esimd_mergeh_64(simd_srli_64(arg1, 32), simd_and(simd_lomask_64(), arg1));
1696}
1697//The total number of operations is 24.0
1698static inline bitblock128_t esimd_zeroextendh_1(bitblock128_t arg1)
1699{
1700        return esimd_mergeh_2(simd_srli_2(arg1, 1), simd_and(simd_lomask_2(), arg1));
1701}
1702//The total number of operations is 14.0
1703static inline bitblock128_t esimd_zeroextendh_2(bitblock128_t arg1)
1704{
1705        return esimd_mergeh_4(simd_srli_4(arg1, 2), simd_and(simd_lomask_4(), arg1));
1706}
1707//The total number of operations is 4.0
1708static inline bitblock128_t esimd_zeroextendh_4(bitblock128_t arg1)
1709{
1710        return esimd_mergeh_8(simd_srli_8(arg1, 4), simd_and(simd_lomask_8(), arg1));
1711}
1712//The total number of operations is 3.0
1713static inline bitblock128_t esimd_zeroextendh_8(bitblock128_t arg1)
1714{
1715        return esimd_mergeh_16(simd_srli_16(arg1, 8), simd_and(simd_lomask_16(), arg1));
1716}
1717//The total number of operations is 2.33333333333
1718static inline bitblock128_t esimd_zeroextendh_64(bitblock128_t arg1)
1719{
1720        return simd_srli_128(arg1, 64);
1721}
1722//The total number of operations is 3.0
1723static inline bitblock128_t esimd_zeroextendh_16(bitblock128_t arg1)
1724{
1725        return esimd_mergeh_32(simd_srli_32(arg1, 16), simd_and(simd_lomask_32(), arg1));
1726}
1727//The total number of operations is 3.0
1728static inline bitblock128_t esimd_zeroextendl_32(bitblock128_t arg1)
1729{
1730        return esimd_mergel_64(simd_srli_64(arg1, 32), simd_and(simd_lomask_64(), arg1));
1731}
1732//The total number of operations is 24.0
1733static inline bitblock128_t esimd_zeroextendl_1(bitblock128_t arg1)
1734{
1735        return esimd_mergel_2(simd_srli_2(arg1, 1), simd_and(simd_lomask_2(), arg1));
1736}
1737//The total number of operations is 14.0
1738static inline bitblock128_t esimd_zeroextendl_2(bitblock128_t arg1)
1739{
1740        return esimd_mergel_4(simd_srli_4(arg1, 2), simd_and(simd_lomask_4(), arg1));
1741}
1742//The total number of operations is 4.0
1743static inline bitblock128_t esimd_zeroextendl_4(bitblock128_t arg1)
1744{
1745        return esimd_mergel_8(simd_srli_8(arg1, 4), simd_and(simd_lomask_8(), arg1));
1746}
1747//The total number of operations is 3.0
1748static inline bitblock128_t esimd_zeroextendl_8(bitblock128_t arg1)
1749{
1750        return esimd_mergel_16(simd_srli_16(arg1, 8), simd_and(simd_lomask_16(), arg1));
1751}
1752//The total number of operations is 1.0
1753static inline bitblock128_t esimd_zeroextendl_64(bitblock128_t arg1)
1754{
1755        return simd_and(simd_lomask_128(), arg1);
1756}
1757//The total number of operations is 3.0
1758static inline bitblock128_t esimd_zeroextendl_16(bitblock128_t arg1)
1759{
1760        return esimd_mergel_32(simd_srli_32(arg1, 16), simd_and(simd_lomask_32(), arg1));
1761}
1762//The total number of operations is 1.0
1763static inline bitblock128_t mvmd_fill4_32(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1764{
1765        return _mm_set_epi32((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4));
1766}
1767//The total number of operations is 5.0
1768static inline bitblock128_t mvmd_fill4_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1769{
1770        return simd_ifh_1(simd_himask_4(), mvmd_fill2_1(val1, val2), mvmd_fill2_1(val3, val4));
1771}
1772//The total number of operations is 5.0
1773static inline bitblock128_t mvmd_fill4_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1774{
1775        return simd_ifh_1(simd_himask_8(), mvmd_fill2_2(val1, val2), mvmd_fill2_2(val3, val4));
1776}
1777//The total number of operations is 5.0
1778static inline bitblock128_t mvmd_fill4_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1779{
1780        return simd_ifh_1(simd_himask_16(), mvmd_fill2_4(val1, val2), mvmd_fill2_4(val3, val4));
1781}
1782//The total number of operations is 5.0
1783static inline bitblock128_t mvmd_fill4_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1784{
1785        return simd_ifh_1(simd_himask_32(), mvmd_fill2_8(val1, val2), mvmd_fill2_8(val3, val4));
1786}
1787//The total number of operations is 3.0
1788static inline bitblock128_t mvmd_fill4_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4)
1789{
1790        return simd_or(mvmd_fill4_32((val1<<16), (val3<<16), (val1<<16), (val3<<16)), mvmd_fill4_32((val2&(65535)), (val4&(65535)), (val2&(65535)), (val4&(65535))));
1791}
1792//The total number of operations is 7.0
1793static inline bitblock128_t simd_umin_32(bitblock128_t arg1, bitblock128_t arg2)
1794{
1795        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
1796        return simd_xor(simd_min_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1797}
1798//The total number of operations is 1.0
1799static inline bitblock128_t simd_umin_1(bitblock128_t arg1, bitblock128_t arg2)
1800{
1801        return simd_and(arg1, arg2);
1802}
1803//The total number of operations is 16.0
1804static inline bitblock128_t simd_umin_2(bitblock128_t arg1, bitblock128_t arg2)
1805{
1806        return simd_or(simd_and(simd_himask_4(), simd_umin_4(arg1, arg2)), simd_umin_4(simd_and(simd_lomask_4(), arg1), simd_and(simd_lomask_4(), arg2)));
1807}
1808//The total number of operations is 6.0
1809static inline bitblock128_t simd_umin_4(bitblock128_t arg1, bitblock128_t arg2)
1810{
1811        return simd_or(simd_and(simd_himask_8(), simd_umin_8(arg1, arg2)), simd_umin_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
1812}
1813//The total number of operations is 1.0
1814static inline bitblock128_t simd_umin_8(bitblock128_t arg1, bitblock128_t arg2)
1815{
1816        return _mm_min_epu8(arg1, arg2);
1817}
1818//The total number of operations is 20.0
1819static inline bitblock128_t simd_umin_64(bitblock128_t arg1, bitblock128_t arg2)
1820{
1821        bitblock128_t tmpAns = simd_umin_32(arg1, arg2);
1822        bitblock128_t eqMask1 = simd_srli_64(simd_eq_32(tmpAns, arg1), (32));
1823        bitblock128_t eqMask2 = simd_srli_64(simd_eq_32(tmpAns, arg2), (32));
1824        return simd_ifh_1(simd_himask_64(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1825}
1826//The total number of operations is 43.6666666667
1827static inline bitblock128_t simd_umin_128(bitblock128_t arg1, bitblock128_t arg2)
1828{
1829        bitblock128_t tmpAns = simd_umin_64(arg1, arg2);
1830        bitblock128_t eqMask1 = simd_srli_128(simd_eq_64(tmpAns, arg1), (64));
1831        bitblock128_t eqMask2 = simd_srli_128(simd_eq_64(tmpAns, arg2), (64));
1832        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
1833}
1834//The total number of operations is 4.0
1835static inline bitblock128_t simd_umin_16(bitblock128_t arg1, bitblock128_t arg2)
1836{
1837        bitblock128_t high_bit = simd_constant_16((32768));
1838        return simd_xor(simd_min_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1839}
1840//The total number of operations is 4.0
1841static inline bitblock128_t simd_min_32(bitblock128_t arg1, bitblock128_t arg2)
1842{
1843        return simd_ifh_1(simd_gt_32(arg1, arg2), arg2, arg1);
1844}
1845//The total number of operations is 1.0
1846static inline bitblock128_t simd_min_1(bitblock128_t arg1, bitblock128_t arg2)
1847{
1848        return simd_or(arg1, arg2);
1849}
1850//The total number of operations is 16.6666666667
1851static inline bitblock128_t simd_min_2(bitblock128_t arg1, bitblock128_t arg2)
1852{
1853        bitblock128_t tmp1 = simd_srli_128(arg1, 1);
1854        bitblock128_t tmp2 = simd_srli_128(arg2, 1);
1855        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg1, simd_and(tmp1, simd_not(tmp2))), simd_and(arg2, simd_or(simd_and(simd_not(tmp1), tmp2), arg1))));
1856}
1857//The total number of operations is 9.0
1858static inline bitblock128_t simd_min_4(bitblock128_t arg1, bitblock128_t arg2)
1859{
1860        bitblock128_t high_bit = simd_constant_4((8));
1861        return simd_xor(simd_umin_4(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
1862}
1863//The total number of operations is 4.0
1864static inline bitblock128_t simd_min_8(bitblock128_t arg1, bitblock128_t arg2)
1865{
1866        return simd_ifh_1(simd_gt_8(arg1, arg2), arg2, arg1);
1867}
1868//The total number of operations is 17.5
1869static inline bitblock128_t simd_min_64(bitblock128_t arg1, bitblock128_t arg2)
1870{
1871        return simd_ifh_1(simd_gt_64(arg1, arg2), arg2, arg1);
1872}
1873//The total number of operations is 54.75
1874static inline bitblock128_t simd_min_128(bitblock128_t arg1, bitblock128_t arg2)
1875{
1876        return simd_ifh_1(simd_gt_128(arg1, arg2), arg2, arg1);
1877}
1878//The total number of operations is 1.0
1879static inline bitblock128_t simd_min_16(bitblock128_t arg1, bitblock128_t arg2)
1880{
1881        return _mm_min_epi16(arg1, arg2);
1882}
1883//The total number of operations is 5.0
1884static inline bitblock128_t mvmd_fill2_32(uint64_t val1, uint64_t val2)
1885{
1886        return simd_ifh_1(simd_himask_64(), mvmd_fill_32(val1), mvmd_fill_32(val2));
1887}
1888//The total number of operations is 1.0
1889static inline bitblock128_t mvmd_fill2_1(uint64_t val1, uint64_t val2)
1890{
1891        return mvmd_fill_2(((val1<<1)|(val2&(1))));
1892}
1893//The total number of operations is 1.0
1894static inline bitblock128_t mvmd_fill2_2(uint64_t val1, uint64_t val2)
1895{
1896        return mvmd_fill_4(((val1<<2)|(val2&(3))));
1897}
1898//The total number of operations is 1.0
1899static inline bitblock128_t mvmd_fill2_4(uint64_t val1, uint64_t val2)
1900{
1901        return mvmd_fill_8(((val1<<4)|(val2&(15))));
1902}
1903//The total number of operations is 1.0
1904static inline bitblock128_t mvmd_fill2_8(uint64_t val1, uint64_t val2)
1905{
1906        return mvmd_fill_16(((val1<<8)|(val2&(255))));
1907}
1908//The total number of operations is 5.0
1909static inline bitblock128_t mvmd_fill2_64(uint64_t val1, uint64_t val2)
1910{
1911        return simd_ifh_1(simd_himask_128(), mvmd_fill_64(val1), mvmd_fill_64(val2));
1912}
1913//The total number of operations is 1.0
1914static inline bitblock128_t mvmd_fill2_16(uint64_t val1, uint64_t val2)
1915{
1916        return mvmd_fill_32(((val1<<16)|(val2&(65535))));
1917}
1918//The total number of operations is 2.0
1919static inline bool bitblock_any(bitblock128_t arg1)
1920{
1921        return hsimd_signmask_8(simd_eq_8(arg1, simd_constant_8(0))) != 65535;
1922}
1923//The total number of operations is 20.3333333333
1924static inline uint64_t bitblock_popcount(bitblock128_t arg1)
1925{
1926        return mvmd_extract_64(simd_popcount_128(arg1), 0);
1927}
1928//The total number of operations is 1.0
1929static inline bitblock128_t simd_or(bitblock128_t arg1, bitblock128_t arg2)
1930{
1931        return _mm_or_si128(arg1, arg2);
1932}
1933//The total number of operations is 3.0
1934static inline bitblock128_t hsimd_packl_32(bitblock128_t arg1, bitblock128_t arg2)
1935{
1936        return _mm_hsub_epi16(simd_and(arg2, simd_lomask_32()), simd_and(arg1, simd_lomask_32()));
1937}
1938//The total number of operations is 35.0
1939static inline bitblock128_t hsimd_packl_2(bitblock128_t arg1, bitblock128_t arg2)
1940{
1941        return hsimd_packl_4(simd_ifh_1(simd_himask_2(), simd_srli_128(arg1, (1)), arg1), simd_ifh_1(simd_himask_2(), simd_srli_128(arg2, (1)), arg2));
1942}
1943//The total number of operations is 24.3333333333
1944static inline bitblock128_t hsimd_packl_4(bitblock128_t arg1, bitblock128_t arg2)
1945{
1946        return hsimd_packl_8(simd_ifh_1(simd_himask_4(), simd_srli_128(arg1, (2)), arg1), simd_ifh_1(simd_himask_4(), simd_srli_128(arg2, (2)), arg2));
1947}
1948//The total number of operations is 13.6666666667
1949static inline bitblock128_t hsimd_packl_8(bitblock128_t arg1, bitblock128_t arg2)
1950{
1951        return hsimd_packl_16(simd_ifh_1(simd_himask_8(), simd_srli_128(arg1, (4)), arg1), simd_ifh_1(simd_himask_8(), simd_srli_128(arg2, (4)), arg2));
1952}
1953//The total number of operations is 3.0
1954static inline bitblock128_t hsimd_packl_64(bitblock128_t arg1, bitblock128_t arg2)
1955{
1956        return _mm_hsub_epi32(simd_and(arg2, simd_lomask_64()), simd_and(arg1, simd_lomask_64()));
1957}
1958//The total number of operations is 5.33333333333
1959static inline bitblock128_t hsimd_packl_128(bitblock128_t arg1, bitblock128_t arg2)
1960{
1961        return simd_ifh_1(simd_himask_128(), simd_slli_128(arg1, (64)), arg2);
1962}
1963//The total number of operations is 3.0
1964static inline bitblock128_t hsimd_packl_16(bitblock128_t arg1, bitblock128_t arg2)
1965{
1966        return hsimd_packus_16(simd_and(arg1, simd_lomask_16()), simd_and(arg2, simd_lomask_16()));
1967}
1968//The total number of operations is 13.0
1969static inline bitblock128_t mvmd_fill8_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1970{
1971        return simd_ifh_1(simd_himask_8(), mvmd_fill4_1(val1, val2, val3, val4), mvmd_fill4_1(val5, val6, val7, val8));
1972}
1973//The total number of operations is 13.0
1974static inline bitblock128_t mvmd_fill8_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1975{
1976        return simd_ifh_1(simd_himask_16(), mvmd_fill4_2(val1, val2, val3, val4), mvmd_fill4_2(val5, val6, val7, val8));
1977}
1978//The total number of operations is 7.0
1979static inline bitblock128_t mvmd_fill8_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1980{
1981        return simd_or(mvmd_fill8_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4)), mvmd_fill8_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15))));
1982}
1983//The total number of operations is 3.0
1984static inline bitblock128_t mvmd_fill8_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1985{
1986        return simd_or(mvmd_fill8_16((val1<<8), (val3<<8), (val5<<8), (val7<<8), (val1<<8), (val3<<8), (val5<<8), (val7<<8)), mvmd_fill8_16((val2&(255)), (val4&(255)), (val6&(255)), (val8&(255)), (val2&(255)), (val4&(255)), (val6&(255)), (val8&(255))));
1987}
1988//The total number of operations is 1.0
1989static inline bitblock128_t mvmd_fill8_16(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8)
1990{
1991        return _mm_set_epi16((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8));
1992}
1993//The total number of operations is 7.0
1994static inline bitblock128_t hsimd_min_hl_32(bitblock128_t arg1, bitblock128_t arg2)
1995{
1996        return simd_min_16(hsimd_packh_32(arg1, arg2), hsimd_packl_32(arg1, arg2));
1997}
1998//The total number of operations is 73.0
1999static inline bitblock128_t hsimd_min_hl_2(bitblock128_t arg1, bitblock128_t arg2)
2000{
2001        return simd_min_1(hsimd_packh_2(arg1, arg2), hsimd_packl_2(arg1, arg2));
2002}
2003//The total number of operations is 67.3333333333
2004static inline bitblock128_t hsimd_min_hl_4(bitblock128_t arg1, bitblock128_t arg2)
2005{
2006        return simd_min_2(hsimd_packh_4(arg1, arg2), hsimd_packl_4(arg1, arg2));
2007}
2008//The total number of operations is 38.3333333333
2009static inline bitblock128_t hsimd_min_hl_8(bitblock128_t arg1, bitblock128_t arg2)
2010{
2011        return simd_min_4(hsimd_packh_8(arg1, arg2), hsimd_packl_8(arg1, arg2));
2012}
2013//The total number of operations is 10.0
2014static inline bitblock128_t hsimd_min_hl_64(bitblock128_t arg1, bitblock128_t arg2)
2015{
2016        return simd_min_32(hsimd_packh_64(arg1, arg2), hsimd_packl_64(arg1, arg2));
2017}
2018//The total number of operations is 28.1666666667
2019static inline bitblock128_t hsimd_min_hl_128(bitblock128_t arg1, bitblock128_t arg2)
2020{
2021        return simd_min_64(hsimd_packh_128(arg1, arg2), hsimd_packl_128(arg1, arg2));
2022}
2023//The total number of operations is 10.0
2024static inline bitblock128_t hsimd_min_hl_16(bitblock128_t arg1, bitblock128_t arg2)
2025{
2026        return simd_min_8(hsimd_packh_16(arg1, arg2), hsimd_packl_16(arg1, arg2));
2027}
2028//The total number of operations is 1.0
2029static inline bitblock128_t simd_xor(bitblock128_t arg1, bitblock128_t arg2)
2030{
2031        return _mm_xor_si128(arg1, arg2);
2032}
2033//The total number of operations is 7.0
2034static inline bitblock128_t simd_umax_32(bitblock128_t arg1, bitblock128_t arg2)
2035{
2036        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
2037        return simd_xor(simd_max_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
2038}
2039//The total number of operations is 1.0
2040static inline bitblock128_t simd_umax_1(bitblock128_t arg1, bitblock128_t arg2)
2041{
2042        return simd_or(arg1, arg2);
2043}
2044//The total number of operations is 15.6666666667
2045static inline bitblock128_t simd_umax_2(bitblock128_t arg1, bitblock128_t arg2)
2046{
2047        return simd_ifh_1(simd_himask_2(), simd_or(arg1, arg2), simd_or(simd_and(arg2, simd_srli_128(simd_or(simd_not(arg1), arg2), 1)), simd_and(arg1, simd_srli_128(simd_or(arg1, simd_not(arg2)), 1))));
2048}
2049//The total number of operations is 6.0
2050static inline bitblock128_t simd_umax_4(bitblock128_t arg1, bitblock128_t arg2)
2051{
2052        return simd_or(simd_and(simd_himask_8(), simd_umax_8(arg1, arg2)), simd_umax_8(simd_and(simd_lomask_8(), arg1), simd_and(simd_lomask_8(), arg2)));
2053}
2054//The total number of operations is 1.0
2055static inline bitblock128_t simd_umax_8(bitblock128_t arg1, bitblock128_t arg2)
2056{
2057        return _mm_max_epu8(arg1, arg2);
2058}
2059//The total number of operations is 20.0
2060static inline bitblock128_t simd_umax_64(bitblock128_t arg1, bitblock128_t arg2)
2061{
2062        bitblock128_t tmpAns = simd_umax_32(arg1, arg2);
2063        bitblock128_t eqMask1 = simd_srli_64(simd_eq_32(tmpAns, arg1), (32));
2064        bitblock128_t eqMask2 = simd_srli_64(simd_eq_32(tmpAns, arg2), (32));
2065        return simd_ifh_1(simd_himask_64(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2066}
2067//The total number of operations is 43.6666666667
2068static inline bitblock128_t simd_umax_128(bitblock128_t arg1, bitblock128_t arg2)
2069{
2070        bitblock128_t tmpAns = simd_umax_64(arg1, arg2);
2071        bitblock128_t eqMask1 = simd_srli_128(simd_eq_64(tmpAns, arg1), (64));
2072        bitblock128_t eqMask2 = simd_srli_128(simd_eq_64(tmpAns, arg2), (64));
2073        return simd_ifh_1(simd_himask_128(), tmpAns, simd_ifh_1(eqMask1, simd_ifh_1(eqMask2, tmpAns, arg1), arg2));
2074}
2075//The total number of operations is 4.0
2076static inline bitblock128_t simd_umax_16(bitblock128_t arg1, bitblock128_t arg2)
2077{
2078        bitblock128_t high_bit = simd_constant_16((32768));
2079        return simd_xor(simd_max_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit)), high_bit);
2080}
2081//The total number of operations is 1.0
2082static inline bitblock128_t bitblock_load_aligned(const bitblock128_t* arg1)
2083{
2084        return _mm_load_si128((bitblock128_t*)(arg1));
2085}
2086//The total number of operations is 1.0
2087static inline void bitblock_store_unaligned(bitblock128_t arg1, bitblock128_t* arg2)
2088{
2089        _mm_storeu_si128((bitblock128_t*)(arg2), arg1);
2090}
2091//The total number of operations is 11.0
2092static inline bitblock128_t esimd_signextendl_32(bitblock128_t arg1)
2093{
2094        return esimd_mergel_64(simd_srai_64(arg1, 32), simd_srai_64(simd_slli_64(arg1, 32), 32));
2095}
2096//The total number of operations is 31.0
2097static inline bitblock128_t esimd_signextendl_1(bitblock128_t arg1)
2098{
2099        return esimd_mergel_2(simd_srai_2(arg1, 1), simd_srai_2(simd_slli_2(arg1, 1), 1));
2100}
2101//The total number of operations is 33.0
2102static inline bitblock128_t esimd_signextendl_2(bitblock128_t arg1)
2103{
2104        return esimd_mergel_4(simd_srai_4(arg1, 2), simd_srai_4(simd_slli_4(arg1, 2), 2));
2105}
2106//The total number of operations is 13.0
2107static inline bitblock128_t esimd_signextendl_4(bitblock128_t arg1)
2108{
2109        return esimd_mergel_8(simd_srai_8(arg1, 4), simd_srai_8(simd_slli_8(arg1, 4), 4));
2110}
2111//The total number of operations is 4.0
2112static inline bitblock128_t esimd_signextendl_8(bitblock128_t arg1)
2113{
2114        return esimd_mergel_16(simd_srai_16(arg1, 8), simd_srai_16(simd_slli_16(arg1, 8), 8));
2115}
2116//The total number of operations is 13.4166666667
2117static inline bitblock128_t esimd_signextendl_64(bitblock128_t arg1)
2118{
2119        return simd_srai_128(simd_slli_128(arg1, 64), 64);
2120}
2121//The total number of operations is 4.0
2122static inline bitblock128_t esimd_signextendl_16(bitblock128_t arg1)
2123{
2124        return esimd_mergel_32(simd_srai_32(arg1, 16), simd_srai_32(simd_slli_32(arg1, 16), 16));
2125}
2126//The total number of operations is 12.0
2127static inline bitblock128_t hsimd_packus_32(bitblock128_t arg1, bitblock128_t arg2)
2128{
2129        bitblock128_t hiPart = hsimd_packh_32(arg1, arg2);
2130        return simd_ifh_16(hiPart, simd_constant_16(0), simd_or(simd_gt_16(hiPart, simd_constant_16(0)), hsimd_packl_32(arg1, arg2)));
2131}
2132//The total number of operations is 75.0
2133static inline bitblock128_t hsimd_packus_2(bitblock128_t arg1, bitblock128_t arg2)
2134{
2135        bitblock128_t arg11 = simd_ifh_2(arg1, simd_constant_2(0), arg1);
2136        bitblock128_t arg12 = simd_and(simd_lomask_2(), arg11);
2137        bitblock128_t arg21 = simd_ifh_2(arg2, simd_constant_2(0), arg2);
2138        bitblock128_t arg22 = simd_and(simd_lomask_2(), arg21);
2139        return hsimd_packl_2(simd_ifh_1(simd_eq_2(arg12, arg11), arg12, simd_lomask_2()), simd_ifh_1(simd_eq_2(arg22, arg21), arg22, simd_lomask_2()));
2140}
2141//The total number of operations is 74.3333333333
2142static inline bitblock128_t hsimd_packus_4(bitblock128_t arg1, bitblock128_t arg2)
2143{
2144        bitblock128_t hiPart = hsimd_packh_4(arg1, arg2);
2145        return simd_ifh_2(hiPart, simd_constant_2(0), simd_or(simd_gt_2(hiPart, simd_constant_2(0)), hsimd_packl_4(arg1, arg2)));
2146}
2147//The total number of operations is 31.6666666667
2148static inline bitblock128_t hsimd_packus_8(bitblock128_t arg1, bitblock128_t arg2)
2149{
2150        bitblock128_t arg11 = simd_ifh_8(arg1, simd_constant_8(0), arg1);
2151        bitblock128_t arg12 = simd_and(simd_lomask_8(), arg11);
2152        bitblock128_t arg21 = simd_ifh_8(arg2, simd_constant_8(0), arg2);
2153        bitblock128_t arg22 = simd_and(simd_lomask_8(), arg21);
2154        return hsimd_packl_8(simd_ifh_1(simd_eq_8(arg12, arg11), arg12, simd_lomask_8()), simd_ifh_1(simd_eq_8(arg22, arg21), arg22, simd_lomask_8()));
2155}
2156//The total number of operations is 12.0
2157static inline bitblock128_t hsimd_packus_64(bitblock128_t arg1, bitblock128_t arg2)
2158{
2159        bitblock128_t hiPart = hsimd_packh_64(arg1, arg2);
2160        return simd_ifh_32(hiPart, simd_constant_32(0), simd_or(simd_gt_32(hiPart, simd_constant_32(0)), hsimd_packl_64(arg1, arg2)));
2161}
2162//The total number of operations is 34.1666666667
2163static inline bitblock128_t hsimd_packus_128(bitblock128_t arg1, bitblock128_t arg2)
2164{
2165        bitblock128_t hiPart = hsimd_packh_128(arg1, arg2);
2166        return simd_ifh_64(hiPart, simd_constant_64(0), simd_or(simd_gt_64(hiPart, simd_constant_64(0)), hsimd_packl_128(arg1, arg2)));
2167}
2168//The total number of operations is 1.0
2169static inline bitblock128_t hsimd_packus_16(bitblock128_t arg1, bitblock128_t arg2)
2170{
2171        return _mm_packus_epi16(arg2, arg1);
2172}
2173//The total number of operations is 1.0
2174static inline bitblock128_t simd_abs_32(bitblock128_t arg1)
2175{
2176        return _mm_abs_epi32(arg1);
2177}
2178//The total number of operations is 7.33333333333
2179static inline bitblock128_t simd_abs_2(bitblock128_t arg1)
2180{
2181        return simd_ifh_1(simd_himask_2(), simd_and(arg1, simd_slli_128(simd_not(arg1), 1)), arg1);
2182}
2183//The total number of operations is 19.0
2184static inline bitblock128_t simd_abs_4(bitblock128_t arg1)
2185{
2186        bitblock128_t gtMask = simd_gt_4(arg1, simd_constant_4(0));
2187        return simd_ifh_1(gtMask, arg1, simd_sub_4(gtMask, arg1));
2188}
2189//The total number of operations is 1.0
2190static inline bitblock128_t simd_abs_8(bitblock128_t arg1)
2191{
2192        return _mm_abs_epi8(arg1);
2193}
2194//The total number of operations is 13.0
2195static inline bitblock128_t simd_abs_64(bitblock128_t arg1)
2196{
2197        bitblock128_t eqMask = simd_eq_64(simd_ifh_1(simd_himask_64(), simd_abs_32(arg1), arg1), arg1);
2198        return simd_ifh_1(eqMask, arg1, simd_sub_64(eqMask, arg1));
2199}
2200//The total number of operations is 40.0
2201static inline bitblock128_t simd_abs_128(bitblock128_t arg1)
2202{
2203        bitblock128_t eqMask = simd_eq_128(simd_ifh_1(simd_himask_128(), simd_abs_64(arg1), arg1), arg1);
2204        return simd_ifh_1(eqMask, arg1, simd_sub_128(eqMask, arg1));
2205}
2206//The total number of operations is 1.0
2207static inline bitblock128_t simd_abs_16(bitblock128_t arg1)
2208{
2209        return _mm_abs_epi16(arg1);
2210}
2211//The total number of operations is 3.0
2212static inline bitblock128_t simd_xor_hl_32(bitblock128_t arg1)
2213{
2214        return simd_xor(simd_srli_32(arg1, (16)), simd_and(arg1, simd_lomask_32()));
2215}
2216//The total number of operations is 4.0
2217static inline bitblock128_t simd_xor_hl_2(bitblock128_t arg1)
2218{
2219        return simd_xor(simd_srli_2(arg1, (1)), simd_and(arg1, simd_lomask_2()));
2220}
2221//The total number of operations is 4.0
2222static inline bitblock128_t simd_xor_hl_4(bitblock128_t arg1)
2223{
2224        return simd_xor(simd_srli_4(arg1, (2)), simd_and(arg1, simd_lomask_4()));
2225}
2226//The total number of operations is 4.0
2227static inline bitblock128_t simd_xor_hl_8(bitblock128_t arg1)
2228{
2229        return simd_xor(simd_srli_8(arg1, (4)), simd_and(arg1, simd_lomask_8()));
2230}
2231//The total number of operations is 3.0
2232static inline bitblock128_t simd_xor_hl_64(bitblock128_t arg1)
2233{
2234        return simd_xor(simd_srli_64(arg1, (32)), simd_and(arg1, simd_lomask_64()));
2235}
2236//The total number of operations is 4.33333333333
2237static inline bitblock128_t simd_xor_hl_128(bitblock128_t arg1)
2238{
2239        return simd_xor(simd_srli_128(arg1, (64)), simd_and(arg1, simd_lomask_128()));
2240}
2241//The total number of operations is 3.0
2242static inline bitblock128_t simd_xor_hl_16(bitblock128_t arg1)
2243{
2244        return simd_xor(simd_srli_16(arg1, (8)), simd_and(arg1, simd_lomask_16()));
2245}
2246//The total number of operations is 10.0
2247static inline bitblock128_t simd_srai_4(bitblock128_t arg1, uint64_t sh)
2248{
2249        bitblock128_t tmp = simd_srli_4(arg1, ((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)));
2250        return simd_or(tmp, simd_sub_4(simd_constant_4(0), simd_and(simd_constant_4((1<<((4-((sh >= 4) ? (3) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2251}
2252//The total number of operations is 5.0
2253static inline bitblock128_t simd_srai_8(bitblock128_t arg1, uint64_t sh)
2254{
2255        bitblock128_t tmp = simd_srli_8(arg1, ((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)));
2256        return simd_or(tmp, simd_sub_8(simd_constant_8(0), simd_and(simd_constant_8((1<<((8-((sh >= 8) ? (7) : ((sh < 0) ? 0 : sh)))-1))), tmp)));
2257}
2258//The total number of operations is 1.0
2259static inline bitblock128_t simd_and(bitblock128_t arg1, bitblock128_t arg2)
2260{
2261        return _mm_and_si128(arg1, arg2);
2262}
2263//The total number of operations is 15.0
2264static inline bitblock128_t mvmd_fill16_1(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2265{
2266        return simd_or(mvmd_fill16_2((val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1), (val1<<1), (val3<<1), (val5<<1), (val7<<1), (val9<<1), (val11<<1), (val13<<1), (val15<<1)), mvmd_fill16_2((val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1)), (val2&(1)), (val4&(1)), (val6&(1)), (val8&(1)), (val10&(1)), (val12&(1)), (val14&(1)), (val16&(1))));
2267}
2268//The total number of operations is 7.0
2269static inline bitblock128_t mvmd_fill16_2(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2270{
2271        return simd_or(mvmd_fill16_4((val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2), (val1<<2), (val3<<2), (val5<<2), (val7<<2), (val9<<2), (val11<<2), (val13<<2), (val15<<2)), mvmd_fill16_4((val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3)), (val2&(3)), (val4&(3)), (val6&(3)), (val8&(3)), (val10&(3)), (val12&(3)), (val14&(3)), (val16&(3))));
2272}
2273//The total number of operations is 3.0
2274static inline bitblock128_t mvmd_fill16_4(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2275{
2276        return simd_or(mvmd_fill16_8((val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4), (val1<<4), (val3<<4), (val5<<4), (val7<<4), (val9<<4), (val11<<4), (val13<<4), (val15<<4)), mvmd_fill16_8((val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15)), (val2&(15)), (val4&(15)), (val6&(15)), (val8&(15)), (val10&(15)), (val12&(15)), (val14&(15)), (val16&(15))));
2277}
2278//The total number of operations is 1.0
2279static inline bitblock128_t mvmd_fill16_8(uint64_t val1, uint64_t val2, uint64_t val3, uint64_t val4, uint64_t val5, uint64_t val6, uint64_t val7, uint64_t val8, uint64_t val9, uint64_t val10, uint64_t val11, uint64_t val12, uint64_t val13, uint64_t val14, uint64_t val15, uint64_t val16)
2280{
2281        return _mm_set_epi8((int32_t)(val1), (int32_t)(val2), (int32_t)(val3), (int32_t)(val4), (int32_t)(val5), (int32_t)(val6), (int32_t)(val7), (int32_t)(val8), (int32_t)(val9), (int32_t)(val10), (int32_t)(val11), (int32_t)(val12), (int32_t)(val13), (int32_t)(val14), (int32_t)(val15), (int32_t)(val16));
2282}
2283//The total number of operations is 5.0
2284static inline bitblock128_t simd_lt_32(bitblock128_t arg1, bitblock128_t arg2)
2285{
2286        return simd_and(simd_not(simd_gt_32(arg1, arg2)), simd_not(simd_eq_32(arg1, arg2)));
2287}
2288//The total number of operations is 1.0
2289static inline bitblock128_t simd_lt_1(bitblock128_t arg1, bitblock128_t arg2)
2290{
2291        return simd_andc(arg1, arg2);
2292}
2293//The total number of operations is 14.6666666667
2294static inline bitblock128_t simd_lt_2(bitblock128_t arg1, bitblock128_t arg2)
2295{
2296        bitblock128_t tmp = simd_not(arg2);
2297        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(simd_and(simd_not(arg1), arg2), 1), simd_or(arg1, tmp)));
2298        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
2299}
2300//The total number of operations is 18.0
2301static inline bitblock128_t simd_lt_4(bitblock128_t arg1, bitblock128_t arg2)
2302{
2303        return simd_ifh_1(simd_himask_8(), simd_lt_8(arg1, simd_and(simd_himask_8(), arg2)), simd_lt_8(simd_slli_8(arg1, 4), simd_slli_8(arg2, 4)));
2304}
2305//The total number of operations is 5.0
2306static inline bitblock128_t simd_lt_8(bitblock128_t arg1, bitblock128_t arg2)
2307{
2308        return simd_and(simd_not(simd_gt_8(arg1, arg2)), simd_not(simd_eq_8(arg1, arg2)));
2309}
2310//The total number of operations is 19.5
2311static inline bitblock128_t simd_lt_64(bitblock128_t arg1, bitblock128_t arg2)
2312{
2313        bitblock128_t high_bit = simd_constant_64((9223372036854775808ULL));
2314        return simd_ult_64(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2315}
2316//The total number of operations is 60.75
2317static inline bitblock128_t simd_lt_128(bitblock128_t arg1, bitblock128_t arg2)
2318{
2319        bitblock128_t hiAns = simd_lt_64(arg1, arg2);
2320        bitblock128_t loAns = simd_ult_64(arg1, arg2);
2321        bitblock128_t mask = simd_and(loAns, simd_srli_128(simd_eq_64(arg1, arg2), (64)));
2322        mask = simd_or(mask, simd_slli_128(mask, (64)));
2323        return simd_or(simd_srai_128(hiAns, (64)), mask);
2324}
2325//The total number of operations is 5.0
2326static inline bitblock128_t simd_lt_16(bitblock128_t arg1, bitblock128_t arg2)
2327{
2328        return simd_and(simd_not(simd_gt_16(arg1, arg2)), simd_not(simd_eq_16(arg1, arg2)));
2329}
2330//The total number of operations is 1.0
2331static inline bitblock128_t simd_add_32(bitblock128_t arg1, bitblock128_t arg2)
2332{
2333        return _mm_add_epi32(arg1, arg2);
2334}
2335//The total number of operations is 1.0
2336static inline bitblock128_t simd_add_1(bitblock128_t arg1, bitblock128_t arg2)
2337{
2338        return simd_xor(arg1, arg2);
2339}
2340//The total number of operations is 8.33333333333
2341static inline bitblock128_t simd_add_2(bitblock128_t arg1, bitblock128_t arg2)
2342{
2343        bitblock128_t tmp = simd_xor(arg1, arg2);
2344        return simd_ifh_1(simd_himask_2(), simd_xor(tmp, simd_slli_128(simd_and(arg1, arg2), 1)), tmp);
2345}
2346//The total number of operations is 6.0
2347static inline bitblock128_t simd_add_4(bitblock128_t arg1, bitblock128_t arg2)
2348{
2349        return simd_ifh_1(simd_himask_8(), simd_add_8(arg1, simd_and(simd_himask_8(), arg2)), simd_add_8(arg1, arg2));
2350}
2351//The total number of operations is 1.0
2352static inline bitblock128_t simd_add_8(bitblock128_t arg1, bitblock128_t arg2)
2353{
2354        return _mm_add_epi8(arg1, arg2);
2355}
2356//The total number of operations is 1.0
2357static inline bitblock128_t simd_add_64(bitblock128_t arg1, bitblock128_t arg2)
2358{
2359        return _mm_add_epi64(arg1, arg2);
2360}
2361//The total number of operations is 9.33333333333
2362static inline bitblock128_t simd_add_128(bitblock128_t arg1, bitblock128_t arg2)
2363{
2364        bitblock128_t partial = simd_add_64(arg1, arg2);
2365        bitblock128_t carryMask = simd_or(simd_and(arg1, arg2), simd_andc(simd_xor(arg1, arg2), partial));
2366        bitblock128_t carry = simd_slli_128(simd_srli_64(carryMask, (63)), (64));
2367        return simd_add_64(partial, carry);
2368}
2369//The total number of operations is 1.0
2370static inline bitblock128_t simd_add_16(bitblock128_t arg1, bitblock128_t arg2)
2371{
2372        return _mm_add_epi16(arg1, arg2);
2373}
2374//The total number of operations is 3.0
2375static inline bitblock128_t simd_ugt_32(bitblock128_t arg1, bitblock128_t arg2)
2376{
2377        bitblock128_t high_bit = simd_constant_32((2147483648ULL));
2378        return simd_gt_32(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2379}
2380//The total number of operations is 1.0
2381static inline bitblock128_t simd_ugt_1(bitblock128_t arg1, bitblock128_t arg2)
2382{
2383        return simd_andc(arg1, arg2);
2384}
2385//The total number of operations is 13.6666666667
2386static inline bitblock128_t simd_ugt_2(bitblock128_t arg1, bitblock128_t arg2)
2387{
2388        bitblock128_t tmp = simd_not(arg2);
2389        bitblock128_t tmpAns = simd_or(simd_and(arg1, tmp), simd_and(simd_slli_128(simd_and(arg1, tmp), 1), simd_or(arg1, tmp)));
2390        return simd_ifh_1(simd_himask_2(), tmpAns, simd_srli_128(tmpAns, 1));
2391}
2392//The total number of operations is 12.0
2393static inline bitblock128_t simd_ugt_4(bitblock128_t arg1, bitblock128_t arg2)
2394{
2395        return simd_ifh_1(simd_himask_8(), simd_ugt_8(simd_and(simd_himask_8(), arg1), arg2), simd_ugt_8(simd_andc(arg1, simd_himask_8()), simd_andc(arg2, simd_himask_8())));
2396}
2397//The total number of operations is 3.0
2398static inline bitblock128_t simd_ugt_8(bitblock128_t arg1, bitblock128_t arg2)
2399{
2400        bitblock128_t high_bit = simd_constant_8((128));
2401        return simd_gt_8(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2402}
2403//The total number of operations is 13.5
2404static inline bitblock128_t simd_ugt_64(bitblock128_t arg1, bitblock128_t arg2)
2405{
2406        bitblock128_t tmpAns = simd_ugt_32(arg1, arg2);
2407        bitblock128_t mask = simd_and(tmpAns, simd_srli_64(simd_eq_32(arg1, arg2), (32)));
2408        mask = simd_or(mask, simd_slli_64(mask, (32)));
2409        return simd_or(simd_srai_64(tmpAns, (32)), mask);
2410}
2411//The total number of operations is 37.25
2412static inline bitblock128_t simd_ugt_128(bitblock128_t arg1, bitblock128_t arg2)
2413{
2414        bitblock128_t tmpAns = simd_ugt_64(arg1, arg2);
2415        bitblock128_t mask = simd_and(tmpAns, simd_srli_128(simd_eq_64(arg1, arg2), (64)));
2416        mask = simd_or(mask, simd_slli_128(mask, (64)));
2417        return simd_or(simd_srai_128(tmpAns, (64)), mask);
2418}
2419//The total number of operations is 3.0
2420static inline bitblock128_t simd_ugt_16(bitblock128_t arg1, bitblock128_t arg2)
2421{
2422        bitblock128_t high_bit = simd_constant_16((32768));
2423        return simd_gt_16(simd_xor(arg1, high_bit), simd_xor(arg2, high_bit));
2424}
2425#endif
Note: See TracBrowser for help on using the repository browser.