source: trunk/lib/avx_simd.h @ 1570

Last change on this file since 1570 was 1233, checked in by cameron, 8 years ago

Add sisd_to_int

File size: 25.7 KB
Line 
1/*  Idealized SIMD Operations with SSE versions
2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7
8#ifndef AVX_SIMD_H
9#define AVX_SIMD_H
10
11#include <stdio.h>
12
13#warning compiling avx_simd.h
14
15/*------------------------------------------------------------*/
16
17#include <stdint.h>
18
19#ifdef _MSC_VER
20#define LITTLE_ENDIAN 1234
21#define BIG_ENDIAN 4321
22#define BYTE_ORDER LITTLE_ENDIAN
23#endif
24
25#include <immintrin.h>
26typedef __m256 SIMD_type;
27/*------------------------------------------------------------*/
28
29/* Prints the SIMD register representation of a SIMD value. */
30static void print_simd_register(const char * var_name, SIMD_type v);
31
32
33
34#define simd_hi128(x) \
35        ((__m128i)(_mm256_extractf128_ps(x, 1)))
36
37        //(_mm256_permute2f128_ps(x, x, 128 + 3))
38       
39#define simd_lo128(x) \
40        ((__m128i) _mm256_castps256_ps128(x))
41
42#define simd_lotohi128(x) \
43        _mm256_permute2f128_ps(x, x, 0 + 8)
44       
45#define simd_combine256(x, y) \
46   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
47   //(_mm256_permute2f128_ps(_mm256_castps128_ps256((__m128) x), _mm256_castps128_ps256((__m128) y), 2))
48       
49#define simd_op256(op128, x, y) \
50  simd_combine256(op128(simd_hi128(x), simd_hi128(y)), op128(simd_lo128(x), simd_lo128(y)))
51
52#define simd_unaryop256(op128, x, y) \
53  simd_combine256(op128(simd_hi128(x), y), op128(simd_lo128(x), y))
54 
55/* I. SIMD bitwise logical operations */
56
57#define simd_or(b1, b2) _mm256_or_ps(b1, b2)
58#define simd_and(b1, b2) _mm256_and_ps(b1, b2)
59#define simd_xor(b1, b2) _mm256_xor_ps(b1, b2)
60#define simd_andc(b1, b2) _mm256_andnot_ps(b2, b1)
61#define simd_if(cond, then_val, else_val) \
62  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
63#define simd_not(b) (simd_xor(b, ((SIMD_type)_mm256_set1_epi32(0xFFFFFFFF))))
64#define simd_nor(a,b) (simd_not(simd_or(a,b)))
65
66/*  Specific constants. */
67#define sisd_low_bit_mask ((SIMD_type) _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0x00000001))
68#define sisd_high_bit_mask ((SIMD_type) _mm256_set_epi32(0x80000000, 0, 0, 0, 0, 0, 0, 0))
69#define simd_himask_2 ((SIMD_type) _mm256_set1_epi32(0xAAAAAAAA))
70#define simd_himask_4 ((SIMD_type) _mm256_set1_epi32(0xCCCCCCCC))
71#define simd_himask_8 ((SIMD_type) _mm256_set1_epi32(0xF0F0F0F0))
72
73/* Little-endian */
74#define simd_himask_16 ((SIMD_type) _mm256_set1_epi32(0xFF00FF00))
75//#define simd_himask_16_128 ((SIMD_type) _mm128_set1_epi32(0xFF00FF00))
76#define simd_himask_32 ((SIMD_type) _mm256_set1_epi32(0xFFFF0000))
77
78#define simd_lomask_64 ((SIMD_type) _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1))
79#define simd_himask_64 ((SIMD_type) _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0))
80
81#define simd_lomask_128 ((SIMD_type) _mm256_set_epi32(0, 0, -1, -1, 0, 0,-1, -1))
82#define simd_himask_128 ((SIMD_type) _mm256_set_epi32(-1,-1, 0, 0,-1,-1, 0, 0))
83
84#define simd_himask_256 ((SIMD_type) _mm256_set_epi32(-1,-1,-1,-1, 0, 0, 0, 0))
85#define simd_lomask_256 ((SIMD_type) _mm256_set_epi32(0, 0, 0, 0,-1,-1,-1,-1))
86
87/* Idealized operations with direct implementation by built-in
88   operations for various target architectures. */
89
90#define simd_add_8(a, b) simd_op256(_mm_add_epi8, a, b)
91#define simd_add_16(a, b) simd_op256(_mm_add_epi16, a, b)
92#define simd_add_32(a, b) simd_op256(_mm_add_epi32, a, b)
93#define simd_add_64(a, b) simd_op256(_mm_add_epi64, a, b)
94
95#define simd_sub_8(a, b) simd_op256(_mm_sub_epi8, a, b)
96#define simd_sub_16(a, b) simd_op256(_mm_sub_epi16, a, b)
97#define simd_sub_32(a, b) simd_op256(_mm_sub_epi32, a, b)
98#define simd_sub_64(a, b) simd_op256(_mm_sub_epi64, a, b)
99
100#define simd_mult_16(a, b) simd_op256(_mm_mullo_epi16, a, b)
101
102#define simd_slli_16(r, shft) simd_unaryop256(_mm_slli_epi16, r, shft)
103#define simd_srli_16(r, shft) simd_unaryop256(_mm_srli_epi16, r, shft)
104#define simd_srai_16(r, shft) simd_unaryop256(_mm_srai_epi16, r, shft)
105
106#define simd_slli_32(r, shft) simd_unaryop256(_mm_slli_epi32, r, shft)
107#define simd_srli_32(r, shft) simd_unaryop256(_mm_srli_epi32, r, shft)
108#define simd_srai_32(r, shft) simd_unaryop256(_mm_srai_epi32, r, shft)
109
110#define simd_slli_64(r, shft) simd_unaryop256(_mm_slli_epi64, r, shft)
111#define simd_srli_64(r, shft) simd_unaryop256(_mm_srli_epi64, r, shft)
112
113#define simd_sll_64(r, shft_reg) simd_op256(_mm_sll_epi64, r, shft_reg)
114#define simd_srl_64(r, shft_reg) simd_op256(_mm_srl_epi64, r, shft_reg)
115
116#define packop256(packop128, x, y) \
117  simd_combine256(packop128(simd_lo128(x), simd_hi128(x)), packop128(simd_lo128(y), simd_hi128(y)))
118
119#define simd_packus_16(a, b) \
120        packop256(_mm_packus_epi16, a, b)
121       
122#define simd_pack_16(a, b) \
123        packop256(_mm_packus_epi16, \
124                simd_andc(a, simd_himask_16), \
125                simd_andc(b, simd_himask_16))
126 
127#define simd_mergeh_8(a, b) simd_op256(_mm_unpackhi_epi8, b, a)
128#define simd_mergeh_16(a, b) simd_op256(_mm_unpackhi_epi16, b, a)
129#define simd_mergeh_32(a, b) simd_op256(_mm_unpackhi_epi32, b, a)
130#define simd_mergeh_64(a, b) simd_op256(_mm_unpackhi_epi64, b, a)
131//_mm256_unpackhi_ps
132
133#define simd_mergel_8(a, b) simd_op256(_mm_unpacklo_epi8, b, a)
134#define simd_mergel_16(a, b) simd_op256(_mm_unpacklo_epi16, b, a)
135#define simd_mergel_32(a, b) simd_op256(_mm_unpacklo_epi32, b, a)
136#define simd_mergel_64(a, b) simd_op256(_mm_unpacklo_epi64, b, a)
137
138#define simd_eq_8(a, b) simd_op256(_mm_cmpeq_epi8, a, b)
139#define simd_eq_16(a, b) simd_op256(_mm_cmpeq_epi16, a, b)
140#define simd_eq_32(a, b) simd_op256(_mm_cmpeq_epi32, a, b)
141
142#define simd_max_8(a, b) simd_op256(_mm_max_epu8, a, b)
143
144#define _avx_mm_slli_128(r, shft) ((SIMD_type)simd_unaryop256(_mm_slli_si128, (r), shft))
145#define _avx_mm_srli_128(r, shft) ((SIMD_type)simd_unaryop256(_mm_srli_si128, (r), shft))
146
147#define simd_slli_128(r, shft) \
148  ((shft) % 8 == 0 ? _avx_mm_slli_128(r, (shft) / 8) : \
149   (shft) >= 64 ? simd_slli_64(_avx_mm_slli_128(r, 8), (shft) - 64) : \
150   simd_or(simd_slli_64(r, shft), _avx_mm_slli_128(simd_srli_64(r, 64-(shft)), 8)))
151
152#define simd_srli_128(r, shft) \
153  ((shft) % 8 == 0 ? _avx_mm_srli_128(r, (shft)/8) : \
154   (shft) >= 64 ? simd_srli_64(_avx_mm_srli_128(r, 8), (shft) - 64) : \
155   simd_or(simd_srli_64(r, shft), _avx_mm_srli_128(simd_slli_64(r, 64-(shft)), 8)))
156
157#define simd_slli_256(r, shft) \
158  simd_or(simd_slli_128(r, shft), simd_lotohi128(simd_srli_128(r, 128 - (shft))))
159
160#define simd_srli_256(r, shft) \
161  simd_or(simd_srli_128(r, shft), simd_slli_128((SIMD_type) _mm256_castsi128_si256(simd_lo128(r)), 128 - (shft)))
162
163#define simd128_slli_128(r, shft) \
164  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
165   (shft) >= 64 ? _mm_slli_epi64(_mm_slli_si128(r, 8), (shft) - 64) : \
166   _mm_or_si128(_mm_slli_epi64(r, shft), _mm_slli_si128(_mm_srli_epi64(r, 64 - (shft)), 8)))
167
168#define simd128_srli_128(r, shft) \
169  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
170   (shft) >= 64 ? _mm_srli_epi64(_mm_srli_si128(r, 8), (shft) - 64) : \
171   _mm_or_si128(_mm_srli_epi64(r, shft), _mm_srli_si128(_mm_slli_epi64(r, 64 - (shft)), 8)))
172
173/*
174
175static inline SIMD_type simd_slli_256(SIMD_type r, uint32_t shft)
176{
177        __m128i x = simd_hi128(r);
178        __m128i y = simd_lo128(r);
179       
180        return
181        simd_combine256(
182        _mm_or_si128(
183                simd128_slli_128(x, shft),
184                simd128_srli_128(y, (128 - shft))),
185        simd128_slli_128(y, shft));
186}
187
188static inline SIMD_type simd_srli_256(SIMD_type r, uint32_t shft)
189{
190        __m128i x = simd_lo128(r);
191        __m128i y = simd_hi128(r);
192
193        return
194        simd_combine256(
195        _mm_or_si128(
196                simd128_srli_128(x, shft),
197                simd128_slli_128(y, (128 - shft))),
198        simd128_srli_128(y, shft));
199}
200*/
201
202#define simd_sll_128(r, shft) \
203   simd_or(simd_sll_64(r, shft), \
204           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
205                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
206
207#define simd_srl_128(r, shft) \
208   simd_or(simd_srl_64(r, shft), \
209           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
210                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
211/*
212#define simd_sll_256(r, shft) \
213   simd_or(simd_sll_128(r, shft), \
214           simd_or(simd_slli_256(simd_sll_128(r, simd_sub_32(shft, sisd_from_int(128))), 16), \
215                   simd_slli_256(simd_srl_128(r, simd_sub_32(sisd_from_int(128), shft)), 16)))
216
217#define simd_srl_256(r, shft) \
218   simd_or(simd_srl_128(r, shft), \
219           simd_or(simd_srli_256(simd_srl_128(r, simd_sub_32(shft, sisd_from_int(128))), 16), \
220                   simd_srli_256(simd_sll_128(r, simd_sub_32(sisd_from_int(128), shft)), 16)))
221*/
222/*
223#define simd128_sll_128(r, shft) \
224   simd_or(simd_sll_64(r, shft), \
225           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
226                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
227
228#define simd128_srl_128(r, shft) \
229   simd_or(simd_srl_64(r, shft), \
230           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, _mm_srli_epi32(64))), 8), \
231                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
232
233#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
234#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
235*/
236
237
238
239static inline __m128i simd128_srl_128(__m128i r, __m128i shft)
240{
241  return
242  _mm_or_si128(_mm_srl_epi64(r, shft),
243        _mm_or_si128(_mm_srli_si128(_mm_srl_epi64(r, _mm_sub_epi32(shft, _mm_cvtsi32_si128(64))), 8),
244                     _mm_srli_si128(_mm_sll_epi64(r, _mm_sub_epi32(_mm_cvtsi32_si128(64), shft)), 8)));
245}
246
247static inline __m128i simd128_sll_128(__m128i r, __m128i shft)
248{
249  return
250  _mm_or_si128(_mm_sll_epi64(r, shft),
251        _mm_or_si128(_mm_slli_si128(_mm_sll_epi64(r, _mm_sub_epi32(shft, _mm_cvtsi32_si128(64))), 8),
252                     _mm_slli_si128(_mm_srl_epi64(r, _mm_sub_epi32(_mm_cvtsi32_si128(64), shft)), 8)));
253}
254
255static inline SIMD_type simd_sll_256(SIMD_type r, __m128i s)
256{
257        //__m128i s = simd_lo128(shft);
258        __m128i x = simd_hi128(r);
259        __m128i y = simd_lo128(r);
260
261        return
262        simd_combine256(
263           _mm_or_si128(
264                _mm_or_si128(simd128_sll_128(x, s), simd128_sll_128(y, _mm_sub_epi32(s, _mm_cvtsi32_si128(128)))),
265                simd128_srl_128(y, _mm_sub_epi32(_mm_cvtsi32_si128(128), s))),
266        simd128_sll_128(y, s));
267}
268
269static inline SIMD_type simd_srl_256(SIMD_type r, __m128i s)
270{
271        //__m128i s = simd_lo128(shft);
272        __m128i x = simd_lo128(r);
273        __m128i y = simd_hi128(r);
274
275        return
276        simd_combine256(
277           simd128_srl_128(y, s),
278           _mm_or_si128(
279                _mm_or_si128(simd128_srl_128(x, s), simd128_srl_128(y, _mm_sub_epi32(s, _mm_cvtsi32_si128(128)))),
280                simd128_sll_128(y, _mm_sub_epi32(_mm_cvtsi32_si128(128), s))));
281}
282
283#define sisd_sll(r, shft) simd_sll_256(r, simd_lo128(shft))
284#define sisd_srl(r, shft) simd_srl_256(r, simd_lo128(shft))
285#define sisd_slli(r, shft) simd_slli_256(r, shft)
286#define sisd_srli(r, shft) simd_srli_256(r, shft)
287#define sisd_add(a, b) simd_add_256(a, b)
288#define sisd_sub(a, b) simd_sub_256(a, b)
289
290#define sisd_store_aligned(r, addr) _mm256_store_ps(addr, r)
291#define sisd_store_unaligned(r, addr) _mm256_storeu_ps(addr, r)
292#define sisd_load_aligned(addr) ((SIMD_type)_mm256_load_si256((__m256i const *)addr))
293#define sisd_load_unaligned(addr) ((SIMD_type)_mm256_loadu_si256((__m256i const *)addr))
294
295
296#define simd_const_32(n) ((__m256)_mm256_set1_epi32(n))
297#define simd_const_16(n) ((__m256)_mm256_set1_epi16(n))
298#define simd_const_8(n) ((__m256)_mm256_set1_epi8(n))
299#define simd_const_4(n) ((__m256)_mm256_set1_epi8((n) << 4| (n)))
300#define simd_const_2(n) simd_const_4((n) << 2 | (n))
301#define simd_const_1(n) (n==0 ? (__m256) _mm256_setzero_ps(): simd_const_8(-1))
302
303static inline
304SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
305{
306         SIMD_type c1 = simd_xor(a,b);
307         SIMD_type borrow = simd_and(a,b);
308         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
309         return simd_if(simd_himask_2,c2,c1);
310}
311
312#define simd_add_4(a, b)\
313        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
314        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
315
316#define simd_srli_2(r, sh)\
317         simd_and(simd_srli_32(r,sh), simd_const_2(3>>sh))
318
319#define simd_srli_4(r, sh)\
320         simd_and(simd_srli_32(r,sh), simd_const_4(15>>sh))
321         
322#define simd_srli_8(r, sh)\
323         simd_and(simd_srli_32(r,sh), simd_const_8(255>>sh))
324
325#define simd_slli_2(r, sh)\
326         simd_and(simd_slli_32(r,sh), simd_const_2((3<<sh)&3))
327
328#define simd_slli_4(r, sh)\
329         simd_and(simd_slli_32(r,sh), simd_const_4((15<<sh)&15))
330         
331#define simd_slli_8(r, sh)\
332         simd_and(simd_slli_32(r,sh), simd_const_8((255<<sh) &255))
333
334#define simd_mergeh_4(a,b)\
335        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
336        simd_if(simd_himask_8,simd_slli_8(a,4),b))
337
338#define simd_mergel_4(a,b)\
339        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
340        simd_if(simd_himask_8,simd_slli_8(a,4),b))
341
342#define simd_mergeh_2(a,b)\
343        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
344        simd_if(simd_himask_4,simd_slli_4(a,2),b))
345
346#define simd_mergel_2(a,b)\
347        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
348        simd_if(simd_himask_4,simd_slli_4(a,2),b))
349
350#define simd_mergeh_1(a,b)\
351        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
352        simd_if(simd_himask_2,simd_slli_2(a,1),b))
353
354#define simd_mergel_1(a,b)\
355        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
356        simd_if(simd_himask_2,simd_slli_2(a,1),b))
357
358#define sisd_from_int(n) _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_cvtsi32_si128(n)))
359
360#define sisd_to_int(x)  _mm_extract_epi32(_mm256_castsi256_si128((__m256i) x), 0)
361/*
362#define sisd_to_int(x) _mm_cvtsi128_si32(x)
363
364#define sisd_from_int(n) _mm_cvtsi32_si128(n)
365
366static inline int simd_all_true_8(SIMD_type v) {
367  return _mm_movemask_epi8(v) == 0xFFFF;
368}
369
370static inline int simd_any_true_8(SIMD_type v) {
371  return _mm_movemask_epi8(v) != 0;
372}
373
374static inline int simd_any_sign_bit_8(SIMD_type v) {
375  return _mm_movemask_epi8(v) != 0;
376}
377
378#define simd_movemask_8(v) _mm_movemask_epi8(v)
379
380#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
381
382
383#define simd_all_le_8(v1, v2) \
384  simd_all_eq_8(simd_max_8(v1, v2), v2)
385
386#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
387
388#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
389
390*/
391
392static inline int simd_movemask_8(SIMD_type v)
393{
394        __m128i x1 = simd_lo128(v);
395        __m128i y1 = simd_hi128(v);
396        return (_mm_movemask_epi8(x1) | (_mm_movemask_epi8(y1) << 16));
397}
398
399static inline int simd_all_eq_8(SIMD_type v1, SIMD_type v2)
400{
401        __m128i x1 = simd_hi128(v1);
402        __m128i y1 = simd_lo128(v1);
403
404        __m128i x2 = simd_hi128(v2);
405        __m128i y2 = simd_lo128(v2);
406
407        return ((_mm_movemask_epi8(_mm_cmpeq_epi8(x1, x2)) & _mm_movemask_epi8(_mm_cmpeq_epi8(y1, y2))) == 0xFFFF);
408}
409
410static inline int bitblock_has_bit(SIMD_type v) 
411{
412        return !_mm256_testz_si256((__m256i) v,(__m256i) v);
413}
414
415
416#define simd_pack_2(a,b)\
417        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
418        simd_if(simd_himask_2,sisd_srli(b,1),b))
419#define simd_pack_4(a,b)\
420        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
421        simd_if(simd_himask_4,sisd_srli(b,2),b))
422#define simd_pack_8(a,b)\
423        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
424        simd_if(simd_himask_8,sisd_srli(b,4),b))
425
426#ifndef simd_add_2_xx
427#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
428#endif
429
430#ifndef simd_add_2_xl
431#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
432#endif
433
434#ifndef simd_add_2_xh
435#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
436#endif
437
438#ifndef simd_add_2_lx
439#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
440#endif
441
442#ifndef simd_add_2_ll
443#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
444#endif
445
446#ifndef simd_add_2_lh
447#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
448#endif
449
450#ifndef simd_add_2_hx
451#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
452#endif
453
454#ifndef simd_add_2_hl
455#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
456#endif
457
458#ifndef simd_add_2_hh
459#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
460#endif
461
462#ifndef simd_add_4_xx
463#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
464#endif
465
466#ifndef simd_add_4_xl
467#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
468#endif
469
470#ifndef simd_add_4_xh
471#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
472#endif
473
474#ifndef simd_add_4_lx
475#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
476#endif
477
478#ifndef simd_add_4_ll
479#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
480#endif
481
482#ifndef simd_add_4_lh
483#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
484#endif
485
486#ifndef simd_add_4_hx
487#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
488#endif
489
490#ifndef simd_add_4_hl
491#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
492#endif
493
494#ifndef simd_add_4_hh
495#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
496#endif
497
498#ifndef simd_add_8_xx
499#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
500#endif
501
502#ifndef simd_add_8_xl
503#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
504#endif
505
506#ifndef simd_add_8_xh
507#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
508#endif
509
510#ifndef simd_add_8_lx
511#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
512#endif
513
514#ifndef simd_add_8_ll
515#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
516#endif
517
518#ifndef simd_add_8_lh
519#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
520#endif
521
522#ifndef simd_add_8_hx
523#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
524#endif
525
526#ifndef simd_add_8_hl
527#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
528#endif
529
530#ifndef simd_add_8_hh
531#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
532#endif
533
534#ifndef simd_add_16_xx
535#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
536#endif
537
538#ifndef simd_add_16_xl
539#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
540#endif
541
542#ifndef simd_add_16_xh
543#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
544#endif
545
546#ifndef simd_add_16_lx
547#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
548#endif
549
550#ifndef simd_add_16_ll
551#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
552#endif
553
554#ifndef simd_add_16_lh
555#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
556#endif
557
558#ifndef simd_add_16_hx
559#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
560#endif
561
562#ifndef simd_add_16_hl
563#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
564#endif
565
566#ifndef simd_add_16_hh
567#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
568#endif
569
570#ifndef simd_add_32_xx
571#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
572#endif
573
574#ifndef simd_add_32_xl
575#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
576#endif
577
578#ifndef simd_add_32_xh
579#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
580#endif
581
582#ifndef simd_add_32_lx
583#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
584#endif
585
586#ifndef simd_add_32_ll
587#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
588#endif
589
590#ifndef simd_add_32_lh
591#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
592#endif
593
594#ifndef simd_add_32_hx
595#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
596#endif
597
598#ifndef simd_add_32_hl
599#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
600#endif
601
602#ifndef simd_add_32_hh
603#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
604#endif
605
606#ifndef simd_add_64_xx
607#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
608#endif
609
610#ifndef simd_add_64_xl
611#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
612#endif
613
614#ifndef simd_add_64_xh
615#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
616#endif
617
618#ifndef simd_add_64_lx
619#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
620#endif
621
622#ifndef simd_add_64_ll
623#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
624#endif
625
626#ifndef simd_add_64_lh
627#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
628#endif
629
630#ifndef simd_add_64_hx
631#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
632#endif
633
634#ifndef simd_add_64_hl
635#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
636#endif
637
638#ifndef simd_add_64_hh
639#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
640#endif
641
642#ifndef simd_add_128_xx
643#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
644#endif
645
646#ifndef simd_add_128_xl
647#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
648#endif
649
650#ifndef simd_add_128_xhsimd_all_eq_8
651#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
652#endif
653
654#ifndef simd_add_128_lx
655#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
656#endif
657
658#ifndef simd_add_128_ll
659#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
660#endif
661
662#ifndef simd_add_128_lh
663#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
664#endif
665
666#ifndef simd_add_128_hx
667#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
668#endif
669
670#ifndef simd_add_128_hl
671#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
672#endif
673
674#ifndef simd_add_128_hh
675#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
676#endif
677
678static inline SIMD_type simd_add_128(SIMD_type v1, SIMD_type v2) {
679        SIMD_type temp = simd_add_64(v1,v2);
680        SIMD_type carry_mask = simd_or(simd_and(v1, v2), simd_and(simd_xor(v1, v2), simd_not(temp)));
681        SIMD_type carry = simd_slli_128(simd_and(carry_mask, simd_lomask_128), 1);
682        return simd_if(simd_lomask_128, temp, simd_add_64(temp, carry));
683}
684
685#ifndef simd_pack_2_xx
686#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
687#endif
688
689#ifndef simd_pack_2_xl
690#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
691#endif
692
693#ifndef simd_pack_2_xh
694#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
695#endif
696
697#ifndef simd_pack_2_lx
698#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
699#endif
700
701#ifndef simd_pack_2_ll
702#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
703#endif
704
705#ifndef simd_pack_2_lh
706#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
707#endif
708
709#ifndef simd_pack_2_hx
710#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
711#endif
712
713#ifndef simd_pack_2_hl
714#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
715#endif
716
717#ifndef simd_pack_2_hh
718#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
719#endif
720
721#ifndef simd_pack_4_xx
722#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
723#endif
724
725#ifndef simd_pack_4_xl
726#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
727#endif
728
729#ifndef simd_pack_4_xh
730#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
731#endif
732
733#ifndef simd_pack_4_lx
734#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
735#endif
736
737#ifndef simd_pack_4_ll
738#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
739#endif
740
741#ifndef simd_pack_4_lh
742#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
743#endif
744
745#ifndef simd_pack_4_hx
746#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
747#endif
748
749#ifndef simd_pack_4_hl
750#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
751#endif
752
753#ifndef simd_pack_4_hh
754#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
755#endif
756
757#ifndef simd_pack_8_xx
758#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
759#endif
760
761#ifndef simd_pack_8_xl
762#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
763#endif
764
765#ifndef simd_pack_8_xh
766#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
767#endif
768
769#ifndef simd_pack_8_lx
770#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
771#endif
772
773#ifndef simd_pack_8_ll
774#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
775#endif
776
777#ifndef simd_pack_8_lh
778#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
779#endif
780
781#ifndef simd_pack_8_hx
782#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
783#endif
784
785#ifndef simd_pack_8_hl
786#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
787#endif
788
789#ifndef simd_pack_8_hh
790#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
791#endif
792
793#ifndef simd_pack_16_xx
794#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
795#endif
796
797#ifndef simd_pack_16_xl
798#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
799#endif
800
801#ifndef simd_pack_16_xh
802#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
803#endif
804
805#ifndef simd_pack_16_lx
806#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
807#endif
808
809#ifndef simd_pack_16_ll
810#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
811#endif
812
813#ifndef simd_pack_16_lh
814#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
815#endif
816
817#ifndef simd_pack_16_hx
818#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
819#endif
820
821#ifndef simd_pack_16_hl
822#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
823#endif
824
825#ifndef simd_pack_16_hh
826//#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
827//Masking performned by simd_pack_16 is unnecessary.
828#define simd_pack_16_hh(v1, v2) simd_packus_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
829#endif
830
831
832static inline long bitblock_bit_count(SIMD_type v) {
833  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(unsigned long)];} x;
834  x.vec = v;
835  long long b = 0;
836  for (int i = 0; i < sizeof(SIMD_type)/sizeof(unsigned long); i++) {
837    b += __builtin_popcountl(x.elems[i]);
838  }
839  return b;
840}
841
842#endif
Note: See TracBrowser for help on using the repository browser.