source: trunk/lib/sse_simd.h @ 575

Last change on this file since 575 was 533, checked in by cameron, 9 years ago

PTEST option

File size: 19.2 KB
Line 
1/*  Idealized SIMD Operations with SSE versions
2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7#ifndef SSE_SIMD_H
8#define SSE_SIMD_H
9
10#include <stdio.h>
11
12/*------------------------------------------------------------*/
13#ifndef _MSC_VER
14#include <stdint.h>
15#endif
16#ifdef _MSC_VER
17#include "stdint.h"
18#define LITTLE_ENDIAN 1234
19#define BIG_ENDIAN 4321
20#define BYTE_ORDER LITTLE_ENDIAN
21#endif
22
23#include <emmintrin.h>
24#ifdef USE_LDDQU
25#include <pmmintrin.h>
26#endif
27#ifdef USE_PTEST
28#include <smmintrin.h>
29#endif
30typedef __m128i SIMD_type;
31/*------------------------------------------------------------*/
32/* I. SIMD bitwise logical operations */
33
34#define simd_or(b1, b2) _mm_or_si128(b1, b2)
35#define simd_and(b1, b2) _mm_and_si128(b1, b2)
36#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
37#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
38#define simd_if(cond, then_val, else_val) \
39  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
40#define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
41#define simd_nor(a,b) (simd_not(simd_or(a,b)))
42
43
44/*  Specific constants. */
45#define simd_himask_2 _mm_set1_epi32(0xAAAAAAAA)
46#define simd_himask_4 _mm_set1_epi32(0xCCCCCCCC)
47#define simd_himask_8 _mm_set1_epi32(0xF0F0F0F0)
48/* Little-endian */
49#define simd_himask_16 _mm_set1_epi32(0xFF00FF00)
50#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
51#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
52#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
53
54/* Idealized operations with direct implementation by built-in
55   operations for various target architectures. */
56
57#define simd_add_8(a, b) _mm_add_epi8(a, b)
58#define simd_add_16(a, b) _mm_add_epi16(a, b)
59#define simd_add_32(a, b) _mm_add_epi32(a, b)
60#define simd_add_64(a, b) _mm_add_epi64(a, b)
61#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
62#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
63#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
64#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
65#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
66#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
67#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
68#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
69#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
70#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
71#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
72#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
73#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
74#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
75#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
76#define simd_packus_16(a, b) _mm_packus_epi16(b, a)
77#define simd_pack_16(a, b) \
78  _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
79#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
80#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
81#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
82#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
83#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
84#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
85#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
86#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
87#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
88#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
89#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
90
91#define simd_max_8(a, b) _mm_max_epu8(a, b)
92
93#define simd_slli_128(r, shft) \
94  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
95   (shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
96   simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
97
98#define simd_srli_128(r, shft) \
99  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
100   (shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
101   simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
102
103#define simd_sll_128(r, shft) \
104   simd_or(simd_sll_64(r, shft), \
105           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
106                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
107
108#define simd_srl_128(r, shft) \
109   simd_or(simd_srl_64(r, shft), \
110           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
111                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
112
113#define sisd_sll(r, shft) simd_sll_128(r, shft)
114#define sisd_srl(r, shft) simd_srl_128(r, shft)
115#define sisd_slli(r, shft) simd_slli_128(r, shft)
116#define sisd_srli(r, shft) simd_srli_128(r, shft)
117#define sisd_add(a, b) simd_add_128(a, b)
118#define sisd_sub(a, b) simd_sub_128(a, b)
119
120#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
121#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
122#define sisd_load_aligned(addr) _mm_load_si128(addr)
123#ifndef USE_LDDQU
124#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
125#endif
126#ifdef USE_LDDQU
127#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
128#endif
129
130
131
132#define simd_const_32(n) _mm_set1_epi32(n)
133#define simd_const_16(n) _mm_set1_epi16(n)
134#define simd_const_8(n) _mm_set1_epi8(n)
135#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
136#define simd_const_2(n) simd_const_4((n)<<2|(n))
137#define simd_const_1(n) \
138  (n==0 ? simd_const_8(0): simd_const_8(-1))
139
140
141static inline
142SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
143{
144         SIMD_type c1 = simd_xor(a,b);
145         SIMD_type borrow = simd_and(a,b);
146         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
147         return simd_if(simd_himask_2,c2,c1);
148}
149#define simd_add_4(a, b)\
150        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
151        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
152
153#define simd_srli_2(r, sh)\
154         simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
155
156#define simd_srli_4(r, sh)\
157         simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
158#define simd_srli_8(r, sh)\
159         simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
160
161#define simd_slli_2(r, sh)\
162         simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
163
164#define simd_slli_4(r, sh)\
165         simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
166#define simd_slli_8(r, sh)\
167         simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
168
169
170
171
172#define simd_mergeh_4(a,b)\
173        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
174        simd_if(simd_himask_8,simd_slli_8(a,4),b))
175#define simd_mergel_4(a,b)\
176        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
177        simd_if(simd_himask_8,simd_slli_8(a,4),b))
178#define simd_mergeh_2(a,b)\
179        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
180        simd_if(simd_himask_4,simd_slli_4(a,2),b))
181#define simd_mergel_2(a,b)\
182        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
183        simd_if(simd_himask_4,simd_slli_4(a,2),b))
184#define simd_mergeh_1(a,b)\
185        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
186        simd_if(simd_himask_2,simd_slli_2(a,1),b))
187#define simd_mergel_1(a,b)\
188        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
189        simd_if(simd_himask_2,simd_slli_2(a,1),b))
190
191#define sisd_to_int(x) _mm_cvtsi128_si32(x)
192
193#define sisd_from_int(n) _mm_cvtsi32_si128(n)
194
195static inline int simd_all_true_8(SIMD_type v) {
196  return _mm_movemask_epi8(v) == 0xFFFF;
197}
198
199static inline int simd_any_true_8(SIMD_type v) {
200  return _mm_movemask_epi8(v) != 0;
201}
202
203static inline int simd_any_sign_bit_8(SIMD_type v) {
204  return _mm_movemask_epi8(v) != 0;
205}
206
207#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
208#define simd_all_le_8(v1, v2) \
209  simd_all_eq_8(simd_max_8(v1, v2), v2)
210
211#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
212
213#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
214
215
216static inline int bitblock_has_bit(SIMD_type v) {
217#ifndef USE_PTEST
218  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
219#endif
220#ifdef USE_PTEST
221  return !_mm_testz_si128(v,v);
222#endif
223}
224
225
226
227#define bitblock_test_bit(blk, n) \
228   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
229
230#define simd_pack_2(a,b)\
231        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
232        simd_if(simd_himask_2,sisd_srli(b,1),b))
233#define simd_pack_4(a,b)\
234        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
235        simd_if(simd_himask_4,sisd_srli(b,2),b))
236#define simd_pack_8(a,b)\
237        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
238        simd_if(simd_himask_8,sisd_srli(b,4),b))
239
240#ifndef simd_add_2_xx
241#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
242#endif
243
244#ifndef simd_add_2_xl
245#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
246#endif
247
248#ifndef simd_add_2_xh
249#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
250#endif
251
252#ifndef simd_add_2_lx
253#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
254#endif
255
256#ifndef simd_add_2_ll
257#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
258#endif
259
260#ifndef simd_add_2_lh
261#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
262#endif
263
264#ifndef simd_add_2_hx
265#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
266#endif
267
268#ifndef simd_add_2_hl
269#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
270#endif
271
272#ifndef simd_add_2_hh
273#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
274#endif
275
276#ifndef simd_add_4_xx
277#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
278#endif
279
280#ifndef simd_add_4_xl
281#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
282#endif
283
284#ifndef simd_add_4_xh
285#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
286#endif
287
288#ifndef simd_add_4_lx
289#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
290#endif
291
292#ifndef simd_add_4_ll
293#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
294#endif
295
296#ifndef simd_add_4_lh
297#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
298#endif
299
300#ifndef simd_add_4_hx
301#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
302#endif
303
304#ifndef simd_add_4_hl
305#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
306#endif
307
308#ifndef simd_add_4_hh
309#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
310#endif
311
312#ifndef simd_add_8_xx
313#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
314#endif
315
316#ifndef simd_add_8_xl
317#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
318#endif
319
320#ifndef simd_add_8_xh
321#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
322#endif
323
324#ifndef simd_add_8_lx
325#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
326#endif
327
328#ifndef simd_add_8_ll
329#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
330#endif
331
332#ifndef simd_add_8_lh
333#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
334#endif
335
336#ifndef simd_add_8_hx
337#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
338#endif
339
340#ifndef simd_add_8_hl
341#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
342#endif
343
344#ifndef simd_add_8_hh
345#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
346#endif
347
348#ifndef simd_add_16_xx
349#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
350#endif
351
352#ifndef simd_add_16_xl
353#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
354#endif
355
356#ifndef simd_add_16_xh
357#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
358#endif
359
360#ifndef simd_add_16_lx
361#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
362#endif
363
364#ifndef simd_add_16_ll
365#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
366#endif
367
368#ifndef simd_add_16_lh
369#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
370#endif
371
372#ifndef simd_add_16_hx
373#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
374#endif
375
376#ifndef simd_add_16_hl
377#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
378#endif
379
380#ifndef simd_add_16_hh
381#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
382#endif
383
384#ifndef simd_add_32_xx
385#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
386#endif
387
388#ifndef simd_add_32_xl
389#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
390#endif
391
392#ifndef simd_add_32_xh
393#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
394#endif
395
396#ifndef simd_add_32_lx
397#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
398#endif
399
400#ifndef simd_add_32_ll
401#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
402#endif
403
404#ifndef simd_add_32_lh
405#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
406#endif
407
408#ifndef simd_add_32_hx
409#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
410#endif
411
412#ifndef simd_add_32_hl
413#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
414#endif
415
416#ifndef simd_add_32_hh
417#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
418#endif
419
420#ifndef simd_add_64_xx
421#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
422#endif
423
424#ifndef simd_add_64_xl
425#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
426#endif
427
428#ifndef simd_add_64_xh
429#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
430#endif
431
432#ifndef simd_add_64_lx
433#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
434#endif
435
436#ifndef simd_add_64_ll
437#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
438#endif
439
440#ifndef simd_add_64_lh
441#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
442#endif
443
444#ifndef simd_add_64_hx
445#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
446#endif
447
448#ifndef simd_add_64_hl
449#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
450#endif
451
452#ifndef simd_add_64_hh
453#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
454#endif
455
456#ifndef simd_add_128_xx
457#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
458#endif
459
460#ifndef simd_add_128_xl
461#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
462#endif
463
464#ifndef simd_add_128_xh
465#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
466#endif
467
468#ifndef simd_add_128_lx
469#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
470#endif
471
472#ifndef simd_add_128_ll
473#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
474#endif
475
476#ifndef simd_add_128_lh
477#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
478#endif
479
480#ifndef simd_add_128_hx
481#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
482#endif
483
484#ifndef simd_add_128_hl
485#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
486#endif
487
488#ifndef simd_add_128_hh
489#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
490#endif
491
492#ifndef simd_pack_2_xx
493#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
494#endif
495
496#ifndef simd_pack_2_xl
497#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
498#endif
499
500#ifndef simd_pack_2_xh
501#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
502#endif
503
504#ifndef simd_pack_2_lx
505#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
506#endif
507
508#ifndef simd_pack_2_ll
509#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
510#endif
511
512#ifndef simd_pack_2_lh
513#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
514#endif
515
516#ifndef simd_pack_2_hx
517#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
518#endif
519
520#ifndef simd_pack_2_hl
521#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
522#endif
523
524#ifndef simd_pack_2_hh
525#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
526#endif
527
528#ifndef simd_pack_4_xx
529#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
530#endif
531
532#ifndef simd_pack_4_xl
533#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
534#endif
535
536#ifndef simd_pack_4_xh
537#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
538#endif
539
540#ifndef simd_pack_4_lx
541#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
542#endif
543
544#ifndef simd_pack_4_ll
545#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
546#endif
547
548#ifndef simd_pack_4_lh
549#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
550#endif
551
552#ifndef simd_pack_4_hx
553#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
554#endif
555
556#ifndef simd_pack_4_hl
557#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
558#endif
559
560#ifndef simd_pack_4_hh
561#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
562#endif
563
564#ifndef simd_pack_8_xx
565#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
566#endif
567
568#ifndef simd_pack_8_xl
569#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
570#endif
571
572#ifndef simd_pack_8_xh
573#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
574#endif
575
576#ifndef simd_pack_8_lx
577#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
578#endif
579
580#ifndef simd_pack_8_ll
581#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
582#endif
583
584#ifndef simd_pack_8_lh
585#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
586#endif
587
588#ifndef simd_pack_8_hx
589#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
590#endif
591
592#ifndef simd_pack_8_hl
593#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
594#endif
595
596#ifndef simd_pack_8_hh
597#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
598#endif
599
600#ifndef simd_pack_16_xx
601#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
602#endif
603
604#ifndef simd_pack_16_xl
605#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
606#endif
607
608#ifndef simd_pack_16_xh
609#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
610#endif
611
612#ifndef simd_pack_16_lx
613#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
614#endif
615
616#ifndef simd_pack_16_ll
617#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
618#endif
619
620#ifndef simd_pack_16_lh
621#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
622#endif
623
624#ifndef simd_pack_16_hx
625#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
626#endif
627
628#ifndef simd_pack_16_hl
629#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
630#endif
631
632#ifndef simd_pack_16_hh
633//#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
634//Masking performned by simd_pack_16 is unnecessary.
635#define simd_pack_16_hh(v1, v2) simd_packus_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
636#endif
637
638// Splat the first 16-bit int into all positions.
639static inline SIMD_type simd_splat_16(SIMD_type x) {
640  SIMD_type t = _mm_shufflelo_epi16(x,0);
641  return _mm_shuffle_epi32(t,0);
642}
643
644// Splat the first 32-bit int into all positions.
645static inline SIMD_type simd_splat_32(SIMD_type x) {
646  return _mm_shuffle_epi32(x,0);
647}
648
649static inline int bitblock_bit_count(SIMD_type v) {
650  int bit_count = 0;
651  SIMD_type cts_2 = simd_add_2_lh(v, v);
652  SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
653  SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
654  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
655  /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
656  SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
657  return (int) sisd_to_int(cts_128);
658}
659
660#endif
661
Note: See TracBrowser for help on using the repository browser.