source: trunk/lib/sse_simd.h @ 88

Last change on this file since 88 was 88, checked in by cameron, 11 years ago

stdint.h for MSVC

File size: 19.2 KB
Line 
1/*  Idealized SIMD Operations with SSE versions
2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7#ifndef SSE_SIMD_H
8#define SSE_SIMD_H
9
10/*------------------------------------------------------------*/
11#ifndef _MSC_VER
12#include <stdint.h>
13#endif
14#ifdef _MSC_VER
15#include "stdint.h"
16#endif
17#include <limits.h>
18#ifndef LONG_BIT
19#define LONG_BIT (8* sizeof(unsigned long))
20#endif
21#include <emmintrin.h>
22#ifdef USE_LDDQU
23#include <pmmintrin.h>
24#endif
25typedef __m128i SIMD_type;
26/*------------------------------------------------------------*/
27/* I. SIMD bitwise logical operations */
28
29#define simd_or(b1, b2) _mm_or_si128(b1, b2)
30#define simd_and(b1, b2) _mm_and_si128(b1, b2)
31#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
32#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
33#define simd_if(cond, then_val, else_val) \
34  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
35#define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
36#define simd_nor(a,b) (simd_not(simd_or(a,b)))
37
38
39/*  Specific constants. */
40#define simd_himask_2 _mm_set1_epi8(0xAA)
41#define simd_himask_4 _mm_set1_epi8(0xCC)
42#define simd_himask_8 _mm_set1_epi8(0xF0)
43/* Little-endian */
44#define simd_himask_16 _mm_set1_epi16(0xFF00)
45#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
46#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
47#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
48
49/* Idealized operations with direct implementation by built-in
50   operations for various target architectures. */
51
52#define simd_add_8(a, b) _mm_add_epi8(a, b)
53#define simd_add_16(a, b) _mm_add_epi16(a, b)
54#define simd_add_32(a, b) _mm_add_epi32(a, b)
55#define simd_add_64(a, b) _mm_add_epi64(a, b)
56#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
57#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
58#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
59#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
60#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
61#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
62#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
63#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
64#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
65#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
66#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
67#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
68#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
69#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
70#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
71#define simd_pack_16(a, b) \
72  _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
73#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
74#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
75#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
76#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
77#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
78#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
79#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
80#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
81#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
82#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
83#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
84
85#define simd_max_8(a, b) _mm_max_epu8(a, b)
86
87#define simd_slli_128(r, shft) \
88  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
89   (shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
90   simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
91
92#define simd_srli_128(r, shft) \
93  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
94   (shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
95   simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
96
97#define simd_sll_128(r, shft) \
98   simd_or(simd_sll_64(r, shft), \
99           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
100                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
101
102#define simd_srl_128(r, shft) \
103   simd_or(simd_srl_64(r, shft), \
104           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
105                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
106
107#define sisd_sll(r, shft) simd_sll_128(r, shft)
108#define sisd_srl(r, shft) simd_srl_128(r, shft)
109#define sisd_slli(r, shft) simd_slli_128(r, shft)
110#define sisd_srli(r, shft) simd_srli_128(r, shft)
111#define sisd_add(a, b) simd_add_128(a, b)
112#define sisd_sub(a, b) simd_sub_128(a, b)
113
114#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
115#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
116#define sisd_load_aligned(addr) _mm_load_si128(addr)
117#ifndef USE_LDDQU
118#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
119#endif
120#ifdef USE_LDDQU
121#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
122#endif
123
124
125
126#define simd_const_32(n) _mm_set1_epi32(n)
127#define simd_const_16(n) _mm_set1_epi16(n)
128#define simd_const_8(n) _mm_set1_epi8(n)
129#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
130#define simd_const_2(n) simd_const_4(n<<2|n)
131#define simd_const_1(n) \
132  (n==0 ? simd_const_8(0): simd_const_8(-1))
133
134#define simd_pack_16_ll(a, b) simd_pack_16(a, b)
135#define simd_pack_16_hh(a, b) \
136  simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
137
138
139static inline
140SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
141{
142         SIMD_type c1 = simd_xor(a,b);
143         SIMD_type borrow = simd_and(a,b);
144         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
145         return simd_if(simd_himask_2,c2,c1);
146}
147#define simd_add_4(a, b)\
148        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
149        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
150
151#define simd_srli_2(r, sh)\
152         simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
153
154#define simd_srli_4(r, sh)\
155         simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
156#define simd_srli_8(r, sh)\
157         simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
158
159#define simd_slli_2(r, sh)\
160         simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
161
162#define simd_slli_4(r, sh)\
163         simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
164#define simd_slli_8(r, sh)\
165         simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
166
167
168
169
170#define simd_mergeh_4(a,b)\
171        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
172        simd_if(simd_himask_8,simd_slli_8(a,4),b))
173#define simd_mergel_4(a,b)\
174        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
175        simd_if(simd_himask_8,simd_slli_8(a,4),b))
176#define simd_mergeh_2(a,b)\
177        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
178        simd_if(simd_himask_4,simd_slli_4(a,2),b))
179#define simd_mergel_2(a,b)\
180        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
181        simd_if(simd_himask_4,simd_slli_4(a,2),b))
182#define simd_mergeh_1(a,b)\
183        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
184        simd_if(simd_himask_2,simd_slli_2(a,1),b))
185#define simd_mergel_1(a,b)\
186        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
187        simd_if(simd_himask_2,simd_slli_2(a,1),b))
188
189#define sisd_to_int(x) _mm_cvtsi128_si32(x)
190
191#define sisd_from_int(n) _mm_cvtsi32_si128(n)
192
193static inline int simd_all_true_8(SIMD_type v) {
194  return _mm_movemask_epi8(v) == 0xFFFF;
195}
196
197static inline int simd_any_true_8(SIMD_type v) {
198  return _mm_movemask_epi8(v) != 0;
199}
200
201static inline int simd_any_sign_bit_8(SIMD_type v) {
202  return _mm_movemask_epi8(v) != 0;
203}
204
205#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
206#define simd_all_le_8(v1, v2) \
207  simd_all_eq_8(simd_max_8(v1, v2), v2)
208
209#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
210
211static inline int bitblock_has_bit(SIMD_type v) {
212  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
213}
214
215
216
217#define bitblock_test_bit(blk, n) \
218   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
219
220#define simd_pack_2(a,b)\
221        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
222        simd_if(simd_himask_2,sisd_srli(b,1),b))
223#define simd_pack_4(a,b)\
224        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
225        simd_if(simd_himask_4,sisd_srli(b,2),b))
226#define simd_pack_8(a,b)\
227        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
228        simd_if(simd_himask_8,sisd_srli(b,4),b))
229
230#ifndef simd_add_2_xx
231#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
232#endif
233
234#ifndef simd_add_2_xl
235#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
236#endif
237
238#ifndef simd_add_2_xh
239#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
240#endif
241
242#ifndef simd_add_2_lx
243#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
244#endif
245
246#ifndef simd_add_2_ll
247#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
248#endif
249
250#ifndef simd_add_2_lh
251#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
252#endif
253
254#ifndef simd_add_2_hx
255#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
256#endif
257
258#ifndef simd_add_2_hl
259#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
260#endif
261
262#ifndef simd_add_2_hh
263#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
264#endif
265
266#ifndef simd_add_4_xx
267#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
268#endif
269
270#ifndef simd_add_4_xl
271#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
272#endif
273
274#ifndef simd_add_4_xh
275#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
276#endif
277
278#ifndef simd_add_4_lx
279#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
280#endif
281
282#ifndef simd_add_4_ll
283#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
284#endif
285
286#ifndef simd_add_4_lh
287#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
288#endif
289
290#ifndef simd_add_4_hx
291#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
292#endif
293
294#ifndef simd_add_4_hl
295#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
296#endif
297
298#ifndef simd_add_4_hh
299#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
300#endif
301
302#ifndef simd_add_8_xx
303#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
304#endif
305
306#ifndef simd_add_8_xl
307#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
308#endif
309
310#ifndef simd_add_8_xh
311#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
312#endif
313
314#ifndef simd_add_8_lx
315#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
316#endif
317
318#ifndef simd_add_8_ll
319#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
320#endif
321
322#ifndef simd_add_8_lh
323#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
324#endif
325
326#ifndef simd_add_8_hx
327#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
328#endif
329
330#ifndef simd_add_8_hl
331#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
332#endif
333
334#ifndef simd_add_8_hh
335#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
336#endif
337
338#ifndef simd_add_16_xx
339#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
340#endif
341
342#ifndef simd_add_16_xl
343#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
344#endif
345
346#ifndef simd_add_16_xh
347#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
348#endif
349
350#ifndef simd_add_16_lx
351#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
352#endif
353
354#ifndef simd_add_16_ll
355#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
356#endif
357
358#ifndef simd_add_16_lh
359#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
360#endif
361
362#ifndef simd_add_16_hx
363#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
364#endif
365
366#ifndef simd_add_16_hl
367#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
368#endif
369
370#ifndef simd_add_16_hh
371#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
372#endif
373
374#ifndef simd_add_32_xx
375#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
376#endif
377
378#ifndef simd_add_32_xl
379#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
380#endif
381
382#ifndef simd_add_32_xh
383#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
384#endif
385
386#ifndef simd_add_32_lx
387#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
388#endif
389
390#ifndef simd_add_32_ll
391#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
392#endif
393
394#ifndef simd_add_32_lh
395#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
396#endif
397
398#ifndef simd_add_32_hx
399#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
400#endif
401
402#ifndef simd_add_32_hl
403#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
404#endif
405
406#ifndef simd_add_32_hh
407#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
408#endif
409
410#ifndef simd_add_64_xx
411#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
412#endif
413
414#ifndef simd_add_64_xl
415#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
416#endif
417
418#ifndef simd_add_64_xh
419#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
420#endif
421
422#ifndef simd_add_64_lx
423#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
424#endif
425
426#ifndef simd_add_64_ll
427#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
428#endif
429
430#ifndef simd_add_64_lh
431#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
432#endif
433
434#ifndef simd_add_64_hx
435#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
436#endif
437
438#ifndef simd_add_64_hl
439#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
440#endif
441
442#ifndef simd_add_64_hh
443#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
444#endif
445
446#ifndef simd_add_128_xx
447#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
448#endif
449
450#ifndef simd_add_128_xl
451#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
452#endif
453
454#ifndef simd_add_128_xh
455#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
456#endif
457
458#ifndef simd_add_128_lx
459#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
460#endif
461
462#ifndef simd_add_128_ll
463#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
464#endif
465
466#ifndef simd_add_128_lh
467#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
468#endif
469
470#ifndef simd_add_128_hx
471#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
472#endif
473
474#ifndef simd_add_128_hl
475#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
476#endif
477
478#ifndef simd_add_128_hh
479#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
480#endif
481
482#ifndef simd_pack_2_xx
483#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
484#endif
485
486#ifndef simd_pack_2_xl
487#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
488#endif
489
490#ifndef simd_pack_2_xh
491#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
492#endif
493
494#ifndef simd_pack_2_lx
495#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
496#endif
497
498#ifndef simd_pack_2_ll
499#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
500#endif
501
502#ifndef simd_pack_2_lh
503#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
504#endif
505
506#ifndef simd_pack_2_hx
507#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
508#endif
509
510#ifndef simd_pack_2_hl
511#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
512#endif
513
514#ifndef simd_pack_2_hh
515#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
516#endif
517
518#ifndef simd_pack_4_xx
519#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
520#endif
521
522#ifndef simd_pack_4_xl
523#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
524#endif
525
526#ifndef simd_pack_4_xh
527#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
528#endif
529
530#ifndef simd_pack_4_lx
531#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
532#endif
533
534#ifndef simd_pack_4_ll
535#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
536#endif
537
538#ifndef simd_pack_4_lh
539#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
540#endif
541
542#ifndef simd_pack_4_hx
543#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
544#endif
545
546#ifndef simd_pack_4_hl
547#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
548#endif
549
550#ifndef simd_pack_4_hh
551#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
552#endif
553
554#ifndef simd_pack_8_xx
555#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
556#endif
557
558#ifndef simd_pack_8_xl
559#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
560#endif
561
562#ifndef simd_pack_8_xh
563#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
564#endif
565
566#ifndef simd_pack_8_lx
567#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
568#endif
569
570#ifndef simd_pack_8_ll
571#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
572#endif
573
574#ifndef simd_pack_8_lh
575#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
576#endif
577
578#ifndef simd_pack_8_hx
579#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
580#endif
581
582#ifndef simd_pack_8_hl
583#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
584#endif
585
586#ifndef simd_pack_8_hh
587#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
588#endif
589
590#ifndef simd_pack_16_xx
591#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
592#endif
593
594#ifndef simd_pack_16_xl
595#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
596#endif
597
598#ifndef simd_pack_16_xh
599#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
600#endif
601
602#ifndef simd_pack_16_lx
603#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
604#endif
605
606#ifndef simd_pack_16_ll
607#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
608#endif
609
610#ifndef simd_pack_16_lh
611#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
612#endif
613
614#ifndef simd_pack_16_hx
615#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
616#endif
617
618#ifndef simd_pack_16_hl
619#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
620#endif
621
622#ifndef simd_pack_16_hh
623#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
624#endif
625
626
627// Splat the first 16-bit int into all positions.
628static inline SIMD_type simd_splat_16(SIMD_type x) {
629  SIMD_type t = _mm_shufflelo_epi16(x,0);
630  return _mm_shuffle_epi32(t,0);
631}
632
633// Splat the first 32-bit int into all positions.
634static inline SIMD_type simd_splat_32(SIMD_type x) {
635  return _mm_shuffle_epi32(x,0);
636}
637
638
639void print_bit_block(char * var_name, SIMD_type v) {
640  union {SIMD_type vec; unsigned char elems[8];} x;
641  x.vec = v;
642  unsigned char c, bit_reversed;
643  int i;
644  printf("%20s = ", var_name);
645  for (i = 0; i < sizeof(SIMD_type); i++) {
646    c = x.elems[i];
647     printf("%02X ", c); 
648  }
649  printf("\n");
650}
651
652static inline int bitblock_bit_count(SIMD_type v) {
653  int bit_count = 0;
654  SIMD_type cts_2 = simd_add_2_lh(v, v);
655  SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
656  SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
657  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
658  /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
659  SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
660  return (int) sisd_to_int(cts_128);
661}
662
663#endif
664
Note: See TracBrowser for help on using the repository browser.