source: trunk/lib/sse_simd.h @ 32

Last change on this file since 32 was 32, checked in by cameron, 11 years ago

SIMD library for PowerPC Altivec.

File size: 21.3 KB
Line 
1
2/*  Idealized SIMD Operations with SSE versions
3    Copyright (C) 2006, 2007, Robert D. Cameron
4    Licensed to the public under the Open Software License 3.0.
5    Licensed to International Characters Inc.
6       under the Academic Free License version 3.0.
7*/
8
9/*------------------------------------------------------------*/
10#ifndef SSE_SIMD_H
11#define SSE_SIMD_H
12
13#include <limits.h>
14#ifndef LONG_BIT
15#define LONG_BIT (8* sizeof(unsigned long))
16#endif
17#include <emmintrin.h>
18#ifdef USE_LDDQU
19#include <pmmintrin.h>
20#endif
21typedef __m128i SIMD_type;
22/*------------------------------------------------------------*/
23/* I. SIMD bitwise logical operations */
24
25#define simd_or(b1, b2) _mm_or_si128(b1, b2)
26#define simd_and(b1, b2) _mm_and_si128(b1, b2)
27#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
28#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
29#define simd_if(cond, then_val, else_val) \
30  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
31#define simd_not(b) (~b)
32#define simd_nor(a,b) (~simd_or(a,b))
33
34
35/*  Specific constants. */
36#define simd_himask_2 _mm_set1_epi8(0xAA)
37#define simd_himask_4 _mm_set1_epi8(0xCC)
38#define simd_himask_8 _mm_set1_epi8(0xF0)
39/* Little-endian */
40#define simd_himask_16 _mm_set1_epi16(0xFF00)
41#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
42#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
43#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
44
45/* Idealized operations with direct implementation by built-in
46   operations for various target architectures. */
47
48#define simd_add_8(a, b) _mm_add_epi8(a, b)
49#define simd_add_16(a, b) _mm_add_epi16(a, b)
50#define simd_add_32(a, b) _mm_add_epi32(a, b)
51#define simd_add_64(a, b) _mm_add_epi64(a, b)
52#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
53#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
54#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
55#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
56#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
57#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
58#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
59#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
60#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
61#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
62#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
63#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
64#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
65#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
66#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
67#define simd_pack_16(a, b) \
68  _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
69#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
70#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
71#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
72#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
73#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
74#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
75#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
76#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
77#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
78#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
79#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
80
81#define simd_max_8(a, b) _mm_max_epu8(a, b)
82
83#define simd_slli_128(r, shft) \
84  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
85   (shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
86   simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
87
88#define simd_srli_128(r, shft) \
89  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
90   (shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
91   simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
92
93#define simd_sll_128(r, shft) \
94   simd_or(simd_sll_64(r, shft), \
95           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
96                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
97
98#define simd_srl_128(r, shft) \
99   simd_or(simd_srl_64(r, shft), \
100           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
101                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
102
103#define sisd_sll(r, shft) simd_sll_128(r, shft)
104#define sisd_srl(r, shft) simd_srl_128(r, shft)
105#define sisd_slli(r, shft) simd_slli_128(r, shft)
106#define sisd_srli(r, shft) simd_srli_128(r, shft)
107#define sisd_add(a, b) simd_add_128(a, b)
108#define sisd_sub(a, b) simd_sub_128(a, b)
109
110#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
111#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
112#define sisd_load_aligned(addr) _mm_load_si128(addr)
113#ifndef USE_LDDQU
114#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
115#endif
116#ifdef USE_LDDQU
117#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
118#endif
119
120
121
122#define simd_const_32(n) _mm_set1_epi32(n)
123#define simd_const_16(n) _mm_set1_epi16(n)
124#define simd_const_8(n) _mm_set1_epi8(n)
125#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
126#define simd_const_2(n) simd_const_4(n<<2|n)
127#define simd_const_1(n) \
128  (n==0 ? simd_const_8(0): simd_const_8(-1))
129
130#define simd_pack_16_ll(a, b) simd_pack_16(a, b)
131#define simd_pack_16_hh(a, b) \
132  simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
133
134
135static inline
136SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
137{
138         SIMD_type c1 = simd_xor(a,b);
139         SIMD_type borrow = simd_and(a,b);
140         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
141         return simd_if(simd_himask_2,c2,c1);
142}
143#define simd_add_4(a, b)\
144        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
145        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
146
147#define simd_srli_2(r, sh)\
148         simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
149
150#define simd_srli_4(r, sh)\
151         simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
152#define simd_srli_8(r, sh)\
153         simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
154
155#define simd_slli_2(r, sh)\
156         simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
157
158#define simd_slli_4(r, sh)\
159         simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
160#define simd_slli_8(r, sh)\
161         simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
162
163
164
165
166#define simd_mergeh_4(a,b)\
167        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
168        simd_if(simd_himask_8,simd_slli_8(a,4),b))
169#define simd_mergel_4(a,b)\
170        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
171        simd_if(simd_himask_8,simd_slli_8(a,4),b))
172#define simd_mergeh_2(a,b)\
173        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
174        simd_if(simd_himask_4,simd_slli_4(a,2),b))
175#define simd_mergel_2(a,b)\
176        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
177        simd_if(simd_himask_4,simd_slli_4(a,2),b))
178#define simd_mergeh_1(a,b)\
179        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
180        simd_if(simd_himask_2,simd_slli_2(a,1),b))
181#define simd_mergel_1(a,b)\
182        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
183        simd_if(simd_himask_2,simd_slli_2(a,1),b))
184
185#define sisd_to_int(x) _mm_cvtsi128_si32(x)
186
187#define sisd_from_int(n) _mm_cvtsi32_si128(n)
188
189char mask_x55 [16] __attribute__ ((aligned(16))) = 
190    {0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55};
191char mask_x33 [16] __attribute__ ((aligned(16))) = 
192    {0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33};
193char mask_x0F [16] __attribute__ ((aligned(16))) = 
194    {0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};
195
196static inline int simd_all_true_8(SIMD_type v) {
197  return _mm_movemask_epi8(v) == 0xFFFF;
198}
199
200static inline int simd_any_true_8(SIMD_type v) {
201  return _mm_movemask_epi8(v) != 0;
202}
203
204static inline int simd_any_sign_bit_8(SIMD_type v) {
205  return _mm_movemask_epi8(v) != 0;
206}
207
208#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
209#define simd_all_le_8(v1, v2) \
210  simd_all_eq_8(simd_max_8(v1, v2), v2)
211
212#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
213
214static inline int bitblock_has_bit(SIMD_type v) {
215  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
216}
217
218
219
220#define bitblock_test_bit(blk, n) \
221   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
222
223#define simd_pack_2(a,b)\
224        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
225        simd_if(simd_himask_2,sisd_srli(b,1),b))
226#define simd_pack_4(a,b)\
227        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
228        simd_if(simd_himask_4,sisd_srli(b,2),b))
229#define simd_pack_8(a,b)\
230        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
231        simd_if(simd_himask_8,sisd_srli(b,4),b))
232
233#ifndef simd_add_2_xx
234#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
235#endif
236
237#ifndef simd_add_2_xl
238#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
239#endif
240
241#ifndef simd_add_2_xh
242#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
243#endif
244
245#ifndef simd_add_2_lx
246#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
247#endif
248
249#ifndef simd_add_2_ll
250#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
251#endif
252
253#ifndef simd_add_2_lh
254#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
255#endif
256
257#ifndef simd_add_2_hx
258#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
259#endif
260
261#ifndef simd_add_2_hl
262#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
263#endif
264
265#ifndef simd_add_2_hh
266#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
267#endif
268
269#ifndef simd_add_4_xx
270#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
271#endif
272
273#ifndef simd_add_4_xl
274#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
275#endif
276
277#ifndef simd_add_4_xh
278#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
279#endif
280
281#ifndef simd_add_4_lx
282#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
283#endif
284
285#ifndef simd_add_4_ll
286#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
287#endif
288
289#ifndef simd_add_4_lh
290#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
291#endif
292
293#ifndef simd_add_4_hx
294#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
295#endif
296
297#ifndef simd_add_4_hl
298#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
299#endif
300
301#ifndef simd_add_4_hh
302#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
303#endif
304
305#ifndef simd_add_8_xx
306#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
307#endif
308
309#ifndef simd_add_8_xl
310#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
311#endif
312
313#ifndef simd_add_8_xh
314#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
315#endif
316
317#ifndef simd_add_8_lx
318#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
319#endif
320
321#ifndef simd_add_8_ll
322#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
323#endif
324
325#ifndef simd_add_8_lh
326#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
327#endif
328
329#ifndef simd_add_8_hx
330#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
331#endif
332
333#ifndef simd_add_8_hl
334#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
335#endif
336
337#ifndef simd_add_8_hh
338#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
339#endif
340
341#ifndef simd_add_16_xx
342#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
343#endif
344
345#ifndef simd_add_16_xl
346#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
347#endif
348
349#ifndef simd_add_16_xh
350#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
351#endif
352
353#ifndef simd_add_16_lx
354#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
355#endif
356
357#ifndef simd_add_16_ll
358#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
359#endif
360
361#ifndef simd_add_16_lh
362#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
363#endif
364
365#ifndef simd_add_16_hx
366#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
367#endif
368
369#ifndef simd_add_16_hl
370#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
371#endif
372
373#ifndef simd_add_16_hh
374#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
375#endif
376
377#ifndef simd_add_32_xx
378#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
379#endif
380
381#ifndef simd_add_32_xl
382#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
383#endif
384
385#ifndef simd_add_32_xh
386#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
387#endif
388
389#ifndef simd_add_32_lx
390#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
391#endif
392
393#ifndef simd_add_32_ll
394#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
395#endif
396
397#ifndef simd_add_32_lh
398#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
399#endif
400
401#ifndef simd_add_32_hx
402#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
403#endif
404
405#ifndef simd_add_32_hl
406#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
407#endif
408
409#ifndef simd_add_32_hh
410#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
411#endif
412
413#ifndef simd_add_64_xx
414#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
415#endif
416
417#ifndef simd_add_64_xl
418#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
419#endif
420
421#ifndef simd_add_64_xh
422#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
423#endif
424
425#ifndef simd_add_64_lx
426#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
427#endif
428
429#ifndef simd_add_64_ll
430#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
431#endif
432
433#ifndef simd_add_64_lh
434#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
435#endif
436
437#ifndef simd_add_64_hx
438#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
439#endif
440
441#ifndef simd_add_64_hl
442#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
443#endif
444
445#ifndef simd_add_64_hh
446#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
447#endif
448
449#ifndef simd_add_128_xx
450#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
451#endif
452
453#ifndef simd_add_128_xl
454#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
455#endif
456
457#ifndef simd_add_128_xh
458#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
459#endif
460
461#ifndef simd_add_128_lx
462#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
463#endif
464
465#ifndef simd_add_128_ll
466#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
467#endif
468
469#ifndef simd_add_128_lh
470#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
471#endif
472
473#ifndef simd_add_128_hx
474#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
475#endif
476
477#ifndef simd_add_128_hl
478#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
479#endif
480
481#ifndef simd_add_128_hh
482#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
483#endif
484
485#ifndef simd_pack_2_xx
486#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
487#endif
488
489#ifndef simd_pack_2_xl
490#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
491#endif
492
493#ifndef simd_pack_2_xh
494#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
495#endif
496
497#ifndef simd_pack_2_lx
498#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
499#endif
500
501#ifndef simd_pack_2_ll
502#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
503#endif
504
505#ifndef simd_pack_2_lh
506#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
507#endif
508
509#ifndef simd_pack_2_hx
510#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
511#endif
512
513#ifndef simd_pack_2_hl
514#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
515#endif
516
517#ifndef simd_pack_2_hh
518#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
519#endif
520
521#ifndef simd_pack_4_xx
522#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
523#endif
524
525#ifndef simd_pack_4_xl
526#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
527#endif
528
529#ifndef simd_pack_4_xh
530#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
531#endif
532
533#ifndef simd_pack_4_lx
534#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
535#endif
536
537#ifndef simd_pack_4_ll
538#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
539#endif
540
541#ifndef simd_pack_4_lh
542#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
543#endif
544
545#ifndef simd_pack_4_hx
546#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
547#endif
548
549#ifndef simd_pack_4_hl
550#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
551#endif
552
553#ifndef simd_pack_4_hh
554#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
555#endif
556
557#ifndef simd_pack_8_xx
558#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
559#endif
560
561#ifndef simd_pack_8_xl
562#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
563#endif
564
565#ifndef simd_pack_8_xh
566#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
567#endif
568
569#ifndef simd_pack_8_lx
570#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
571#endif
572
573#ifndef simd_pack_8_ll
574#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
575#endif
576
577#ifndef simd_pack_8_lh
578#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
579#endif
580
581#ifndef simd_pack_8_hx
582#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
583#endif
584
585#ifndef simd_pack_8_hl
586#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
587#endif
588
589#ifndef simd_pack_8_hh
590#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
591#endif
592
593#ifndef simd_pack_16_xx
594#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
595#endif
596
597#ifndef simd_pack_16_xl
598#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
599#endif
600
601#ifndef simd_pack_16_xh
602#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
603#endif
604
605#ifndef simd_pack_16_lx
606#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
607#endif
608
609#ifndef simd_pack_16_ll
610#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
611#endif
612
613#ifndef simd_pack_16_lh
614#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
615#endif
616
617#ifndef simd_pack_16_hx
618#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
619#endif
620
621#ifndef simd_pack_16_hl
622#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
623#endif
624
625#ifndef simd_pack_16_hh
626#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
627#endif
628
629void print_bit_block(char * var_name, SIMD_type v) {
630  union {SIMD_type vec; unsigned char elems[8];} x;
631  x.vec = v;
632  unsigned char c, bit_reversed;
633  int i;
634  printf("%20s = ", var_name);
635  for (i = 0; i < sizeof(SIMD_type); i++) {
636    c = x.elems[i];
637     printf("%02X ", c); 
638  }
639  printf("\n");
640}
641
642static inline int bitblock_bit_count(SIMD_type v) {
643  int bit_count = 0;
644  SIMD_type cts_2 = simd_add_2_lh(v, v);
645  SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
646  SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
647  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
648  /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
649  SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
650  return (int) sisd_to_int(cts_128);
651}
652
653
654static inline int count_forward_zeroes(SIMD_type bits) {
655  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/LONG_BIT];} v;
656  v.vec = bits;
657  if (v.elems[0] != 0) return __builtin_ctzl(v.elems[0]);
658  else if (v.elems[1] != 0) return LONG_BIT + __builtin_ctzl(v.elems[1]);
659#if LONG_BIT < 64
660  else if (v.elems[2] != 0) return 2*LONG_BIT + __builtin_ctzl(v.elems[2]);
661  else if (v.elems[3] != 0) return 3*LONG_BIT + __builtin_ctzl(v.elems[3]);
662#endif
663  else return 8*sizeof(SIMD_type);
664}
665
666
667/* Scans for a 1 as long as it takes.  Use a sentinel to fence. */
668static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
669  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
670  unsigned long bitstream_slice = *bitstream_ptr & (-1L << bit_posn % 8);
671  int base_posn;
672  if (bitstream_slice == 0) {
673    do {
674      bitstream_ptr++;
675      bitstream_slice = *bitstream_ptr;
676    } while (bitstream_slice == 0);
677  }
678  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
679  return base_posn + __builtin_ctzl(bitstream_slice);
680}
681
682// static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
683//   unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
684//   unsigned long bitstream_slice = *bitstream_ptr >> bit_posn % 8;
685//   int base_posn = bit_posn;
686//   if (bitstream_slice == 0) {
687//     do {
688//       bitstream_ptr++;
689//       bitstream_slice = *bitstream_ptr;
690//     } while (bitstream_slice == 0);
691//     base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
692//   }
693//   return base_posn + __builtin_ctzl(bitstream_slice);
694// }
695
696static inline int bitstream_scan0(SIMD_type * stream) {
697  unsigned long * bitstream_ptr = (unsigned long *) stream;
698  unsigned long bitstream_slice = *bitstream_ptr;
699  int base_posn = 0;
700  while (bitstream_slice == 0) {
701    bitstream_ptr++;
702    bitstream_slice = *bitstream_ptr;
703#ifdef BITSTREAM_SCAN_ACCUM_POSN
704    base_posn += 8 * sizeof(unsigned long);
705#endif
706  }
707#ifndef BITSTREAM_SCAN_ACCUM_POSN
708  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
709#endif
710  return base_posn + __builtin_ctzl(bitstream_slice);
711}
712
713
714#endif
Note: See TracBrowser for help on using the repository browser.