source: proto/Compiler/workspace/sse_simd.h @ 433

Last change on this file since 433 was 433, checked in by eamiri, 9 years ago

adding workspace directory

File size: 23.1 KB
Line 
1/*  Idealized SIMD Operations with SSE versions
2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7#ifndef SSE_SIMD_H
8#define SSE_SIMD_H
9
10/*------------------------------------------------------------*/
11#ifndef _MSC_VER
12#include <stdint.h>
13#endif
14#ifdef _MSC_VER
15#include "stdint.h"
16#define LITTLE_ENDIAN 1234
17#define BIG_ENDIAN 4321
18#define BYTE_ORDER LITTLE_ENDIAN
19#endif
20#include <limits.h>
21#ifndef LONG_BIT
22#define LONG_BIT (8*__WORDSIZE)
23#endif
24#include <emmintrin.h>
25#ifdef USE_LDDQU
26#include <pmmintrin.h>
27#endif
28typedef __m128i SIMD_type;
29
30
31#define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
32  __asm__  ("sahf\n\t" \
33        "adc %[e1], %[z1]\n\t" \
34        "adc %[e2], %[z2]\n\t" \
35        "lahf\n\t" \
36     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
37         : "[z1]" (x1), "[z2]" (x2), \
38           [e1] "r" (y1), [e2] "r" (y2), \
39           "[carryflag]" (carry) \
40         : "cc")
41
42#define adc128(first, second, carry, sum) \
43do\
44{\
45  union {__m128i bitblock;\
46         uint64_t int64[2];} rslt;\
47\
48  union {__m128i bitblock;\
49         uint64_t int64[2];} x;\
50\
51  union {__m128i bitblock;\
52         uint64_t int64[2];} y;\
53\
54  x.bitblock = first;\
55  y.bitblock = second;\
56\
57  double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
58  sum = rslt.bitblock;\
59}while(0)
60
61
62
63#define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, carry) \
64  __asm__  ("sahf\n\t" \
65        "sbb %[e1], %[z1]\n\t" \
66        "sbb %[e2], %[z2]\n\t" \
67        "lahf\n\t" \
68     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
69         : "[z1]" (x1), "[z2]" (x2), \
70           [e1] "r" (y1), [e2] "r" (y2), \
71           "[carryflag]" (carry) \
72         : "cc")
73
74#define sbb128(first, second, carry, sum) \
75do\
76{ union {__m128i bitblock;\
77         uint64_t int64[2];} rslt;\
78\
79  union {__m128i bitblock;\
80         uint64_t int64[2];} x;\
81\
82  union {__m128i bitblock;\
83         uint64_t int64[2];} y;\
84\
85  x.bitblock = first;\
86  y.bitblock = second;\
87\
88  double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
89                   rslt.int64[0], rslt.int64[1], carry);\
90  sum = rslt.bitblock;\
91}while(0)
92
93
94
95#define adc128_simd(x, y, carry,  sum) \
96do{ \
97  SIMD_type gen = simd_and(x, y); \
98  SIMD_type prop = simd_or(x, y); \
99  SIMD_type partial = simd_add_64(simd_add_64(x, y), carry); \
100  SIMD_type c1 = sisd_slli(simd_srli_64(simd_or(gen, simd_andc(prop, partial)), 63), 64); \
101  sum = simd_add_64(c1, partial); \
102  carry = sisd_srli(simd_or(gen, simd_andc(prop, sum)), 127); \
103} while(0)
104
105
106#define sbb128_simd(x, y, borrow, difference) \
107do {\
108  SIMD_type gen = simd_andc(y, x); \
109  SIMD_type prop = simd_not(simd_xor(x, y)); \
110  SIMD_type partial = simd_sub_64(simd_sub_64(x, y), borrow); \
111  SIMD_type b1 = sisd_slli(simd_srli_64(simd_or(gen, simd_and(prop, partial)), 63), 64); \
112  difference = simd_sub_64(partial, b1); \
113  borrow = sisd_srli(simd_or(gen, simd_and(prop, difference)), 127); \
114}while(0)
115
116
117#define advance_with_carry(cursor, carry, rslt)\
118do{\
119  SIMD_type shift_out = simd_srli_64(cursor, 63);\
120  SIMD_type low_bits = simd_mergel_64(shift_out, carry);\
121  carry = sisd_srli(shift_out, 64);\
122  rslt = simd_or(simd_add_64(cursor, cursor), low_bits);\
123}while(0)
124
125
126
127
128/*------------------------------------------------------------*/
129/* I. SIMD bitwise logical operations */
130
131#define simd_or(b1, b2) _mm_or_si128(b1, b2)
132#define simd_and(b1, b2) _mm_and_si128(b1, b2)
133#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
134#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
135#define simd_if(cond, then_val, else_val) \
136  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
137#define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
138#define simd_nor(a,b) (simd_not(simd_or(a,b)))
139
140
141/*  Specific constants. */
142#define simd_himask_2 _mm_set1_epi8(0xAA)
143#define simd_himask_4 _mm_set1_epi8(0xCC)
144#define simd_himask_8 _mm_set1_epi8(0xF0)
145/* Little-endian */
146#define simd_himask_16 _mm_set1_epi16(0xFF00)
147#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
148#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
149#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
150
151/* Idealized operations with direct implementation by built-in
152   operations for various target architectures. */
153
154#define simd_add_8(a, b) _mm_add_epi8(a, b)
155#define simd_add_16(a, b) _mm_add_epi16(a, b)
156#define simd_add_32(a, b) _mm_add_epi32(a, b)
157#define simd_add_64(a, b) _mm_add_epi64(a, b)
158#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
159#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
160#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
161#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
162#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
163#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
164#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
165#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
166#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
167#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
168#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
169#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
170#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
171#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
172#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
173#define simd_pack_16(a, b) \
174  _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
175#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
176#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
177#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
178#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
179#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
180#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
181#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
182#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
183#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
184#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
185#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
186
187#define simd_max_8(a, b) _mm_max_epu8(a, b)
188
189#define simd_slli_128(r, shft) \
190  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
191   (shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
192   simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
193
194#define simd_srli_128(r, shft) \
195  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
196   (shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
197   simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
198
199#define simd_sll_128(r, shft) \
200   simd_or(simd_sll_64(r, shft), \
201           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
202                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
203
204#define simd_srl_128(r, shft) \
205   simd_or(simd_srl_64(r, shft), \
206           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
207                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
208
209#define sisd_sll(r, shft) simd_sll_128(r, shft)
210#define sisd_srl(r, shft) simd_srl_128(r, shft)
211#define sisd_slli(r, shft) simd_slli_128(r, shft)
212#define sisd_srli(r, shft) simd_srli_128(r, shft)
213#define sisd_add(a, b) simd_add_128(a, b)
214#define sisd_sub(a, b) simd_sub_128(a, b)
215
216#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
217#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
218#define sisd_load_aligned(addr) _mm_load_si128(addr)
219#ifndef USE_LDDQU
220#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
221#endif
222#ifdef USE_LDDQU
223#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
224#endif
225
226
227
228#define simd_const_32(n) _mm_set1_epi32(n)
229#define simd_const_16(n) _mm_set1_epi16(n)
230#define simd_const_8(n) _mm_set1_epi8(n)
231#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
232#define simd_const_2(n) simd_const_4(n<<2|n)
233#define simd_const_1(n) \
234  (n==0 ? simd_const_8(0): simd_const_8(-1))
235
236#define simd_pack_16_ll(a, b) simd_pack_16(a, b)
237#define simd_pack_16_hh(a, b) \
238  simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
239
240
241static inline
242SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
243{
244         SIMD_type c1 = simd_xor(a,b);
245         SIMD_type borrow = simd_and(a,b);
246         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
247         return simd_if(simd_himask_2,c2,c1);
248}
249#define simd_add_4(a, b)\
250        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
251        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
252
253#define simd_srli_2(r, sh)\
254         simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
255
256#define simd_srli_4(r, sh)\
257         simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
258#define simd_srli_8(r, sh)\
259         simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
260
261#define simd_slli_2(r, sh)\
262         simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
263
264#define simd_slli_4(r, sh)\
265         simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
266#define simd_slli_8(r, sh)\
267         simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
268
269
270
271
272#define simd_mergeh_4(a,b)\
273        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
274        simd_if(simd_himask_8,simd_slli_8(a,4),b))
275#define simd_mergel_4(a,b)\
276        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
277        simd_if(simd_himask_8,simd_slli_8(a,4),b))
278#define simd_mergeh_2(a,b)\
279        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
280        simd_if(simd_himask_4,simd_slli_4(a,2),b))
281#define simd_mergel_2(a,b)\
282        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
283        simd_if(simd_himask_4,simd_slli_4(a,2),b))
284#define simd_mergeh_1(a,b)\
285        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
286        simd_if(simd_himask_2,simd_slli_2(a,1),b))
287#define simd_mergel_1(a,b)\
288        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
289        simd_if(simd_himask_2,simd_slli_2(a,1),b))
290
291#define sisd_to_int(x) _mm_cvtsi128_si32(x)
292
293#define sisd_from_int(n) _mm_cvtsi32_si128(n)
294
295static inline int simd_all_true_8(SIMD_type v) {
296  return _mm_movemask_epi8(v) == 0xFFFF;
297}
298
299static inline int simd_any_true_8(SIMD_type v) {
300  return _mm_movemask_epi8(v) != 0;
301}
302
303static inline int simd_any_sign_bit_8(SIMD_type v) {
304  return _mm_movemask_epi8(v) != 0;
305}
306
307#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
308#define simd_all_le_8(v1, v2) \
309  simd_all_eq_8(simd_max_8(v1, v2), v2)
310
311#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
312
313#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
314
315static inline int bitblock_has_bit(SIMD_type v) {
316  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
317}
318
319
320
321#define bitblock_test_bit(blk, n) \
322   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
323
324#define simd_pack_2(a,b)\
325        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
326        simd_if(simd_himask_2,sisd_srli(b,1),b))
327#define simd_pack_4(a,b)\
328        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
329        simd_if(simd_himask_4,sisd_srli(b,2),b))
330#define simd_pack_8(a,b)\
331        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
332        simd_if(simd_himask_8,sisd_srli(b,4),b))
333
334#ifndef simd_add_2_xx
335#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
336#endif
337
338#ifndef simd_add_2_xl
339#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
340#endif
341
342#ifndef simd_add_2_xh
343#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
344#endif
345
346#ifndef simd_add_2_lx
347#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
348#endif
349
350#ifndef simd_add_2_ll
351#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
352#endif
353
354#ifndef simd_add_2_lh
355#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
356#endif
357
358#ifndef simd_add_2_hx
359#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
360#endif
361
362#ifndef simd_add_2_hl
363#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
364#endif
365
366#ifndef simd_add_2_hh
367#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
368#endif
369
370#ifndef simd_add_4_xx
371#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
372#endif
373
374#ifndef simd_add_4_xl
375#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
376#endif
377
378#ifndef simd_add_4_xh
379#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
380#endif
381
382#ifndef simd_add_4_lx
383#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
384#endif
385
386#ifndef simd_add_4_ll
387#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
388#endif
389
390#ifndef simd_add_4_lh
391#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
392#endif
393
394#ifndef simd_add_4_hx
395#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
396#endif
397
398#ifndef simd_add_4_hl
399#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
400#endif
401
402#ifndef simd_add_4_hh
403#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
404#endif
405
406#ifndef simd_add_8_xx
407#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
408#endif
409
410#ifndef simd_add_8_xl
411#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
412#endif
413
414#ifndef simd_add_8_xh
415#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
416#endif
417
418#ifndef simd_add_8_lx
419#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
420#endif
421
422#ifndef simd_add_8_ll
423#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
424#endif
425
426#ifndef simd_add_8_lh
427#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
428#endif
429
430#ifndef simd_add_8_hx
431#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
432#endif
433
434#ifndef simd_add_8_hl
435#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
436#endif
437
438#ifndef simd_add_8_hh
439#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
440#endif
441
442#ifndef simd_add_16_xx
443#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
444#endif
445
446#ifndef simd_add_16_xl
447#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
448#endif
449
450#ifndef simd_add_16_xh
451#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
452#endif
453
454#ifndef simd_add_16_lx
455#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
456#endif
457
458#ifndef simd_add_16_ll
459#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
460#endif
461
462#ifndef simd_add_16_lh
463#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
464#endif
465
466#ifndef simd_add_16_hx
467#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
468#endif
469
470#ifndef simd_add_16_hl
471#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
472#endif
473
474#ifndef simd_add_16_hh
475#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
476#endif
477
478#ifndef simd_add_32_xx
479#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
480#endif
481
482#ifndef simd_add_32_xl
483#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
484#endif
485
486#ifndef simd_add_32_xh
487#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
488#endif
489
490#ifndef simd_add_32_lx
491#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
492#endif
493
494#ifndef simd_add_32_ll
495#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
496#endif
497
498#ifndef simd_add_32_lh
499#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
500#endif
501
502#ifndef simd_add_32_hx
503#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
504#endif
505
506#ifndef simd_add_32_hl
507#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
508#endif
509
510#ifndef simd_add_32_hh
511#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
512#endif
513
514#ifndef simd_add_64_xx
515#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
516#endif
517
518#ifndef simd_add_64_xl
519#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
520#endif
521
522#ifndef simd_add_64_xh
523#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
524#endif
525
526#ifndef simd_add_64_lx
527#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
528#endif
529
530#ifndef simd_add_64_ll
531#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
532#endif
533
534#ifndef simd_add_64_lh
535#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
536#endif
537
538#ifndef simd_add_64_hx
539#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
540#endif
541
542#ifndef simd_add_64_hl
543#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
544#endif
545
546#ifndef simd_add_64_hh
547#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
548#endif
549
550#ifndef simd_add_128_xx
551#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
552#endif
553
554#ifndef simd_add_128_xl
555#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
556#endif
557
558#ifndef simd_add_128_xh
559#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
560#endif
561
562#ifndef simd_add_128_lx
563#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
564#endif
565
566#ifndef simd_add_128_ll
567#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
568#endif
569
570#ifndef simd_add_128_lh
571#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
572#endif
573
574#ifndef simd_add_128_hx
575#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
576#endif
577
578#ifndef simd_add_128_hl
579#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
580#endif
581
582#ifndef simd_add_128_hh
583#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
584#endif
585
586#ifndef simd_pack_2_xx
587#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
588#endif
589
590#ifndef simd_pack_2_xl
591#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
592#endif
593
594#ifndef simd_pack_2_xh
595#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
596#endif
597
598#ifndef simd_pack_2_lx
599#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
600#endif
601
602#ifndef simd_pack_2_ll
603#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
604#endif
605
606#ifndef simd_pack_2_lh
607#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
608#endif
609
610#ifndef simd_pack_2_hx
611#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
612#endif
613
614#ifndef simd_pack_2_hl
615#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
616#endif
617
618#ifndef simd_pack_2_hh
619#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
620#endif
621
622#ifndef simd_pack_4_xx
623#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
624#endif
625
626#ifndef simd_pack_4_xl
627#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
628#endif
629
630#ifndef simd_pack_4_xh
631#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
632#endif
633
634#ifndef simd_pack_4_lx
635#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
636#endif
637
638#ifndef simd_pack_4_ll
639#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
640#endif
641
642#ifndef simd_pack_4_lh
643#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
644#endif
645
646#ifndef simd_pack_4_hx
647#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
648#endif
649
650#ifndef simd_pack_4_hl
651#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
652#endif
653
654#ifndef simd_pack_4_hh
655#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
656#endif
657
658#ifndef simd_pack_8_xx
659#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
660#endif
661
662#ifndef simd_pack_8_xl
663#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
664#endif
665
666#ifndef simd_pack_8_xh
667#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
668#endif
669
670#ifndef simd_pack_8_lx
671#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
672#endif
673
674#ifndef simd_pack_8_ll
675#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
676#endif
677
678#ifndef simd_pack_8_lh
679#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
680#endif
681
682#ifndef simd_pack_8_hx
683#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
684#endif
685
686#ifndef simd_pack_8_hl
687#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
688#endif
689
690#ifndef simd_pack_8_hh
691#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
692#endif
693
694#ifndef simd_pack_16_xx
695#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
696#endif
697
698#ifndef simd_pack_16_xl
699#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
700#endif
701
702#ifndef simd_pack_16_xh
703#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
704#endif
705
706#ifndef simd_pack_16_lx
707#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
708#endif
709
710#ifndef simd_pack_16_ll
711#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
712#endif
713
714#ifndef simd_pack_16_lh
715#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
716#endif
717
718#ifndef simd_pack_16_hx
719#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
720#endif
721
722#ifndef simd_pack_16_hl
723#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
724#endif
725
726#ifndef simd_pack_16_hh
727//#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
728//Masking performned by simd_pack_16 is unnecessary.
729#define simd_pack_16_hh(v1, v2) _mm_packus_epi16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
730#endif
731
732
733// Splat the first 16-bit int into all positions.
734static inline SIMD_type simd_splat_16(SIMD_type x) {
735  SIMD_type t = _mm_shufflelo_epi16(x,0);
736  return _mm_shuffle_epi32(t,0);
737}
738
739// Splat the first 32-bit int into all positions.
740static inline SIMD_type simd_splat_32(SIMD_type x) {
741  return _mm_shuffle_epi32(x,0);
742}
743
744
745void print_bit_block(const char * var_name, SIMD_type v) {
746  union {SIMD_type vec; unsigned char elems[8];} x;
747  x.vec = v;
748  unsigned char c, bit_reversed;
749  int i;
750  printf("%20s = ", var_name);
751  for (i = 0; i < sizeof(SIMD_type); i++) {
752    c = x.elems[i];
753     printf("%02X ", c);
754  }
755  printf("\n");
756}
757
758static inline int bitblock_bit_count(SIMD_type v) {
759  int bit_count = 0;
760  SIMD_type cts_2 = simd_add_2_lh(v, v);
761  SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
762  SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
763  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
764  /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
765  SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
766  return (int) sisd_to_int(cts_128);
767}
768
769#define sb_op(x, n) ((x)>>(n))
770#define sf_op(x, n) ((x)<<(n))
771#ifdef __GNUC__
772#define cfzl __builtin_ctzl
773#endif
774#ifdef _MSC_VER
775#include <intrin.h>
776#pragma intrinsic(_BitScanForward)
777//  precondition: x > 0
778static inline unsigned long cfzl(unsigned long x) {
779        unsigned long zeroes;
780        _BitScanForward(&zeroes, x);
781        return zeroes;
782}
783#endif
784
785static inline int count_forward_zeroes(SIMD_type bits) {
786  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
787  v.vec = bits;
788  if (v.elems[0] != 0) return cfzl(v.elems[0]);
789  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
790#ifdef _MSC_VER
791  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
792  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
793#endif
794#ifndef _MSC_VER
795#if (LONG_BIT < 64)
796  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
797  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
798#endif
799#endif
800  else return 8*sizeof(SIMD_type);
801}
802
803
804
805#endif
806
Note: See TracBrowser for help on using the repository browser.