source: proto/Compiler/p-workspace/sse_simd.h @ 444

Last change on this file since 444 was 444, checked in by ksherdy, 9 years ago

Fix syntax error.

File size: 23.1 KB
Line 
1/*  Idealized SIMD Operations with SSE versions
2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7#ifndef SSE_SIMD_H
8#define SSE_SIMD_H
9
10/*------------------------------------------------------------*/
11#ifndef _MSC_VER
12#include <stdint.h>
13#endif
14#ifdef _MSC_VER
15#include "stdint.h"
16#define LITTLE_ENDIAN 1234
17#define BIG_ENDIAN 4321
18#define BYTE_ORDER LITTLE_ENDIAN
19#endif
20#include <limits.h>
21#ifndef LONG_BIT
22#define LONG_BIT (8* sizeof(unsigned long))
23#endif
24#include <emmintrin.h>
25#ifdef USE_LDDQU
26#include <pmmintrin.h>
27#endif
28typedef __m128i SIMD_type;
29/*------------------------------------------------------------*/
30/* I. SIMD bitwise logical operations */
31
32#define simd_or(b1, b2) _mm_or_si128(b1, b2)
33#define simd_and(b1, b2) _mm_and_si128(b1, b2)
34#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
35#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
36#define simd_if(cond, then_val, else_val) \
37  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
38#define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
39#define simd_nor(a,b) (simd_not(simd_or(a,b)))
40
41
42/*  Specific constants. */
43#define simd_himask_2 _mm_set1_epi8(0xAA)
44#define simd_himask_4 _mm_set1_epi8(0xCC)
45#define simd_himask_8 _mm_set1_epi8(0xF0)
46/* Little-endian */
47#define simd_himask_16 _mm_set1_epi16(0xFF00)
48#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
49#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
50#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
51
52/* Idealized operations with direct implementation by built-in
53   operations for various target architectures. */
54
55#define simd_add_8(a, b) _mm_add_epi8(a, b)
56#define simd_add_16(a, b) _mm_add_epi16(a, b)
57#define simd_add_32(a, b) _mm_add_epi32(a, b)
58#define simd_add_64(a, b) _mm_add_epi64(a, b)
59#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
60#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
61#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
62#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
63#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
64#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
65#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
66#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
67#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
68#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
69#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
70#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
71#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
72#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
73#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
74#define simd_pack_16(a, b) \
75  _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
76#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
77#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
78#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
79#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
80#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
81#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
82#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
83#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
84#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
85#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
86#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
87
88#define simd_max_8(a, b) _mm_max_epu8(a, b)
89
90#define simd_slli_128(r, shft) \
91  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
92   (shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
93   simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
94
95#define simd_srli_128(r, shft) \
96  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
97   (shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
98   simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
99
100#define simd_sll_128(r, shft) \
101   simd_or(simd_sll_64(r, shft), \
102           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
103                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
104
105#define simd_srl_128(r, shft) \
106   simd_or(simd_srl_64(r, shft), \
107           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
108                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
109
110#define sisd_sll(r, shft) simd_sll_128(r, shft)
111#define sisd_srl(r, shft) simd_srl_128(r, shft)
112#define sisd_slli(r, shft) simd_slli_128(r, shft)
113#define sisd_srli(r, shft) simd_srli_128(r, shft)
114#define sisd_add(a, b) simd_add_128(a, b)
115#define sisd_sub(a, b) simd_sub_128(a, b)
116
117#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
118#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
119#define sisd_load_aligned(addr) _mm_load_si128(addr)
120#ifndef USE_LDDQU
121#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
122#endif
123#ifdef USE_LDDQU
124#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
125#endif
126
127
128
129#define simd_const_32(n) _mm_set1_epi32(n)
130#define simd_const_16(n) _mm_set1_epi16(n)
131#define simd_const_8(n) _mm_set1_epi8(n)
132#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
133#define simd_const_2(n) simd_const_4(n<<2|n)
134#define simd_const_1(n) \
135  (n==0 ? simd_const_8(0): simd_const_8(-1))
136
137#define simd_pack_16_ll(a, b) simd_pack_16(a, b)
138#define simd_pack_16_hh(a, b) \
139  simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
140
141
142static inline
143SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
144{
145         SIMD_type c1 = simd_xor(a,b);
146         SIMD_type borrow = simd_and(a,b);
147         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
148         return simd_if(simd_himask_2,c2,c1);
149}
150#define simd_add_4(a, b)\
151        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
152        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
153
154#define simd_srli_2(r, sh)\
155         simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
156
157#define simd_srli_4(r, sh)\
158         simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
159#define simd_srli_8(r, sh)\
160         simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
161
162#define simd_slli_2(r, sh)\
163         simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
164
165#define simd_slli_4(r, sh)\
166         simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
167#define simd_slli_8(r, sh)\
168         simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
169
170
171
172
173#define simd_mergeh_4(a,b)\
174        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
175        simd_if(simd_himask_8,simd_slli_8(a,4),b))
176#define simd_mergel_4(a,b)\
177        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
178        simd_if(simd_himask_8,simd_slli_8(a,4),b))
179#define simd_mergeh_2(a,b)\
180        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
181        simd_if(simd_himask_4,simd_slli_4(a,2),b))
182#define simd_mergel_2(a,b)\
183        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
184        simd_if(simd_himask_4,simd_slli_4(a,2),b))
185#define simd_mergeh_1(a,b)\
186        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
187        simd_if(simd_himask_2,simd_slli_2(a,1),b))
188#define simd_mergel_1(a,b)\
189        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
190        simd_if(simd_himask_2,simd_slli_2(a,1),b))
191
192#define sisd_to_int(x) _mm_cvtsi128_si32(x)
193
194#define sisd_from_int(n) _mm_cvtsi32_si128(n)
195
196static inline int simd_all_true_8(SIMD_type v) {
197  return _mm_movemask_epi8(v) == 0xFFFF;
198}
199
200static inline int simd_any_true_8(SIMD_type v) {
201  return _mm_movemask_epi8(v) != 0;
202}
203
204static inline int simd_any_sign_bit_8(SIMD_type v) {
205  return _mm_movemask_epi8(v) != 0;
206}
207
208#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
209#define simd_all_le_8(v1, v2) \
210  simd_all_eq_8(simd_max_8(v1, v2), v2)
211
212#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
213
214#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
215
216static inline int bitblock_has_bit(SIMD_type v) {
217  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
218}
219
220
221
222#define bitblock_test_bit(blk, n) \
223   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
224
225#define simd_pack_2(a,b)\
226        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
227        simd_if(simd_himask_2,sisd_srli(b,1),b))
228#define simd_pack_4(a,b)\
229        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
230        simd_if(simd_himask_4,sisd_srli(b,2),b))
231#define simd_pack_8(a,b)\
232        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
233        simd_if(simd_himask_8,sisd_srli(b,4),b))
234
235#ifndef simd_add_2_xx
236#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
237#endif
238
239#ifndef simd_add_2_xl
240#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
241#endif
242
243#ifndef simd_add_2_xh
244#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
245#endif
246
247#ifndef simd_add_2_lx
248#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
249#endif
250
251#ifndef simd_add_2_ll
252#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
253#endif
254
255#ifndef simd_add_2_lh
256#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
257#endif
258
259#ifndef simd_add_2_hx
260#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
261#endif
262
263#ifndef simd_add_2_hl
264#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
265#endif
266
267#ifndef simd_add_2_hh
268#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
269#endif
270
271#ifndef simd_add_4_xx
272#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
273#endif
274
275#ifndef simd_add_4_xl
276#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
277#endif
278
279#ifndef simd_add_4_xh
280#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
281#endif
282
283#ifndef simd_add_4_lx
284#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
285#endif
286
287#ifndef simd_add_4_ll
288#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
289#endif
290
291#ifndef simd_add_4_lh
292#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
293#endif
294
295#ifndef simd_add_4_hx
296#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
297#endif
298
299#ifndef simd_add_4_hl
300#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
301#endif
302
303#ifndef simd_add_4_hh
304#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
305#endif
306
307#ifndef simd_add_8_xx
308#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
309#endif
310
311#ifndef simd_add_8_xl
312#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
313#endif
314
315#ifndef simd_add_8_xh
316#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
317#endif
318
319#ifndef simd_add_8_lx
320#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
321#endif
322
323#ifndef simd_add_8_ll
324#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
325#endif
326
327#ifndef simd_add_8_lh
328#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
329#endif
330
331#ifndef simd_add_8_hx
332#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
333#endif
334
335#ifndef simd_add_8_hl
336#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
337#endif
338
339#ifndef simd_add_8_hh
340#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
341#endif
342
343#ifndef simd_add_16_xx
344#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
345#endif
346
347#ifndef simd_add_16_xl
348#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
349#endif
350
351#ifndef simd_add_16_xh
352#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
353#endif
354
355#ifndef simd_add_16_lx
356#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
357#endif
358
359#ifndef simd_add_16_ll
360#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
361#endif
362
363#ifndef simd_add_16_lh
364#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
365#endif
366
367#ifndef simd_add_16_hx
368#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
369#endif
370
371#ifndef simd_add_16_hl
372#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
373#endif
374
375#ifndef simd_add_16_hh
376#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
377#endif
378
379#ifndef simd_add_32_xx
380#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
381#endif
382
383#ifndef simd_add_32_xl
384#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
385#endif
386
387#ifndef simd_add_32_xh
388#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
389#endif
390
391#ifndef simd_add_32_lx
392#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
393#endif
394
395#ifndef simd_add_32_ll
396#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
397#endif
398
399#ifndef simd_add_32_lh
400#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
401#endif
402
403#ifndef simd_add_32_hx
404#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
405#endif
406
407#ifndef simd_add_32_hl
408#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
409#endif
410
411#ifndef simd_add_32_hh
412#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
413#endif
414
415#ifndef simd_add_64_xx
416#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
417#endif
418
419#ifndef simd_add_64_xl
420#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
421#endif
422
423#ifndef simd_add_64_xh
424#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
425#endif
426
427#ifndef simd_add_64_lx
428#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
429#endif
430
431#ifndef simd_add_64_ll
432#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
433#endif
434
435#ifndef simd_add_64_lh
436#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
437#endif
438
439#ifndef simd_add_64_hx
440#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
441#endif
442
443#ifndef simd_add_64_hl
444#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
445#endif
446
447#ifndef simd_add_64_hh
448#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
449#endif
450
451#ifndef simd_add_128_xx
452#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
453#endif
454
455#ifndef simd_add_128_xl
456#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
457#endif
458
459#ifndef simd_add_128_xh
460#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
461#endif
462
463#ifndef simd_add_128_lx
464#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
465#endif
466
467#ifndef simd_add_128_ll
468#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
469#endif
470
471#ifndef simd_add_128_lh
472#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
473#endif
474
475#ifndef simd_add_128_hx
476#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
477#endif
478
479#ifndef simd_add_128_hl
480#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
481#endif
482
483#ifndef simd_add_128_hh
484#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
485#endif
486
487#ifndef simd_pack_2_xx
488#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
489#endif
490
491#ifndef simd_pack_2_xl
492#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
493#endif
494
495#ifndef simd_pack_2_xh
496#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
497#endif
498
499#ifndef simd_pack_2_lx
500#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
501#endif
502
503#ifndef simd_pack_2_ll
504#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
505#endif
506
507#ifndef simd_pack_2_lh
508#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
509#endif
510
511#ifndef simd_pack_2_hx
512#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
513#endif
514
515#ifndef simd_pack_2_hl
516#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
517#endif
518
519#ifndef simd_pack_2_hh
520#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
521#endif
522
523#ifndef simd_pack_4_xx
524#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
525#endif
526
527#ifndef simd_pack_4_xl
528#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
529#endif
530
531#ifndef simd_pack_4_xh
532#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
533#endif
534
535#ifndef simd_pack_4_lx
536#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
537#endif
538
539#ifndef simd_pack_4_ll
540#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
541#endif
542
543#ifndef simd_pack_4_lh
544#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
545#endif
546
547#ifndef simd_pack_4_hx
548#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
549#endif
550
551#ifndef simd_pack_4_hl
552#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
553#endif
554
555#ifndef simd_pack_4_hh
556#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
557#endif
558
559#ifndef simd_pack_8_xx
560#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
561#endif
562
563#ifndef simd_pack_8_xl
564#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
565#endif
566
567#ifndef simd_pack_8_xh
568#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
569#endif
570
571#ifndef simd_pack_8_lx
572#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
573#endif
574
575#ifndef simd_pack_8_ll
576#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
577#endif
578
579#ifndef simd_pack_8_lh
580#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
581#endif
582
583#ifndef simd_pack_8_hx
584#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
585#endif
586
587#ifndef simd_pack_8_hl
588#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
589#endif
590
591#ifndef simd_pack_8_hh
592#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
593#endif
594
595#ifndef simd_pack_16_xx
596#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
597#endif
598
599#ifndef simd_pack_16_xl
600#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
601#endif
602
603#ifndef simd_pack_16_xh
604#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
605#endif
606
607#ifndef simd_pack_16_lx
608#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
609#endif
610
611#ifndef simd_pack_16_ll
612#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
613#endif
614
615#ifndef simd_pack_16_lh
616#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
617#endif
618
619#ifndef simd_pack_16_hx
620#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
621#endif
622
623#ifndef simd_pack_16_hl
624#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
625#endif
626
627#ifndef simd_pack_16_hh
628//#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
629//Masking performned by simd_pack_16 is unnecessary.
630#define simd_pack_16_hh(v1, v2) _mm_packus_epi16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
631#endif
632
633
634// Splat the first 16-bit int into all positions.
635static inline SIMD_type simd_splat_16(SIMD_type x) {
636  SIMD_type t = _mm_shufflelo_epi16(x,0);
637  return _mm_shuffle_epi32(t,0);
638}
639
640// Splat the first 32-bit int into all positions.
641static inline SIMD_type simd_splat_32(SIMD_type x) {
642  return _mm_shuffle_epi32(x,0);
643}
644
645
646void print_bit_block(const char * var_name, SIMD_type v) {
647  union {SIMD_type vec; unsigned char elems[8];} x;
648  x.vec = v;
649  unsigned char c, bit_reversed;
650  int i;
651  printf("%20s = ", var_name);
652  for (i = 0; i < sizeof(SIMD_type); i++) {
653    c = x.elems[i];
654     printf("%02X ", c);
655  }
656  printf("\n");
657}
658
659static inline int bitblock_bit_count(SIMD_type v) {
660  int bit_count = 0;
661  SIMD_type cts_2 = simd_add_2_lh(v, v);
662  SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
663  SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
664  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
665  /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
666  SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
667  return (int) sisd_to_int(cts_128);
668}
669
670#define sb_op(x, n) ((x)>>(n))
671#define sf_op(x, n) ((x)<<(n))
672#ifdef __GNUC__
673#define cfzl __builtin_ctzl
674#endif
675#ifdef _MSC_VER
676#include <intrin.h>
677#pragma intrinsic(_BitScanForward)
678//  precondition: x > 0
679static inline unsigned long cfzl(unsigned long x) {
680        unsigned long zeroes;
681        _BitScanForward(&zeroes, x);
682        return zeroes;
683}
684#endif
685
686static inline int count_forward_zeroes(SIMD_type bits) {
687  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
688  v.vec = bits;
689  if (v.elems[0] != 0) return cfzl(v.elems[0]);
690  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
691#ifdef _MSC_VER
692  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
693  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
694#endif
695#ifndef _MSC_VER
696#if LONG_BIT < 64
697  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
698  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
699#endif
700#endif
701  else return 8*sizeof(SIMD_type);
702}
703
704
705#define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
706  __asm__  ("sahf\n\t" \
707        "adc %[e1], %[z1]\n\t" \
708        "adc %[e2], %[z2]\n\t" \
709        "lahf\n\t" \
710     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
711         : "[z1]" (x1), "[z2]" (x2), \
712           [e1] "r" (y1), [e2] "r" (y2), \
713           "[carryflag]" (carry) \
714         : "cc")
715
716#define adc128(first, second, carry, sum) \
717do\
718{\
719  union {__m128i bitblock;\
720         uint64_t int64[2];} rslt;\
721\
722  union {__m128i bitblock;\
723         uint64_t int64[2];} x;\
724\
725  union {__m128i bitblock;\
726         uint64_t int64[2];} y;\
727\
728  x.bitblock = first;\
729  y.bitblock = second;\
730\
731  double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
732  sum = rslt.bitblock;\
733}while(0)
734
735
736
737#define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, carry) \
738  __asm__  ("sahf\n\t" \
739        "sbb %[e1], %[z1]\n\t" \
740        "sbb %[e2], %[z2]\n\t" \
741        "lahf\n\t" \
742     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
743         : "[z1]" (x1), "[z2]" (x2), \
744           [e1] "r" (y1), [e2] "r" (y2), \
745           "[carryflag]" (carry) \
746         : "cc")
747
748#define sbb128(first, second, carry, sum) \
749do\
750{ union {__m128i bitblock;\
751         uint64_t int64[2];} rslt;\
752\
753  union {__m128i bitblock;\
754         uint64_t int64[2];} x;\
755\
756  union {__m128i bitblock;\
757         uint64_t int64[2];} y;\
758\
759  x.bitblock = first;\
760  y.bitblock = second;\
761\
762  double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
763                   rslt.int64[0], rslt.int64[1], carry);\
764  sum = rslt.bitblock;\
765}while(0)
766
767
768
769#define adc128_simd(x, y, carry,  sum) \
770do{ \
771  SIMD_type gen = simd_and(x, y); \
772  SIMD_type prop = simd_or(x, y); \
773  SIMD_type partial = simd_add_64(simd_add_64(x, y), *carry); \
774  SIMD_type c1 = sisd_slli(simd_srli_64(simd_or(gen, simd_andc(prop, partial)), 63), 64); \
775  *sum = simd_add_64(c1, partial); \
776  *carry = sisd_srli(simd_or(gen, simd_andc(prop, *sum)), 127); \
777} while(0)
778
779
780#define sbb128_simd(x, y, borrow, difference) \
781do {\
782  SIMD_type gen = simd_andc(y, x); \
783  SIMD_type prop = simd_not(simd_xor(x, y)); \
784  SIMD_type partial = simd_sub_64(simd_sub_64(x, y), *borrow); \
785  SIMD_type b1 = sisd_slli(simd_srli_64(simd_or(gen, simd_and(prop, partial)), 63), 64); \
786  *difference = simd_sub_64(partial, b1); \
787  *borrow = sisd_srli(simd_or(gen, simd_and(prop, *difference)), 127); \
788}while(0)
789
790
791#define advance_with_carry(cursor, carry, rslt)\
792do{\
793  SIMD_type shift_out = simd_srli_64(cursor, 63);\
794  SIMD_type low_bits = simd_mergel_64(shift_out, carry);\
795  carry = sisd_srli(shift_out, 64);\
796  rslt = simd_or(simd_add_64(cursor, cursor), low_bits);\
797}while(0)
798
799#endif
800
Note: See TracBrowser for help on using the repository browser.