source: trunk/lib/sse_simd.h @ 1584

Last change on this file since 1584 was 1228, checked in by vla24, 8 years ago

Integrated symbol table with xmlwf. There are various implementations for the symbol table, please read /proto/SymbolTable/README_SymbolTable for more information.

File size: 21.5 KB
Line 
1/*  Idealized SIMD Operations with SSE versions
2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7#ifndef SSE_SIMD_H
8#define SSE_SIMD_H
9
10#include <stdio.h>
11
12/*------------------------------------------------------------*/
13
14#include <stdint.h>
15
16#ifdef _MSC_VER
17#define LITTLE_ENDIAN 1234
18#define BIG_ENDIAN 4321
19#define BYTE_ORDER LITTLE_ENDIAN
20#endif
21
22#include <emmintrin.h>
23#ifdef USE_LDDQU
24#include <pmmintrin.h>
25#endif
26#ifdef USE_PTEST
27#include <smmintrin.h>
28#endif
29typedef __m128i SIMD_type;
30/*------------------------------------------------------------*/
31
32/* Prints the SIMD register representation of a SIMD value. */
33static void print_simd_register(const char * var_name, SIMD_type v);
34
35/* I. SIMD bitwise logical operations */
36
37#define simd_or(b1, b2) _mm_or_si128(b1, b2)
38#define simd_and(b1, b2) _mm_and_si128(b1, b2)
39#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
40#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
41#define simd_if(cond, then_val, else_val) \
42  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
43#define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
44#define simd_nor(a,b) (simd_not(simd_or(a,b)))
45
46
47/*  Specific constants. */
48#define sisd_low_bit_mask  _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000001)
49#define sisd_high_bit_mask _mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000)
50
51#define simd_himask_2 _mm_set1_epi32(0xAAAAAAAA)
52#define simd_himask_4 _mm_set1_epi32(0xCCCCCCCC)
53#define simd_himask_8 _mm_set1_epi32(0xF0F0F0F0)
54/* Little-endian */
55#define simd_himask_16 _mm_set1_epi32(0xFF00FF00)
56#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
57#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
58#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
59
60#define simd_lomask_128 _mm_set_epi32(0,0,-1,-1)
61/* Idealized operations with direct implementation by built-in
62   operations for various target architectures. */
63
64#define simd_add_8(a, b) _mm_add_epi8(a, b)
65#define simd_add_16(a, b) _mm_add_epi16(a, b)
66#define simd_add_32(a, b) _mm_add_epi32(a, b)
67#define simd_add_64(a, b) _mm_add_epi64(a, b)
68#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
69#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
70#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
71#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
72#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
73#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
74#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
75#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
76#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
77#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
78#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
79#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
80#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
81#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
82#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
83#define simd_packus_16(a, b) _mm_packus_epi16(b, a)
84#define simd_pack_16(a, b) \
85  _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
86#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
87#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
88#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
89#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
90#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
91#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
92#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
93#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
94#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
95#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
96#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
97
98#define simd_max_8(a, b) _mm_max_epu8(a, b)
99
100#define simd_slli_128(r, shft) \
101  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
102   (shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
103   simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
104
105#define simd_srli_128(r, shft) \
106  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
107   (shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
108   simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
109
110#define simd_sll_128(r, shft) \
111   simd_or(simd_sll_64(r, shft), \
112           simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
113                   _mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
114
115#define simd_srl_128(r, shft) \
116   simd_or(simd_srl_64(r, shft), \
117           simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
118                   _mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
119
120#define sisd_sll(r, shft) simd_sll_128(r, shft)
121#define sisd_srl(r, shft) simd_srl_128(r, shft)
122#define sisd_slli(r, shft) simd_slli_128(r, shft)
123#define sisd_srli(r, shft) simd_srli_128(r, shft)
124#define sisd_add(a, b) simd_add_128(a, b)
125#define sisd_sub(a, b) simd_sub_128(a, b)
126
127#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
128#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
129#define sisd_load_aligned(addr) _mm_load_si128(addr)
130#ifndef USE_LDDQU
131#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
132#endif
133#ifdef USE_LDDQU
134#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
135#endif
136
137
138
139#define simd_const_32(n) _mm_set1_epi32(n)
140#define simd_const_16(n) _mm_set1_epi16(n)
141#define simd_const_8(n) _mm_set1_epi8(n)
142#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
143#define simd_const_2(n) simd_const_4((n)<<2|(n))
144#define simd_const_1(n) \
145  (n==0 ? simd_const_8(0): simd_const_8(-1))
146
147
148static inline
149SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
150{
151         SIMD_type c1 = simd_xor(a,b);
152         SIMD_type borrow = simd_and(a,b);
153         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
154         return simd_if(simd_himask_2,c2,c1);
155}
156#define simd_add_4(a, b)\
157        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
158        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
159
160#define simd_srli_2(r, sh)\
161         simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
162
163#define simd_srli_4(r, sh)\
164         simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
165#define simd_srli_8(r, sh)\
166         simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
167
168#define simd_slli_2(r, sh)\
169         simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
170
171#define simd_slli_4(r, sh)\
172         simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
173#define simd_slli_8(r, sh)\
174         simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
175
176
177
178
179#define simd_mergeh_4(a,b)\
180        simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
181        simd_if(simd_himask_8,simd_slli_8(a,4),b))
182#define simd_mergel_4(a,b)\
183        simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
184        simd_if(simd_himask_8,simd_slli_8(a,4),b))
185#define simd_mergeh_2(a,b)\
186        simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
187        simd_if(simd_himask_4,simd_slli_4(a,2),b))
188#define simd_mergel_2(a,b)\
189        simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
190        simd_if(simd_himask_4,simd_slli_4(a,2),b))
191#define simd_mergeh_1(a,b)\
192        simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
193        simd_if(simd_himask_2,simd_slli_2(a,1),b))
194#define simd_mergel_1(a,b)\
195        simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
196        simd_if(simd_himask_2,simd_slli_2(a,1),b))
197
198#define sisd_to_int(x) _mm_cvtsi128_si32(x)
199
200#define sisd_from_int(n) _mm_cvtsi32_si128(n)
201
202static inline int simd_all_true_8(SIMD_type v) {
203  return _mm_movemask_epi8(v) == 0xFFFF;
204}
205
206static inline int simd_any_true_8(SIMD_type v) {
207  return _mm_movemask_epi8(v) != 0;
208}
209
210static inline int simd_any_sign_bit_8(SIMD_type v) {
211  return _mm_movemask_epi8(v) != 0;
212}
213
214#define simd_movemask_8(v) _mm_movemask_epi8(v)
215
216#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
217#define simd_all_le_8(v1, v2) \
218  simd_all_eq_8(simd_max_8(v1, v2), v2)
219
220#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
221
222#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
223
224
225//static inline int bitblock_has_bit(SIMD_type v) {
226//#ifndef USE_PTEST
227//  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
228//#endif
229//#ifdef USE_PTEST
230//  return !_mm_testz_si128(v,v);
231//#endif
232//}
233
234
235
236#define bitblock_test_bit(blk, n) \
237   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
238
239#define simd_pack_2(a,b)\
240        simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
241        simd_if(simd_himask_2,sisd_srli(b,1),b))
242#define simd_pack_4(a,b)\
243        simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
244        simd_if(simd_himask_4,sisd_srli(b,2),b))
245#define simd_pack_8(a,b)\
246        simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
247        simd_if(simd_himask_8,sisd_srli(b,4),b))
248
249#ifndef simd_add_2_xx
250#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
251#endif
252
253#ifndef simd_add_2_xl
254#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
255#endif
256
257#ifndef simd_add_2_xh
258#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
259#endif
260
261#ifndef simd_add_2_lx
262#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
263#endif
264
265#ifndef simd_add_2_ll
266#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
267#endif
268
269#ifndef simd_add_2_lh
270#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
271#endif
272
273#ifndef simd_add_2_hx
274#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
275#endif
276
277#ifndef simd_add_2_hl
278#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
279#endif
280
281#ifndef simd_add_2_hh
282#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
283#endif
284
285#ifndef simd_add_4_xx
286#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
287#endif
288
289#ifndef simd_add_4_xl
290#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
291#endif
292
293#ifndef simd_add_4_xh
294#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
295#endif
296
297#ifndef simd_add_4_lx
298#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
299#endif
300
301#ifndef simd_add_4_ll
302#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
303#endif
304
305#ifndef simd_add_4_lh
306#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
307#endif
308
309#ifndef simd_add_4_hx
310#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
311#endif
312
313#ifndef simd_add_4_hl
314#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
315#endif
316
317#ifndef simd_add_4_hh
318#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
319#endif
320
321#ifndef simd_add_8_xx
322#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
323#endif
324
325#ifndef simd_add_8_xl
326#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
327#endif
328
329#ifndef simd_add_8_xh
330#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
331#endif
332
333#ifndef simd_add_8_lx
334#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
335#endif
336
337#ifndef simd_add_8_ll
338#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
339#endif
340
341#ifndef simd_add_8_lh
342#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
343#endif
344
345#ifndef simd_add_8_hx
346#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
347#endif
348
349#ifndef simd_add_8_hl
350#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
351#endif
352
353#ifndef simd_add_8_hh
354#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
355#endif
356
357#ifndef simd_add_16_xx
358#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
359#endif
360
361#ifndef simd_add_16_xl
362#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
363#endif
364
365#ifndef simd_add_16_xh
366#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
367#endif
368
369#ifndef simd_add_16_lx
370#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
371#endif
372
373#ifndef simd_add_16_ll
374#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
375#endif
376
377#ifndef simd_add_16_lh
378#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
379#endif
380
381#ifndef simd_add_16_hx
382#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
383#endif
384
385#ifndef simd_add_16_hl
386#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
387#endif
388
389#ifndef simd_add_16_hh
390#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
391#endif
392
393#ifndef simd_add_32_xx
394#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
395#endif
396
397#ifndef simd_add_32_xl
398#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
399#endif
400
401#ifndef simd_add_32_xh
402#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
403#endif
404
405#ifndef simd_add_32_lx
406#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
407#endif
408
409#ifndef simd_add_32_ll
410#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
411#endif
412
413#ifndef simd_add_32_lh
414#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
415#endif
416
417#ifndef simd_add_32_hx
418#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
419#endif
420
421#ifndef simd_add_32_hl
422#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
423#endif
424
425#ifndef simd_add_32_hh
426#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
427#endif
428
429#ifndef simd_add_64_xx
430#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
431#endif
432
433#ifndef simd_add_64_xl
434#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
435#endif
436
437#ifndef simd_add_64_xh
438#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
439#endif
440
441#ifndef simd_add_64_lx
442#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
443#endif
444
445#ifndef simd_add_64_ll
446#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
447#endif
448
449#ifndef simd_add_64_lh
450#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
451#endif
452
453#ifndef simd_add_64_hx
454#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
455#endif
456
457#ifndef simd_add_64_hl
458#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
459#endif
460
461#ifndef simd_add_64_hh
462#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
463#endif
464
465#ifndef simd_add_128_xx
466#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
467#endif
468
469#ifndef simd_add_128_xl
470#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
471#endif
472
473#ifndef simd_add_128_xh
474#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
475#endif
476
477#ifndef simd_add_128_lx
478#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
479#endif
480
481#ifndef simd_add_128_ll
482#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
483#endif
484
485#ifndef simd_add_128_lh
486#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
487#endif
488
489#ifndef simd_add_128_hx
490#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
491#endif
492
493#ifndef simd_add_128_hl
494#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
495#endif
496
497#ifndef simd_add_128_hh
498#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
499#endif
500
501static inline SIMD_type simd_add_128(SIMD_type v1, SIMD_type v2) {
502        SIMD_type temp = simd_add_64(v1,v2);
503        SIMD_type carry_mask = simd_or(simd_and(v1, v2), simd_and(simd_xor(v1, v2), simd_not(temp)));
504        SIMD_type carry = sisd_slli(simd_and(carry_mask, simd_lomask_128),1);
505        return simd_if(simd_lomask_128, temp, simd_add_64(temp, carry));
506}
507
508#ifndef simd_pack_2_xx
509#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
510#endif
511
512#ifndef simd_pack_2_xl
513#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
514#endif
515
516#ifndef simd_pack_2_xh
517#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
518#endif
519
520#ifndef simd_pack_2_lx
521#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
522#endif
523
524#ifndef simd_pack_2_ll
525#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
526#endif
527
528#ifndef simd_pack_2_lh
529#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
530#endif
531
532#ifndef simd_pack_2_hx
533#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
534#endif
535
536#ifndef simd_pack_2_hl
537#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
538#endif
539
540#ifndef simd_pack_2_hh
541#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
542#endif
543
544#ifndef simd_pack_4_xx
545#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
546#endif
547
548#ifndef simd_pack_4_xl
549#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
550#endif
551
552#ifndef simd_pack_4_xh
553#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
554#endif
555
556#ifndef simd_pack_4_lx
557#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
558#endif
559
560#ifndef simd_pack_4_ll
561#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
562#endif
563
564#ifndef simd_pack_4_lh
565#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
566#endif
567
568#ifndef simd_pack_4_hx
569#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
570#endif
571
572#ifndef simd_pack_4_hl
573#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
574#endif
575
576#ifndef simd_pack_4_hh
577#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
578#endif
579
580#ifndef simd_pack_8_xx
581#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
582#endif
583
584#ifndef simd_pack_8_xl
585#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
586#endif
587
588#ifndef simd_pack_8_xh
589#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
590#endif
591
592#ifndef simd_pack_8_lx
593#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
594#endif
595
596#ifndef simd_pack_8_ll
597#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
598#endif
599
600#ifndef simd_pack_8_lh
601#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
602#endif
603
604#ifndef simd_pack_8_hx
605#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
606#endif
607
608#ifndef simd_pack_8_hl
609#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
610#endif
611
612#ifndef simd_pack_8_hh
613#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
614#endif
615
616#ifndef simd_pack_16_xx
617#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
618#endif
619
620#ifndef simd_pack_16_xl
621#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
622#endif
623
624#ifndef simd_pack_16_xh
625#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
626#endif
627
628#ifndef simd_pack_16_lx
629#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
630#endif
631
632#ifndef simd_pack_16_ll
633#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
634#endif
635
636#ifndef simd_pack_16_lh
637#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
638#endif
639
640#ifndef simd_pack_16_hx
641#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
642#endif
643
644#ifndef simd_pack_16_hl
645#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
646#endif
647
648#ifndef simd_pack_16_hh
649//#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
650//Masking performned by simd_pack_16 is unnecessary.
651#define simd_pack_16_hh(v1, v2) simd_packus_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
652#endif
653
654// Splat the first 16-bit int into all positions.
655static inline SIMD_type simd_splat_16(SIMD_type x) {
656  SIMD_type t = _mm_shufflelo_epi16(x,0);
657  return _mm_shuffle_epi32(t,0);
658}
659
660// Splat the first 32-bit int into all positions.
661static inline SIMD_type simd_splat_32(SIMD_type x) {
662  return _mm_shuffle_epi32(x,0);
663}
664
665//static inline int bitblock_bit_count(SIMD_type v) {
666//  int bit_count = 0;
667//  SIMD_type cts_2 = simd_add_2_lh(v, v);
668//  SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
669//  SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
670//  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
671//  /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
672//  SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
673//  return (int) sisd_to_int(cts_128);
674//}
675
676/*
677   Returns parallel-prefix XOR (parity) mask based on parity of the arguments parity_1 and bitblock_parity_mask.
678   
679   parity_1 - parity of 1 bit fields
680   parity_2 - parity of 2 bit fields
681   .
682   .
683   parity_128 - parity of 128 bit fields (register width).
684   
685   bitblock_parity_mask - mask of all 1's or all 0's,
686   Under the block-by-block processing model, a mask of all 1's => odd parity in the previous block, all 0's => even parity.
687   
688*/
689static inline SIMD_type bitblock_parallel_prefix_parity(SIMD_type parity_1, SIMD_type bitblock_parity_mask) {
690 
691  SIMD_type parity_2 = simd_xor(parity_1, sisd_slli(parity_1,1));
692  SIMD_type parity_4 = simd_xor(parity_2, sisd_slli(parity_2,2));
693  SIMD_type parity_8 = simd_xor(parity_4, sisd_slli(parity_4,4)); 
694  SIMD_type parity_16 = simd_xor(parity_8, sisd_slli(parity_8,8));
695  SIMD_type parity_32 = simd_xor(parity_16, sisd_slli(parity_16,16));
696  SIMD_type parity_64 = simd_xor(parity_32, sisd_slli(parity_32,32));
697  SIMD_type parity_128 = simd_xor(parity_64, sisd_slli(parity_64,64));
698
699  /*
700  print_simd_register("parity_1",parity_1); 
701  print_simd_register("parity_2",parity_2); 
702  print_simd_register("parity_4",parity_4);   
703  print_simd_register("parity_8",parity_8);   
704  print_simd_register("parity_16",parity_16); 
705  print_simd_register("parity_32",parity_32);
706  print_simd_register("parity_64",parity_64); 
707  print_simd_register("parity_128",parity_128); 
708  print_simd_register("bitblock_parity_mask",bitblock_parity_mask); 
709  */
710 
711  return simd_xor(parity_128, bitblock_parity_mask);
712}
713
714#endif
715
Note: See TracBrowser for help on using the repository browser.