source: trunk/lib/altivec_simd.h @ 531

Last change on this file since 531 was 87, checked in by cameron, 11 years ago

Adaptations for MSVC

File size: 16.4 KB
Line 
1/*  Idealized SIMD Operations - Altivec versions
2    Copyright (C) 2006, 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7
8/*------------------------------------------------------------*/
9#ifndef ALTIVEC_SIMD_H
10#define ALTIVEC_SIMD_H
11#include <limits.h>
12
13typedef vector unsigned short vUInt16;
14typedef vector unsigned int vUInt32;
15typedef vector unsigned char SIMD_type;
16
17
18#define vec_lvsl1(x) vec_lvsl(x, (unsigned char *) 0)
19#define vec_lvsr1(x) vec_lvsr(x, (unsigned char *) 0)
20
21
22/*------------------------------------------------------------*/
23/* I. SIMD bitwise logical operations */
24
25#define simd_or(b1, b2) vec_or(b1, b2)
26#define simd_and(b1, b2) vec_and(b1, b2)
27#define simd_xor(b1, b2) vec_xor(b1, b2)
28#define simd_nor(b1, b2) vec_nor(b1, b2)
29#define simd_andc(b1, b2) vec_andc(b1, b2)
30#define simd_if(cond, then_val, else_val) vec_sel(else_val, then_val, cond)
31#define simd_not(x) vec_nor(x, x)
32
33
34/* Idealized operations with direct implementation by built-in
35   operations for Altivec. */
36
37#define simd_add_8(a, b) vec_vaddubm(a, b)
38#define simd_add_16(a, b) (SIMD_type) vec_vadduhw((vUInt16) a, (vUInt16) b)
39#define simd_add_32(a, b) (SIMD_type) vec_vadduwm((vUInt32) a, (vUInt32) b)
40#define simd_sub_8(a, b) vec_vsububm(a, b)
41#define simd_sub_16(a, b) (SIMD_type) vec_vsubuhm((vUInt16) a, (vUInt16) b)
42#define simd_sub_32(a, b) (SIMD_type) vec_vsubuwm((vUInt32) a, (vUInt32) b)
43#define simd_mult_16(a, b) (SIMD_type) vec_mladd((vUInt16) a, (vUInt16) b, (vUInt16) vec_splat_u8(0))
44#define simd_mergeh_8(v1, v2) vec_vmrghb(v1, v2)
45#define simd_mergeh_16(v1, v2) (SIMD_type) vec_vmrghh((vUInt16) v1, (vUInt16) v2)
46#define simd_mergeh_32(v1, v2) (SIMD_type) vec_vmrghw((vUInt32) v1, (vUInt32) v2)
47#define simd_mergel_8(v1, v2) vec_vmrglb(v1, v2)
48#define simd_mergel_16(v1, v2) (SIMD_type) vec_vmrglh((vUInt16) v1, (vUInt16) v2)
49#define simd_mergel_32(v1, v2) (SIMD_type) vec_vmrglw((vUInt32) v1, (vUInt32) v2)
50#define simd_pack_16(v1, v2) vec_vpkuhum((vUInt16) v1, (vUInt16) v2)
51#define simd_pack_32(v1, v2) (SIMD_type) vec_vpkuwum((vUInt32) v1, (vUInt32) v2)
52#define simd_sll_8(r, shft) vec_vslb(r, shft)
53#define simd_srl_8(r, shft) vec_vsrb(r, shft)
54#define simd_sra_8(r, shft) vec_vsrab(r, shft)
55#define simd_rotl_8(r, shft) vec_vrlb(r, shft)
56#define simd_sll_16(r, shft) (SIMD_type) vec_vslh((vUInt16) r, (vUInt16) shft)
57#define simd_srl_16(r, shft) (SIMD_type) vec_vsrh((vUInt16) r, (vUInt16) shft)
58#define simd_sra_16(r, shft) (SIMD_type) vec_vsrah((vUInt16) r, (vUInt16) shft)
59#define simd_rotl_16(r, shft) (SIMD_type) vec_vrlh((vUInt16) r, (vUInt16) shft)
60#define simd_sll_32(r, shft) (SIMD_type) vec_vslw((vUInt32) r, (vUInt32) shft)
61#define simd_srl_32(r, shft) (SIMD_type) vec_vsrw((vUInt32) r, (vUInt32) shft)
62#define simd_sra_32(r, shft) (SIMD_type) vec_vsraw((vUInt32) r, (vUInt32) shft)
63#define simd_rotl_32(r, shft) (SIMD_type) vec_vrlw((vUInt32) r, (vUInt32) shft)
64#define simd_slli_8(r, shft) vec_vslb(r, vec_splat_u8(shft))
65#define simd_srli_8(r, shft) vec_vsrb(r, vec_splat_u8(shft))
66#define simd_srai_8(r, shft) vec_vsrab(r, vec_splat_u8(shft))
67#define simd_rotli_8(r, shft) vec_vrlb(r, vec_splat_u8(shft))
68/* For shifts of 16 or 32, the shift values could be loaded by
69   vec_splat_u16 or vec_splat_32.  However, using vec_splat_u8
70   works as well, as only the low 4 or 5 bits are used.  The
71   vec_splat_u8 is used to increase the chance that the
72   optimizer will find this value already in a register. */
73#define simd_slli_16(r, shft) (SIMD_type) vec_vslh((vUInt16) r, (vUInt16) vec_splat_u8(shft))
74#define simd_srli_16(r, shft) (SIMD_type) vec_vsrh((vUInt16) r, (vUInt16) vec_splat_u8(shft))
75#define simd_srai_16(r, shft) (SIMD_type) vec_vsrah((vUInt16) r, (vUInt16) vec_splat_u8(shft))
76#define simd_rotli_16(r, shft) (SIMD_type) vec_vrlh((vUInt16) r, (vUInt16) vec_splat_u8(shft))
77/* Because only the least significant 5 bits are used in 32 bit
78   shifts, shifts of 16 to 31 are equivalent to shifts of -16 to -1.
79   Translating to the negative values allows the shift constant to be
80   loaded with a single vec_splat_u8. */ 
81#define splat_shft(shft) vec_splat_u8((shft) >= 16 ? (shft)-32 : (shft))
82#define simd_slli_32(r, shft) (SIMD_type) vec_vslw((vUInt32) r, (vUInt32) splat_shft(shft))
83#define simd_srli_32(r, shft) (SIMD_type) vec_vsrw((vUInt32) r, (vUInt32) splat_shft(shft))
84#define simd_srai_32(r, shft) (SIMD_type) vec_vsraw((vUInt32) r, (vUInt32) splat_shft(shft))
85#define simd_rotli_32(r, shft) (SIMD_type) vec_vrlw((vUInt32) r, (vUInt32) splat_shft(shft))
86#define simd_eq_8(a, b) (SIMD_type) vec_vcmpequb(a, b)
87#define simd_eq_16(a, b) (SIMD_type) vec_vcmpequh((vUInt16) a, (vUInt16) b)
88#define simd_eq_32(a, b) (SIMD_type) vec_vcmpequw((vUInt32) a, (vUInt32) b)
89
90#define simd_max_8(a, b) vec_vmaxub(a, b)
91
92/* 64-bit and 128-bit add/sub */
93
94#define simd_add_64(a, b) \
95  (SIMD_type) vec_add(vec_add((vUInt32) a, (vUInt32) b), \
96                     vec_andc(vec_sld(vec_addc((vUInt32) a, (vUInt32) b), vec_0, 4), \
97                              (vUInt32) alt_words))
98#define simd_sub_64(a, b) \
99  (SIMD_type) vec_sub(vec_sub((vUInt32) a, (vUInt32) b), \
100                     vec_add(vec_sld(vec_sub((vUInt32) vec_0, vec_subc((vUInt32) a, (vUInt32) b)), vec_0, 4), \
101                              (vUInt32) alt_words))
102
103static inline SIMD_type simd_add_128(SIMD_type a, SIMD_type b) {
104  vUInt32 sum1 = vec_add((vUInt32) a, (vUInt32) b);
105  vUInt32 carry1 = vec_sld(vec_addc((vUInt32) a, (vUInt32) b), (vUInt32) vec_splat_u8(0), 4);
106  vUInt32 sum2 = vec_add(sum1, carry1);
107  vUInt32 carry2 = vec_sld(vec_addc(sum1, carry1), (vUInt32) vec_splat_u8(0), 4);
108  vUInt32 sum3 = vec_add(sum2, carry2);
109  vUInt32 carry3 = vec_sld(vec_addc(sum2, carry2), (vUInt32) vec_splat_u8(0), 4);
110  return vec_add(sum3, carry3);
111}
112
113
114/* Altivec has separate full register shift instructions for
115   small shifts < 8 (vec_sll, vec_srl) and for shifts in
116   multiples of 8 (vec_slo, vec_sro, vec_sld).  The bytealign
117   macros handle the mod 8 shift, while vec_sld is used for
118   to complete the shift. */
119#define sl_bytealign(r, shft) \
120  ((shft) % 8 == 0 ? r : vec_sll(r, vec_splat_u8(shft)))
121#define sr_bytealign(r, shft) \
122  ((shft) % 8 == 0 ? r : vec_srl(r, vec_splat_u8(shft)))
123#define simd_slli_128(r, shft) \
124  ((shft) < 8 ? sl_bytealign(r, shft):\
125   (shft) < 16 ? vec_slo(sl_bytealign(r, shft), vec_splat_u8(shft)) :\
126   (shft) >= 112 ? vec_slo(sl_bytealign(r, (shft)-128), vec_splat_u8((shft)-128)):\
127   vec_sld(sl_bytealign(r, (shft) % 8), vec_splat_u8(0), (shft) >> 3))
128#define simd_srli_128(r, shft) \
129  ((shft) < 8 ? sr_bytealign(r, shft):\
130   (shft) < 16 ? vec_sro(sr_bytealign(r, shft), vec_splat_u8(shft)) :\
131   (shft) >= 112 ? vec_sro(sr_bytealign(r, (shft)-128), vec_splat_u8((shft)-128)):\
132   vec_sld(vec_splat_u8(0), sr_bytealign(r, (shft) % 8), 16 - ((shft) >> 3)))
133
134/* The vec_splat(r2, 15) ensures that the shift constant is duplicated
135   in all bytes prior to vec_sll or vec_srl. */
136#define simd_sll_128(r1, r2) vec_sll(vec_slo(r1, r2), vec_splat(r2, 15))
137#define simd_srl_128(r1, r2) vec_srl(vec_sro(r1, r2), vec_splat(r2, 15))
138
139
140
141#define sisd_store_aligned(r, addr) *((SIMD_type *) (addr)) = r
142#define sisd_load_aligned(addr) ((SIMD_type) *((SIMD_type *) (addr)))
143
144
145#define simd_pack_16_ll(a, b) simd_pack_16(a, b)
146
147#ifndef ALTIVEC_USE_EVEN_INDICES
148#define simd_pack_16_hh(a, b) \
149  simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
150#endif
151
152
153#ifdef ALTIVEC_USE_EVEN_INDICES
154#define even_byte_indices vec_add(vec_lvsl1(0), vec_lvsl1(0))
155#define simd_pack_16_hh(a, b) vec_perm(a, b, even_byte_indices)
156#endif
157
158
159#define sisd_sll(r, shft) simd_sll_128(r, shft)
160#define sisd_srl(r, shft) simd_srl_128(r, shft)
161#define sisd_slli(r, shft) simd_slli_128(r, shft)
162#define sisd_srli(r, shft) simd_srli_128(r, shft)
163#define sisd_add(a, b) simd_add_128(a, b)
164#define sisd_sub(a, b) simd_sub_128(a, b)
165
166
167
168
169
170#define simd_himask_2 vec_or(vec_splat_u8(10), vec_sl(vec_splat_u8(10), vec_splat_u8(4)))
171#define simd_himask_4 vec_or(vec_splat_u8(12), vec_sl(vec_splat_u8(12), vec_splat_u8(4)))
172#define simd_himask_8 vec_splat_u8(-16)
173
174#define simd_const_8(n) \
175  ((n) >= -16 && (n) < 15 ? vec_splat_u8(n):\
176   vec_or(vec_sl(vec_splat_u8(((n)>>4)&15), vec_splat_u8(4)), vec_splat_u8((n)&15)))
177
178
179#define simd_const_16(n) \
180  ((SIMD_type) vec_splat_u16(n))
181
182#define simd_const_32(n) \
183  (SIMD_type) ((n) >= -16 && (n) < 15 ? vec_splat_u32(n):\
184   vec_or(vec_sl(vec_splat_u32((n)>>4), vec_splat_u32(4), vec_splat_u32((n)&15))))
185
186#define simd_const_4(n) \
187   vec_or(vec_sl(vec_splat_u8(n), vec_splat_u8(4)), vec_splat_u8(n))
188
189#define simd_const_2(n) \
190   vec_or(vec_sl(vec_splat_u8(5*(n)), vec_splat_u8(4)), vec_splat_u8(5*(n)))
191
192#define simd_const_1(n) \
193  (n==0 ? simd_const_8(0): simd_const_8(-1))
194
195#define sisd_const(n) vec_sld(vec_splat_u8(0), simd_const_8(n))
196
197
198static inline int sisd_to_int(SIMD_type x) {
199  union {vector signed int vec; signed int elems[4];} xunion;
200  xunion.vec = (vector signed int) x;
201  return xunion.elems[3];
202}
203
204static inline SIMD_type sisd_from_int(unsigned int x) {
205  union {SIMD_type vec; unsigned int elems[4];} y;
206  y.elems[0] = 0;
207  y.elems[1] = 0;
208  y.elems[2] = 0;
209  y.elems[3] = x;
210  return y.vec;
211}
212
213#define bitblock_has_bit(blk) vec_any_ne(blk, vec_splat_u8(0))
214
215#define simd_all_le_8(v1, v2) vec_vcmpleub(v1, v2)
216static inline int simd_all_signed_gt_8(SIMD_type v1, SIMD_type v2) {
217//#define simd_all_signed_gt_8(v1, v2) \
218   return vec_all_gt((vector signed char) v1, (vector signed char) v2);
219}
220
221
222#define simd_any_sign_bit_8(v) \
223  vec_any_lt((vector signed char) v, (vector signed char) vec_splat_u8(0))
224
225static inline vector unsigned char bits_per_nybble_table() {
226  vector unsigned char zeroes = vec_splat_u8(0);
227  vector unsigned char ones = vec_splat_u8(1);
228  return simd_add_8
229           (simd_add_8(simd_pack_16(zeroes, ones),     // 0000000011111111
230                       simd_mergeh_32(zeroes, ones)),  // 0000111100001111   
231            simd_add_8(simd_mergeh_16(zeroes, ones),   // 0011001100110011
232                       simd_mergeh_8(zeroes, ones)));  // 0101010101010101
233  //                                                      ----------------
234  //                                                      0112122312232334
235}
236static inline int bitblock_bit_count(SIMD_type v) {
237  union {vector signed int vec; signed int elems[4];} result_count;
238  SIMD_type bit_count_tbl = bits_per_nybble_table();
239/*   SIMD_type bit_count_tbl = u8u16_control_vector[bits_per_nybble_tbl]; */
240  SIMD_type byte_counts;
241  byte_counts = vec_add(vec_perm(bit_count_tbl, bit_count_tbl, vec_sr(v, vec_splat_u8(4))),
242                        vec_perm(bit_count_tbl, bit_count_tbl, v));
243  vector unsigned int acc = vec_sum4s(byte_counts, vec_splat_u32(0));
244  result_count.vec = vec_sums((vector signed int) acc, vec_splat_s32(0));
245  return result_count.elems[3];
246} 
247
248#define bitblock_test_bit(blk, n) \
249   sisd_to_int(sisd_srli(sisd_slli(blk, (n)), BLOCKSIZE-1))
250
251
252
253static inline int count_fwd_zeroes(SIMD_type v) {
254  int zeroes;
255  union {SIMD_type vec; signed int elems[4];} vu;
256  vu.vec = v;
257  asm volatile("cntlzw %0, %1\n" : "=r" (zeroes) : "r" (vu.elems[0]));
258  if (zeroes < 32) return zeroes;
259  asm volatile("cntlzw %0, %1\n" : "=r" (zeroes) : "r" (vu.elems[1]));
260  if (zeroes < 32) return zeroes+32;
261  asm volatile("cntlzw %0, %1\n" : "=r" (zeroes) : "r" (vu.elems[2]));
262  if (zeroes < 32) return zeroes+64;
263  asm volatile("cntlzw %0, %1\n" : "=r" (zeroes) : "r" (vu.elems[3]));
264  return zeroes+96;
265}
266
267
268void print_bit_block(char * var_name, SIMD_type v) {
269  union {SIMD_type vec; unsigned char elems[16];} x;
270  x.vec = v;
271  int i;
272  printf("%20s = ", var_name);
273  for (i = 0; i < 16; i++) {
274    printf("%02X ", x.elems[i]);
275  }
276  printf("\n");
277}
278
279
280static inline SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
281{
282         SIMD_type c1 = simd_xor(a,b);
283         SIMD_type borrow = simd_and(a,b);
284         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
285         return simd_if(simd_himask_2,c2,c1);
286}
287#define simd_add_4(a, b)\
288        simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
289        ,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
290
291#define simd_srli_2(r, sh)\
292         simd_and(sisd_srli(r,sh),simd_const_2(3>>sh))
293
294#define simd_srli_4(r, sh)\
295         simd_and(sisd_srli(r,sh),simd_const_4(15>>sh))
296
297
298#define simd_add_2_xx(a, b) simd_add_2(a, b)
299#define simd_add_2_xl(a, b) simd_add_2(a, simd_andc(b, simd_himask_2))
300#define simd_add_2_xh(a, b) simd_add_2(a, simd_srli_2(b, 1))
301#define simd_add_2_lx(a, b) simd_add_2(simd_andc(a, simd_himask_2), b)
302#define simd_add_2_ll(a, b) simd_add_2(simd_andc(a, simd_himask_2), simd_andc(b, simd_himask_2))
303#define simd_add_2_lh(a, b) simd_add_2(simd_andc(a, simd_himask_2), simd_srli_2(b, 1))
304#define simd_add_2_hx(a, b) simd_add_2(simd_srli_2(a, 1), b)
305#define simd_add_2_hl(a, b) simd_add_2(simd_srli_2(a, 1), simd_andc(b, simd_himask_2))
306#define simd_add_2_hh(a, b) simd_add_2(simd_srli_2(a, 1), simd_srli_2(b, 1))
307#define simd_add_4_xx(a, b) simd_add_4(a, b)
308#define simd_add_4_xl(a, b) simd_add_4(a, simd_andc(b, simd_himask_4))
309#define simd_add_4_xh(a, b) simd_add_4(a, simd_srli_4(b, 2))
310#define simd_add_4_lx(a, b) simd_add_4(simd_andc(a, simd_himask_4), b)
311#define simd_add_4_ll(a, b) simd_add_4(simd_andc(a, simd_himask_4), simd_andc(b, simd_himask_4))
312#define simd_add_4_lh(a, b) simd_add_4(simd_andc(a, simd_himask_4), simd_srli_4(b, 2))
313#define simd_add_4_hx(a, b) simd_add_4(simd_srli_4(a, 2), b)
314#define simd_add_4_hl(a, b) simd_add_4(simd_srli_4(a, 2), simd_andc(b, simd_himask_4))
315#define simd_add_4_hh(a, b) simd_add_4(simd_srli_4(a, 2), simd_srli_4(b, 2))
316#define simd_add_8_xx(a, b) simd_add_8(a, b)
317#define simd_add_8_xl(a, b) simd_add_8(a, simd_andc(b, simd_himask_8))
318#define simd_add_8_xh(a, b) simd_add_8(a, simd_srli_8(b, 4))
319#define simd_add_8_lx(a, b) simd_add_8(simd_andc(a, simd_himask_8), b)
320#define simd_add_8_ll(a, b) simd_add_8(simd_andc(a, simd_himask_8), simd_andc(b, simd_himask_8))
321#define simd_add_8_lh(a, b) simd_add_8(simd_andc(a, simd_himask_8), simd_srli_8(b, 4))
322#define simd_add_8_hx(a, b) simd_add_8(simd_srli_8(a, 4), b)
323#define simd_add_8_hl(a, b) simd_add_8(simd_srli_8(a, 4), simd_andc(b, simd_himask_8))
324#define simd_add_8_hh(a, b) simd_add_8(simd_srli_8(a, 4), simd_srli_8(b, 4))
325
326#define simd_pack_2(a,b)\
327        simd_pack_4(simd_if(simd_himask_2,a,sisd_srli(a,1)),\
328        simd_if(simd_himask_2,b,sisd_srli(b,1)))
329#define simd_pack_4(a,b)\
330        simd_pack_8(simd_if(simd_himask_4,a,sisd_srli(a,2)),\
331        simd_if(simd_himask_4,b,sisd_srli(b,2)))
332#define simd_pack_8(a,b)\
333        simd_pack_16(simd_if(simd_himask_8,a,sisd_srli(a,4)),\
334        simd_if(simd_himask_8,b,sisd_srli(b,4)))
335#define simd_pack_2_xx(a, b) simd_pack_2(a, b)
336#define simd_pack_2_xl(a, b) simd_pack_2(a, b)
337#define simd_pack_2_xh(a, b) simd_pack_2(a, simd_srli_2(b, 1))
338#define simd_pack_2_lx(a, b) simd_pack_2(a, b)
339#define simd_pack_2_ll(a, b) simd_pack_2(a, b)
340#define simd_pack_2_lh(a, b) simd_pack_2(a, simd_srli_2(b, 1))
341#define simd_pack_2_hx(a, b) simd_pack_2(simd_srli_2(a, 1), b)
342#define simd_pack_2_hl(a, b) simd_pack_2(simd_srli_2(a, 1), b)
343#define simd_pack_2_hh(a, b) simd_pack_2(simd_srli_2(a, 1), simd_srli_2(b, 1))
344#define simd_pack_4_xx(a, b) simd_pack_4(a, b)
345#define simd_pack_4_xl(a, b) simd_pack_4(a, b)
346#define simd_pack_4_xh(a, b) simd_pack_4(a, simd_srli_4(b, 2))
347#define simd_pack_4_lx(a, b) simd_pack_4(a, b)
348#define simd_pack_4_ll(a, b) simd_pack_4(a, b)
349#define simd_pack_4_lh(a, b) simd_pack_4(a, simd_srli_4(b, 2))
350#define simd_pack_4_hx(a, b) simd_pack_4(simd_srli_4(a, 2), b)
351#define simd_pack_4_hl(a, b) simd_pack_4(simd_srli_4(a, 2), b)
352#define simd_pack_4_hh(a, b) simd_pack_4(simd_srli_4(a, 2), simd_srli_4(b, 2))
353#define simd_pack_8_xx(a, b) simd_pack_8(a, b)
354#define simd_pack_8_xl(a, b) simd_pack_8(a, b)
355#define simd_pack_8_xh(a, b) simd_pack_8(a, simd_srli_8(b, 4))
356#define simd_pack_8_lx(a, b) simd_pack_8(a, b)
357#define simd_pack_8_ll(a, b) simd_pack_8(a, b)
358#define simd_pack_8_lh(a, b) simd_pack_8(a, simd_srli_8(b, 4))
359#define simd_pack_8_hx(a, b) simd_pack_8(simd_srli_8(a, 4), b)
360#define simd_pack_8_hl(a, b) simd_pack_8(simd_srli_8(a, 4), b)
361#define simd_pack_8_hh(a, b) simd_pack_8(simd_srli_8(a, 4), simd_srli_8(b, 4))
362
363// Splat the first 16-bit int into all positions.
364static inline SIMD_type simd_splat_16(SIMD_type x) {
365  return (SIMD_type) vec_splat((vUInt16) x, 0);
366}
367
368// Splat the first 32-bit int into all positions.
369static inline SIMD_type simd_splat_32(SIMD_type x) {
370  return (SIMD_type) vec_splat((vUInt32) x, 0);
371}
372
373static inline SIMD_type sisd_load_unaligned(SIMD_type * p) {
374  SIMD_type input_shiftl = vec_lvsl(p,0);
375  return vec_perm(vec_ld(0, p), vec_ld(15, p), input_shiftl);
376}
377#endif
Note: See TracBrowser for help on using the repository browser.