source: trunk/lib/lib_simd.h @ 1516

Last change on this file since 1516 was 1463, checked in by ksherdy, 8 years ago

Centralized typedefs.

File size: 9.3 KB
Line 
1/*  lib_simd_h:  SIMD Library including idealized SIMD operations
2    Copyright (C) 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7    This file contains generic architecture-independent definitions,
8    importing architecture-specific implementations from appropriate
9    files.
10*/
11
12/*------------------------------------------------------------*/
13#ifndef LIB_SIMD_H
14#define LIB_SIMD_H
15#include <sys/types.h>
16#include <limits.h>
17
18#ifndef LONG_BIT
19#if ULONG_MAX == 0xFFFFFFFF
20#define LONG_BIT 32
21#endif
22#if ULONG_MAX == 0xFFFFFFFFFFFFFFFF
23#define LONG_BIT 64
24#endif
25#endif
26
27#if (defined(__i386) || defined(__x86_64))
28#ifdef TEMPLATED_SIMD_LIB
29#include "sse_simd_t.h"
30#endif
31#ifndef TEMPLATED_SIMD_LIB
32#include "sse_simd.h"
33#endif
34#endif
35#ifdef _ARCH_PPC
36#include "altivec_simd.h"
37#endif
38
39/* Useful typdefs and constants. */
40#include "types.h"
41
42/* Useful definitions from Linux kernel*/
43#ifdef __GNUC__
44/*
45#define likely(x) __builtin_expect((x),1)
46#define unlikely(x) __builtin_expect((x),0)
47*/
48static inline long likely(long x) {
49        return __builtin_expect(x, 1);
50}
51static inline long unlikely(long x) {
52        return __builtin_expect(x, 0);
53}
54
55#endif
56#ifdef _MSC_VER
57#define inline __inline
58#include "lib/sse_simd.h"
59#define likely(x) (x)
60#define unlikely(x) (x)
61#endif
62
63#ifdef TEMPLATED_SIMD_LIB
64static inline SIMD_type sisd_sll(SIMD_type blk, SIMD_type n) {
65        return simd<128>::sll(blk, n);
66}
67static inline SIMD_type sisd_srl(SIMD_type blk, SIMD_type n) {
68        return simd<128>::srl(blk, n);
69}
70#define sisd_slli(blk, n) simd<128>::slli<n>(blk)
71#define sisd_srli(blk, n) simd<128>::srli<n>(blk)
72#endif
73
74
75/* Shift forward and back operations, based on endianness */
76#if BYTE_ORDER == BIG_ENDIAN
77#define sisd_sfl(blk, n) sisd_srl(blk, n)
78#define sisd_sbl(blk, n) sisd_sll(blk, n)
79#define sisd_sfli(blk, n) sisd_srli(blk, n)
80#define sisd_sbli(blk, n) sisd_slli(blk, n)
81#define sb_op(x, n) ((x)<<(n))
82#define sf_op(x, n) ((x)>>(n))
83#define cfzl __builtin_clzl
84#define cbzl __builtin_ctzl
85#endif
86
87#if BYTE_ORDER == LITTLE_ENDIAN
88#define sisd_sfl(blk, n) sisd_sll(blk, n)
89#define sisd_sbl(blk, n) sisd_srl(blk, n)
90#define sisd_sfli(blk, n) sisd_slli(blk, n)
91#define sisd_sbli(blk, n) sisd_srli(blk, n)
92#define sb_op(x, n) ((x)>>(n))
93#define sf_op(x, n) ((x)<<(n))
94#ifdef __GNUC__
95#define cfzl __builtin_ctzl
96#define cbzl __builtin_clzl
97#endif
98#ifdef _MSC_VER
99#include <intrin.h>
100#pragma intrinsic(_BitScanForward)
101//  precondition: x > 0
102static inline unsigned long cfzl(unsigned long x) {
103        unsigned long zeroes;
104        _BitScanForward(&zeroes, x);
105        return zeroes;
106}
107static inline unsigned long cbzl(unsigned long x) {
108        unsigned long zeroes;
109        _BitScanReverse(&zeroes, x);
110        return zeroes;
111}
112#endif
113#endif
114
115
116static inline int count_forward_zeroes(SIMD_type bits) {
117  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
118  v.vec = bits;
119  if (v.elems[0] != 0) return cfzl(v.elems[0]);
120  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
121#ifdef _MSC_VER
122  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
123  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
124#endif
125#ifndef _MSC_VER
126#if LONG_BIT < 64
127  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
128  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
129#endif
130#endif
131  else return 8*sizeof(SIMD_type);
132}
133
134static inline int count_backward_zeroes(SIMD_type bits) {
135  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
136  v.vec = bits;
137#if LONG_BIT == 64
138  if (v.elems[1] != 0) return cbzl(v.elems[1]);
139  else if (v.elems[0] != 0) return LONG_BIT + cbzl(v.elems[0]);
140#endif
141#if LONG_BIT < 64
142  if (v.elems[3] != 0) return cbzl(v.elems[3]);
143  else if (v.elems[2] != 0) return LONG_BIT + cbzl(v.elems[2]);
144  else if (v.elems[1] != 0) return 2*LONG_BIT + cbzl(v.elems[1]);
145  else if (v.elems[0] != 0) return 3*LONG_BIT + cbzl(v.elems[0]);
146#endif
147  else return 8*sizeof(SIMD_type);
148}
149
150static inline unsigned long bitstream_segment_from(SIMD_type * stream, int bit_posn) {
151  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
152  return sb_op(*bitstream_ptr, bit_posn % 8);
153}
154
155/* Scans for a 1 as long as it takes.  Use a sentinel to fence.
156   Works for either endianness.  */
157static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
158  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
159  unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
160  if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
161  else {
162    do {
163      bitstream_ptr++;
164      bitstream_slice = *bitstream_ptr;
165    } while (bitstream_slice == 0);
166    int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
167    return base_posn + cfzl(bitstream_slice);
168  }
169}
170
171static inline int bitstream_scan0(SIMD_type * stream) {
172  unsigned long * bitstream_ptr = (unsigned long *) stream;
173  unsigned long bitstream_slice = *bitstream_ptr;
174  int base_posn = 0;
175  while (bitstream_slice == 0) {
176    bitstream_ptr++;
177    bitstream_slice = *bitstream_ptr;
178  }
179  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
180  return base_posn + cfzl(bitstream_slice);
181}
182
183
184/* Allocator for arrays of aligned SIMD data values.
185   Ideally the new operator could be used to allocate arrays
186   of vector data aligned on the required boundaries
187   (16-byte for SSE or Altivec).  But since this alignment
188   is not guaranteed except on Mac OS X, the following routine
189   is used. */
190
191static inline SIMD_type * simd_new(size_t SIMD_packs) {
192#ifdef __APPLE__
193        return new SIMD_type [SIMD_packs];
194#endif
195#ifdef _MSC_VER
196        SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
197        if (v != 0) return v;
198        else {
199                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
200                exit(-1);
201        }
202#endif
203#if !defined(__APPLE__) && !defined(_MSC_VER)
204        SIMD_type * v;
205        int rslt = posix_memalign((void **) &v,
206                                  sizeof(SIMD_type),
207                                  sizeof(SIMD_type) * SIMD_packs);
208        if (rslt == 0) return v;
209        else {
210                printf("Failed to allocated new array of %zu SIMD packs.\n", SIMD_packs);
211                exit(-1);
212        }
213#endif
214}
215
216static inline void simd_delete(SIMD_type * blk_ptr) {
217#ifdef __APPLE__
218  delete [] blk_ptr;
219#endif
220#ifndef __APPLE__
221  free((void *) blk_ptr);
222#endif
223}
224
225static void print_bit_block(char * var_name, SIMD_type v) {
226  union {SIMD_type vec; unsigned char elems[sizeof(SIMD_type)];} x;
227  x.vec = v;
228  unsigned char c;
229  int i;
230  printf("%20s = ", var_name);
231  for (i = 0; i < sizeof(SIMD_type); i++) {
232    c = x.elems[i];
233     printf("%02X ", c); 
234  }
235  printf("\n");
236}
237
238/* Prints the register representation of a 32 bit value. */
239static void print_general_register_32(const char * var_name, uint32_t v) {
240        unsigned char c;
241        printf("%30s = ", var_name);
242        for(int i=sizeof(uint32_t)-1; i>=0; i--) {
243                c = *(((unsigned char *)&v)+i);
244                printf("%02X ", c); 
245        }
246        printf("\n");
247}
248
249/* Prints the register representation of a 64 bit value. */
250static void print_general_register_64(const char * var_name, uint64_t v) {
251        unsigned char c;
252        printf("%30s = ", var_name);
253        for(int i=sizeof(uint64_t)-1; i>=0; i--) {
254                c = *(((unsigned char *)&v)+i);
255                printf("%02X ", c); 
256        }
257        printf("\n");
258}
259
260/* Prints the SIMD register representation of a SIMD value. */
261static void print_simd_register(const char * var_name, SIMD_type v) {
262  union {SIMD_type vec; unsigned char elems[sizeof(SIMD_type)];} x;
263  x.vec = v;
264  unsigned char c;
265  printf("%30s = ", var_name);
266  for(int i=sizeof(SIMD_type)-1; i>=0; i--) {
267    c = x.elems[i];
268    printf("%02X ", c); 
269  }
270  printf("\n");
271}
272
273/* Prints the array values little endian, right to left. */ 
274static void print_array_le(const char * var_name, char * buf, int size) {
275  unsigned char c;
276  printf("%30s = ", var_name);
277  for(int i=size-1; i>=0; i--) {
278    c = buf[i];
279    if(c == 0) {
280      printf("0");      // print zero for UTF code point 0
281    } else {
282      printf("%c", c); 
283    }
284    if(i%8 == 0) {
285      printf(" ");      // print a single space separator
286    }
287  }
288  printf("\n");
289}
290
291static inline int bitblock_has_bit(SIMD_type v) {
292#ifdef TEMPLATED_SIMD_LIB
293    return !simd_all_true<8>(simd<8>::eq(v, simd<8>::constant<0>()));
294#else
295
296#ifndef USE_PTEST
297  return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
298#endif
299#ifdef USE_PTEST
300  return !_mm_testz_si128(v,v);
301#endif
302
303#endif //TEMPLATED_SIMD_LIB
304}
305
306static inline int bitblock_bit_count(SIMD_type v) {
307#ifdef TEMPLATED_SIMD_LIB
308
309    int bit_count = 0;
310    SIMD_type cts_2 = simd<2>::add<l,h>(v, v);
311    SIMD_type cts_4 = simd<4>::add<l,h>(cts_2, cts_2);
312    SIMD_type cts_8 = simd<8>::add<l,h>(cts_4, cts_4);
313    SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd<8>::constant<0>());
314    /* SIMD_type cts_128 = simd<a28>::add<l,h>(cts_64, cts_64) */;
315    SIMD_type cts_128 = simd<64>::add(cts_64, simd<128>::srli<64>(cts_64));
316    return (int) sisd_to_int(cts_128);
317
318#else
319
320    int bit_count = 0;
321    SIMD_type cts_2 = simd_add_2_lh(v, v);
322    SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
323    SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
324    SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
325    /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
326    SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
327    return (int) sisd_to_int(cts_128);
328
329#endif //TEMPLATED_SIMD_LIB
330}
331
332#endif
333
Note: See TracBrowser for help on using the repository browser.