source: trunk/lib/lib_simd.h @ 531

Last change on this file since 531 was 531, checked in by cameron, 9 years ago

More cbzl fixes.

File size: 7.5 KB
Line 
1/*  lib_simd_h:  SIMD Library including idealized SIMD operations
2    Copyright (C) 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7    This file contains generic architecture-independent definitions,
8    importing architecture-specific implementations from appropriate
9    files.
10*/
11
12/*------------------------------------------------------------*/
13#ifndef LIB_SIMD_H
14#define LIB_SIMD_H
15#include <sys/types.h>
16#include <limits.h>
17
18#ifndef LONG_BIT
19#if ULONG_MAX == 0xFFFFFFFF
20#define LONG_BIT 32
21#endif
22#if ULONG_MAX == 0xFFFFFFFFFFFFFFFF
23#define LONG_BIT 64
24#endif
25#endif
26
27#if (defined(__i386) || defined(__x86_64))
28#ifdef TEMPLATED_SIMD_LIB
29#include "sse_simd_t.h"
30#endif
31#ifndef TEMPLATED_SIMD_LIB
32#include "sse_simd.h"
33#endif
34#endif
35#ifdef _ARCH_PPC
36#include "altivec_simd.h"
37#endif
38
39/* Useful definitions from Linux kernel*/
40#ifdef __GNUC__
41/*
42#define likely(x) __builtin_expect((x),1)
43#define unlikely(x) __builtin_expect((x),0)
44*/
45static inline long likely(long x) {
46        return __builtin_expect(x, 1);
47}
48static inline long unlikely(long x) {
49        return __builtin_expect(x, 0);
50}
51
52#endif
53#ifdef _MSC_VER
54#define likely(x) (x)
55#define unlikely(x) (x)
56#endif
57
58/* Shift forward and back operations, based on endianness */
59#if BYTE_ORDER == BIG_ENDIAN
60#define sisd_sfl(blk, n) sisd_srl(blk, n)
61#define sisd_sbl(blk, n) sisd_sll(blk, n)
62#define sisd_sfli(blk, n) sisd_srli(blk, n)
63#define sisd_sbli(blk, n) sisd_slli(blk, n)
64#define sb_op(x, n) ((x)<<(n))
65#define sf_op(x, n) ((x)>>(n))
66#define cfzl __builtin_clzl
67#define cbzl __builtin_ctzl
68#endif
69#if BYTE_ORDER == LITTLE_ENDIAN
70#ifdef TEMPLATED_SIMD_LIB
71static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
72        return simd<128>::sll(blk, n);
73}
74
75static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
76        return simd<128>::srl(blk, n);
77}
78#define sisd_sfli(blk, n) simd<128>::slli<n>(blk)
79#define sisd_sbli(blk, n) simd<128>::srli<n>(blk)
80#endif
81#ifndef TEMPLATED_SIMD_LIB
82static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
83        return sisd_sll(blk, n);
84}
85static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
86        return sisd_srl(blk, n);
87}
88#define sisd_sfli(blk, n) sisd_slli(blk, n)
89#define sisd_sbli(blk, n) sisd_srli(blk, n)
90#endif
91#define sb_op(x, n) ((x)>>(n))
92#define sf_op(x, n) ((x)<<(n))
93#ifdef __GNUC__
94#define cfzl __builtin_ctzl
95#define cbzl __builtin_clzl
96#endif
97#ifdef _MSC_VER
98#include <intrin.h>
99#pragma intrinsic(_BitScanForward)
100//  precondition: x > 0
101static inline unsigned long cfzl(unsigned long x) {
102        unsigned long zeroes;
103        _BitScanForward(&zeroes, x);
104        return zeroes;
105}
106#endif
107#endif
108
109
110static inline int count_forward_zeroes(SIMD_type bits) {
111  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
112  v.vec = bits;
113  if (v.elems[0] != 0) return cfzl(v.elems[0]);
114  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
115#ifdef _MSC_VER
116  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
117  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
118#endif
119#ifndef _MSC_VER
120#if LONG_BIT < 64
121  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
122  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
123#endif
124#endif
125  else return 8*sizeof(SIMD_type);
126}
127
128static inline int count_backward_zeroes(SIMD_type bits) {
129  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
130  v.vec = bits;
131#if LONG_BIT == 64
132  if (v.elems[1] != 0) return cbzl(v.elems[1]);
133  else if (v.elems[0] != 0) return LONG_BIT + cbzl(v.elems[0]);
134#endif
135#if LONG_BIT < 64
136  if (v.elems[3] != 0) return cbzl(v.elems[3]);
137  else if (v.elems[2] != 0) return LONG_BIT + cbzl(v.elems[2]);
138  else if (v.elems[1] != 0) return 2*LONG_BIT + cbzl(v.elems[1]);
139  else if (v.elems[0] != 0) return 3*LONG_BIT + cbzl(v.elems[0]);
140#endif
141  else return 8*sizeof(SIMD_type);
142}
143
144static inline unsigned long bitstream_segment_from(SIMD_type * stream, int bit_posn) {
145  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
146  return sb_op(*bitstream_ptr, bit_posn % 8);
147}
148
149/* Scans for a 1 as long as it takes.  Use a sentinel to fence.
150   Works for either endianness.  */
151static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
152  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
153  unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
154  if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
155  else {
156    do {
157      bitstream_ptr++;
158      bitstream_slice = *bitstream_ptr;
159    } while (bitstream_slice == 0);
160    int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
161    return base_posn + cfzl(bitstream_slice);
162  }
163}
164
165static inline int bitstream_scan0(SIMD_type * stream) {
166  unsigned long * bitstream_ptr = (unsigned long *) stream;
167  unsigned long bitstream_slice = *bitstream_ptr;
168  int base_posn = 0;
169  while (bitstream_slice == 0) {
170    bitstream_ptr++;
171    bitstream_slice = *bitstream_ptr;
172  }
173  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
174  return base_posn + cfzl(bitstream_slice);
175}
176
177
178/* Allocator for arrays of aligned SIMD data values.
179   Ideally the new operator could be used to allocate arrays
180   of vector data aligned on the required boundaries
181   (16-byte for SSE or Altivec).  But since this alignment
182   is not guaranteed except on Mac OS X, the following routine
183   is used. */
184
185static inline SIMD_type * simd_new(size_t SIMD_packs) {
186#ifdef __APPLE__
187        return new SIMD_type [SIMD_packs];
188#endif
189#ifdef _MSC_VER
190        SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
191        if (v != 0) return v;
192        else {
193                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
194                exit(-1);
195        }
196#endif
197#if !defined(__APPLE__) && !defined(_MSC_VER)
198        SIMD_type * v;
199        int rslt = posix_memalign((void **) &v,
200                                  sizeof(SIMD_type),
201                                  sizeof(SIMD_type) * SIMD_packs);
202        if (rslt == 0) return v;
203        else {
204                printf("Failed to allocated new array of %zu SIMD packs.\n", SIMD_packs);
205                exit(-1);
206        }
207#endif
208}
209
210static inline void simd_delete(SIMD_type * blk_ptr) {
211#ifdef __APPLE__
212  delete [] blk_ptr;
213#endif
214#ifndef __APPLE__
215  free((void *) blk_ptr);
216#endif
217}
218
219static void print_bit_block(char * var_name, SIMD_type v) {
220  union {SIMD_type vec; unsigned char elems[8];} x;
221  x.vec = v;
222  unsigned char c, bit_reversed;
223  int i;
224  printf("%20s = ", var_name);
225  for (i = 0; i < sizeof(SIMD_type); i++) {
226    c = x.elems[i];
227     printf("%02X ", c); 
228  }
229  printf("\n");
230}
231
232/* Prints the register representation of a 32 bit value. */
233static void print_general_register_32(const char * var_name, uint32_t v) {
234        unsigned char c;
235        int i;
236        printf("%30s = ", var_name);
237        for(int i=sizeof(uint32_t)-1; i>=0; i--) {
238                c = *(((unsigned char *)&v)+i);
239                printf("%02X ", c); 
240        }
241        printf("\n");
242}
243
244/* Prints the register representation of a 64 bit value. */
245static void print_general_register_64(const char * var_name, uint64_t v) {
246        unsigned char c;
247        int i;
248        printf("%30s = ", var_name);
249        for(int i=sizeof(uint64_t)-1; i>=0; i--) {
250                c = *(((unsigned char *)&v)+i);
251                printf("%02X ", c); 
252        }
253        printf("\n");
254}
255
256/* Prints the SIMD register representation of a SIMD value. */
257static void print_simd_register(const char * var_name, SIMD_type v) {
258  union {SIMD_type vec; unsigned char elems[8];} x;
259  x.vec = v;
260  unsigned char c, bit_reversed;
261  int i;
262  printf("%30s = ", var_name);
263  for(int i=sizeof(SIMD_type)-1; i>=0; i--) {
264    c = x.elems[i];
265    printf("%02X ", c); 
266  }
267  printf("\n");
268}
269
270#endif
271
Note: See TracBrowser for help on using the repository browser.