source: trunk/lib/lib_simd.h @ 414

Last change on this file since 414 was 414, checked in by cameron, 9 years ago

Library fixes

File size: 6.9 KB
Line 
1/*  lib_simd_h:  SIMD Library including idealized SIMD operations
2    Copyright (C) 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7    This file contains generic architecture-independent definitions,
8    importing architecture-specific implementations from appropriate
9    files.
10*/
11
12/*------------------------------------------------------------*/
13#ifndef LIB_SIMD_H
14#define LIB_SIMD_H
15#include <sys/types.h>
16#include <limits.h>
17
18#ifndef LONG_BIT
19#if ULONG_MAX == 0xFFFFFFFF
20#define LONG_BIT 32
21#endif
22#if ULONG_MAX == 0xFFFFFFFFFFFFFFFF
23#define LONG_BIT 64
24#endif
25#endif
26
27#if (defined(__i386) || defined(__x86_64))
28#ifdef TEMPLATED_SIMD_LIB
29#include "sse_simd_t.h"
30#endif
31#ifndef TEMPLATED_SIMD_LIB
32#include "sse_simd.h"
33#endif
34#endif
35#ifdef _ARCH_PPC
36#include "altivec_simd.h"
37#endif
38
39/* Useful definitions from Linux kernel*/
40#ifdef __GNUC__
41/*
42#define likely(x) __builtin_expect((x),1)
43#define unlikely(x) __builtin_expect((x),0)
44*/
45static inline long likely(long x) {
46        return __builtin_expect(x, 1);
47}
48static inline long unlikely(long x) {
49        return __builtin_expect(x, 0);
50}
51
52#endif
53#ifdef _MSC_VER
54#define likely(x) (x)
55#define unlikely(x) (x)
56#endif
57
58/* Shift forward and back operations, based on endianness */
59#if BYTE_ORDER == BIG_ENDIAN
60#define sisd_sfl(blk, n) sisd_srl(blk, n)
61#define sisd_sbl(blk, n) sisd_sll(blk, n)
62#define sisd_sfli(blk, n) sisd_srli(blk, n)
63#define sisd_sbli(blk, n) sisd_slli(blk, n)
64#define sb_op(x, n) ((x)<<(n))
65#define sf_op(x, n) ((x)>>(n))
66#define cfzl __builtin_clzl
67#endif
68#if BYTE_ORDER == LITTLE_ENDIAN
69#ifdef TEMPLATED_SIMD_LIB
70static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
71        return simd<128>::sll(blk, n);
72}
73
74static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
75        return simd<128>::srl(blk, n);
76}
77#define sisd_sfli(blk, n) simd<128>::slli<n>(blk)
78#define sisd_sbli(blk, n) simd<128>::srli<n>(blk)
79#endif
80#ifndef TEMPLATED_SIMD_LIB
81static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
82        return sisd_sll(blk, n);
83}
84static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
85        return sisd_srl(blk, n);
86}
87#define sisd_sfli(blk, n) sisd_slli(blk, n)
88#define sisd_sbli(blk, n) sisd_srli(blk, n)
89#endif
90#define sb_op(x, n) ((x)>>(n))
91#define sf_op(x, n) ((x)<<(n))
92#ifdef __GNUC__
93#define cfzl __builtin_ctzl
94#endif
95#ifdef _MSC_VER
96#include <intrin.h>
97#pragma intrinsic(_BitScanForward)
98//  precondition: x > 0
99static inline unsigned long cfzl(unsigned long x) {
100        unsigned long zeroes;
101        _BitScanForward(&zeroes, x);
102        return zeroes;
103}
104#endif
105#endif
106
107
108static inline int count_forward_zeroes(SIMD_type bits) {
109  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
110  v.vec = bits;
111  if (v.elems[0] != 0) return cfzl(v.elems[0]);
112  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
113#ifdef _MSC_VER
114  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
115  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
116#endif
117#ifndef _MSC_VER
118#if LONG_BIT < 64
119  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
120  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
121#endif
122#endif
123  else return 8*sizeof(SIMD_type);
124}
125
126static inline unsigned long bitstream_segment_from(SIMD_type * stream, int bit_posn) {
127  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
128  return sb_op(*bitstream_ptr, bit_posn % 8);
129}
130
131/* Scans for a 1 as long as it takes.  Use a sentinel to fence.
132   Works for either endianness.  */
133static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
134  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
135  unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
136  if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
137  else {
138    do {
139      bitstream_ptr++;
140      bitstream_slice = *bitstream_ptr;
141    } while (bitstream_slice == 0);
142    int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
143    return base_posn + cfzl(bitstream_slice);
144  }
145}
146
147static inline int bitstream_scan0(SIMD_type * stream) {
148  unsigned long * bitstream_ptr = (unsigned long *) stream;
149  unsigned long bitstream_slice = *bitstream_ptr;
150  int base_posn = 0;
151  while (bitstream_slice == 0) {
152    bitstream_ptr++;
153    bitstream_slice = *bitstream_ptr;
154  }
155  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
156  return base_posn + cfzl(bitstream_slice);
157}
158
159
160/* Allocator for arrays of aligned SIMD data values.
161   Ideally the new operator could be used to allocate arrays
162   of vector data aligned on the required boundaries
163   (16-byte for SSE or Altivec).  But since this alignment
164   is not guaranteed except on Mac OS X, the following routine
165   is used. */
166
167static inline SIMD_type * simd_new(size_t SIMD_packs) {
168#ifdef __APPLE__
169        return new SIMD_type [SIMD_packs];
170#endif
171#ifdef _MSC_VER
172        SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
173        if (v != 0) return v;
174        else {
175                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
176                exit(-1);
177        }
178#endif
179#if !defined(__APPLE__) && !defined(_MSC_VER)
180        SIMD_type * v;
181        int rslt = posix_memalign((void **) &v,
182                                  sizeof(SIMD_type),
183                                  sizeof(SIMD_type) * SIMD_packs);
184        if (rslt == 0) return v;
185        else {
186                printf("Failed to allocated new array of %zu SIMD packs.\n", SIMD_packs);
187                exit(-1);
188        }
189#endif
190}
191
192static inline void simd_delete(SIMD_type * blk_ptr) {
193#ifdef __APPLE__
194  delete [] blk_ptr;
195#endif
196#ifndef __APPLE__
197  free((void *) blk_ptr);
198#endif
199}
200
201static void print_bit_block(char * var_name, SIMD_type v) {
202  union {SIMD_type vec; unsigned char elems[8];} x;
203  x.vec = v;
204  unsigned char c, bit_reversed;
205  int i;
206  printf("%20s = ", var_name);
207  for (i = 0; i < sizeof(SIMD_type); i++) {
208    c = x.elems[i];
209     printf("%02X ", c); 
210  }
211  printf("\n");
212}
213
214/* Prints the register representation of a 32 bit value. */
215static void print_general_register_32(const char * var_name, uint32_t v) {
216        unsigned char c;
217        int i;
218        printf("%30s = ", var_name);
219        for(int i=sizeof(uint32_t)-1; i>=0; i--) {
220                c = *(((unsigned char *)&v)+i);
221                printf("%02X ", c); 
222        }
223        printf("\n");
224}
225
226/* Prints the register representation of a 64 bit value. */
227static void print_general_register_64(const char * var_name, uint32_t v) {
228        unsigned char c;
229        int i;
230        printf("%30s = ", var_name);
231        for(int i=sizeof(uint32_t)-1; i>=0; i--) {
232                c = *(((unsigned char *)&v)+i);
233                printf("%02X ", c); 
234        }
235        printf("\n");
236}
237
238/* Prints the SIMD register representation of a SIMD value. */
239static void print_simd_register(const char * var_name, SIMD_type v) {
240  union {SIMD_type vec; unsigned char elems[8];} x;
241  x.vec = v;
242  unsigned char c, bit_reversed;
243  int i;
244  printf("%30s = ", var_name);
245  for(int i=sizeof(SIMD_type)-1; i>=0; i--) {
246    c = x.elems[i];
247    printf("%02X ", c); 
248  }
249  printf("\n");
250}
251
252#endif
253
Note: See TracBrowser for help on using the repository browser.