source: trunk/lib/lib_simd_avx.h @ 1234

Last change on this file since 1234 was 960, checked in by cameron, 8 years ago

Rename AVX library versions

File size: 6.3 KB
Line 
1/*  lib_simd_h:  SIMD Library including idealized SIMD operations
2    Copyright (C) 2011, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7    This file contains generic architecture-independent definitions,
8    importing architecture-specific implementations from appropriate
9    files.
10*/
11
12/*------------------------------------------------------------*/
13#ifndef LIB_SIMD_H
14#define LIB_SIMD_H
15#include <sys/types.h>
16#include <limits.h>
17
18#define LONG_BIT 64
19
20#include "../lib/avx_simd.h"
21
22#if BYTE_ORDER == LITTLE_ENDIAN
23#define sisd_sfl(blk, n) sisd_sll(blk, n)
24#define sisd_sbl(blk, n) sisd_srl(blk, n)
25#define sisd_sfli(blk, n) sisd_slli(blk, n)
26#define sisd_sbli(blk, n) sisd_srli(blk, n)
27#define sb_op(x, n) ((x)>>(n))
28#define sf_op(x, n) ((x)<<(n))
29
30#ifdef __GNUC__
31#define cfzl __builtin_ctzl
32#define cbzl __builtin_clzl
33#define likely(x) __builtin_expect((x),1)
34#define unlikely(x) __builtin_expect((x),0)
35#endif
36#ifdef _MSC_VER
37#include <intrin.h>
38#pragma intrinsic(_BitScanForward)
39//  precondition: x > 0
40static inline unsigned long cfzl(unsigned long x) {
41        unsigned long zeroes;
42        _BitScanForward(&zeroes, x);
43        return zeroes;
44}
45static inline unsigned long cbzl(unsigned long x) {
46        unsigned long zeroes;
47        _BitScanReverse(&zeroes, x);
48        return zeroes;
49}
50#endif
51#endif
52
53
54static inline int count_forward_zeroes(SIMD_type bits) {
55  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
56  v.vec = bits;
57  if (v.elems[0] != 0) return cfzl(v.elems[0]);
58  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
59  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
60  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
61  else return 8*sizeof(SIMD_type);
62}
63
64static inline int count_backward_zeroes(SIMD_type bits) {
65  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
66  v.vec = bits;
67  if (v.elems[3] != 0) return cbzl(v.elems[3]);
68  else if (v.elems[2] != 0) return LONG_BIT + cbzl(v.elems[2]);
69  else if (v.elems[1] != 0) return 2*LONG_BIT + cbzl(v.elems[1]);
70  else if (v.elems[0] != 0) return 3*LONG_BIT + cbzl(v.elems[0]);
71  else return 8*sizeof(SIMD_type);
72}
73
74static inline unsigned long bitstream_segment_from(SIMD_type * stream, int bit_posn) {
75  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
76  return sb_op(*bitstream_ptr, bit_posn % 8);
77}
78
79/* Scans for a 1 as long as it takes.  Use a sentinel to fence.
80   Works for either endianness.  */
81static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
82  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
83  unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
84  if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
85  else {
86    do {
87      bitstream_ptr++;
88      bitstream_slice = *bitstream_ptr;
89    } while (bitstream_slice == 0);
90    int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
91    return base_posn + cfzl(bitstream_slice);
92  }
93}
94
95static inline int bitstream_scan0(SIMD_type * stream) {
96  unsigned long * bitstream_ptr = (unsigned long *) stream;
97  unsigned long bitstream_slice = *bitstream_ptr;
98  int base_posn = 0;
99  while (bitstream_slice == 0) {
100    bitstream_ptr++;
101    bitstream_slice = *bitstream_ptr;
102  }
103  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
104  return base_posn + cfzl(bitstream_slice);
105}
106
107
108/* Allocator for arrays of aligned SIMD data values.
109   Ideally the new operator could be used to allocate arrays
110   of vector data aligned on the required boundaries
111   (16-byte for SSE or Altivec).  But since this alignment
112   is not guaranteed except on Mac OS X, the following routine
113   is used. */
114
115static inline SIMD_type * simd_new(size_t SIMD_packs) {
116#ifdef __APPLE__
117        return new SIMD_type [SIMD_packs];
118#endif
119#ifdef _MSC_VER
120        SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
121        if (v != 0) return v;
122        else {
123                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
124                exit(-1);
125        }
126#endif
127#if !defined(__APPLE__) && !defined(_MSC_VER)
128        SIMD_type * v;
129        int rslt = posix_memalign((void **) &v,
130                                  sizeof(SIMD_type),
131                                  sizeof(SIMD_type) * SIMD_packs);
132        if (rslt == 0) return v;
133        else {
134                printf("Failed to allocated new array of %zu SIMD packs.\n", SIMD_packs);
135                exit(-1);
136        }
137#endif
138}
139
140static inline void simd_delete(SIMD_type * blk_ptr) {
141#ifdef __APPLE__
142  delete [] blk_ptr;
143#endif
144#ifndef __APPLE__
145  free((void *) blk_ptr);
146#endif
147}
148
149static void print_bit_block(char * var_name, SIMD_type v) {
150  union {SIMD_type vec; unsigned char elems[sizeof(SIMD_type)];} x;
151  x.vec = v;
152  unsigned char c;
153  int i;
154  printf("%20s = ", var_name);
155  for (i = 0; i < sizeof(SIMD_type); i++) {
156    c = x.elems[i];
157     printf("%02X ", c); 
158  }
159  printf("\n");
160}
161
162/* Prints the register representation of a 32 bit value. */
163static void print_general_register_32(const char * var_name, uint32_t v) {
164        unsigned char c;
165        printf("%30s = ", var_name);
166        for(int i=sizeof(uint32_t)-1; i>=0; i--) {
167                c = *(((unsigned char *)&v)+i);
168                printf("%02X ", c); 
169        }
170        printf("\n");
171}
172
173/* Prints the register representation of a 64 bit value. */
174static void print_general_register_64(const char * var_name, uint64_t v) {
175        unsigned char c;
176        printf("%30s = ", var_name);
177        for(int i=sizeof(uint64_t)-1; i>=0; i--) {
178                c = *(((unsigned char *)&v)+i);
179                printf("%02X ", c); 
180        }
181        printf("\n");
182}
183
184/* Prints the SIMD register representation of a SIMD value. */
185static void print_simd_register(const char * var_name, SIMD_type v) {
186  union {SIMD_type vec; unsigned char elems[sizeof(SIMD_type)];} x;
187  x.vec = v;
188  unsigned char c;
189  printf("%30s = ", var_name);
190  for(int i=sizeof(SIMD_type)-1; i>=0; i--) {
191    c = x.elems[i];
192    printf("%02X ", c); 
193  }
194  printf("\n");
195}
196
197/* Prints the array values little endian, right to left. */ 
198static void print_array_le(const char * var_name, char * buf, int size) {
199  unsigned char c;
200  printf("%30s = ", var_name);
201  for(int i=size-1; i>=0; i--) {
202    c = buf[i];
203    if(c == 0) {
204      printf("0");      // print zero for UTF code point 0
205    } else {
206      printf("%c", c); 
207    }
208    if(i%8 == 0) {
209      printf(" ");      // print a single space separator
210    }
211  }
212  printf("\n");
213}
214
215#endif
216
Note: See TracBrowser for help on using the repository browser.