source: trunk/lib/lib_simd_avx.h @ 1498

Last change on this file since 1498 was 1498, checked in by cameron, 8 years ago

AVX library update

File size: 6.3 KB
Line 
1/*  lib_simd_h:  SIMD Library including idealized SIMD operations
2    Copyright (C) 2011, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7    This file contains generic architecture-independent definitions,
8    importing architecture-specific implementations from appropriate
9    files.
10*/
11
12/*------------------------------------------------------------*/
13#ifndef LIB_SIMD_H
14#define LIB_SIMD_H
15#include <sys/types.h>
16#include <limits.h>
17
18#define LONG_BIT 64
19
20#include "../lib/avx_simd.h"
21/* Useful typdefs and constants. */
22#include "types.h"
23
24
25#if BYTE_ORDER == LITTLE_ENDIAN
26#define sisd_sfl(blk, n) sisd_sll(blk, n)
27#define sisd_sbl(blk, n) sisd_srl(blk, n)
28#define sisd_sfli(blk, n) sisd_slli(blk, n)
29#define sisd_sbli(blk, n) sisd_srli(blk, n)
30#define sb_op(x, n) ((x)>>(n))
31#define sf_op(x, n) ((x)<<(n))
32
33#ifdef __GNUC__
34#define cfzl __builtin_ctzl
35#define cbzl __builtin_clzl
36#define likely(x) __builtin_expect((x),1)
37#define unlikely(x) __builtin_expect((x),0)
38#endif
39#ifdef _MSC_VER
40#include <intrin.h>
41#pragma intrinsic(_BitScanForward)
42//  precondition: x > 0
43static inline unsigned long cfzl(unsigned long x) {
44        unsigned long zeroes;
45        _BitScanForward(&zeroes, x);
46        return zeroes;
47}
48static inline unsigned long cbzl(unsigned long x) {
49        unsigned long zeroes;
50        _BitScanReverse(&zeroes, x);
51        return zeroes;
52}
53#endif
54#endif
55
56
57static inline int count_forward_zeroes(SIMD_type bits) {
58  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
59  v.vec = bits;
60  if (v.elems[0] != 0) return cfzl(v.elems[0]);
61  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
62  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
63  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
64  else return 8*sizeof(SIMD_type);
65}
66
67static inline int count_backward_zeroes(SIMD_type bits) {
68  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
69  v.vec = bits;
70  if (v.elems[3] != 0) return cbzl(v.elems[3]);
71  else if (v.elems[2] != 0) return LONG_BIT + cbzl(v.elems[2]);
72  else if (v.elems[1] != 0) return 2*LONG_BIT + cbzl(v.elems[1]);
73  else if (v.elems[0] != 0) return 3*LONG_BIT + cbzl(v.elems[0]);
74  else return 8*sizeof(SIMD_type);
75}
76
77static inline unsigned long bitstream_segment_from(SIMD_type * stream, int bit_posn) {
78  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
79  return sb_op(*bitstream_ptr, bit_posn % 8);
80}
81
82/* Scans for a 1 as long as it takes.  Use a sentinel to fence.
83   Works for either endianness.  */
84static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
85  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
86  unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
87  if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
88  else {
89    do {
90      bitstream_ptr++;
91      bitstream_slice = *bitstream_ptr;
92    } while (bitstream_slice == 0);
93    int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
94    return base_posn + cfzl(bitstream_slice);
95  }
96}
97
98static inline int bitstream_scan0(SIMD_type * stream) {
99  unsigned long * bitstream_ptr = (unsigned long *) stream;
100  unsigned long bitstream_slice = *bitstream_ptr;
101  int base_posn = 0;
102  while (bitstream_slice == 0) {
103    bitstream_ptr++;
104    bitstream_slice = *bitstream_ptr;
105  }
106  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
107  return base_posn + cfzl(bitstream_slice);
108}
109
110
111/* Allocator for arrays of aligned SIMD data values.
112   Ideally the new operator could be used to allocate arrays
113   of vector data aligned on the required boundaries
114   (16-byte for SSE or Altivec).  But since this alignment
115   is not guaranteed except on Mac OS X, the following routine
116   is used. */
117
118static inline SIMD_type * simd_new(size_t SIMD_packs) {
119#ifdef __APPLE__
120        return new SIMD_type [SIMD_packs];
121#endif
122#ifdef _MSC_VER
123        SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
124        if (v != 0) return v;
125        else {
126                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
127                exit(-1);
128        }
129#endif
130#if !defined(__APPLE__) && !defined(_MSC_VER)
131        SIMD_type * v;
132        int rslt = posix_memalign((void **) &v,
133                                  sizeof(SIMD_type),
134                                  sizeof(SIMD_type) * SIMD_packs);
135        if (rslt == 0) return v;
136        else {
137                printf("Failed to allocated new array of %zu SIMD packs.\n", SIMD_packs);
138                exit(-1);
139        }
140#endif
141}
142
143static inline void simd_delete(SIMD_type * blk_ptr) {
144#ifdef __APPLE__
145  delete [] blk_ptr;
146#endif
147#ifndef __APPLE__
148  free((void *) blk_ptr);
149#endif
150}
151
152static void print_bit_block(char * var_name, SIMD_type v) {
153  union {SIMD_type vec; unsigned char elems[sizeof(SIMD_type)];} x;
154  x.vec = v;
155  unsigned char c;
156  int i;
157  printf("%20s = ", var_name);
158  for (i = 0; i < sizeof(SIMD_type); i++) {
159    c = x.elems[i];
160     printf("%02X ", c); 
161  }
162  printf("\n");
163}
164
165/* Prints the register representation of a 32 bit value. */
166static void print_general_register_32(const char * var_name, uint32_t v) {
167        unsigned char c;
168        printf("%30s = ", var_name);
169        for(int i=sizeof(uint32_t)-1; i>=0; i--) {
170                c = *(((unsigned char *)&v)+i);
171                printf("%02X ", c); 
172        }
173        printf("\n");
174}
175
176/* Prints the register representation of a 64 bit value. */
177static void print_general_register_64(const char * var_name, uint64_t v) {
178        unsigned char c;
179        printf("%30s = ", var_name);
180        for(int i=sizeof(uint64_t)-1; i>=0; i--) {
181                c = *(((unsigned char *)&v)+i);
182                printf("%02X ", c); 
183        }
184        printf("\n");
185}
186
187/* Prints the SIMD register representation of a SIMD value. */
188static void print_simd_register(const char * var_name, SIMD_type v) {
189  union {SIMD_type vec; unsigned char elems[sizeof(SIMD_type)];} x;
190  x.vec = v;
191  unsigned char c;
192  printf("%30s = ", var_name);
193  for(int i=sizeof(SIMD_type)-1; i>=0; i--) {
194    c = x.elems[i];
195    printf("%02X ", c); 
196  }
197  printf("\n");
198}
199
200/* Prints the array values little endian, right to left. */ 
201static void print_array_le(const char * var_name, char * buf, int size) {
202  unsigned char c;
203  printf("%30s = ", var_name);
204  for(int i=size-1; i>=0; i--) {
205    c = buf[i];
206    if(c == 0) {
207      printf("0");      // print zero for UTF code point 0
208    } else {
209      printf("%c", c); 
210    }
211    if(i%8 == 0) {
212      printf(" ");      // print a single space separator
213    }
214  }
215  printf("\n");
216}
217
218#endif
219
Note: See TracBrowser for help on using the repository browser.