source: trunk/lib/lib_simd.h @ 179

Last change on this file since 179 was 179, checked in by lindanl, 11 years ago

Templated SIMD Library - initial version

File size: 4.7 KB
Line 
1/*  lib_simd_h:  SIMD Library including idealized SIMD operations
2    Copyright (C) 2008, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7    This file contains generic architecture-independent definitions,
8    importing architecture-specific implementations from appropriate
9    files.
10*/
11
12/*------------------------------------------------------------*/
13#ifndef LIB_SIMD_H
14#define LIB_SIMD_H
15#include <sys/types.h>
16#include <limits.h>
17
18#if (defined(__i386) || defined(__x86_64))
19#ifdef TEMPLATED_SIMD_LIB
20#include "sse_simd_t.h"
21#endif
22#ifndef TEMPLATED_SIMD_LIB
23#include "sse_simd.h"
24#endif
25#endif
26#ifdef _ARCH_PPC
27#include "altivec_simd.h"
28#endif
29
30/* Useful definitions from Linux kernel*/
31#ifdef __GNUC__
32#define likely(x) __builtin_expect((x),1)
33#define unlikely(x) __builtin_expect((x),0)
34#endif
35#ifdef _MSC_VER
36#define likely(x) (x)
37#define unlikely(x) (x)
38#endif
39
40/* Shift forward and back operations, based on endianness */
41#if BYTE_ORDER == BIG_ENDIAN
42#define sisd_sfl(blk, n) sisd_srl(blk, n)
43#define sisd_sbl(blk, n) sisd_sll(blk, n)
44#define sisd_sfli(blk, n) sisd_srli(blk, n)
45#define sisd_sbli(blk, n) sisd_slli(blk, n)
46#define sb_op(x, n) ((x)<<(n))
47#define sf_op(x, n) ((x)>>(n))
48#define cfzl __builtin_clzl
49#endif
50#if BYTE_ORDER == LITTLE_ENDIAN
51static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
52        return sisd_sll(blk, n);
53}
54//#define sisd_sbl(blk, n) sisd_srl(blk, n)
55#define sisd_sfli(blk, n) sisd_slli(blk, n)
56#define sisd_sbli(blk, n) sisd_srli(blk, n)
57#define sb_op(x, n) ((x)>>(n))
58#define sf_op(x, n) ((x)<<(n))
59#ifdef __GNUC__
60#define cfzl __builtin_ctzl
61#endif
62#ifdef _MSC_VER
63#include <intrin.h>
64#pragma intrinsic(_BitScanForward)
65//  precondition: x > 0
66static inline unsigned long cfzl(unsigned long x) {
67        unsigned long zeroes;
68        _BitScanForward(&zeroes, x);
69        return zeroes;
70}
71#endif
72#endif
73
74
75static inline int count_forward_zeroes(SIMD_type bits) {
76  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
77  v.vec = bits;
78  if (v.elems[0] != 0) return cfzl(v.elems[0]);
79  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
80#ifdef _MSC_VER
81  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
82  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
83#endif
84#ifndef _MSC_VER
85#if LONG_BIT < 64
86  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
87  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
88#endif
89#endif
90  else return 8*sizeof(SIMD_type);
91}
92
93
94/* Scans for a 1 as long as it takes.  Use a sentinel to fence.
95   Works for either endianness.  */
96static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
97  unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
98  unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
99  if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
100  else {
101    do {
102      bitstream_ptr++;
103      bitstream_slice = *bitstream_ptr;
104    } while (bitstream_slice == 0);
105    int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
106    return base_posn + cfzl(bitstream_slice);
107  }
108}
109
110static inline int bitstream_scan0(SIMD_type * stream) {
111  unsigned long * bitstream_ptr = (unsigned long *) stream;
112  unsigned long bitstream_slice = *bitstream_ptr;
113  int base_posn = 0;
114  while (bitstream_slice == 0) {
115    bitstream_ptr++;
116    bitstream_slice = *bitstream_ptr;
117  }
118  base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
119  return base_posn + cfzl(bitstream_slice);
120}
121
122
123/* Allocator for arrays of aligned SIMD data values.
124   Ideally the new operator could be used to allocate arrays
125   of vector data aligned on the required boundaries
126   (16-byte for SSE or Altivec).  But since this alignment
127   is not guaranteed except on Mac OS X, the following routine
128   is used. */
129
130static inline SIMD_type * simd_new(size_t SIMD_packs) {
131#ifdef __APPLE__
132        return new SIMD_type [SIMD_packs];
133#endif
134#ifdef _MSC_VER
135        SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
136        if (v != 0) return v;
137        else {
138                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
139                exit(-1);
140        }
141#endif
142#if !defined(__APPLE__) && !defined(_MSC_VER)
143        SIMD_type * v;
144        int rslt = posix_memalign((void **) &v,
145                                  sizeof(SIMD_type),
146                                  sizeof(SIMD_type) * SIMD_packs);
147        if (rslt == 0) return v;
148        else {
149                printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
150                exit(-1);
151        }
152#endif
153}
154
155static inline void simd_delete(SIMD_type * blk_ptr) {
156#ifdef __APPLE__
157  delete [] blk_ptr;
158#endif
159#ifndef __APPLE__
160  free((void *) blk_ptr);
161#endif
162}
163
164#endif
165
Note: See TracBrowser for help on using the repository browser.