Changeset 959


Ignore:
Timestamp:
Mar 21, 2011, 5:14:08 PM (8 years ago)
Author:
cameron
Message:

SIMD library for AVX; initial check-in

Location:
trunk/lib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/block_carry.h

    r759 r959  
    3232
    3333/*------------------------------------------------------------*/
    34 #include "sse_simd.h"
     34#include "avx_simd.h"
    3535
    3636#define SIMD_CARRY_STRATEGY 1
     
    3939
    4040#ifdef ADC64
    41 #ifdef SAHFLAHF
    42 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY
    43 #else
    4441#define CARRY_STRATEGY ADC64_STRATEGY
    45 #endif
    46 #else
    47 #ifdef SAHFLAHF
    48 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY
    4942#else
    5043#define CARRY_STRATEGY SIMD_CARRY_STRATEGY
    5144#endif
    52 #endif
    5345
    5446#if (CARRY_STRATEGY == ADC64_STRATEGY)
    5547typedef uint64_t CarryType;
     48typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64;
    5649
    5750#define Carry0 0
    5851
    59 #define test_carry(x) ((x) > 0)
     52#define test_carry(x) ((x) != 0)
    6053
    6154#define carry_or(carry1, carry2) (carry1 | carry2)
    6255
    63 #define clc() \
    64   __asm__ __volatile__ ("clc\n\t":::)
    65 
    66 #define adc(x,y,carry,sum) \
    67   __asm__ __volatile__ ("add %[carryflag], %[e]\n\t" \
    68         "adc %[e], %[z]\n\t" \
    69         "mov $0, %1\n\t" \
    70         "adc %[carryflag], %1\n\t" \
    71         : [z] "=r" (sum), [carryflag] "=a" (carry) \
    72         : "[z]" (x), [e] "r" (y), "[carryflag]" (carry) \
    73         : "cc")
    74 
    75 #define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
    76    __asm__ __volatile__ ("neg %[carryflag]\n\t" \
    77          "adc %[e1], %[z1]\n\t" \
    78          "adc %[e2], %[z2]\n\t" \
    79          "mov $0, %[carryflag]\n\t" \
    80          "adc $0, %[carryflag]\n\t" \
    81      : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=r" (carry) \
    82          : "[z1]" (x1), "[z2]" (x2), \
    83            [e1] "r" (y1), [e2] "r" (y2), \
    84            "[carryflag]" (carry) \
    85          : "cc")
    86 
    87 
    88 #define adc128(first, second, carry, sum) \
    89 do {\
    90   BitBlock_int64 rslt, x, y;\
    91   x.bitblock = first;\
    92   y.bitblock = second;\
    93   double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
    94   sum = rslt.bitblock;\
    95 } while(0)
    96 
    97 
    98 
    99 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
    100   __asm__  (\
    101         "add %[z1], %[z1]\n\t" \
    102         "adc %[z2], %[z2]\n\t" \
    103         "lea 0(%[carryflag], %[z1]), %[z1]\n\t" \
    104         "setc %%al\n\t" \
    105          : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
    106          : "[z1]" (x1), "[z2]" (x2), \
    107            "[carryflag]" (carry) \
    108          : "cc")
    109 
    110 /*  Slow
    111 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
    112   __asm__  (\
    113         "shld $1, %[z1], %[z2]\n\t" \
    114         "lea 0(%[carryflag], %[z1], 2), %[z1]\n\t" \
    115         "setc %%al\n\t" \
    116          : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
    117          : "[z1]" (x1), "[z2]" (x2), \
    118            "[carryflag]" (carry) \
    119          : "cc")
    120 */
    121 
    122 #define advance_with_carry(cursor, carry, rslt)\
    123 do {\
    124   BitBlock_int64 x, z;\
    125   x.bitblock = cursor;\
    126   double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\
    127   rslt = z.bitblock;\
    128 } while(0)
    129 
    130 #define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, brw) \
    131   __asm__  ("neg %[borrowflag]\n\t" \
    132         "sbb %[e1], %[z1]\n\t" \
    133         "sbb %[e2], %[z2]\n\t" \
    134          "mov $0, %[borrowflag]\n\t" \
    135          "sbb $0, %[borrowflag]\n\t" \
    136      : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [borrowflag] "=a" (brw) \
    137          : "[z1]" (x1), "[z2]" (x2), \
    138            [e1] "r" (y1), [e2] "r" (y2), \
    139            "[borrowflag]" (brw) \
    140          : "cc")
    141 
    142 #define sbb128(first, second, borrow, diff) \
    143 do {\
    144   BitBlock_int64 rslt, x, y;\
    145   x.bitblock = first;\
    146   y.bitblock = second;\
    147   double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
    148                    rslt.int64[0], rslt.int64[1], borrow);\
    149   diff = rslt.bitblock;\
    150 } while(0)
    151 
     56
     57static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
     58static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline));
     59static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline));
     60
     61
     62static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
     63  SIMD256_int64 a, b, rslt;
     64//printf("carryin = %lu\n",carry);
     65//print_simd_register("x", x);
     66//print_simd_register("y", y);
     67  a.bitblock = x;
     68  b.bitblock = y;
     69  asm volatile("negq %[carryflag]\n\t"
     70       "movq 0(%[xaddr]), %[r0]\n\t"
     71       "adcq 0(%[yaddr]), %[r0]\n\t"
     72       "movq 8(%[xaddr]), %[r1]\n\t"
     73       "adcq 8(%[yaddr]), %[r1]\n\t"
     74       "movq 16(%[xaddr]), %[r2]\n\t"
     75       "adcq 16(%[yaddr]), %[r2]\n\t"
     76       "movq 24(%[xaddr]), %[r3]\n\t"
     77       "adcq 24(%[yaddr]), %[r3]\n\t"
     78       "movq $0, %[carryflag]\n\t"
     79       "adcq $0, %[carryflag]\n\t"
     80        : [carryflag] "=&r" (carry),
     81          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
     82        : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
     83        : "cc");
     84  sum = rslt.bitblock;
     85//printf("carryout = %lu\n",carry);
     86//print_simd_register("sum", sum);
     87}
     88 
     89static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) {
     90  SIMD256_int64 a, b, rslt;
     91//printf("borrowin = %lu\n",borrow);
     92//print_simd_register("x", x);
     93//print_simd_register("y", y);
     94  a.bitblock = x;
     95  b.bitblock = y;
     96  asm volatile("negq %[carryflag]\n\t"
     97       "movq 0(%[xaddr]), %[r0]\n\t"
     98       "sbbq 0(%[yaddr]), %[r0]\n\t"
     99       "movq 8(%[xaddr]), %[r1]\n\t"
     100       "sbbq 8(%[yaddr]), %[r1]\n\t"
     101       "movq 16(%[xaddr]), %[r2]\n\t"
     102       "sbbq 16(%[yaddr]), %[r2]\n\t"
     103       "movq 24(%[xaddr]), %[r3]\n\t"
     104       "sbbq 24(%[yaddr]), %[r3]\n\t"
     105       "movq $0, %[carryflag]\n\t"
     106       "adcq $0, %[carryflag]\n\t"
     107        : [carryflag] "=&r" (borrow),
     108          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
     109        : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
     110        : "cc");
     111  diff = rslt.bitblock;
     112//printf("borrowout = %lu\n",borrow);
     113//print_simd_register("diff", diff);
     114}
     115
     116static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) {
     117  SIMD256_int64 r;
     118  SIMD_type a = x;
     119//printf("shift in = %lu\n",carry);
     120//print_simd_register("x", x);
     121  asm volatile("negq %[carryflag]\n\t"
     122       "movq 0(%[xaddr]), %[r0]\n\t"
     123       "adcq %[r0], %[r0]\n\t"
     124       "movq 8(%[xaddr]), %[r1]\n\t"
     125       "adcq %[r1], %[r1]\n\t"
     126       "movq 16(%[xaddr]), %[r2]\n\t"
     127       "adcq %[r2], %[r2]\n\t"
     128       "movq 24(%[xaddr]), %[r3]\n\t"
     129       "adcq %[r3], %[r3]\n\t"
     130       "movq $0, %[carryflag]\n\t"
     131       "adcq $0, %[carryflag]\n\t"
     132        : [carryflag] "=&r" (carry),
     133          [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3])
     134        : "[carryflag]" (carry), [xaddr] "r" (&a)
     135        : "cc");
     136  rslt = r.bitblock;
     137//printf("shift out = %lu\n",carry);
     138//print_simd_register("rslt", rslt);
     139}
    152140
    153141#endif
     
    233221#if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY)
    234222
     223typedef __m128i sse_type;
     224
     225
     226
     227#define sse_or(b1, b2) _mm_or_si128(b1, b2)
     228#define sse_and(b1, b2) _mm_and_si128(b1, b2)
     229#define sse_xor(b1, b2) _mm_xor_si128(b1, b2)
     230#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
     231#define sse_if(cond, then_val, else_val) \
     232  sse_or(sse_and(then_val, cond), sse_andc(else_val, cond))
     233#define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
     234#define sse_nor(a,b) (sse_not(sse_or(a,b)))
     235
     236#define sse_slli_64(r, shft) _mm_slli_epi64(r, shft)
     237#define sse_srli_64(r, shft) _mm_srli_epi64(r, shft)
     238#define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
     239#define sse_sub_64(a, b) _mm_sub_epi64(a, b)
     240#define sse_add_64(a, b) _mm_add_epi64(a, b)
     241
     242#define sse_slli_128(r, shft) \
     243  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
     244   (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
     245   sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64-(shft)), 8)))
     246
     247#define sse_srli_128(r, shft) \
     248  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
     249   (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
     250   sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64-(shft)), 8)))
     251
     252#define sse_to_int(x) _mm_cvtsi128_si32(x)
     253
     254#define sse_from_int(n) _mm_cvtsi32_si128(n)
     255
     256
     257/*
    235258typedef SIMD_type CarryType;
    236259
     
    240263
    241264#define carry_or(carry1, carry2) simd_or(carry1, carry2)
     265*/
     266
     267#define sse_CARRYTYPE
     268#ifdef uint32_CARRYTYPE
     269typedef uint32_t CarryType;
     270
     271#define Carry0 0
     272
     273#define test_carry(x) ((x) != 0)
     274
     275#define carry_or(carry1, carry2) (carry1 | carry2)
     276
     277#define sse_from_CarryType(c) sse_from_int(c)
     278
     279#define sse_to_CarryType(c) sse_to_int(c)
     280#endif
     281
     282#ifdef sse_CARRYTYPE
     283#define CarryType sse_type
     284
     285#define Carry0 (_mm_set1_epi32(0))
     286
     287#define test_carry(x) (!_mm_testz_si128(x, x))
     288
     289#define carry_or(carry1, carry2) sse_or(carry1, carry2)
     290
     291#define sse_from_CarryType(c) c
     292
     293#define sse_to_CarryType(c) c
     294#endif
     295
     296
     297
     298
    242299
    243300#define adc128(x, y, carry,  sum) \
    244301do{ \
    245   SIMD_type gen = simd_and(x, y); \
    246   SIMD_type prop = simd_or(x, y); \
    247   SIMD_type partial = simd_add_64(simd_add_64(x, y), carry); \
    248   SIMD_type c1 = sisd_slli(simd_srli_64(simd_or(gen, simd_andc(prop, partial)), 63), 64); \
    249   sum = simd_add_64(c1, partial); \
    250   carry = sisd_srli(simd_or(gen, simd_andc(prop, sum)), 127); \
     302  sse_type gen = sse_and(x, y); \
     303  sse_type prop = sse_or(x, y); \
     304  sse_type partial = sse_add_64(sse_add_64(x, y), carry); \
     305  sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \
     306  sum = sse_add_64(c1, partial); \
     307  carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \
    251308} while(0)
    252309
     
    254311#define sbb128(x, y, borrow, difference) \
    255312do {\
    256   SIMD_type gen = simd_andc(y, x); \
    257   SIMD_type prop = simd_not(simd_xor(x, y)); \
    258   SIMD_type partial = simd_sub_64(simd_sub_64(x, y), borrow); \
    259   SIMD_type b1 = sisd_slli(simd_srli_64(simd_or(gen, simd_and(prop, partial)), 63), 64); \
    260   difference = simd_sub_64(partial, b1); \
    261   borrow = sisd_srli(simd_or(gen, simd_and(prop, difference)), 127); \
     313  sse_type gen = sse_andc(y, x); \
     314  sse_type prop = sse_not(sse_xor(x, y)); \
     315  sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \
     316  sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \
     317  difference = sse_sub_64(partial, b1); \
     318  borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \
    262319}while(0)
    263320
    264 
    265321#define advance_with_carry(cursor, carry, rslt)\
    266322do {\
    267   SIMD_type shift_out = simd_srli_64(cursor, 63);\
    268   SIMD_type low_bits = simd_mergel_64(shift_out, carry);\
    269   carry = sisd_srli(shift_out, 64);\
    270   rslt = simd_or(simd_add_64(cursor, cursor), low_bits);\
    271 } while(0)
    272 
    273 #endif
    274 #endif
    275 
    276 
     323  sse_type shift_out = sse_srli_64(cursor, 63);\
     324  sse_type low_bits = sse_mergel_64(shift_out, carry);\
     325  carry = sse_srli_128(shift_out, 64);\
     326  rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\
     327} while(0)
     328
     329#define adc256(x, y, carry,  sum) \
     330do {\
     331        __m128i x0 = simd_lo128(x);\
     332        __m128i x1 = simd_hi128(x);\
     333        __m128i y0 = simd_lo128(y);\
     334        __m128i y1 = simd_hi128(y);\
     335        __m128i cry = sse_from_CarryType(carry);\
     336        __m128i s0, s1;\
     337        adc128(x0, y0, cry, s0);\
     338        adc128(x1, y1, cry, s1);\
     339        sum = simd_combine256(s1, s0);\
     340        carry = sse_to_CarryType(cry);\
     341} while(0)
     342
     343#define sbb256(x, y, borrow, diff) \
     344do {\
     345        __m128i x0 = simd_lo128(x);\
     346        __m128i x1 = simd_hi128(x);\
     347        __m128i y0 = simd_lo128(y);\
     348        __m128i y1 = simd_hi128(y);\
     349        __m128i brw = sse_from_CarryType(borrow);\
     350        __m128i d0, d1;\
     351        sbb128(x0, y0, brw, d0);\
     352        sbb128(x1, y1, brw, d1);\
     353        diff = simd_combine256(d1, d0);\
     354        borrow = sse_to_CarryType(brw);\
     355} while(0)
     356
     357#define advance_with_carry256(cursor, carry, rslt)\
     358do {\
     359        __m128i cursor0 = simd_lo128(cursor);\
     360        __m128i cursor1 = simd_hi128(cursor);\
     361        __m128i cry = sse_from_CarryType(carry);\
     362        __m128i rslt0, rslt1;\
     363        advance_with_carry(cursor0, cry, rslt0);\
     364        advance_with_carry(cursor1, cry, rslt1);\
     365        rslt = simd_combine256(rslt1, rslt0);\
     366        carry = sse_to_CarryType(cry);\
     367} while(0)
     368
     369
     370#endif
     371
     372
     373#endif
  • trunk/lib/carryQ.h

    r947 r959  
    133133static inline BitBlock BitBlock_advance_ci_co(BitBlock strm, CarryQtype cq, const int carryno) {
    134134        BitBlock rslt;
    135         advance_with_carry(strm, cq[carryno], rslt);
     135        advance_with_carry256(strm, cq[carryno], rslt);
    136136        return rslt;
    137137}
     
    140140        BitBlock rslt;
    141141        cq[carryno] = Carry0;
    142         advance_with_carry(strm, cq[carryno], rslt);
     142        advance_with_carry256(strm, cq[carryno], rslt);
    143143        return rslt;
    144144}
     
    147147        BitBlock rslt;
    148148        CarryType c = cq[carryno];
    149         advance_with_carry(strm, c, rslt);
     149        advance_with_carry256(strm, c, rslt);
    150150        return rslt;
    151151}
     
    157157static inline BitBlock BitBlock_add_ci_co(BitBlock strm1, BitBlock strm2, CarryQtype cq, const int carryno) {
    158158        BitBlock sum;
    159         adc128(strm1, strm2, cq[carryno], sum);
     159        adc256(strm1, strm2, cq[carryno], sum);
    160160        return sum;
    161161}
     
    164164        BitBlock sum;
    165165        cq[carryno] = Carry0;
    166         adc128(strm1, strm2, cq[carryno], sum);
     166        adc256(strm1, strm2, cq[carryno], sum);
    167167        return sum;
    168168}
     
    171171        BitBlock sum;
    172172        CarryType c = cq[carryno];
    173         adc128(strm1, strm2, c, sum);
     173        adc256(strm1, strm2, c, sum);
    174174        return sum;
    175175}
     
    178178        BitBlock sum;
    179179        CarryType c = Carry0;
    180         adc128(strm1, strm2, c, sum);
     180        adc256(strm1, strm2, c, sum);
    181181        return sum;
    182182}
     
    184184static inline BitBlock BitBlock_sub_ci_co(BitBlock strm1, BitBlock strm2, CarryQtype cq, const int carryno) {
    185185        BitBlock diff;
    186         sbb128(strm1, strm2, cq[carryno], diff);
     186        sbb256(strm1, strm2, cq[carryno], diff);
    187187        return diff;
    188188}
     
    191191        BitBlock diff;
    192192        cq[carryno] = Carry0;
    193         sbb128(strm1, strm2, cq[carryno], diff);
     193        sbb256(strm1, strm2, cq[carryno], diff);
    194194        return diff;
    195195}
     
    198198        BitBlock diff;
    199199        CarryType c = cq[carryno];
    200         sbb128(strm1, strm2, c, diff);
     200        sbb256(strm1, strm2, c, diff);
    201201        return diff;
    202202}
     
    205205        BitBlock diff;
    206206        CarryType c = Carry0;
    207         sbb128(strm1, strm2, c, diff);
     207        sbb256(strm1, strm2, c, diff);
    208208        return diff;
    209209}
     
    211211static inline BitBlock BitBlock_scanthru_ci_co(BitBlock markers0, BitBlock charclass, CarryQtype cq, const int carryno) {
    212212        BitBlock markers1;
    213         adc128(markers0, charclass, cq[carryno], markers1);
     213        adc256(markers0, charclass, cq[carryno], markers1);
    214214        return simd_andc(markers1, charclass);
    215215}
     
    218218        BitBlock markers1;
    219219        cq[carryno] = Carry0;
    220         adc128(markers0, charclass, cq[carryno], markers1);
     220        adc256(markers0, charclass, cq[carryno], markers1);
    221221        return simd_andc(markers1, charclass);
    222222}
     
    225225        BitBlock markers1;
    226226        CarryType c = cq[carryno];
    227         adc128(markers0, charclass, c, markers1);
     227        adc256(markers0, charclass, c, markers1);
    228228        return simd_andc(markers1, charclass);
    229229}
     
    232232        BitBlock markers1;
    233233        CarryType c = Carry0;
    234         adc128(markers0, charclass, c, markers1);
     234        adc256(markers0, charclass, c, markers1);
    235235        return simd_andc(markers1, charclass);
    236236}
     
    238238static inline BitBlock BitBlock_scanto_ci_co(BitBlock markers0, BitBlock charclass, CarryQtype cq, const int carryno) {
    239239        BitBlock markers1;
    240         adc128(markers0, simd_not(charclass), cq[carryno], markers1);
     240        adc256(markers0, simd_not(charclass), cq[carryno], markers1);
    241241        return simd_and(markers1, charclass);
    242242}
     
    245245        BitBlock markers1;
    246246        cq[carryno] = Carry0;
    247         adc128(markers0, simd_not(charclass), cq[carryno], markers1);
     247        adc256(markers0, simd_not(charclass), cq[carryno], markers1);
    248248        return simd_and(markers1, charclass);
    249249}
     
    253253        CarryType c = cq[carryno];
    254254        BitBlock scanclass = simd_andc(EOF_mask, charclass);
    255         adc128(markers0, scanclass, c, markers1);
     255        adc256(markers0, scanclass, c, markers1);
    256256        return simd_andc(markers1, scanclass);
    257257}
     
    261261        CarryType c = Carry0;
    262262        BitBlock scanclass = simd_andc(EOF_mask, charclass);
    263         adc128(markers0, scanclass, c, markers1);
     263        adc256(markers0, scanclass, c, markers1);
    264264        return simd_andc(markers1, scanclass);
    265265}
  • trunk/lib/lib_simd.h

    r948 r959  
    11/*  lib_simd_h:  SIMD Library including idealized SIMD operations
    2     Copyright (C) 2008, Robert D. Cameron
     2    Copyright (C) 2011, Robert D. Cameron
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters Inc.
     
    1616#include <limits.h>
    1717
    18 #ifndef LONG_BIT
    19 #if ULONG_MAX == 0xFFFFFFFF
    20 #define LONG_BIT 32
    21 #endif
    22 #if ULONG_MAX == 0xFFFFFFFFFFFFFFFF
    2318#define LONG_BIT 64
    24 #endif
    25 #endif
    26 
    27 #if (defined(__i386) || defined(__x86_64))
    28 #ifdef TEMPLATED_SIMD_LIB
    29 #include "sse_simd_t.h"
    30 #endif
    31 #ifndef TEMPLATED_SIMD_LIB
    32 #include "sse_simd.h"
    33 #endif
    34 #endif
    35 #ifdef _ARCH_PPC
    36 #include "altivec_simd.h"
    37 #endif
    38 
    39 /* Useful definitions from Linux kernel*/
    40 #ifdef __GNUC__
    41 /*
    42 #define likely(x) __builtin_expect((x),1)
    43 #define unlikely(x) __builtin_expect((x),0)
    44 */
    45 static inline long likely(long x) {
    46         return __builtin_expect(x, 1);
    47 }
    48 static inline long unlikely(long x) {
    49         return __builtin_expect(x, 0);
    50 }
    51 
    52 #endif
    53 #ifdef _MSC_VER
    54 #define inline __inline
    55 #include "lib/sse_simd.h"
    56 #define likely(x) (x)
    57 #define unlikely(x) (x)
    58 #endif
    59 
    60 #ifdef TEMPLATED_SIMD_LIB
    61 static inline SIMD_type sisd_sll(SIMD_type blk, SIMD_type n) {
    62         return simd<128>::sll(blk, n);
    63 }
    64 static inline SIMD_type sisd_srl(SIMD_type blk, SIMD_type n) {
    65         return simd<128>::srl(blk, n);
    66 }
    67 #define sisd_slli(blk, n) simd<128>::slli<n>(blk)
    68 #define sisd_srli(blk, n) simd<128>::srli<n>(blk)
    69 #endif
    70 
    71 
    72 /* Shift forward and back operations, based on endianness */
    73 #if BYTE_ORDER == BIG_ENDIAN
    74 #define sisd_sfl(blk, n) sisd_srl(blk, n)
    75 #define sisd_sbl(blk, n) sisd_sll(blk, n)
    76 #define sisd_sfli(blk, n) sisd_srli(blk, n)
    77 #define sisd_sbli(blk, n) sisd_slli(blk, n)
    78 #define sb_op(x, n) ((x)<<(n))
    79 #define sf_op(x, n) ((x)>>(n))
    80 #define cfzl __builtin_clzl
    81 #define cbzl __builtin_ctzl
    82 #endif
     19
     20#include "../lib/avx_simd.h"
    8321
    8422#if BYTE_ORDER == LITTLE_ENDIAN
     
    8927#define sb_op(x, n) ((x)>>(n))
    9028#define sf_op(x, n) ((x)<<(n))
     29
    9130#ifdef __GNUC__
    9231#define cfzl __builtin_ctzl
    9332#define cbzl __builtin_clzl
     33#define likely(x) __builtin_expect((x),1)
     34#define unlikely(x) __builtin_expect((x),0)
    9435#endif
    9536#ifdef _MSC_VER
     
    11657  if (v.elems[0] != 0) return cfzl(v.elems[0]);
    11758  else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
    118 #ifdef _MSC_VER
    11959  else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
    12060  else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
    121 #endif
    122 #ifndef _MSC_VER
    123 #if LONG_BIT < 64
    124   else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
    125   else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
    126 #endif
    127 #endif
    12861  else return 8*sizeof(SIMD_type);
    12962}
     
    13265  union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
    13366  v.vec = bits;
    134 #if LONG_BIT == 64
    135   if (v.elems[1] != 0) return cbzl(v.elems[1]);
    136   else if (v.elems[0] != 0) return LONG_BIT + cbzl(v.elems[0]);
    137 #endif
    138 #if LONG_BIT < 64
    13967  if (v.elems[3] != 0) return cbzl(v.elems[3]);
    14068  else if (v.elems[2] != 0) return LONG_BIT + cbzl(v.elems[2]);
    14169  else if (v.elems[1] != 0) return 2*LONG_BIT + cbzl(v.elems[1]);
    14270  else if (v.elems[0] != 0) return 3*LONG_BIT + cbzl(v.elems[0]);
    143 #endif
    14471  else return 8*sizeof(SIMD_type);
    14572}
  • trunk/lib/s2p.h

    r712 r959  
    9696*/
    9797
     98
     99#ifndef USE_S2P_AVX
    98100#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
    99101  do {\
     
    104106        p1 = simd_if(hi_mask, simd_slli_16(t0, shift), t1);\
    105107  } while(0)
     108#endif
     109
     110
     111/* For AVX, we use a modified s2p_step function to avoid a number
     112   of conversions from 128-bit mode to 256-bit mode just to
     113   immediately convert back. */
     114#ifdef USE_S2P_AVX
     115#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
     116#define sse_himask_16 _mm_set1_epi32(0xFF00FF00)
     117#define sse_slli_16(r, shft) _mm_slli_epi16(r, shft)
     118#define sse_srli_16(r, shft) _mm_srli_epi16(r, shft)
     119#define sse_packus_16(a, b) _mm_packus_epi16(b, a)
     120#define sse_pack_16(a, b) \
     121  _mm_packus_epi16(sse_andc(b, sse_himask_16), sse_andc(a, sse_himask_16))
     122#define sse_pack_16_ll(v1, v2) sse_pack_16(v1, v2)
     123#define sse_pack_16_hh(v1, v2) sse_packus_16(sse_srli_16(v1, 8), sse_srli_16(v2, 8))
     124
     125#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
     126  do {\
     127        __m128i s00, s01, s10, s11, t00, t01, t10, t11;\
     128        __m128i t10shift, t11shift, t00shift, t01shift;\
     129        s00 = simd_hi128(s0);\
     130        s01 = simd_lo128(s0);\
     131        s10 = simd_hi128(s1);\
     132        s11 = simd_lo128(s1);\
     133        t00 = sse_pack_16_hh(s00, s01);\
     134        t10 = sse_pack_16_ll(s00, s01);\
     135        t01 = sse_pack_16_hh(s10, s11);\
     136        t11 = sse_pack_16_ll(s10, s11);\
     137        t10shift = sse_srli_16(t10, shift);\
     138        t11shift = sse_srli_16(t11, shift);\
     139        t00shift = sse_slli_16(t00, shift);\
     140        t01shift = sse_slli_16(t01, shift);\
     141        p0 = simd_if(hi_mask, simd_combine256(t00, t01), simd_combine256(t10shift, t11shift));\
     142        p1 = simd_if(hi_mask, simd_combine256(t00shift, t01shift), simd_combine256(t10, t11));\
     143  } while(0)
     144#endif
     145
     146
    106147
    107148#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
     
    125166  } while(0)
    126167
     168
     169
     170
     171
    127172/* For sizeof(SIMD_type) = 16 */
    128173typedef uint16_t BitPack;
Note: See TracChangeset for help on using the changeset viewer.