Ignore:
Timestamp:
Mar 21, 2011, 5:14:08 PM (8 years ago)
Author:
cameron
Message:

SIMD library for AVX; initial check-in

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/block_carry.h

    r759 r959  
    3232
    3333/*------------------------------------------------------------*/
    34 #include "sse_simd.h"
     34#include "avx_simd.h"
    3535
    3636#define SIMD_CARRY_STRATEGY 1
     
    3939
    4040#ifdef ADC64
    41 #ifdef SAHFLAHF
    42 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY
    43 #else
    4441#define CARRY_STRATEGY ADC64_STRATEGY
    45 #endif
    46 #else
    47 #ifdef SAHFLAHF
    48 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY
    4942#else
    5043#define CARRY_STRATEGY SIMD_CARRY_STRATEGY
    5144#endif
    52 #endif
    5345
    5446#if (CARRY_STRATEGY == ADC64_STRATEGY)
    5547typedef uint64_t CarryType;
     48typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64;
    5649
    5750#define Carry0 0
    5851
    59 #define test_carry(x) ((x) > 0)
     52#define test_carry(x) ((x) != 0)
    6053
    6154#define carry_or(carry1, carry2) (carry1 | carry2)
    6255
    63 #define clc() \
    64   __asm__ __volatile__ ("clc\n\t":::)
    65 
    66 #define adc(x,y,carry,sum) \
    67   __asm__ __volatile__ ("add %[carryflag], %[e]\n\t" \
    68         "adc %[e], %[z]\n\t" \
    69         "mov $0, %1\n\t" \
    70         "adc %[carryflag], %1\n\t" \
    71         : [z] "=r" (sum), [carryflag] "=a" (carry) \
    72         : "[z]" (x), [e] "r" (y), "[carryflag]" (carry) \
    73         : "cc")
    74 
    75 #define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
    76    __asm__ __volatile__ ("neg %[carryflag]\n\t" \
    77          "adc %[e1], %[z1]\n\t" \
    78          "adc %[e2], %[z2]\n\t" \
    79          "mov $0, %[carryflag]\n\t" \
    80          "adc $0, %[carryflag]\n\t" \
    81      : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=r" (carry) \
    82          : "[z1]" (x1), "[z2]" (x2), \
    83            [e1] "r" (y1), [e2] "r" (y2), \
    84            "[carryflag]" (carry) \
    85          : "cc")
    86 
    87 
    88 #define adc128(first, second, carry, sum) \
    89 do {\
    90   BitBlock_int64 rslt, x, y;\
    91   x.bitblock = first;\
    92   y.bitblock = second;\
    93   double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
    94   sum = rslt.bitblock;\
    95 } while(0)
    96 
    97 
    98 
    99 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
    100   __asm__  (\
    101         "add %[z1], %[z1]\n\t" \
    102         "adc %[z2], %[z2]\n\t" \
    103         "lea 0(%[carryflag], %[z1]), %[z1]\n\t" \
    104         "setc %%al\n\t" \
    105          : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
    106          : "[z1]" (x1), "[z2]" (x2), \
    107            "[carryflag]" (carry) \
    108          : "cc")
    109 
    110 /*  Slow
    111 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
    112   __asm__  (\
    113         "shld $1, %[z1], %[z2]\n\t" \
    114         "lea 0(%[carryflag], %[z1], 2), %[z1]\n\t" \
    115         "setc %%al\n\t" \
    116          : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
    117          : "[z1]" (x1), "[z2]" (x2), \
    118            "[carryflag]" (carry) \
    119          : "cc")
    120 */
    121 
    122 #define advance_with_carry(cursor, carry, rslt)\
    123 do {\
    124   BitBlock_int64 x, z;\
    125   x.bitblock = cursor;\
    126   double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\
    127   rslt = z.bitblock;\
    128 } while(0)
    129 
    130 #define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, brw) \
    131   __asm__  ("neg %[borrowflag]\n\t" \
    132         "sbb %[e1], %[z1]\n\t" \
    133         "sbb %[e2], %[z2]\n\t" \
    134          "mov $0, %[borrowflag]\n\t" \
    135          "sbb $0, %[borrowflag]\n\t" \
    136      : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [borrowflag] "=a" (brw) \
    137          : "[z1]" (x1), "[z2]" (x2), \
    138            [e1] "r" (y1), [e2] "r" (y2), \
    139            "[borrowflag]" (brw) \
    140          : "cc")
    141 
    142 #define sbb128(first, second, borrow, diff) \
    143 do {\
    144   BitBlock_int64 rslt, x, y;\
    145   x.bitblock = first;\
    146   y.bitblock = second;\
    147   double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
    148                    rslt.int64[0], rslt.int64[1], borrow);\
    149   diff = rslt.bitblock;\
    150 } while(0)
    151 
     56
     57static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
     58static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline));
     59static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline));
     60
     61
     62static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
     63  SIMD256_int64 a, b, rslt;
     64//printf("carryin = %lu\n",carry);
     65//print_simd_register("x", x);
     66//print_simd_register("y", y);
     67  a.bitblock = x;
     68  b.bitblock = y;
     69  asm volatile("negq %[carryflag]\n\t"
     70       "movq 0(%[xaddr]), %[r0]\n\t"
     71       "adcq 0(%[yaddr]), %[r0]\n\t"
     72       "movq 8(%[xaddr]), %[r1]\n\t"
     73       "adcq 8(%[yaddr]), %[r1]\n\t"
     74       "movq 16(%[xaddr]), %[r2]\n\t"
     75       "adcq 16(%[yaddr]), %[r2]\n\t"
     76       "movq 24(%[xaddr]), %[r3]\n\t"
     77       "adcq 24(%[yaddr]), %[r3]\n\t"
     78       "movq $0, %[carryflag]\n\t"
     79       "adcq $0, %[carryflag]\n\t"
     80        : [carryflag] "=&r" (carry),
     81          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
     82        : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
     83        : "cc");
     84  sum = rslt.bitblock;
     85//printf("carryout = %lu\n",carry);
     86//print_simd_register("sum", sum);
     87}
     88 
     89static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) {
     90  SIMD256_int64 a, b, rslt;
     91//printf("borrowin = %lu\n",borrow);
     92//print_simd_register("x", x);
     93//print_simd_register("y", y);
     94  a.bitblock = x;
     95  b.bitblock = y;
     96  asm volatile("negq %[carryflag]\n\t"
     97       "movq 0(%[xaddr]), %[r0]\n\t"
     98       "sbbq 0(%[yaddr]), %[r0]\n\t"
     99       "movq 8(%[xaddr]), %[r1]\n\t"
     100       "sbbq 8(%[yaddr]), %[r1]\n\t"
     101       "movq 16(%[xaddr]), %[r2]\n\t"
     102       "sbbq 16(%[yaddr]), %[r2]\n\t"
     103       "movq 24(%[xaddr]), %[r3]\n\t"
     104       "sbbq 24(%[yaddr]), %[r3]\n\t"
     105       "movq $0, %[carryflag]\n\t"
     106       "adcq $0, %[carryflag]\n\t"
     107        : [carryflag] "=&r" (borrow),
     108          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
     109        : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
     110        : "cc");
     111  diff = rslt.bitblock;
     112//printf("borrowout = %lu\n",borrow);
     113//print_simd_register("diff", diff);
     114}
     115
     116static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) {
     117  SIMD256_int64 r;
     118  SIMD_type a = x;
     119//printf("shift in = %lu\n",carry);
     120//print_simd_register("x", x);
     121  asm volatile("negq %[carryflag]\n\t"
     122       "movq 0(%[xaddr]), %[r0]\n\t"
     123       "adcq %[r0], %[r0]\n\t"
     124       "movq 8(%[xaddr]), %[r1]\n\t"
     125       "adcq %[r1], %[r1]\n\t"
     126       "movq 16(%[xaddr]), %[r2]\n\t"
     127       "adcq %[r2], %[r2]\n\t"
     128       "movq 24(%[xaddr]), %[r3]\n\t"
     129       "adcq %[r3], %[r3]\n\t"
     130       "movq $0, %[carryflag]\n\t"
     131       "adcq $0, %[carryflag]\n\t"
     132        : [carryflag] "=&r" (carry),
     133          [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3])
     134        : "[carryflag]" (carry), [xaddr] "r" (&a)
     135        : "cc");
     136  rslt = r.bitblock;
     137//printf("shift out = %lu\n",carry);
     138//print_simd_register("rslt", rslt);
     139}
    152140
    153141#endif
     
    233221#if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY)
    234222
     223typedef __m128i sse_type;
     224
     225
     226
     227#define sse_or(b1, b2) _mm_or_si128(b1, b2)
     228#define sse_and(b1, b2) _mm_and_si128(b1, b2)
     229#define sse_xor(b1, b2) _mm_xor_si128(b1, b2)
     230#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
     231#define sse_if(cond, then_val, else_val) \
     232  sse_or(sse_and(then_val, cond), sse_andc(else_val, cond))
     233#define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
     234#define sse_nor(a,b) (sse_not(sse_or(a,b)))
     235
     236#define sse_slli_64(r, shft) _mm_slli_epi64(r, shft)
     237#define sse_srli_64(r, shft) _mm_srli_epi64(r, shft)
     238#define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
     239#define sse_sub_64(a, b) _mm_sub_epi64(a, b)
     240#define sse_add_64(a, b) _mm_add_epi64(a, b)
     241
     242#define sse_slli_128(r, shft) \
     243  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
     244   (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
     245   sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64-(shft)), 8)))
     246
     247#define sse_srli_128(r, shft) \
     248  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
     249   (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
     250   sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64-(shft)), 8)))
     251
     252#define sse_to_int(x) _mm_cvtsi128_si32(x)
     253
     254#define sse_from_int(n) _mm_cvtsi32_si128(n)
     255
     256
     257/*
    235258typedef SIMD_type CarryType;
    236259
     
    240263
    241264#define carry_or(carry1, carry2) simd_or(carry1, carry2)
     265*/
     266
     267#define sse_CARRYTYPE
     268#ifdef uint32_CARRYTYPE
     269typedef uint32_t CarryType;
     270
     271#define Carry0 0
     272
     273#define test_carry(x) ((x) != 0)
     274
     275#define carry_or(carry1, carry2) (carry1 | carry2)
     276
     277#define sse_from_CarryType(c) sse_from_int(c)
     278
     279#define sse_to_CarryType(c) sse_to_int(c)
     280#endif
     281
     282#ifdef sse_CARRYTYPE
     283#define CarryType sse_type
     284
     285#define Carry0 (_mm_set1_epi32(0))
     286
     287#define test_carry(x) (!_mm_testz_si128(x, x))
     288
     289#define carry_or(carry1, carry2) sse_or(carry1, carry2)
     290
     291#define sse_from_CarryType(c) c
     292
     293#define sse_to_CarryType(c) c
     294#endif
     295
     296
     297
     298
    242299
    243300#define adc128(x, y, carry,  sum) \
    244301do{ \
    245   SIMD_type gen = simd_and(x, y); \
    246   SIMD_type prop = simd_or(x, y); \
    247   SIMD_type partial = simd_add_64(simd_add_64(x, y), carry); \
    248   SIMD_type c1 = sisd_slli(simd_srli_64(simd_or(gen, simd_andc(prop, partial)), 63), 64); \
    249   sum = simd_add_64(c1, partial); \
    250   carry = sisd_srli(simd_or(gen, simd_andc(prop, sum)), 127); \
     302  sse_type gen = sse_and(x, y); \
     303  sse_type prop = sse_or(x, y); \
     304  sse_type partial = sse_add_64(sse_add_64(x, y), carry); \
     305  sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \
     306  sum = sse_add_64(c1, partial); \
     307  carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \
    251308} while(0)
    252309
     
    254311#define sbb128(x, y, borrow, difference) \
    255312do {\
    256   SIMD_type gen = simd_andc(y, x); \
    257   SIMD_type prop = simd_not(simd_xor(x, y)); \
    258   SIMD_type partial = simd_sub_64(simd_sub_64(x, y), borrow); \
    259   SIMD_type b1 = sisd_slli(simd_srli_64(simd_or(gen, simd_and(prop, partial)), 63), 64); \
    260   difference = simd_sub_64(partial, b1); \
    261   borrow = sisd_srli(simd_or(gen, simd_and(prop, difference)), 127); \
     313  sse_type gen = sse_andc(y, x); \
     314  sse_type prop = sse_not(sse_xor(x, y)); \
     315  sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \
     316  sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \
     317  difference = sse_sub_64(partial, b1); \
     318  borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \
    262319}while(0)
    263320
    264 
    265321#define advance_with_carry(cursor, carry, rslt)\
    266322do {\
    267   SIMD_type shift_out = simd_srli_64(cursor, 63);\
    268   SIMD_type low_bits = simd_mergel_64(shift_out, carry);\
    269   carry = sisd_srli(shift_out, 64);\
    270   rslt = simd_or(simd_add_64(cursor, cursor), low_bits);\
    271 } while(0)
    272 
    273 #endif
    274 #endif
    275 
    276 
     323  sse_type shift_out = sse_srli_64(cursor, 63);\
     324  sse_type low_bits = sse_mergel_64(shift_out, carry);\
     325  carry = sse_srli_128(shift_out, 64);\
     326  rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\
     327} while(0)
     328
     329#define adc256(x, y, carry,  sum) \
     330do {\
     331        __m128i x0 = simd_lo128(x);\
     332        __m128i x1 = simd_hi128(x);\
     333        __m128i y0 = simd_lo128(y);\
     334        __m128i y1 = simd_hi128(y);\
     335        __m128i cry = sse_from_CarryType(carry);\
     336        __m128i s0, s1;\
     337        adc128(x0, y0, cry, s0);\
     338        adc128(x1, y1, cry, s1);\
     339        sum = simd_combine256(s1, s0);\
     340        carry = sse_to_CarryType(cry);\
     341} while(0)
     342
     343#define sbb256(x, y, borrow, diff) \
     344do {\
     345        __m128i x0 = simd_lo128(x);\
     346        __m128i x1 = simd_hi128(x);\
     347        __m128i y0 = simd_lo128(y);\
     348        __m128i y1 = simd_hi128(y);\
     349        __m128i brw = sse_from_CarryType(borrow);\
     350        __m128i d0, d1;\
     351        sbb128(x0, y0, brw, d0);\
     352        sbb128(x1, y1, brw, d1);\
     353        diff = simd_combine256(d1, d0);\
     354        borrow = sse_to_CarryType(brw);\
     355} while(0)
     356
     357#define advance_with_carry256(cursor, carry, rslt)\
     358do {\
     359        __m128i cursor0 = simd_lo128(cursor);\
     360        __m128i cursor1 = simd_hi128(cursor);\
     361        __m128i cry = sse_from_CarryType(carry);\
     362        __m128i rslt0, rslt1;\
     363        advance_with_carry(cursor0, cry, rslt0);\
     364        advance_with_carry(cursor1, cry, rslt1);\
     365        rslt = simd_combine256(rslt1, rslt0);\
     366        carry = sse_to_CarryType(cry);\
     367} while(0)
     368
     369
     370#endif
     371
     372
     373#endif
Note: See TracChangeset for help on using the changeset viewer.