Changeset 959 for trunk/lib/block_carry.h
 Timestamp:
 Mar 21, 2011, 5:14:08 PM (8 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

trunk/lib/block_carry.h
r759 r959 32 32 33 33 /**/ 34 #include " sse_simd.h"34 #include "avx_simd.h" 35 35 36 36 #define SIMD_CARRY_STRATEGY 1 … … 39 39 40 40 #ifdef ADC64 41 #ifdef SAHFLAHF42 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY43 #else44 41 #define CARRY_STRATEGY ADC64_STRATEGY 45 #endif46 #else47 #ifdef SAHFLAHF48 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY49 42 #else 50 43 #define CARRY_STRATEGY SIMD_CARRY_STRATEGY 51 44 #endif 52 #endif53 45 54 46 #if (CARRY_STRATEGY == ADC64_STRATEGY) 55 47 typedef uint64_t CarryType; 48 typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64; 56 49 57 50 #define Carry0 0 58 51 59 #define test_carry(x) ((x) >0)52 #define test_carry(x) ((x) != 0) 60 53 61 54 #define carry_or(carry1, carry2) (carry1  carry2) 62 55 63 #define clc() \ 64 __asm__ __volatile__ ("clc\n\t":::) 65 66 #define adc(x,y,carry,sum) \ 67 __asm__ __volatile__ ("add %[carryflag], %[e]\n\t" \ 68 "adc %[e], %[z]\n\t" \ 69 "mov $0, %1\n\t" \ 70 "adc %[carryflag], %1\n\t" \ 71 : [z] "=r" (sum), [carryflag] "=a" (carry) \ 72 : "[z]" (x), [e] "r" (y), "[carryflag]" (carry) \ 73 : "cc") 74 75 #define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \ 76 __asm__ __volatile__ ("neg %[carryflag]\n\t" \ 77 "adc %[e1], %[z1]\n\t" \ 78 "adc %[e2], %[z2]\n\t" \ 79 "mov $0, %[carryflag]\n\t" \ 80 "adc $0, %[carryflag]\n\t" \ 81 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=r" (carry) \ 82 : "[z1]" (x1), "[z2]" (x2), \ 83 [e1] "r" (y1), [e2] "r" (y2), \ 84 "[carryflag]" (carry) \ 85 : "cc") 86 87 88 #define adc128(first, second, carry, sum) \ 89 do {\ 90 BitBlock_int64 rslt, x, y;\ 91 x.bitblock = first;\ 92 y.bitblock = second;\ 93 double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\ 94 sum = rslt.bitblock;\ 95 } while(0) 96 97 98 99 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \ 100 __asm__ (\ 101 "add %[z1], %[z1]\n\t" \ 102 "adc %[z2], %[z2]\n\t" \ 103 "lea 0(%[carryflag], %[z1]), %[z1]\n\t" \ 104 "setc %%al\n\t" \ 105 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ 106 : "[z1]" (x1), "[z2]" (x2), \ 107 "[carryflag]" (carry) \ 108 : "cc") 109 110 /* Slow 111 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \ 112 __asm__ (\ 113 "shld $1, %[z1], %[z2]\n\t" \ 114 "lea 0(%[carryflag], %[z1], 2), %[z1]\n\t" \ 115 "setc %%al\n\t" \ 116 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ 117 : "[z1]" (x1), "[z2]" (x2), \ 118 "[carryflag]" (carry) \ 119 : "cc") 120 */ 121 122 #define advance_with_carry(cursor, carry, rslt)\ 123 do {\ 124 BitBlock_int64 x, z;\ 125 x.bitblock = cursor;\ 126 double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\ 127 rslt = z.bitblock;\ 128 } while(0) 129 130 #define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, brw) \ 131 __asm__ ("neg %[borrowflag]\n\t" \ 132 "sbb %[e1], %[z1]\n\t" \ 133 "sbb %[e2], %[z2]\n\t" \ 134 "mov $0, %[borrowflag]\n\t" \ 135 "sbb $0, %[borrowflag]\n\t" \ 136 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [borrowflag] "=a" (brw) \ 137 : "[z1]" (x1), "[z2]" (x2), \ 138 [e1] "r" (y1), [e2] "r" (y2), \ 139 "[borrowflag]" (brw) \ 140 : "cc") 141 142 #define sbb128(first, second, borrow, diff) \ 143 do {\ 144 BitBlock_int64 rslt, x, y;\ 145 x.bitblock = first;\ 146 y.bitblock = second;\ 147 double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \ 148 rslt.int64[0], rslt.int64[1], borrow);\ 149 diff = rslt.bitblock;\ 150 } while(0) 151 56 57 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline)); 58 static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline)); 59 static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline)); 60 61 62 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) { 63 SIMD256_int64 a, b, rslt; 64 //printf("carryin = %lu\n",carry); 65 //print_simd_register("x", x); 66 //print_simd_register("y", y); 67 a.bitblock = x; 68 b.bitblock = y; 69 asm volatile("negq %[carryflag]\n\t" 70 "movq 0(%[xaddr]), %[r0]\n\t" 71 "adcq 0(%[yaddr]), %[r0]\n\t" 72 "movq 8(%[xaddr]), %[r1]\n\t" 73 "adcq 8(%[yaddr]), %[r1]\n\t" 74 "movq 16(%[xaddr]), %[r2]\n\t" 75 "adcq 16(%[yaddr]), %[r2]\n\t" 76 "movq 24(%[xaddr]), %[r3]\n\t" 77 "adcq 24(%[yaddr]), %[r3]\n\t" 78 "movq $0, %[carryflag]\n\t" 79 "adcq $0, %[carryflag]\n\t" 80 : [carryflag] "=&r" (carry), 81 [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3]) 82 : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock) 83 : "cc"); 84 sum = rslt.bitblock; 85 //printf("carryout = %lu\n",carry); 86 //print_simd_register("sum", sum); 87 } 88 89 static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) { 90 SIMD256_int64 a, b, rslt; 91 //printf("borrowin = %lu\n",borrow); 92 //print_simd_register("x", x); 93 //print_simd_register("y", y); 94 a.bitblock = x; 95 b.bitblock = y; 96 asm volatile("negq %[carryflag]\n\t" 97 "movq 0(%[xaddr]), %[r0]\n\t" 98 "sbbq 0(%[yaddr]), %[r0]\n\t" 99 "movq 8(%[xaddr]), %[r1]\n\t" 100 "sbbq 8(%[yaddr]), %[r1]\n\t" 101 "movq 16(%[xaddr]), %[r2]\n\t" 102 "sbbq 16(%[yaddr]), %[r2]\n\t" 103 "movq 24(%[xaddr]), %[r3]\n\t" 104 "sbbq 24(%[yaddr]), %[r3]\n\t" 105 "movq $0, %[carryflag]\n\t" 106 "adcq $0, %[carryflag]\n\t" 107 : [carryflag] "=&r" (borrow), 108 [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3]) 109 : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock) 110 : "cc"); 111 diff = rslt.bitblock; 112 //printf("borrowout = %lu\n",borrow); 113 //print_simd_register("diff", diff); 114 } 115 116 static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) { 117 SIMD256_int64 r; 118 SIMD_type a = x; 119 //printf("shift in = %lu\n",carry); 120 //print_simd_register("x", x); 121 asm volatile("negq %[carryflag]\n\t" 122 "movq 0(%[xaddr]), %[r0]\n\t" 123 "adcq %[r0], %[r0]\n\t" 124 "movq 8(%[xaddr]), %[r1]\n\t" 125 "adcq %[r1], %[r1]\n\t" 126 "movq 16(%[xaddr]), %[r2]\n\t" 127 "adcq %[r2], %[r2]\n\t" 128 "movq 24(%[xaddr]), %[r3]\n\t" 129 "adcq %[r3], %[r3]\n\t" 130 "movq $0, %[carryflag]\n\t" 131 "adcq $0, %[carryflag]\n\t" 132 : [carryflag] "=&r" (carry), 133 [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3]) 134 : "[carryflag]" (carry), [xaddr] "r" (&a) 135 : "cc"); 136 rslt = r.bitblock; 137 //printf("shift out = %lu\n",carry); 138 //print_simd_register("rslt", rslt); 139 } 152 140 153 141 #endif … … 233 221 #if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY) 234 222 223 typedef __m128i sse_type; 224 225 226 227 #define sse_or(b1, b2) _mm_or_si128(b1, b2) 228 #define sse_and(b1, b2) _mm_and_si128(b1, b2) 229 #define sse_xor(b1, b2) _mm_xor_si128(b1, b2) 230 #define sse_andc(b1, b2) _mm_andnot_si128(b2, b1) 231 #define sse_if(cond, then_val, else_val) \ 232 sse_or(sse_and(then_val, cond), sse_andc(else_val, cond)) 233 #define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF))) 234 #define sse_nor(a,b) (sse_not(sse_or(a,b))) 235 236 #define sse_slli_64(r, shft) _mm_slli_epi64(r, shft) 237 #define sse_srli_64(r, shft) _mm_srli_epi64(r, shft) 238 #define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a) 239 #define sse_sub_64(a, b) _mm_sub_epi64(a, b) 240 #define sse_add_64(a, b) _mm_add_epi64(a, b) 241 242 #define sse_slli_128(r, shft) \ 243 ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \ 244 (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft)  64) : \ 245 sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64(shft)), 8))) 246 247 #define sse_srli_128(r, shft) \ 248 ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \ 249 (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft)  64) : \ 250 sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64(shft)), 8))) 251 252 #define sse_to_int(x) _mm_cvtsi128_si32(x) 253 254 #define sse_from_int(n) _mm_cvtsi32_si128(n) 255 256 257 /* 235 258 typedef SIMD_type CarryType; 236 259 … … 240 263 241 264 #define carry_or(carry1, carry2) simd_or(carry1, carry2) 265 */ 266 267 #define sse_CARRYTYPE 268 #ifdef uint32_CARRYTYPE 269 typedef uint32_t CarryType; 270 271 #define Carry0 0 272 273 #define test_carry(x) ((x) != 0) 274 275 #define carry_or(carry1, carry2) (carry1  carry2) 276 277 #define sse_from_CarryType(c) sse_from_int(c) 278 279 #define sse_to_CarryType(c) sse_to_int(c) 280 #endif 281 282 #ifdef sse_CARRYTYPE 283 #define CarryType sse_type 284 285 #define Carry0 (_mm_set1_epi32(0)) 286 287 #define test_carry(x) (!_mm_testz_si128(x, x)) 288 289 #define carry_or(carry1, carry2) sse_or(carry1, carry2) 290 291 #define sse_from_CarryType(c) c 292 293 #define sse_to_CarryType(c) c 294 #endif 295 296 297 298 242 299 243 300 #define adc128(x, y, carry, sum) \ 244 301 do{ \ 245 SIMD_type gen = simd_and(x, y); \246 SIMD_type prop = simd_or(x, y); \247 SIMD_type partial = simd_add_64(simd_add_64(x, y), carry); \248 SIMD_type c1 = sisd_slli(simd_srli_64(simd_or(gen, simd_andc(prop, partial)), 63), 64); \249 sum = s imd_add_64(c1, partial); \250 carry = s isd_srli(simd_or(gen, simd_andc(prop, sum)), 127); \302 sse_type gen = sse_and(x, y); \ 303 sse_type prop = sse_or(x, y); \ 304 sse_type partial = sse_add_64(sse_add_64(x, y), carry); \ 305 sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \ 306 sum = sse_add_64(c1, partial); \ 307 carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \ 251 308 } while(0) 252 309 … … 254 311 #define sbb128(x, y, borrow, difference) \ 255 312 do {\ 256 SIMD_type gen = simd_andc(y, x); \257 SIMD_type prop = simd_not(simd_xor(x, y)); \258 SIMD_type partial = simd_sub_64(simd_sub_64(x, y), borrow); \259 SIMD_type b1 = sisd_slli(simd_srli_64(simd_or(gen, simd_and(prop, partial)), 63), 64); \260 difference = s imd_sub_64(partial, b1); \261 borrow = s isd_srli(simd_or(gen, simd_and(prop, difference)), 127); \313 sse_type gen = sse_andc(y, x); \ 314 sse_type prop = sse_not(sse_xor(x, y)); \ 315 sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \ 316 sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \ 317 difference = sse_sub_64(partial, b1); \ 318 borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \ 262 319 }while(0) 263 320 264 265 321 #define advance_with_carry(cursor, carry, rslt)\ 266 322 do {\ 267 SIMD_type shift_out = simd_srli_64(cursor, 63);\ 268 SIMD_type low_bits = simd_mergel_64(shift_out, carry);\ 269 carry = sisd_srli(shift_out, 64);\ 270 rslt = simd_or(simd_add_64(cursor, cursor), low_bits);\ 271 } while(0) 272 273 #endif 274 #endif 275 276 323 sse_type shift_out = sse_srli_64(cursor, 63);\ 324 sse_type low_bits = sse_mergel_64(shift_out, carry);\ 325 carry = sse_srli_128(shift_out, 64);\ 326 rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\ 327 } while(0) 328 329 #define adc256(x, y, carry, sum) \ 330 do {\ 331 __m128i x0 = simd_lo128(x);\ 332 __m128i x1 = simd_hi128(x);\ 333 __m128i y0 = simd_lo128(y);\ 334 __m128i y1 = simd_hi128(y);\ 335 __m128i cry = sse_from_CarryType(carry);\ 336 __m128i s0, s1;\ 337 adc128(x0, y0, cry, s0);\ 338 adc128(x1, y1, cry, s1);\ 339 sum = simd_combine256(s1, s0);\ 340 carry = sse_to_CarryType(cry);\ 341 } while(0) 342 343 #define sbb256(x, y, borrow, diff) \ 344 do {\ 345 __m128i x0 = simd_lo128(x);\ 346 __m128i x1 = simd_hi128(x);\ 347 __m128i y0 = simd_lo128(y);\ 348 __m128i y1 = simd_hi128(y);\ 349 __m128i brw = sse_from_CarryType(borrow);\ 350 __m128i d0, d1;\ 351 sbb128(x0, y0, brw, d0);\ 352 sbb128(x1, y1, brw, d1);\ 353 diff = simd_combine256(d1, d0);\ 354 borrow = sse_to_CarryType(brw);\ 355 } while(0) 356 357 #define advance_with_carry256(cursor, carry, rslt)\ 358 do {\ 359 __m128i cursor0 = simd_lo128(cursor);\ 360 __m128i cursor1 = simd_hi128(cursor);\ 361 __m128i cry = sse_from_CarryType(carry);\ 362 __m128i rslt0, rslt1;\ 363 advance_with_carry(cursor0, cry, rslt0);\ 364 advance_with_carry(cursor1, cry, rslt1);\ 365 rslt = simd_combine256(rslt1, rslt0);\ 366 carry = sse_to_CarryType(cry);\ 367 } while(0) 368 369 370 #endif 371 372 373 #endif
Note: See TracChangeset
for help on using the changeset viewer.