 Timestamp:
 Mar 21, 2011, 8:25:00 PM (8 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

trunk/lib/block_carry_avx.h
r960 r965 253 253 254 254 #define sse_from_int(n) _mm_cvtsi32_si128(n) 255 #define sse_eq_64(a, b) _mm_cmpeq_epi64(a, b) 256 #define sse_mergeh_32(a, b) _mm_unpackhi_epi32(b, a) 257 #define sse_mergel_32(a, b) _mm_unpacklo_epi32(b, a) 258 #define sse_const_8(n) _mm_set1_epi8(n) 259 #define sse_const_1(n) \ 260 (n==0 ? _mm_setzero_si128(): sse_const_8(1)) 261 255 262 256 263 … … 265 272 */ 266 273 267 #define sse_CARRYTYPE274 #define uint32_CARRYTYPE 268 275 #ifdef uint32_CARRYTYPE 269 276 typedef uint32_t CarryType; … … 327 334 } while(0) 328 335 336 /* 329 337 #define adc256(x, y, carry, sum) \ 330 338 do {\ … … 340 348 carry = sse_to_CarryType(cry);\ 341 349 } while(0) 350 */ 351 352 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline)); 353 354 #ifndef ADCMAGIC 355 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) { 356 __m128i x0 = simd_lo128(x); 357 __m128i x1 = simd_hi128(x); 358 __m128i y0 = simd_lo128(y); 359 __m128i y1 = simd_hi128(y); 360 __m128i cry = sse_from_CarryType(carry); 361 __m128i s0, s1; 362 adc128(x0, y0, cry, s0); 363 adc128(x1, y1, cry, s1); 364 sum = simd_combine256(s1, s0); 365 carry = sse_to_CarryType(cry); 366 } 367 #endif 368 #ifdef ADCMAGIC 369 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) { 370 371 BitBlock gen = simd_and(x, y); 372 BitBlock prop = simd_xor(x, y); 373 __m128i x0 = simd_lo128(x); 374 __m128i x1 = simd_hi128(x); 375 __m128i y0 = simd_lo128(y); 376 __m128i y1 = simd_hi128(y); 377 __m128i sum0 = sse_add_64(x0, y0); 378 __m128i sum1 = sse_add_64(x1, y1); 379 BitBlock icarry = simd_or(gen, simd_andc(prop, simd_combine256(sum1, sum0))); 380 __m128i max0 = sse_eq_64(sum0, sse_const_1(1)); 381 __m128i max1 = sse_eq_64(sum1, sse_const_1(1)); 382 BitBlock max = simd_combine256(max1, max0); 383 uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + carry; 384 uint64_t max_mask = _mm256_movemask_pd((__m256d) max); 385 uint64_t increments = max_mask + carry_mask; 386 carry = increments >> 4; 387 uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001; 388 __m128i inc_32 = _mm_cvtepu16_epi32(_mm_cvtsi64_si128(spread)); 389 __m128i inc_64_0 = sse_mergel_32(sse_const_1(0), inc_32); 390 __m128i inc_64_1 = sse_mergeh_32(sse_const_1(0), inc_32); 391 sum = simd_combine256(sse_add_64(sum1, inc_64_1), sse_add_64(sum0, inc_64_0)); 392 } 393 #endif 394 342 395 343 396 #define sbb256(x, y, borrow, diff) \
Note: See TracChangeset
for help on using the changeset viewer.