Changeset 965


Ignore:
Timestamp:
Mar 21, 2011, 8:25:00 PM (8 years ago)
Author:
cameron
Message:

ADCMAGIC carry implementation

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/block_carry_avx.h

    r960 r965  
    253253
    254254#define sse_from_int(n) _mm_cvtsi32_si128(n)
     255#define sse_eq_64(a, b) _mm_cmpeq_epi64(a, b)
     256#define sse_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
     257#define sse_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
     258#define sse_const_8(n) _mm_set1_epi8(n)
     259#define sse_const_1(n) \
     260  (n==0 ? _mm_setzero_si128(): sse_const_8(-1))
     261
    255262
    256263
     
    265272*/
    266273
    267 #define sse_CARRYTYPE
     274#define uint32_CARRYTYPE
    268275#ifdef uint32_CARRYTYPE
    269276typedef uint32_t CarryType;
     
    327334} while(0)
    328335
     336/*
    329337#define adc256(x, y, carry,  sum) \
    330338do {\
     
    340348        carry = sse_to_CarryType(cry);\
    341349} while(0)
     350*/
     351
     352static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
     353
     354#ifndef ADCMAGIC
     355static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
     356        __m128i x0 = simd_lo128(x);
     357        __m128i x1 = simd_hi128(x);
     358        __m128i y0 = simd_lo128(y);
     359        __m128i y1 = simd_hi128(y);
     360        __m128i cry = sse_from_CarryType(carry);
     361        __m128i s0, s1;
     362        adc128(x0, y0, cry, s0);
     363        adc128(x1, y1, cry, s1);
     364        sum = simd_combine256(s1, s0);
     365        carry = sse_to_CarryType(cry);
     366}
     367#endif
     368#ifdef ADCMAGIC
     369static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
     370
     371        BitBlock gen = simd_and(x, y);
     372        BitBlock prop = simd_xor(x, y);
     373        __m128i x0 = simd_lo128(x);
     374        __m128i x1 = simd_hi128(x);
     375        __m128i y0 = simd_lo128(y);
     376        __m128i y1 = simd_hi128(y);
     377        __m128i sum0 = sse_add_64(x0, y0);
     378        __m128i sum1 = sse_add_64(x1, y1);
     379        BitBlock icarry = simd_or(gen, simd_andc(prop, simd_combine256(sum1, sum0)));
     380        __m128i max0 = sse_eq_64(sum0, sse_const_1(1));
     381        __m128i max1 = sse_eq_64(sum1, sse_const_1(1));
     382        BitBlock max = simd_combine256(max1, max0);
     383        uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + carry;
     384        uint64_t max_mask = _mm256_movemask_pd((__m256d) max);
     385        uint64_t increments = max_mask + carry_mask;
     386        carry = increments >> 4;
     387        uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
     388        __m128i inc_32 = _mm_cvtepu16_epi32(_mm_cvtsi64_si128(spread));
     389        __m128i inc_64_0 = sse_mergel_32(sse_const_1(0), inc_32);
     390        __m128i inc_64_1 = sse_mergeh_32(sse_const_1(0), inc_32);
     391        sum = simd_combine256(sse_add_64(sum1, inc_64_1), sse_add_64(sum0, inc_64_0));
     392}
     393#endif
     394
    342395
    343396#define sbb256(x, y, borrow, diff) \
Note: See TracChangeset for help on using the changeset viewer.