Changeset 2196 for trunk


Ignore:
Timestamp:
May 30, 2012, 4:48:27 PM (7 years ago)
Author:
cameron
Message:

Speed up adc, sbb, advance_with_carry for 256 bits

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/bitblock256.hpp

    r2018 r2196  
    4646static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) {  return carry;}
    4747
     48
     49
     50#define avx_select_lo128(x) \
     51        ((__m128i) _mm256_castps256_ps128(x))
     52
     53#define avx_select_hi128(x) \
     54        ((__m128i)(_mm256_extractf128_ps(x, 1)))
     55
     56#define avx_general_combine256(x, y) \
     57   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
     58
     59IDISA_ALWAYS_INLINE void adc128(bitblock128_t x, bitblock128_t y, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & sum)
     60{
     61        bitblock128_t gen = simd_and(x, y);
     62        bitblock128_t prop = simd_or(x, y);
     63        bitblock128_t partial = simd128<64>::add(simd128<64>::add(x, y), carry_in);
     64        bitblock128_t c1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_andc(prop, partial))));
     65        sum = simd128<64>::add(c1, partial);
     66        carry_out = simd128<128>::srli<127>(simd_or(gen, simd_andc(prop, sum)));
     67}
     68
    4869IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum)
    4970{
    50         bitblock256_t gen = simd_and(x, y);
    51         bitblock256_t prop = simd_or(x, y);
    52         sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in));
    53         carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum))));
     71//  Really Slow!
     72//        bitblock256_t gen = simd_and(x, y);
     73//        bitblock256_t prop = simd_or(x, y);
     74//        sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in));
     75//        carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum))));
     76
     77  bitblock128_t x0 = avx_select_lo128(x);
     78  bitblock128_t x1 = avx_select_hi128(x);
     79  bitblock128_t y0 = avx_select_lo128(y);
     80  bitblock128_t y1 = avx_select_hi128(y);
     81  bitblock128_t c0 = avx_select_lo128(carry2bitblock(carry_in));
     82  bitblock128_t s0, s1, c1, c2;
     83  adc128(x0, y0, c0, c1, s0);
     84  adc128(x1, y1, c1, c2, s1);
     85  sum = avx_general_combine256(s1, s0);
     86  carry_out = _mm256_castps128_ps256((__m128) c2);
    5487}
     88
     89IDISA_ALWAYS_INLINE void sbb128(bitblock128_t x, bitblock128_t y, bitblock128_t borrow_in, bitblock128_t & borrow_out, bitblock128_t & difference)
     90{
     91        bitblock128_t gen = simd_andc(y, x);
     92        bitblock128_t prop = simd_not(simd_xor(x, y));
     93        bitblock128_t partial = simd128<64>::sub(simd128<64>::sub(x, y), borrow_in);
     94        bitblock128_t b1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_and(prop, partial))));
     95        difference = simd128<64>::sub(partial, b1);
     96        borrow_out = simd128<128>::srli<127>(simd_or(gen, simd_and(prop, difference)));
     97}
     98
    5599
    56100IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference)
    57101{
    58         bitblock256_t gen = simd_andc(y, x);
    59         bitblock256_t prop = simd_not(simd_xor(x, y));
    60         difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in));
    61         borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference))));
     102//        bitblock256_t gen = simd_andc(y, x);
     103//        bitblock256_t prop = simd_not(simd_xor(x, y));
     104//        difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in));
     105//        borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference))));
     106  bitblock128_t x0 = avx_select_lo128(x);
     107  bitblock128_t x1 = avx_select_hi128(x);
     108  bitblock128_t y0 = avx_select_lo128(y);
     109  bitblock128_t y1 = avx_select_hi128(y);
     110  bitblock128_t b0 = avx_select_lo128(carry2bitblock(borrow_in));
     111  bitblock128_t d0, d1, b1, b2;
     112  sbb128(x0, y0, b0, b1, d0);
     113  sbb128(x1, y1, b1, b2, d1);
     114  difference = avx_general_combine256(d1, d0);
     115  borrow_out = _mm256_castps128_ps256((__m128) b2);
     116}
     117
     118IDISA_ALWAYS_INLINE void advance_with_carry128(bitblock128_t cursor, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & rslt)
     119{
     120bitblock128_t shift_out = simd128<64>::srli<63>(cursor);
     121bitblock128_t low_bits = esimd128<64>::mergel(shift_out, carry_in);
     122carry_out = simd128<128>::srli<64>(shift_out);
     123rslt = simd_or(simd128<64>::add(cursor, cursor), low_bits);
    62124}
    63125
    64126IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt)
    65127{
    66 carry_out = simd256<256>::srli<255>(cursor);
    67 rslt = simd_or(simd256<256>::add(cursor, cursor), carry_in);
     128  bitblock128_t cursor0 = avx_select_lo128(cursor);
     129  bitblock128_t cursor1 = avx_select_hi128(cursor);
     130  bitblock128_t  carry0 = avx_select_lo128(carry_in);
     131  bitblock128_t  carry1, carry2, rslt0, rslt1;
     132  advance_with_carry128(cursor0, carry0, carry1, rslt0);
     133  advance_with_carry128(cursor1, carry1, carry2, rslt1);
     134  rslt = avx_general_combine256(rslt1, rslt0);
     135  carry_out = _mm256_castps128_ps256((__m128)carry2);
    68136}
    69137
Note: See TracChangeset for help on using the changeset viewer.