Changeset 2196
 Timestamp:
 May 30, 2012, 4:48:27 PM (7 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

trunk/lib/bitblock256.hpp
r2018 r2196 46 46 static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) { return carry;} 47 47 48 49 50 #define avx_select_lo128(x) \ 51 ((__m128i) _mm256_castps256_ps128(x)) 52 53 #define avx_select_hi128(x) \ 54 ((__m128i)(_mm256_extractf128_ps(x, 1))) 55 56 #define avx_general_combine256(x, y) \ 57 (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1)) 58 59 IDISA_ALWAYS_INLINE void adc128(bitblock128_t x, bitblock128_t y, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & sum) 60 { 61 bitblock128_t gen = simd_and(x, y); 62 bitblock128_t prop = simd_or(x, y); 63 bitblock128_t partial = simd128<64>::add(simd128<64>::add(x, y), carry_in); 64 bitblock128_t c1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_andc(prop, partial)))); 65 sum = simd128<64>::add(c1, partial); 66 carry_out = simd128<128>::srli<127>(simd_or(gen, simd_andc(prop, sum))); 67 } 68 48 69 IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) 49 70 { 50 bitblock256_t gen = simd_and(x, y); 51 bitblock256_t prop = simd_or(x, y); 52 sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in)); 53 carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum)))); 71 // Really Slow! 72 // bitblock256_t gen = simd_and(x, y); 73 // bitblock256_t prop = simd_or(x, y); 74 // sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in)); 75 // carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum)))); 76 77 bitblock128_t x0 = avx_select_lo128(x); 78 bitblock128_t x1 = avx_select_hi128(x); 79 bitblock128_t y0 = avx_select_lo128(y); 80 bitblock128_t y1 = avx_select_hi128(y); 81 bitblock128_t c0 = avx_select_lo128(carry2bitblock(carry_in)); 82 bitblock128_t s0, s1, c1, c2; 83 adc128(x0, y0, c0, c1, s0); 84 adc128(x1, y1, c1, c2, s1); 85 sum = avx_general_combine256(s1, s0); 86 carry_out = _mm256_castps128_ps256((__m128) c2); 54 87 } 88 89 IDISA_ALWAYS_INLINE void sbb128(bitblock128_t x, bitblock128_t y, bitblock128_t borrow_in, bitblock128_t & borrow_out, bitblock128_t & difference) 90 { 91 bitblock128_t gen = simd_andc(y, x); 92 bitblock128_t prop = simd_not(simd_xor(x, y)); 93 bitblock128_t partial = simd128<64>::sub(simd128<64>::sub(x, y), borrow_in); 94 bitblock128_t b1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_and(prop, partial)))); 95 difference = simd128<64>::sub(partial, b1); 96 borrow_out = simd128<128>::srli<127>(simd_or(gen, simd_and(prop, difference))); 97 } 98 55 99 56 100 IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference) 57 101 { 58 bitblock256_t gen = simd_andc(y, x); 59 bitblock256_t prop = simd_not(simd_xor(x, y)); 60 difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in)); 61 borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference)))); 102 // bitblock256_t gen = simd_andc(y, x); 103 // bitblock256_t prop = simd_not(simd_xor(x, y)); 104 // difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in)); 105 // borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference)))); 106 bitblock128_t x0 = avx_select_lo128(x); 107 bitblock128_t x1 = avx_select_hi128(x); 108 bitblock128_t y0 = avx_select_lo128(y); 109 bitblock128_t y1 = avx_select_hi128(y); 110 bitblock128_t b0 = avx_select_lo128(carry2bitblock(borrow_in)); 111 bitblock128_t d0, d1, b1, b2; 112 sbb128(x0, y0, b0, b1, d0); 113 sbb128(x1, y1, b1, b2, d1); 114 difference = avx_general_combine256(d1, d0); 115 borrow_out = _mm256_castps128_ps256((__m128) b2); 116 } 117 118 IDISA_ALWAYS_INLINE void advance_with_carry128(bitblock128_t cursor, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & rslt) 119 { 120 bitblock128_t shift_out = simd128<64>::srli<63>(cursor); 121 bitblock128_t low_bits = esimd128<64>::mergel(shift_out, carry_in); 122 carry_out = simd128<128>::srli<64>(shift_out); 123 rslt = simd_or(simd128<64>::add(cursor, cursor), low_bits); 62 124 } 63 125 64 126 IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt) 65 127 { 66 carry_out = simd256<256>::srli<255>(cursor); 67 rslt = simd_or(simd256<256>::add(cursor, cursor), carry_in); 128 bitblock128_t cursor0 = avx_select_lo128(cursor); 129 bitblock128_t cursor1 = avx_select_hi128(cursor); 130 bitblock128_t carry0 = avx_select_lo128(carry_in); 131 bitblock128_t carry1, carry2, rslt0, rslt1; 132 advance_with_carry128(cursor0, carry0, carry1, rslt0); 133 advance_with_carry128(cursor1, carry1, carry2, rslt1); 134 rslt = avx_general_combine256(rslt1, rslt0); 135 carry_out = _mm256_castps128_ps256((__m128)carry2); 68 136 } 69 137
Note: See TracChangeset
for help on using the changeset viewer.