source: trunk/lib/bitblock256.hpp @ 2196

Last change on this file since 2196 was 2196, checked in by cameron, 7 years ago

Speed up adc, sbb, advance_with_carry for 256 bits

File size: 6.4 KB
Line 
1#ifndef BITBLOCK256_HPP_
2#define BITBLOCK256_HPP_
3
4/*=============================================================================
5    bitblock256 - Specific 256 bit IDISA implementations.
6
7    Idealized SIMD Operations with SSE versions
8    Copyright (C) 2011, Robert D. Cameron, Kenneth S. Herdy, Hua Huang and Nigel Medforth.
9    Licensed to the public under the Open Software License 3.0.
10    Licensed to International Characters Inc.
11       under the Academic Free License version 3.0.
12
13=============================================================================*/
14
15#include "idisa128.hpp"
16#include "idisa256.hpp"
17#include "builtins.hpp"
18
19union ubitblock {
20        bitblock256_t _256;
21        bitblock256_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)];
22        uint64_t _64[sizeof(bitblock256_t)/sizeof(uint64_t)];
23        uint32_t _32[sizeof(bitblock256_t)/sizeof(uint32_t)];
24        uint16_t _16[sizeof(bitblock256_t)/sizeof(uint16_t)];
25        uint8_t _8[sizeof(bitblock256_t)/sizeof(uint8_t)];
26};
27
28/* The type used to store a carry bit. */
29typedef bitblock256_t carry_t;
30
31static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry);
32static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry);
33
34static IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t & carry, bitblock256_t & sum);
35static IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t & borrow, bitblock256_t & difference);
36static IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t & carry, bitblock256_t & rslt);
37
38static IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum);
39static IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference);
40static IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt);
41
42static IDISA_ALWAYS_INLINE bitblock256_t convert (uint64_t s);
43static IDISA_ALWAYS_INLINE uint64_t convert (bitblock256_t v);
44
45static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry) {  return carry;}
46static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) {  return carry;}
47
48
49
50#define avx_select_lo128(x) \
51        ((__m128i) _mm256_castps256_ps128(x))
52
53#define avx_select_hi128(x) \
54        ((__m128i)(_mm256_extractf128_ps(x, 1)))
55
56#define avx_general_combine256(x, y) \
57   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
58
59IDISA_ALWAYS_INLINE void adc128(bitblock128_t x, bitblock128_t y, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & sum)
60{
61        bitblock128_t gen = simd_and(x, y);
62        bitblock128_t prop = simd_or(x, y);
63        bitblock128_t partial = simd128<64>::add(simd128<64>::add(x, y), carry_in);
64        bitblock128_t c1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_andc(prop, partial))));
65        sum = simd128<64>::add(c1, partial);
66        carry_out = simd128<128>::srli<127>(simd_or(gen, simd_andc(prop, sum)));
67}
68
69IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum)
70{
71//  Really Slow!
72//        bitblock256_t gen = simd_and(x, y);
73//        bitblock256_t prop = simd_or(x, y);
74//        sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in));
75//        carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum))));
76
77  bitblock128_t x0 = avx_select_lo128(x);
78  bitblock128_t x1 = avx_select_hi128(x);
79  bitblock128_t y0 = avx_select_lo128(y);
80  bitblock128_t y1 = avx_select_hi128(y);
81  bitblock128_t c0 = avx_select_lo128(carry2bitblock(carry_in));
82  bitblock128_t s0, s1, c1, c2;
83  adc128(x0, y0, c0, c1, s0);
84  adc128(x1, y1, c1, c2, s1);
85  sum = avx_general_combine256(s1, s0);
86  carry_out = _mm256_castps128_ps256((__m128) c2);
87}
88
89IDISA_ALWAYS_INLINE void sbb128(bitblock128_t x, bitblock128_t y, bitblock128_t borrow_in, bitblock128_t & borrow_out, bitblock128_t & difference)
90{
91        bitblock128_t gen = simd_andc(y, x);
92        bitblock128_t prop = simd_not(simd_xor(x, y));
93        bitblock128_t partial = simd128<64>::sub(simd128<64>::sub(x, y), borrow_in);
94        bitblock128_t b1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_and(prop, partial))));
95        difference = simd128<64>::sub(partial, b1);
96        borrow_out = simd128<128>::srli<127>(simd_or(gen, simd_and(prop, difference)));
97}
98
99
100IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference)
101{
102//        bitblock256_t gen = simd_andc(y, x);
103//        bitblock256_t prop = simd_not(simd_xor(x, y));
104//        difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in));
105//        borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference))));
106  bitblock128_t x0 = avx_select_lo128(x);
107  bitblock128_t x1 = avx_select_hi128(x);
108  bitblock128_t y0 = avx_select_lo128(y);
109  bitblock128_t y1 = avx_select_hi128(y);
110  bitblock128_t b0 = avx_select_lo128(carry2bitblock(borrow_in));
111  bitblock128_t d0, d1, b1, b2;
112  sbb128(x0, y0, b0, b1, d0);
113  sbb128(x1, y1, b1, b2, d1);
114  difference = avx_general_combine256(d1, d0);
115  borrow_out = _mm256_castps128_ps256((__m128) b2);
116}
117
118IDISA_ALWAYS_INLINE void advance_with_carry128(bitblock128_t cursor, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & rslt)
119{
120bitblock128_t shift_out = simd128<64>::srli<63>(cursor);
121bitblock128_t low_bits = esimd128<64>::mergel(shift_out, carry_in);
122carry_out = simd128<128>::srli<64>(shift_out);
123rslt = simd_or(simd128<64>::add(cursor, cursor), low_bits);
124}
125
126IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt)
127{
128  bitblock128_t cursor0 = avx_select_lo128(cursor);
129  bitblock128_t cursor1 = avx_select_hi128(cursor);
130  bitblock128_t  carry0 = avx_select_lo128(carry_in);
131  bitblock128_t  carry1, carry2, rslt0, rslt1;
132  advance_with_carry128(cursor0, carry0, carry1, rslt0);
133  advance_with_carry128(cursor1, carry1, carry2, rslt1);
134  rslt = avx_general_combine256(rslt1, rslt0);
135  carry_out = _mm256_castps128_ps256((__m128)carry2);
136}
137
138IDISA_ALWAYS_INLINE bitblock256_t convert(uint64_t s)
139{
140        ubitblock b = {b._256 = simd256<128>::constant<0>()}; // = {0};
141        b._64[0] = s;
142        return b._256;
143}
144
145IDISA_ALWAYS_INLINE uint64_t convert (bitblock256_t v)
146{
147        return (uint64_t) mvmd256<64>::extract<0>(v);
148}
149
150
151#endif // BITBLOCK256_HPP_
Note: See TracBrowser for help on using the repository browser.