1 | #ifndef BITBLOCK256_HPP_ |
---|
2 | #define BITBLOCK256_HPP_ |
---|
3 | |
---|
4 | /*============================================================================= |
---|
5 | bitblock256 - Specific 256 bit implementations. |
---|
6 | |
---|
7 | Copyright (C) 2011, Robert D. Cameron, Kenneth S. Herdy, Hua Huang and Nigel Medforth. |
---|
8 | Licensed to the public under the Open Software License 3.0. |
---|
9 | Licensed to International Characters Inc. |
---|
10 | under the Academic Free License version 3.0. |
---|
11 | |
---|
12 | =============================================================================*/ |
---|
13 | |
---|
14 | #include "idisa128.hpp" |
---|
15 | #include "idisa256.hpp" |
---|
16 | #include "builtins.hpp" |
---|
17 | |
---|
18 | union ubitblock { |
---|
19 | bitblock256_t _bitblock; |
---|
20 | bitblock256_t _256; |
---|
21 | bitblock128_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)]; |
---|
22 | uint64_t _64[sizeof(bitblock256_t)/sizeof(uint64_t)]; |
---|
23 | uint32_t _32[sizeof(bitblock256_t)/sizeof(uint32_t)]; |
---|
24 | uint16_t _16[sizeof(bitblock256_t)/sizeof(uint16_t)]; |
---|
25 | uint8_t _8[sizeof(bitblock256_t)/sizeof(uint8_t)]; |
---|
26 | }; |
---|
27 | |
---|
28 | /* The type used to store a carry bit. */ |
---|
29 | #ifndef CARRY64 |
---|
30 | typedef bitblock256_t carry_t; |
---|
31 | #endif |
---|
32 | #ifdef CARRY64 |
---|
33 | typedef uint64_t carry_t; |
---|
34 | #endif |
---|
35 | |
---|
36 | static IDISA_ALWAYS_INLINE void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum); |
---|
37 | static IDISA_ALWAYS_INLINE void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference); |
---|
38 | static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt); |
---|
39 | |
---|
40 | |
---|
41 | |
---|
42 | |
---|
43 | static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry); |
---|
44 | static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry); |
---|
45 | static IDISA_ALWAYS_INLINE carry_t carryout2carry(bitblock256_t carryout); |
---|
46 | |
---|
47 | |
---|
48 | static IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t & carry, bitblock256_t & sum); |
---|
49 | static IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t & borrow, bitblock256_t & difference); |
---|
50 | static IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t & carry, bitblock256_t & rslt); |
---|
51 | |
---|
52 | static IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum); |
---|
53 | static IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference); |
---|
54 | static IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt); |
---|
55 | |
---|
56 | static IDISA_ALWAYS_INLINE bitblock256_t convert (uint64_t s); |
---|
57 | static IDISA_ALWAYS_INLINE bitblock128_t convert_128 (uint64_t s); |
---|
58 | static IDISA_ALWAYS_INLINE uint64_t convert (bitblock256_t v); |
---|
59 | |
---|
60 | #ifndef CARRY64 |
---|
61 | static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry) { return carry;} |
---|
62 | static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) { return carry;} |
---|
63 | static IDISA_ALWAYS_INLINE uint64_t carry2uint64(carry_t carry) { return convert(carry);} |
---|
64 | static IDISA_ALWAYS_INLINE carry_t uint2carry(uint64_t carry) { return convert(carry);} |
---|
65 | #endif |
---|
66 | #ifdef CARRY64 |
---|
67 | static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry) { return convert(carry);} |
---|
68 | static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) { return convert(carry);} |
---|
69 | static IDISA_ALWAYS_INLINE uint64_t carry2uint64(carry_t carry) { return carry;} |
---|
70 | static IDISA_ALWAYS_INLINE carry_t uint2carry(uint64_t carry) { return carry;} |
---|
71 | #endif |
---|
72 | |
---|
73 | static IDISA_ALWAYS_INLINE carry_t carryout2carry(carry_t carryout) { |
---|
74 | return carryout; |
---|
75 | } |
---|
76 | |
---|
77 | static IDISA_ALWAYS_INLINE void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) { |
---|
78 | bitblock256_t all_ones = simd256<1>::constant<1>(); |
---|
79 | bitblock256_t gen = simd_and(x, y); |
---|
80 | bitblock256_t prop = simd_xor(x, y); |
---|
81 | bitblock256_t partial_sum = simd256<64>::add(x, y); |
---|
82 | bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum)); |
---|
83 | bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones); |
---|
84 | uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + carry2uint64(carry_in); |
---|
85 | uint64_t bubble_mask = hsimd256<64>::signmask(bubble); |
---|
86 | uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask; |
---|
87 | uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask); |
---|
88 | carry_out = uint2carry(increments >> 4); |
---|
89 | uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001; |
---|
90 | sum = simd256<64>::add(partial_sum, _mm256_cvtepu16_epi64(avx_select_lo128(convert(spread)))); |
---|
91 | } |
---|
92 | |
---|
93 | static IDISA_ALWAYS_INLINE void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference){ |
---|
94 | bitblock256_t gen = simd_andc(y, x); |
---|
95 | bitblock256_t prop = simd_not(simd_xor(x, y)); |
---|
96 | bitblock256_t partial_diff = simd256<64>::sub(x, y); |
---|
97 | bitblock256_t borrow = simd_or(gen, simd_and(prop, partial_diff)); |
---|
98 | bitblock256_t bubble = simd256<64>::eq(partial_diff, simd<1>::constant<0>()); |
---|
99 | uint64_t borrow_mask = hsimd256<64>::signmask(borrow) * 2 + carry2uint64(borrow_in); |
---|
100 | uint64_t bubble_mask = hsimd256<64>::signmask(bubble); |
---|
101 | uint64_t borrow_scan_thru_bubbles = (borrow_mask + bubble_mask) &~ bubble_mask; |
---|
102 | uint64_t decrements = borrow_scan_thru_bubbles | (borrow_scan_thru_bubbles - borrow_mask); |
---|
103 | borrow_out = uint2carry(decrements >> 4); |
---|
104 | uint64_t spread = 0x0000200040008001 * decrements & 0x0001000100010001; |
---|
105 | difference = simd256<64>::sub(partial_diff, _mm256_cvtepu16_epi64(avx_select_lo128(convert(spread)))); |
---|
106 | } |
---|
107 | |
---|
108 | static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt){ |
---|
109 | bitblock256_t shift_out = simd256<64>::srli<63>(cursor); |
---|
110 | bitblock256_t low_bits = simd_or(mvmd256<64>::slli<1>(shift_out), carry2bitblock(carry_in)); |
---|
111 | carry_out = bitblock2carry(mvmd256<64>::srli<3>(shift_out)); |
---|
112 | rslt = simd_or(simd256<64>::add(cursor, cursor), low_bits); |
---|
113 | } |
---|
114 | |
---|
115 | IDISA_ALWAYS_INLINE bitblock256_t convert(uint64_t s) |
---|
116 | { |
---|
117 | return _mm256_castsi128_si256(_mm_cvtsi64_si128(s)); |
---|
118 | // ubitblock b = {b._256 = simd256<128>::constant<0>()}; // = {0}; |
---|
119 | // b._64[0] = s; |
---|
120 | // return b._256; |
---|
121 | } |
---|
122 | |
---|
123 | IDISA_ALWAYS_INLINE bitblock128_t convert_128(uint64_t s) |
---|
124 | { |
---|
125 | ubitblock b = {b._256 = simd256<128>::constant<0>()}; // = {0}; |
---|
126 | b._64[0] = s; |
---|
127 | return b._128[0]; |
---|
128 | } |
---|
129 | |
---|
130 | IDISA_ALWAYS_INLINE uint64_t convert(bitblock256_t v) |
---|
131 | { |
---|
132 | return (uint64_t) _mm_cvtsi128_si64(avx_select_lo128(v)); |
---|
133 | // return (uint64_t) mvmd256<64>::extract<0>(v); |
---|
134 | } |
---|
135 | |
---|
136 | // The code below is not used. |
---|
137 | |
---|
138 | #ifdef AVX |
---|
139 | #define avx_select_lo128(x) \ |
---|
140 | ((__m128i) _mm256_castps256_ps128(x)) |
---|
141 | |
---|
142 | #define avx_select_hi128(x) \ |
---|
143 | ((__m128i)(_mm256_extractf128_ps(x, 1))) |
---|
144 | |
---|
145 | #define avx_general_combine256(x, y) \ |
---|
146 | (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1)) |
---|
147 | |
---|
148 | IDISA_ALWAYS_INLINE void adc128(bitblock128_t x, bitblock128_t y, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & sum) |
---|
149 | { |
---|
150 | bitblock128_t gen = simd_and(x, y); |
---|
151 | bitblock128_t prop = simd_or(x, y); |
---|
152 | bitblock128_t partial = simd128<64>::add(simd128<64>::add(x, y), carry_in); |
---|
153 | bitblock128_t c1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_andc(prop, partial)))); |
---|
154 | sum = simd128<64>::add(c1, partial); |
---|
155 | carry_out = simd128<128>::srli<127>(simd_or(gen, simd_andc(prop, sum))); |
---|
156 | } |
---|
157 | |
---|
158 | #ifndef ADCMAGIC |
---|
159 | IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) |
---|
160 | { |
---|
161 | // Really Slow! |
---|
162 | // bitblock256_t gen = simd_and(x, y); |
---|
163 | // bitblock256_t prop = simd_or(x, y); |
---|
164 | // sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in)); |
---|
165 | // carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum)))); |
---|
166 | |
---|
167 | bitblock128_t x0 = avx_select_lo128(x); |
---|
168 | bitblock128_t x1 = avx_select_hi128(x); |
---|
169 | bitblock128_t y0 = avx_select_lo128(y); |
---|
170 | bitblock128_t y1 = avx_select_hi128(y); |
---|
171 | bitblock128_t c0 = avx_select_lo128(carry2bitblock(carry_in)); |
---|
172 | bitblock128_t s0, s1, c1, c2; |
---|
173 | adc128(x0, y0, c0, c1, s0); |
---|
174 | adc128(x1, y1, c1, c2, s1); |
---|
175 | sum = avx_general_combine256(s1, s0); |
---|
176 | carry_out = _mm256_castps128_ps256((__m128) c2); |
---|
177 | } |
---|
178 | #endif |
---|
179 | |
---|
180 | #ifdef ADCMAGIC |
---|
181 | |
---|
182 | #ifdef AVX2 |
---|
183 | static inline void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) { |
---|
184 | bitblock256_t all_ones = simd256<1>::constant<1>(); |
---|
185 | bitblock256_t gen = simd_and(x, y); |
---|
186 | bitblock256_t prop = simd_xor(x, y); |
---|
187 | bitblock256_t partial_sum = simd256<64>::add(x, y); |
---|
188 | bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum)); |
---|
189 | bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones); |
---|
190 | uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in); |
---|
191 | uint64_t bubble_mask = hsimd256<64>::signmask(bubble); |
---|
192 | uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask; |
---|
193 | uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask); |
---|
194 | carry_out = convert(increments >> 4); |
---|
195 | uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001; |
---|
196 | sum = simd256<64>::add(partial_sum, _mm256_cvtepu8_epi64(convert_128(spread))); |
---|
197 | } |
---|
198 | #else |
---|
199 | static inline void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) { |
---|
200 | bitblock128_t all_ones = simd128<1>::constant<1>(); |
---|
201 | //bitblock256_t gen = simd_and(x, y); |
---|
202 | //bitblock256_t prop = simd_xor(x, y); |
---|
203 | bitblock128_t x0 = avx_select_lo128(x); |
---|
204 | bitblock128_t x1 = avx_select_hi128(x); |
---|
205 | bitblock128_t y0 = avx_select_lo128(y); |
---|
206 | bitblock128_t y1 = avx_select_hi128(y); |
---|
207 | bitblock128_t sum0 = simd128<64>::add(x0, y0); |
---|
208 | bitblock128_t sum1 = simd128<64>::add(x1, y1); |
---|
209 | //bitblock256_t icarry = simd_or(gen, simd_andc(prop, avx_general_combine256(sum1, sum0))); |
---|
210 | bitblock128_t icarry0 = simd_or(simd_and(x0, y0), simd_andc(simd_or(x0, y0), sum0)); |
---|
211 | bitblock128_t icarry1 = simd_or(simd_and(x1, y1), simd_andc(simd_or(x1, y1), sum1)); |
---|
212 | // A carry may bubble through a field if it is all ones. |
---|
213 | bitblock128_t bubble0 = simd128<64>::eq(sum0, all_ones); |
---|
214 | bitblock128_t bubble1 = simd128<64>::eq(sum1, all_ones); |
---|
215 | //bitblock128_t bubble = hsimd128<64>::packss(bubble1, bubble0); |
---|
216 | bitblock256_t bubble = avx_general_combine256(bubble1, bubble0); |
---|
217 | //uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + convert(carry_in); |
---|
218 | uint64_t carry_mask = hsimd128<64>::signmask(icarry1) * 8 + hsimd128<64>::signmask(icarry0) * 2 + convert(carry_in); |
---|
219 | uint64_t bubble_mask = _mm256_movemask_pd((__m256d) bubble); |
---|
220 | //uint64_t bubble_mask = hsimd128<64>::signmask(bubble1) * 4 + hsimd128<64>::signmask(bubble0); |
---|
221 | //uint64_t bubble_mask = hsimd128<32>::signmask(bubble); |
---|
222 | uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask; |
---|
223 | uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask); |
---|
224 | carry_out = convert(increments >> 4); |
---|
225 | uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001; |
---|
226 | bitblock128_t inc_32 = _mm_cvtepu16_epi32(_mm_cvtsi64_si128(spread)); |
---|
227 | bitblock128_t inc_64_0 = esimd128<32>::mergel(simd128<1>::constant<0>(), inc_32); |
---|
228 | bitblock128_t inc_64_1 = esimd128<32>::mergeh(simd128<1>::constant<0>(), inc_32); |
---|
229 | sum = avx_general_combine256(simd128<64>::add(sum1, inc_64_1), simd128<64>::add(sum0, inc_64_0)); |
---|
230 | } |
---|
231 | #endif |
---|
232 | |
---|
233 | #endif |
---|
234 | |
---|
235 | IDISA_ALWAYS_INLINE void sbb128(bitblock128_t x, bitblock128_t y, bitblock128_t borrow_in, bitblock128_t & borrow_out, bitblock128_t & difference) |
---|
236 | { |
---|
237 | bitblock128_t gen = simd_andc(y, x); |
---|
238 | bitblock128_t prop = simd_not(simd_xor(x, y)); |
---|
239 | bitblock128_t partial = simd128<64>::sub(simd128<64>::sub(x, y), borrow_in); |
---|
240 | bitblock128_t b1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_and(prop, partial)))); |
---|
241 | difference = simd128<64>::sub(partial, b1); |
---|
242 | borrow_out = simd128<128>::srli<127>(simd_or(gen, simd_and(prop, difference))); |
---|
243 | } |
---|
244 | |
---|
245 | |
---|
246 | IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference) |
---|
247 | { |
---|
248 | // bitblock256_t gen = simd_andc(y, x); |
---|
249 | // bitblock256_t prop = simd_not(simd_xor(x, y)); |
---|
250 | // difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in)); |
---|
251 | // borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference)))); |
---|
252 | bitblock128_t x0 = avx_select_lo128(x); |
---|
253 | bitblock128_t x1 = avx_select_hi128(x); |
---|
254 | bitblock128_t y0 = avx_select_lo128(y); |
---|
255 | bitblock128_t y1 = avx_select_hi128(y); |
---|
256 | bitblock128_t b0 = avx_select_lo128(carry2bitblock(borrow_in)); |
---|
257 | bitblock128_t d0, d1, b1, b2; |
---|
258 | sbb128(x0, y0, b0, b1, d0); |
---|
259 | sbb128(x1, y1, b1, b2, d1); |
---|
260 | difference = avx_general_combine256(d1, d0); |
---|
261 | borrow_out = _mm256_castps128_ps256((__m128) b2); |
---|
262 | } |
---|
263 | |
---|
264 | IDISA_ALWAYS_INLINE void advance_with_carry128(bitblock128_t cursor, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & rslt) |
---|
265 | { |
---|
266 | bitblock128_t shift_out = simd128<64>::srli<63>(cursor); |
---|
267 | bitblock128_t low_bits = esimd128<64>::mergel(shift_out, carry_in); |
---|
268 | carry_out = simd128<128>::srli<64>(shift_out); |
---|
269 | rslt = simd_or(simd128<64>::add(cursor, cursor), low_bits); |
---|
270 | } |
---|
271 | |
---|
272 | IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt) |
---|
273 | { |
---|
274 | bitblock128_t cursor0 = avx_select_lo128(cursor); |
---|
275 | bitblock128_t cursor1 = avx_select_hi128(cursor); |
---|
276 | bitblock128_t carry0 = avx_select_lo128(carry_in); |
---|
277 | bitblock128_t carry1, carry2, rslt0, rslt1; |
---|
278 | advance_with_carry128(cursor0, carry0, carry1, rslt0); |
---|
279 | advance_with_carry128(cursor1, carry1, carry2, rslt1); |
---|
280 | rslt = avx_general_combine256(rslt1, rslt0); |
---|
281 | carry_out = _mm256_castps128_ps256((__m128)carry2); |
---|
282 | } |
---|
283 | #endif |
---|
284 | |
---|
285 | #endif // BITBLOCK256_HPP_ |
---|