source: icGREP/icgrep-devel/icgrep/include/simd-lib/bitblock256.hpp @ 4904

Last change on this file since 4904 was 4904, checked in by cameron, 3 years ago

Refactoring progress towards layered kernels

File size: 13.6 KB
Line 
1#ifndef BITBLOCK256_HPP_
2#define BITBLOCK256_HPP_
3
4/*=============================================================================
5    bitblock256 - Specific 256 bit implementations.
6
7    Copyright (C) 2011, Robert D. Cameron, Kenneth S. Herdy, Hua Huang and Nigel Medforth.
8    Licensed to the public under the Open Software License 3.0.
9    Licensed to International Characters Inc.
10       under the Academic Free License version 3.0.
11
12=============================================================================*/
13
14#include "idisa128.hpp"
15#include "idisa256.hpp"
16#include "builtins.hpp"
17
18union ubitblock {
19        bitblock256_t _bitblock;
20        bitblock256_t _256;
21        bitblock128_t _128[sizeof(bitblock256_t)/sizeof(bitblock256_t)];
22        uint64_t _64[sizeof(bitblock256_t)/sizeof(uint64_t)];
23        uint32_t _32[sizeof(bitblock256_t)/sizeof(uint32_t)];
24        uint16_t _16[sizeof(bitblock256_t)/sizeof(uint16_t)];
25        uint8_t _8[sizeof(bitblock256_t)/sizeof(uint8_t)];
26};
27
28/* The type used to store a carry bit. */
29#ifndef CARRY64
30typedef bitblock256_t carry_t;
31#endif
32#ifdef CARRY64
33typedef uint64_t carry_t;
34#endif
35
36static IDISA_ALWAYS_INLINE void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum);
37static IDISA_ALWAYS_INLINE void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference);
38static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt);
39
40
41
42
43static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry);
44static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry);
45static IDISA_ALWAYS_INLINE carry_t carryout2carry(bitblock256_t carryout);
46
47
48static IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t & carry, bitblock256_t & sum);
49static IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t & borrow, bitblock256_t & difference);
50static IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t & carry, bitblock256_t & rslt);
51
52static IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum);
53static IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference);
54static IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt);
55
56static IDISA_ALWAYS_INLINE bitblock256_t convert (uint64_t s);
57static IDISA_ALWAYS_INLINE bitblock128_t convert_128 (uint64_t s);
58static IDISA_ALWAYS_INLINE uint64_t convert (bitblock256_t v);
59
60#ifndef CARRY64
61static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry) {  return carry;}
62static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) {  return carry;}
63static IDISA_ALWAYS_INLINE uint64_t carry2uint64(carry_t carry) {  return convert(carry);}
64static IDISA_ALWAYS_INLINE carry_t uint2carry(uint64_t carry) {  return convert(carry);}
65#endif
66#ifdef CARRY64
67static IDISA_ALWAYS_INLINE bitblock256_t carry2bitblock(carry_t carry) {  return convert(carry);}
68static IDISA_ALWAYS_INLINE carry_t bitblock2carry(bitblock256_t carry) {  return convert(carry);}
69static IDISA_ALWAYS_INLINE uint64_t carry2uint64(carry_t carry) {  return carry;}
70static IDISA_ALWAYS_INLINE carry_t uint2carry(uint64_t carry) {  return carry;}
71#endif
72
73static IDISA_ALWAYS_INLINE carry_t carryout2carry(carry_t carryout) {
74  return carryout;
75}
76
77#ifndef AVX2
78#define _mm256_cvtepu16_epi64(x) _mm256_castps128_ps256((__m128) (x))
79#endif
80
81static IDISA_ALWAYS_INLINE void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) {
82  bitblock256_t all_ones = simd256<1>::constant<1>();
83  bitblock256_t gen = simd_and(x, y);
84  bitblock256_t prop = simd_xor(x, y);
85  bitblock256_t partial_sum = simd256<64>::add(x, y);
86  bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum));
87  bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones);
88  uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + carry2uint64(carry_in);
89  uint64_t bubble_mask = hsimd256<64>::signmask(bubble);
90  uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
91  uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
92  carry_out = uint2carry(increments >> 4);
93  uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
94  sum = simd256<64>::add(partial_sum, _mm256_cvtepu16_epi64(avx_select_lo128(convert(spread))));
95}
96
97static IDISA_ALWAYS_INLINE void sub_bi_bo(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference){
98  bitblock256_t gen = simd_andc(y, x);
99  bitblock256_t prop = simd_not(simd_xor(x, y));
100  bitblock256_t partial_diff = simd256<64>::sub(x, y);
101  bitblock256_t borrow = simd_or(gen, simd_and(prop, partial_diff));
102  bitblock256_t bubble = simd256<64>::eq(partial_diff, simd<1>::constant<0>());
103  uint64_t borrow_mask = hsimd256<64>::signmask(borrow) * 2 + carry2uint64(borrow_in);
104  uint64_t bubble_mask = hsimd256<64>::signmask(bubble);
105  uint64_t borrow_scan_thru_bubbles = (borrow_mask + bubble_mask) &~ bubble_mask;
106  uint64_t decrements = borrow_scan_thru_bubbles | (borrow_scan_thru_bubbles - borrow_mask);
107  borrow_out = uint2carry(decrements >> 4);
108  uint64_t spread = 0x0000200040008001 * decrements & 0x0001000100010001;
109  difference = simd256<64>::sub(partial_diff, _mm256_cvtepu16_epi64(avx_select_lo128(convert(spread))));
110}
111
112static IDISA_ALWAYS_INLINE void adv_ci_co(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt){
113        bitblock256_t shift_out = simd256<64>::srli<63>(cursor);
114        bitblock256_t low_bits = simd_or(mvmd256<64>::slli<1>(shift_out), carry2bitblock(carry_in));
115        carry_out = bitblock2carry(mvmd256<64>::srli<3>(shift_out));
116        rslt = simd_or(simd256<64>::add(cursor, cursor), low_bits);
117}
118
119IDISA_ALWAYS_INLINE bitblock256_t convert(uint64_t s)
120{ 
121  return _mm256_castsi128_si256(_mm_cvtsi64_si128(s));
122  // ubitblock b = {b._256 = simd256<128>::constant<0>()}; // = {0};
123  // b._64[0] = s;
124  // return b._256;
125}
126
127IDISA_ALWAYS_INLINE bitblock128_t convert_128(uint64_t s)
128{
129  ubitblock b = {b._256 = simd256<128>::constant<0>()}; // = {0};
130  b._64[0] = s;
131  return b._128[0];
132}
133
134IDISA_ALWAYS_INLINE uint64_t convert(bitblock256_t v)
135{
136  return (uint64_t) _mm_cvtsi128_si64(avx_select_lo128(v));
137  // return (uint64_t) mvmd256<64>::extract<0>(v);
138}
139
140// The code below is not used.
141
142#ifdef AVX
143#define avx_select_lo128(x) \
144        ((__m128i) _mm256_castps256_ps128(x))
145
146#define avx_select_hi128(x) \
147        ((__m128i)(_mm256_extractf128_ps(x, 1)))
148
149#define avx_general_combine256(x, y) \
150   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
151
152IDISA_ALWAYS_INLINE void adc128(bitblock128_t x, bitblock128_t y, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & sum)
153{
154        bitblock128_t gen = simd_and(x, y);
155        bitblock128_t prop = simd_or(x, y);
156        bitblock128_t partial = simd128<64>::add(simd128<64>::add(x, y), carry_in);
157        bitblock128_t c1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_andc(prop, partial))));
158        sum = simd128<64>::add(c1, partial);
159        carry_out = simd128<128>::srli<127>(simd_or(gen, simd_andc(prop, sum)));
160}
161
162#ifndef ADCMAGIC
163IDISA_ALWAYS_INLINE void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum)
164{
165//  Really Slow!
166//        bitblock256_t gen = simd_and(x, y);
167//        bitblock256_t prop = simd_or(x, y);
168//        sum = simd256<256>::add(simd256<256>::add(x, y), carry2bitblock(carry_in));
169//        carry_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_andc(prop, sum))));
170
171  bitblock128_t x0 = avx_select_lo128(x);
172  bitblock128_t x1 = avx_select_hi128(x);
173  bitblock128_t y0 = avx_select_lo128(y);
174  bitblock128_t y1 = avx_select_hi128(y);
175  bitblock128_t c0 = avx_select_lo128(carry2bitblock(carry_in));
176  bitblock128_t s0, s1, c1, c2;
177  adc128(x0, y0, c0, c1, s0);
178  adc128(x1, y1, c1, c2, s1);
179  sum = avx_general_combine256(s1, s0);
180  carry_out = _mm256_castps128_ps256((__m128) c2);
181}
182#endif
183
184#ifdef ADCMAGIC
185
186#ifdef AVX2
187static inline void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) {
188        bitblock256_t all_ones = simd256<1>::constant<1>();
189        bitblock256_t gen = simd_and(x, y);
190        bitblock256_t prop = simd_xor(x, y);
191  bitblock256_t partial_sum = simd256<64>::add(x, y);
192        bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum));
193  bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones);
194  uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in);
195  uint64_t bubble_mask = hsimd256<64>::signmask(bubble);
196        uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
197  uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
198  carry_out = convert(increments >> 4);
199  uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
200  sum = simd256<64>::add(partial_sum, _mm256_cvtepu8_epi64(convert_128(spread)));
201}
202#else
203static inline void adc(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) {
204  bitblock128_t all_ones = simd128<1>::constant<1>();
205  //bitblock256_t gen = simd_and(x, y);
206  //bitblock256_t prop = simd_xor(x, y);
207  bitblock128_t x0 = avx_select_lo128(x);
208  bitblock128_t x1 = avx_select_hi128(x);
209  bitblock128_t y0 = avx_select_lo128(y);
210  bitblock128_t y1 = avx_select_hi128(y);
211        bitblock128_t sum0 = simd128<64>::add(x0, y0);
212        bitblock128_t sum1 = simd128<64>::add(x1, y1);
213  //bitblock256_t icarry = simd_or(gen, simd_andc(prop, avx_general_combine256(sum1, sum0)));
214        bitblock128_t icarry0 = simd_or(simd_and(x0, y0), simd_andc(simd_or(x0, y0), sum0));
215        bitblock128_t icarry1 = simd_or(simd_and(x1, y1), simd_andc(simd_or(x1, y1), sum1));
216        // A carry may bubble through a field if it is all ones.
217        bitblock128_t bubble0 = simd128<64>::eq(sum0, all_ones);
218        bitblock128_t bubble1 = simd128<64>::eq(sum1, all_ones);
219        //bitblock128_t bubble = hsimd128<64>::packss(bubble1, bubble0);
220        bitblock256_t bubble = avx_general_combine256(bubble1, bubble0);
221        //uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + convert(carry_in);
222        uint64_t carry_mask = hsimd128<64>::signmask(icarry1) * 8 + hsimd128<64>::signmask(icarry0) * 2 + convert(carry_in);
223        uint64_t bubble_mask = _mm256_movemask_pd((__m256d) bubble);
224        //uint64_t bubble_mask = hsimd128<64>::signmask(bubble1) * 4 + hsimd128<64>::signmask(bubble0);
225  //uint64_t bubble_mask = hsimd128<32>::signmask(bubble);
226  uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
227        uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
228        carry_out = convert(increments >> 4);
229        uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
230        bitblock128_t inc_32 = _mm_cvtepu16_epi32(_mm_cvtsi64_si128(spread));
231        bitblock128_t inc_64_0 = esimd128<32>::mergel(simd128<1>::constant<0>(), inc_32);
232        bitblock128_t inc_64_1 = esimd128<32>::mergeh(simd128<1>::constant<0>(), inc_32);
233        sum = avx_general_combine256(simd128<64>::add(sum1, inc_64_1), simd128<64>::add(sum0, inc_64_0));
234}
235#endif
236
237#endif
238
239IDISA_ALWAYS_INLINE void sbb128(bitblock128_t x, bitblock128_t y, bitblock128_t borrow_in, bitblock128_t & borrow_out, bitblock128_t & difference)
240{
241        bitblock128_t gen = simd_andc(y, x);
242        bitblock128_t prop = simd_not(simd_xor(x, y));
243        bitblock128_t partial = simd128<64>::sub(simd128<64>::sub(x, y), borrow_in);
244        bitblock128_t b1 = simd128<128>::slli<64>(simd128<64>::srli<63>(simd_or(gen, simd_and(prop, partial))));
245        difference = simd128<64>::sub(partial, b1);
246        borrow_out = simd128<128>::srli<127>(simd_or(gen, simd_and(prop, difference)));
247}
248
249
250IDISA_ALWAYS_INLINE void sbb(bitblock256_t x, bitblock256_t y, carry_t borrow_in, carry_t & borrow_out, bitblock256_t & difference)
251{
252//        bitblock256_t gen = simd_andc(y, x);
253//        bitblock256_t prop = simd_not(simd_xor(x, y));
254//        difference = simd256<256>::sub(simd256<256>::sub(x, y), carry2bitblock(borrow_in));
255//        borrow_out = bitblock2carry(simd256<256>::srli<255>(simd_or(gen, simd_and(prop, difference))));
256  bitblock128_t x0 = avx_select_lo128(x);
257  bitblock128_t x1 = avx_select_hi128(x);
258  bitblock128_t y0 = avx_select_lo128(y);
259  bitblock128_t y1 = avx_select_hi128(y);
260  bitblock128_t b0 = avx_select_lo128(carry2bitblock(borrow_in));
261  bitblock128_t d0, d1, b1, b2;
262  sbb128(x0, y0, b0, b1, d0);
263  sbb128(x1, y1, b1, b2, d1);
264  difference = avx_general_combine256(d1, d0);
265  borrow_out = _mm256_castps128_ps256((__m128) b2);
266}
267
268IDISA_ALWAYS_INLINE void advance_with_carry128(bitblock128_t cursor, bitblock128_t carry_in, bitblock128_t & carry_out, bitblock128_t & rslt)
269{
270bitblock128_t shift_out = simd128<64>::srli<63>(cursor);
271bitblock128_t low_bits = esimd128<64>::mergel(shift_out, carry_in);
272carry_out = simd128<128>::srli<64>(shift_out);
273rslt = simd_or(simd128<64>::add(cursor, cursor), low_bits);
274}
275
276IDISA_ALWAYS_INLINE void advance_with_carry(bitblock256_t cursor, carry_t carry_in, carry_t & carry_out, bitblock256_t & rslt)
277{
278  bitblock128_t cursor0 = avx_select_lo128(cursor);
279  bitblock128_t cursor1 = avx_select_hi128(cursor);
280  bitblock128_t  carry0 = avx_select_lo128(carry_in);
281  bitblock128_t  carry1, carry2, rslt0, rslt1;
282  advance_with_carry128(cursor0, carry0, carry1, rslt0);
283  advance_with_carry128(cursor1, carry1, carry2, rslt1);
284  rslt = avx_general_combine256(rslt1, rslt0);
285  carry_out = _mm256_castps128_ps256((__m128)carry2);
286}
287#endif
288
289#endif // BITBLOCK256_HPP_
Note: See TracBrowser for help on using the repository browser.