source: trunk/lib/block_carry_avx.h @ 1073

Last change on this file since 1073 was 973, checked in by cameron, 8 years ago

Fix ADCMAGIC version

File size: 13.3 KB
Line 
1/*  Block Addition, Subtraction and Shifts with Carry
2    Copyright (C) 2010, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7This file defines addition, subtract and shift operations on
8128-bit blocks.   Different versions of the operations are
9selectable with the CARRY_STRATEGY preprocessor constant.
10
11Each implementation defines the following "abstract data type"
12for block operations with carry.
13
14Typename:   CarryType
15Constant:   Carry0  represents a value of 0 for the carry bit.
16Predicate:  test_carry(x) returns nonzero if a carry bit is 1, 0 otherwise.
17Function:   carry_or(carry1, carry2) forms the logical or of two carries.
18Function:   adc128(x, y, carry, sum) computes (carry, sum) = x + y + carry,
19Function:   advance_with_carry(cursor, carry, rslt)
20                 computes (carry, rslt) = cursor + cursor + carry
21Function:   sbb128(x, y, borrow, diff)
22                 computes (borrow, diff) = y - x - borrow
23
24*/
25#ifndef BLOCK_CARRY_H
26#define BLOCK_CARRY_H
27
28
29typedef union {SIMD_type bitblock; uint64_t int64[2];} BitBlock_int64;
30
31
32
33/*------------------------------------------------------------*/
34#include "avx_simd.h"
35
36#define SIMD_CARRY_STRATEGY 1
37#define ADC64_STRATEGY 2
38#define ADC64_SAHF_STRATEGY 3
39
40#ifdef ADC64
41#define CARRY_STRATEGY ADC64_STRATEGY
42#else
43#define CARRY_STRATEGY SIMD_CARRY_STRATEGY
44#endif
45
46#if (CARRY_STRATEGY == ADC64_STRATEGY)
47typedef uint64_t CarryType;
48typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64;
49
50#define Carry0 0
51
52#define test_carry(x) ((x) != 0)
53
54#define carry_or(carry1, carry2) (carry1 | carry2)
55
56
57static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
58static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline));
59static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline));
60
61
62static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
63  SIMD256_int64 a, b, rslt;
64//printf("carryin = %lu\n",carry);
65//print_simd_register("x", x);
66//print_simd_register("y", y);
67  a.bitblock = x;
68  b.bitblock = y;
69  asm volatile("negq %[carryflag]\n\t"
70       "movq 0(%[xaddr]), %[r0]\n\t"
71       "adcq 0(%[yaddr]), %[r0]\n\t"
72       "movq 8(%[xaddr]), %[r1]\n\t"
73       "adcq 8(%[yaddr]), %[r1]\n\t"
74       "movq 16(%[xaddr]), %[r2]\n\t"
75       "adcq 16(%[yaddr]), %[r2]\n\t"
76       "movq 24(%[xaddr]), %[r3]\n\t"
77       "adcq 24(%[yaddr]), %[r3]\n\t"
78       "movq $0, %[carryflag]\n\t"
79       "adcq $0, %[carryflag]\n\t"
80        : [carryflag] "=&r" (carry), 
81          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
82        : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
83        : "cc");
84  sum = rslt.bitblock;
85//printf("carryout = %lu\n",carry);
86//print_simd_register("sum", sum);
87}
88 
89static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) {
90  SIMD256_int64 a, b, rslt;
91//printf("borrowin = %lu\n",borrow);
92//print_simd_register("x", x);
93//print_simd_register("y", y);
94  a.bitblock = x;
95  b.bitblock = y;
96  asm volatile("negq %[carryflag]\n\t"
97       "movq 0(%[xaddr]), %[r0]\n\t"
98       "sbbq 0(%[yaddr]), %[r0]\n\t"
99       "movq 8(%[xaddr]), %[r1]\n\t"
100       "sbbq 8(%[yaddr]), %[r1]\n\t"
101       "movq 16(%[xaddr]), %[r2]\n\t"
102       "sbbq 16(%[yaddr]), %[r2]\n\t"
103       "movq 24(%[xaddr]), %[r3]\n\t"
104       "sbbq 24(%[yaddr]), %[r3]\n\t"
105       "movq $0, %[carryflag]\n\t"
106       "adcq $0, %[carryflag]\n\t"
107        : [carryflag] "=&r" (borrow),
108          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
109        : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
110        : "cc");
111  diff = rslt.bitblock;
112//printf("borrowout = %lu\n",borrow);
113//print_simd_register("diff", diff);
114}
115
116static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) {
117  SIMD256_int64 r;
118  SIMD_type a = x;
119//printf("shift in = %lu\n",carry);
120//print_simd_register("x", x);
121  asm volatile("negq %[carryflag]\n\t"
122       "movq 0(%[xaddr]), %[r0]\n\t"
123       "adcq %[r0], %[r0]\n\t"
124       "movq 8(%[xaddr]), %[r1]\n\t"
125       "adcq %[r1], %[r1]\n\t"
126       "movq 16(%[xaddr]), %[r2]\n\t"
127       "adcq %[r2], %[r2]\n\t"
128       "movq 24(%[xaddr]), %[r3]\n\t"
129       "adcq %[r3], %[r3]\n\t"
130       "movq $0, %[carryflag]\n\t"
131       "adcq $0, %[carryflag]\n\t"
132        : [carryflag] "=&r" (carry),
133          [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3])
134        : "[carryflag]" (carry), [xaddr] "r" (&a)
135        : "cc");
136  rslt = r.bitblock;
137//printf("shift out = %lu\n",carry);
138//print_simd_register("rslt", rslt);
139}
140
141#endif
142
143#if (CARRY_STRATEGY == ADC64_SAHF_STRATEGY)
144typedef uint64_t CarryType;
145
146#define Carry0 0
147
148#define test_carry(x) (((x)&256) > 0)
149
150#define carry_or(carry1, carry2) (carry1 | carry2)
151
152#define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
153  __asm__  ("sahf\n\t" \
154        "adc %[e1], %[z1]\n\t" \
155        "adc %[e2], %[z2]\n\t" \
156        "lahf\n\t" \
157     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
158         : "[z1]" (x1), "[z2]" (x2), \
159           [e1] "r" (y1), [e2] "r" (y2), \
160           "[carryflag]" (carry) \
161         : "cc")
162
163#define adc128(first, second, carry, sum) \
164do {\
165  BitBlock_int64 rslt, x, y;\
166  x.bitblock = first;\
167  y.bitblock = second;\
168  double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
169  sum = rslt.bitblock;\
170}while(0)
171
172
173
174#define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
175  __asm__  ("sahf\n\t" \
176        "adc %[z1], %[z1]\n\t" \
177        "adc %[z2], %[z2]\n\t" \
178        "lahf\n\t" \
179     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
180         : "[z1]" (x1), "[z2]" (x2), \
181           "[carryflag]" (carry) \
182         : "cc")
183
184
185#define advance_with_carry(cursor, carry, rslt)\
186do {\
187  BitBlock_int64 x, z;\
188  x.bitblock = cursor;\
189  double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\
190  rslt = z.bitblock;\
191} while(0)
192
193
194
195
196#define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, carry) \
197  __asm__  ("sahf\n\t" \
198        "sbb %[e1], %[z1]\n\t" \
199        "sbb %[e2], %[z2]\n\t" \
200        "lahf\n\t" \
201     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
202         : "[z1]" (x1), "[z2]" (x2), \
203           [e1] "r" (y1), [e2] "r" (y2), \
204           "[carryflag]" (carry) \
205         : "cc")
206
207#define sbb128(first, second, borrow, diff) \
208do {\
209  BitBlock_int64 rslt, x, y;\
210  x.bitblock = first;\
211  y.bitblock = second;\
212  double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
213                   rslt.int64[0], rslt.int64[1], borrow);\
214  diff = rslt.bitblock;\
215}while(0)
216
217#endif
218
219
220
221#if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY)
222
223typedef __m128i sse_type;
224
225
226
227#define sse_or(b1, b2) _mm_or_si128(b1, b2)
228#define sse_and(b1, b2) _mm_and_si128(b1, b2)
229#define sse_xor(b1, b2) _mm_xor_si128(b1, b2)
230#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
231#define sse_if(cond, then_val, else_val) \
232  sse_or(sse_and(then_val, cond), sse_andc(else_val, cond))
233#define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
234#define sse_nor(a,b) (sse_not(sse_or(a,b)))
235
236#define sse_slli_64(r, shft) _mm_slli_epi64(r, shft)
237#define sse_srli_64(r, shft) _mm_srli_epi64(r, shft)
238#define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
239#define sse_sub_64(a, b) _mm_sub_epi64(a, b)
240#define sse_add_64(a, b) _mm_add_epi64(a, b)
241
242#define sse_slli_128(r, shft) \
243  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
244   (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
245   sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64-(shft)), 8)))
246
247#define sse_srli_128(r, shft) \
248  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
249   (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
250   sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64-(shft)), 8)))
251
252#define sse_to_int(x) _mm_cvtsi128_si32(x)
253
254#define sse_from_int(n) _mm_cvtsi32_si128(n)
255#define sse_eq_64(a, b) _mm_cmpeq_epi64(a, b)
256#define sse_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
257#define sse_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
258#define sse_const_8(n) _mm_set1_epi8(n)
259#define sse_const_1(n) \
260  (n==0 ? _mm_setzero_si128(): sse_const_8(-1))
261
262
263
264/*
265typedef SIMD_type CarryType;
266
267#define Carry0 simd_const_1(0)
268
269#define test_carry(x) bitblock_has_bit(x)
270
271#define carry_or(carry1, carry2) simd_or(carry1, carry2)
272*/
273
274#define uint32_CARRYTYPE
275#ifdef uint32_CARRYTYPE
276typedef uint32_t CarryType;
277
278#define Carry0 0
279
280#define test_carry(x) ((x) != 0)
281
282#define carry_or(carry1, carry2) (carry1 | carry2)
283
284#define sse_from_CarryType(c) sse_from_int(c)
285
286#define sse_to_CarryType(c) sse_to_int(c)
287#endif
288
289#ifdef sse_CARRYTYPE
290#define CarryType sse_type
291
292#define Carry0 (_mm_set1_epi32(0))
293
294#define test_carry(x) (!_mm_testz_si128(x, x))
295
296#define carry_or(carry1, carry2) sse_or(carry1, carry2)
297
298#define sse_from_CarryType(c) c
299
300#define sse_to_CarryType(c) c
301#endif
302
303
304
305
306
307#define adc128(x, y, carry,  sum) \
308do{ \
309  sse_type gen = sse_and(x, y); \
310  sse_type prop = sse_or(x, y); \
311  sse_type partial = sse_add_64(sse_add_64(x, y), carry); \
312  sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \
313  sum = sse_add_64(c1, partial); \
314  carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \
315} while(0)
316
317
318#define sbb128(x, y, borrow, difference) \
319do {\
320  sse_type gen = sse_andc(y, x); \
321  sse_type prop = sse_not(sse_xor(x, y)); \
322  sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \
323  sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \
324  difference = sse_sub_64(partial, b1); \
325  borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \
326}while(0)
327
328#define advance_with_carry(cursor, carry, rslt)\
329do {\
330  sse_type shift_out = sse_srli_64(cursor, 63);\
331  sse_type low_bits = sse_mergel_64(shift_out, carry);\
332  carry = sse_srli_128(shift_out, 64);\
333  rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\
334} while(0)
335
336/*
337#define adc256(x, y, carry,  sum) \
338do {\
339        __m128i x0 = simd_lo128(x);\
340        __m128i x1 = simd_hi128(x);\
341        __m128i y0 = simd_lo128(y);\
342        __m128i y1 = simd_hi128(y);\
343        __m128i cry = sse_from_CarryType(carry);\
344        __m128i s0, s1;\
345        adc128(x0, y0, cry, s0);\
346        adc128(x1, y1, cry, s1);\
347        sum = simd_combine256(s1, s0);\
348        carry = sse_to_CarryType(cry);\
349} while(0)
350*/
351
352static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
353
354#ifndef ADCMAGIC
355static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
356        __m128i x0 = simd_lo128(x);
357        __m128i x1 = simd_hi128(x);
358        __m128i y0 = simd_lo128(y);
359        __m128i y1 = simd_hi128(y);
360        __m128i cry = sse_from_CarryType(carry);
361        __m128i s0, s1;
362        adc128(x0, y0, cry, s0);
363        adc128(x1, y1, cry, s1);
364        sum = simd_combine256(s1, s0);
365        carry = sse_to_CarryType(cry);
366}
367#endif
368#ifdef ADCMAGIC
369static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
370
371        BitBlock gen = simd_and(x, y);
372        BitBlock prop = simd_xor(x, y);
373        __m128i x0 = simd_lo128(x);
374        __m128i x1 = simd_hi128(x);
375        __m128i y0 = simd_lo128(y);
376        __m128i y1 = simd_hi128(y);
377        __m128i sum0 = sse_add_64(x0, y0);
378        __m128i sum1 = sse_add_64(x1, y1);
379        BitBlock icarry = simd_or(gen, simd_andc(prop, simd_combine256(sum1, sum0)));
380        // A carry may bubble through a field if it is all ones.
381        __m128i bubble0 = sse_eq_64(sum0, sse_const_1(1));
382        __m128i bubble1 = sse_eq_64(sum1, sse_const_1(1));
383        BitBlock bubble = simd_combine256(bubble1, bubble0);
384        uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + carry;
385        uint64_t bubble_mask = _mm256_movemask_pd((__m256d) bubble);
386        uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
387        uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
388        carry = increments >> 4;
389        uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
390        __m128i inc_32 = _mm_cvtepu16_epi32(_mm_cvtsi64_si128(spread));
391        __m128i inc_64_0 = sse_mergel_32(sse_const_1(0), inc_32);
392        __m128i inc_64_1 = sse_mergeh_32(sse_const_1(0), inc_32);
393        sum = simd_combine256(sse_add_64(sum1, inc_64_1), sse_add_64(sum0, inc_64_0));
394}
395#endif
396
397
398#define sbb256(x, y, borrow, diff) \
399do {\
400        __m128i x0 = simd_lo128(x);\
401        __m128i x1 = simd_hi128(x);\
402        __m128i y0 = simd_lo128(y);\
403        __m128i y1 = simd_hi128(y);\
404        __m128i brw = sse_from_CarryType(borrow);\
405        __m128i d0, d1;\
406        sbb128(x0, y0, brw, d0);\
407        sbb128(x1, y1, brw, d1);\
408        diff = simd_combine256(d1, d0);\
409        borrow = sse_to_CarryType(brw);\
410} while(0)
411
412#define advance_with_carry256(cursor, carry, rslt)\
413do {\
414        __m128i cursor0 = simd_lo128(cursor);\
415        __m128i cursor1 = simd_hi128(cursor);\
416        __m128i cry = sse_from_CarryType(carry);\
417        __m128i rslt0, rslt1;\
418        advance_with_carry(cursor0, cry, rslt0);\
419        advance_with_carry(cursor1, cry, rslt1);\
420        rslt = simd_combine256(rslt1, rslt0);\
421        carry = sse_to_CarryType(cry);\
422} while(0)
423
424
425#endif
426
427
428#endif
Note: See TracBrowser for help on using the repository browser.