source: trunk/lib/block_carry_avx.h @ 1580

Last change on this file since 1580 was 1077, checked in by cameron, 8 years ago

CarryQ changes for AVX/BitBlock_scantofirst

File size: 13.5 KB
Line 
1/*  Block Addition, Subtraction and Shifts with Carry
2    Copyright (C) 2010, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7This file defines addition, subtract and shift operations on
8128-bit blocks.   Different versions of the operations are
9selectable with the CARRY_STRATEGY preprocessor constant.
10
11Each implementation defines the following "abstract data type"
12for block operations with carry.
13
14Typename:   CarryType
15Constant:   Carry0  represents a value of 0 for the carry bit.
16Predicate:  test_carry(x) returns nonzero if a carry bit is 1, 0 otherwise.
17Function:   carry_or(carry1, carry2) forms the logical or of two carries.
18Function:   adc128(x, y, carry, sum) computes (carry, sum) = x + y + carry,
19Function:   advance_with_carry(cursor, carry, rslt)
20                 computes (carry, rslt) = cursor + cursor + carry
21Function:   sbb128(x, y, borrow, diff)
22                 computes (borrow, diff) = y - x - borrow
23
24*/
25#ifndef BLOCK_CARRY_H
26#define BLOCK_CARRY_H
27
28
29typedef union {SIMD_type bitblock; uint64_t int64[2];} BitBlock_int64;
30
31
32
33/*------------------------------------------------------------*/
34#include "avx_simd.h"
35
36#define SIMD_CARRY_STRATEGY 1
37#define ADC64_STRATEGY 2
38#define ADC64_SAHF_STRATEGY 3
39
40#ifdef ADC64
41#define CARRY_STRATEGY ADC64_STRATEGY
42#else
43#define CARRY_STRATEGY SIMD_CARRY_STRATEGY
44#endif
45
46#if (CARRY_STRATEGY == ADC64_STRATEGY)
47typedef uint64_t CarryType;
48typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64;
49
50#define Carry0 0
51
52#define carry_flip(c) ((c) ^ 1)
53
54#define test_carry(x) ((x) != 0)
55
56#define carry_or(carry1, carry2) (carry1 | carry2)
57
58
59static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
60static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline));
61static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline));
62
63
64static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
65  SIMD256_int64 a, b, rslt;
66//printf("carryin = %lu\n",carry);
67//print_simd_register("x", x);
68//print_simd_register("y", y);
69  a.bitblock = x;
70  b.bitblock = y;
71  asm volatile("negq %[carryflag]\n\t"
72       "movq 0(%[xaddr]), %[r0]\n\t"
73       "adcq 0(%[yaddr]), %[r0]\n\t"
74       "movq 8(%[xaddr]), %[r1]\n\t"
75       "adcq 8(%[yaddr]), %[r1]\n\t"
76       "movq 16(%[xaddr]), %[r2]\n\t"
77       "adcq 16(%[yaddr]), %[r2]\n\t"
78       "movq 24(%[xaddr]), %[r3]\n\t"
79       "adcq 24(%[yaddr]), %[r3]\n\t"
80       "movq $0, %[carryflag]\n\t"
81       "adcq $0, %[carryflag]\n\t"
82        : [carryflag] "=&r" (carry), 
83          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
84        : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
85        : "cc");
86  sum = rslt.bitblock;
87//printf("carryout = %lu\n",carry);
88//print_simd_register("sum", sum);
89}
90 
91static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) {
92  SIMD256_int64 a, b, rslt;
93//printf("borrowin = %lu\n",borrow);
94//print_simd_register("x", x);
95//print_simd_register("y", y);
96  a.bitblock = x;
97  b.bitblock = y;
98  asm volatile("negq %[carryflag]\n\t"
99       "movq 0(%[xaddr]), %[r0]\n\t"
100       "sbbq 0(%[yaddr]), %[r0]\n\t"
101       "movq 8(%[xaddr]), %[r1]\n\t"
102       "sbbq 8(%[yaddr]), %[r1]\n\t"
103       "movq 16(%[xaddr]), %[r2]\n\t"
104       "sbbq 16(%[yaddr]), %[r2]\n\t"
105       "movq 24(%[xaddr]), %[r3]\n\t"
106       "sbbq 24(%[yaddr]), %[r3]\n\t"
107       "movq $0, %[carryflag]\n\t"
108       "adcq $0, %[carryflag]\n\t"
109        : [carryflag] "=&r" (borrow),
110          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
111        : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
112        : "cc");
113  diff = rslt.bitblock;
114//printf("borrowout = %lu\n",borrow);
115//print_simd_register("diff", diff);
116}
117
118static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) {
119  SIMD256_int64 r;
120  SIMD_type a = x;
121//printf("shift in = %lu\n",carry);
122//print_simd_register("x", x);
123  asm volatile("negq %[carryflag]\n\t"
124       "movq 0(%[xaddr]), %[r0]\n\t"
125       "adcq %[r0], %[r0]\n\t"
126       "movq 8(%[xaddr]), %[r1]\n\t"
127       "adcq %[r1], %[r1]\n\t"
128       "movq 16(%[xaddr]), %[r2]\n\t"
129       "adcq %[r2], %[r2]\n\t"
130       "movq 24(%[xaddr]), %[r3]\n\t"
131       "adcq %[r3], %[r3]\n\t"
132       "movq $0, %[carryflag]\n\t"
133       "adcq $0, %[carryflag]\n\t"
134        : [carryflag] "=&r" (carry),
135          [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3])
136        : "[carryflag]" (carry), [xaddr] "r" (&a)
137        : "cc");
138  rslt = r.bitblock;
139//printf("shift out = %lu\n",carry);
140//print_simd_register("rslt", rslt);
141}
142
143#endif
144
145#if (CARRY_STRATEGY == ADC64_SAHF_STRATEGY)
146typedef uint64_t CarryType;
147
148#define Carry0 0
149
150#define test_carry(x) (((x)&256) > 0)
151
152#define carry_flip(c) ((c)^256)
153
154#define carry_or(carry1, carry2) (carry1 | carry2)
155
156#define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
157  __asm__  ("sahf\n\t" \
158        "adc %[e1], %[z1]\n\t" \
159        "adc %[e2], %[z2]\n\t" \
160        "lahf\n\t" \
161     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
162         : "[z1]" (x1), "[z2]" (x2), \
163           [e1] "r" (y1), [e2] "r" (y2), \
164           "[carryflag]" (carry) \
165         : "cc")
166
167#define adc128(first, second, carry, sum) \
168do {\
169  BitBlock_int64 rslt, x, y;\
170  x.bitblock = first;\
171  y.bitblock = second;\
172  double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
173  sum = rslt.bitblock;\
174}while(0)
175
176
177
178#define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
179  __asm__  ("sahf\n\t" \
180        "adc %[z1], %[z1]\n\t" \
181        "adc %[z2], %[z2]\n\t" \
182        "lahf\n\t" \
183     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
184         : "[z1]" (x1), "[z2]" (x2), \
185           "[carryflag]" (carry) \
186         : "cc")
187
188
189#define advance_with_carry(cursor, carry, rslt)\
190do {\
191  BitBlock_int64 x, z;\
192  x.bitblock = cursor;\
193  double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\
194  rslt = z.bitblock;\
195} while(0)
196
197
198
199
200#define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, carry) \
201  __asm__  ("sahf\n\t" \
202        "sbb %[e1], %[z1]\n\t" \
203        "sbb %[e2], %[z2]\n\t" \
204        "lahf\n\t" \
205     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
206         : "[z1]" (x1), "[z2]" (x2), \
207           [e1] "r" (y1), [e2] "r" (y2), \
208           "[carryflag]" (carry) \
209         : "cc")
210
211#define sbb128(first, second, borrow, diff) \
212do {\
213  BitBlock_int64 rslt, x, y;\
214  x.bitblock = first;\
215  y.bitblock = second;\
216  double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
217                   rslt.int64[0], rslt.int64[1], borrow);\
218  diff = rslt.bitblock;\
219}while(0)
220
221#endif
222
223
224
225#if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY)
226
227typedef __m128i sse_type;
228
229
230
231#define sse_or(b1, b2) _mm_or_si128(b1, b2)
232#define sse_and(b1, b2) _mm_and_si128(b1, b2)
233#define sse_xor(b1, b2) _mm_xor_si128(b1, b2)
234#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
235#define sse_if(cond, then_val, else_val) \
236  sse_or(sse_and(then_val, cond), sse_andc(else_val, cond))
237#define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
238#define sse_nor(a,b) (sse_not(sse_or(a,b)))
239
240#define sse_slli_64(r, shft) _mm_slli_epi64(r, shft)
241#define sse_srli_64(r, shft) _mm_srli_epi64(r, shft)
242#define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
243#define sse_sub_64(a, b) _mm_sub_epi64(a, b)
244#define sse_add_64(a, b) _mm_add_epi64(a, b)
245
246#define sse_slli_128(r, shft) \
247  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
248   (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
249   sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64-(shft)), 8)))
250
251#define sse_srli_128(r, shft) \
252  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
253   (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
254   sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64-(shft)), 8)))
255
256#define sse_to_int(x) _mm_cvtsi128_si32(x)
257
258#define sse_from_int(n) _mm_cvtsi32_si128(n)
259#define sse_eq_64(a, b) _mm_cmpeq_epi64(a, b)
260#define sse_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
261#define sse_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
262#define sse_const_8(n) _mm_set1_epi8(n)
263#define sse_const_1(n) \
264  (n==0 ? _mm_setzero_si128(): sse_const_8(-1))
265
266
267
268/*
269typedef SIMD_type CarryType;
270
271#define Carry0 simd_const_1(0)
272
273#define test_carry(x) bitblock_has_bit(x)
274
275#define carry_flip(c) simd_xor(c, sisd_from_int(1))
276
277#define carry_or(carry1, carry2) simd_or(carry1, carry2)
278*/
279
280#define uint32_CARRYTYPE
281#ifdef uint32_CARRYTYPE
282typedef uint32_t CarryType;
283
284#define Carry0 0
285
286#define test_carry(x) ((x) != 0)
287
288#define carry_flip(c) ((c)^1)
289
290#define carry_or(carry1, carry2) (carry1 | carry2)
291
292#define sse_from_CarryType(c) sse_from_int(c)
293
294#define sse_to_CarryType(c) sse_to_int(c)
295#endif
296
297#ifdef sse_CARRYTYPE
298#define CarryType sse_type
299
300#define Carry0 (_mm_set1_epi32(0))
301
302#define test_carry(x) (!_mm_testz_si128(x, x))
303
304#define carry_or(carry1, carry2) sse_or(carry1, carry2)
305
306#define sse_from_CarryType(c) c
307
308#define sse_to_CarryType(c) c
309#endif
310
311
312
313
314
315#define adc128(x, y, carry,  sum) \
316do{ \
317  sse_type gen = sse_and(x, y); \
318  sse_type prop = sse_or(x, y); \
319  sse_type partial = sse_add_64(sse_add_64(x, y), carry); \
320  sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \
321  sum = sse_add_64(c1, partial); \
322  carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \
323} while(0)
324
325
326#define sbb128(x, y, borrow, difference) \
327do {\
328  sse_type gen = sse_andc(y, x); \
329  sse_type prop = sse_not(sse_xor(x, y)); \
330  sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \
331  sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \
332  difference = sse_sub_64(partial, b1); \
333  borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \
334}while(0)
335
336#define advance_with_carry(cursor, carry, rslt)\
337do {\
338  sse_type shift_out = sse_srli_64(cursor, 63);\
339  sse_type low_bits = sse_mergel_64(shift_out, carry);\
340  carry = sse_srli_128(shift_out, 64);\
341  rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\
342} while(0)
343
344/*
345#define adc256(x, y, carry,  sum) \
346do {\
347        __m128i x0 = simd_lo128(x);\
348        __m128i x1 = simd_hi128(x);\
349        __m128i y0 = simd_lo128(y);\
350        __m128i y1 = simd_hi128(y);\
351        __m128i cry = sse_from_CarryType(carry);\
352        __m128i s0, s1;\
353        adc128(x0, y0, cry, s0);\
354        adc128(x1, y1, cry, s1);\
355        sum = simd_combine256(s1, s0);\
356        carry = sse_to_CarryType(cry);\
357} while(0)
358*/
359
360static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
361
362#ifndef ADCMAGIC
363static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
364        __m128i x0 = simd_lo128(x);
365        __m128i x1 = simd_hi128(x);
366        __m128i y0 = simd_lo128(y);
367        __m128i y1 = simd_hi128(y);
368        __m128i cry = sse_from_CarryType(carry);
369        __m128i s0, s1;
370        adc128(x0, y0, cry, s0);
371        adc128(x1, y1, cry, s1);
372        sum = simd_combine256(s1, s0);
373        carry = sse_to_CarryType(cry);
374}
375#endif
376#ifdef ADCMAGIC
377static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
378
379        BitBlock gen = simd_and(x, y);
380        BitBlock prop = simd_xor(x, y);
381        __m128i x0 = simd_lo128(x);
382        __m128i x1 = simd_hi128(x);
383        __m128i y0 = simd_lo128(y);
384        __m128i y1 = simd_hi128(y);
385        __m128i sum0 = sse_add_64(x0, y0);
386        __m128i sum1 = sse_add_64(x1, y1);
387        BitBlock icarry = simd_or(gen, simd_andc(prop, simd_combine256(sum1, sum0)));
388        // A carry may bubble through a field if it is all ones.
389        __m128i bubble0 = sse_eq_64(sum0, sse_const_1(1));
390        __m128i bubble1 = sse_eq_64(sum1, sse_const_1(1));
391        BitBlock bubble = simd_combine256(bubble1, bubble0);
392        uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + carry;
393        uint64_t bubble_mask = _mm256_movemask_pd((__m256d) bubble);
394        uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
395        uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
396        carry = increments >> 4;
397        uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;
398        __m128i inc_32 = _mm_cvtepu16_epi32(_mm_cvtsi64_si128(spread));
399        __m128i inc_64_0 = sse_mergel_32(sse_const_1(0), inc_32);
400        __m128i inc_64_1 = sse_mergeh_32(sse_const_1(0), inc_32);
401        sum = simd_combine256(sse_add_64(sum1, inc_64_1), sse_add_64(sum0, inc_64_0));
402}
403#endif
404
405
406#define sbb256(x, y, borrow, diff) \
407do {\
408        __m128i x0 = simd_lo128(x);\
409        __m128i x1 = simd_hi128(x);\
410        __m128i y0 = simd_lo128(y);\
411        __m128i y1 = simd_hi128(y);\
412        __m128i brw = sse_from_CarryType(borrow);\
413        __m128i d0, d1;\
414        sbb128(x0, y0, brw, d0);\
415        sbb128(x1, y1, brw, d1);\
416        diff = simd_combine256(d1, d0);\
417        borrow = sse_to_CarryType(brw);\
418} while(0)
419
420#define advance_with_carry256(cursor, carry, rslt)\
421do {\
422        __m128i cursor0 = simd_lo128(cursor);\
423        __m128i cursor1 = simd_hi128(cursor);\
424        __m128i cry = sse_from_CarryType(carry);\
425        __m128i rslt0, rslt1;\
426        advance_with_carry(cursor0, cry, rslt0);\
427        advance_with_carry(cursor1, cry, rslt1);\
428        rslt = simd_combine256(rslt1, rslt0);\
429        carry = sse_to_CarryType(cry);\
430} while(0)
431
432
433#endif
434
435
436#endif
Note: See TracBrowser for help on using the repository browser.