source: trunk/lib/block_carry_avx.h @ 960

Last change on this file since 960 was 960, checked in by cameron, 9 years ago

Rename AVX library versions

File size: 11.1 KB
RevLine 
[462]1/*  Block Addition, Subtraction and Shifts with Carry
2    Copyright (C) 2010, Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6
7This file defines addition, subtract and shift operations on
8128-bit blocks.   Different versions of the operations are
9selectable with the CARRY_STRATEGY preprocessor constant.
10
11Each implementation defines the following "abstract data type"
12for block operations with carry.
13
14Typename:   CarryType
15Constant:   Carry0  represents a value of 0 for the carry bit.
16Predicate:  test_carry(x) returns nonzero if a carry bit is 1, 0 otherwise.
17Function:   carry_or(carry1, carry2) forms the logical or of two carries.
18Function:   adc128(x, y, carry, sum) computes (carry, sum) = x + y + carry,
19Function:   advance_with_carry(cursor, carry, rslt)
20                 computes (carry, rslt) = cursor + cursor + carry
21Function:   sbb128(x, y, borrow, diff)
22                 computes (borrow, diff) = y - x - borrow
23
24*/
25#ifndef BLOCK_CARRY_H
26#define BLOCK_CARRY_H
27
28
[723]29typedef union {SIMD_type bitblock; uint64_t int64[2];} BitBlock_int64;
[462]30
31
32
33/*------------------------------------------------------------*/
[959]34#include "avx_simd.h"
[462]35
[463]36#define SIMD_CARRY_STRATEGY 1
37#define ADC64_STRATEGY 2
38#define ADC64_SAHF_STRATEGY 3
[462]39
[463]40#ifdef ADC64
41#define CARRY_STRATEGY ADC64_STRATEGY
42#else
43#define CARRY_STRATEGY SIMD_CARRY_STRATEGY
44#endif
[462]45
[463]46#if (CARRY_STRATEGY == ADC64_STRATEGY)
[759]47typedef uint64_t CarryType;
[959]48typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64;
[462]49
50#define Carry0 0
51
[959]52#define test_carry(x) ((x) != 0)
[462]53
54#define carry_or(carry1, carry2) (carry1 | carry2)
55
[534]56
[959]57static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline));
58static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline));
59static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline));
[534]60
[462]61
[959]62static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) {
63  SIMD256_int64 a, b, rslt;
64//printf("carryin = %lu\n",carry);
65//print_simd_register("x", x);
66//print_simd_register("y", y);
67  a.bitblock = x;
68  b.bitblock = y;
69  asm volatile("negq %[carryflag]\n\t"
70       "movq 0(%[xaddr]), %[r0]\n\t"
71       "adcq 0(%[yaddr]), %[r0]\n\t"
72       "movq 8(%[xaddr]), %[r1]\n\t"
73       "adcq 8(%[yaddr]), %[r1]\n\t"
74       "movq 16(%[xaddr]), %[r2]\n\t"
75       "adcq 16(%[yaddr]), %[r2]\n\t"
76       "movq 24(%[xaddr]), %[r3]\n\t"
77       "adcq 24(%[yaddr]), %[r3]\n\t"
78       "movq $0, %[carryflag]\n\t"
79       "adcq $0, %[carryflag]\n\t"
80        : [carryflag] "=&r" (carry), 
81          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
82        : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
83        : "cc");
84  sum = rslt.bitblock;
85//printf("carryout = %lu\n",carry);
86//print_simd_register("sum", sum);
87}
88 
89static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) {
90  SIMD256_int64 a, b, rslt;
91//printf("borrowin = %lu\n",borrow);
92//print_simd_register("x", x);
93//print_simd_register("y", y);
94  a.bitblock = x;
95  b.bitblock = y;
96  asm volatile("negq %[carryflag]\n\t"
97       "movq 0(%[xaddr]), %[r0]\n\t"
98       "sbbq 0(%[yaddr]), %[r0]\n\t"
99       "movq 8(%[xaddr]), %[r1]\n\t"
100       "sbbq 8(%[yaddr]), %[r1]\n\t"
101       "movq 16(%[xaddr]), %[r2]\n\t"
102       "sbbq 16(%[yaddr]), %[r2]\n\t"
103       "movq 24(%[xaddr]), %[r3]\n\t"
104       "sbbq 24(%[yaddr]), %[r3]\n\t"
105       "movq $0, %[carryflag]\n\t"
106       "adcq $0, %[carryflag]\n\t"
107        : [carryflag] "=&r" (borrow),
108          [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3])
109        : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock)
110        : "cc");
111  diff = rslt.bitblock;
112//printf("borrowout = %lu\n",borrow);
113//print_simd_register("diff", diff);
114}
[729]115
[959]116static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) {
117  SIMD256_int64 r;
118  SIMD_type a = x;
119//printf("shift in = %lu\n",carry);
120//print_simd_register("x", x);
121  asm volatile("negq %[carryflag]\n\t"
122       "movq 0(%[xaddr]), %[r0]\n\t"
123       "adcq %[r0], %[r0]\n\t"
124       "movq 8(%[xaddr]), %[r1]\n\t"
125       "adcq %[r1], %[r1]\n\t"
126       "movq 16(%[xaddr]), %[r2]\n\t"
127       "adcq %[r2], %[r2]\n\t"
128       "movq 24(%[xaddr]), %[r3]\n\t"
129       "adcq %[r3], %[r3]\n\t"
130       "movq $0, %[carryflag]\n\t"
131       "adcq $0, %[carryflag]\n\t"
132        : [carryflag] "=&r" (carry),
133          [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3])
134        : "[carryflag]" (carry), [xaddr] "r" (&a)
135        : "cc");
136  rslt = r.bitblock;
137//printf("shift out = %lu\n",carry);
138//print_simd_register("rslt", rslt);
139}
[462]140
141#endif
142
[463]143#if (CARRY_STRATEGY == ADC64_SAHF_STRATEGY)
[462]144typedef uint64_t CarryType;
145
146#define Carry0 0
147
148#define test_carry(x) (((x)&256) > 0)
149
150#define carry_or(carry1, carry2) (carry1 | carry2)
151
152#define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \
153  __asm__  ("sahf\n\t" \
154        "adc %[e1], %[z1]\n\t" \
155        "adc %[e2], %[z2]\n\t" \
156        "lahf\n\t" \
157     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
158         : "[z1]" (x1), "[z2]" (x2), \
159           [e1] "r" (y1), [e2] "r" (y2), \
160           "[carryflag]" (carry) \
161         : "cc")
162
163#define adc128(first, second, carry, sum) \
[723]164do {\
165  BitBlock_int64 rslt, x, y;\
[462]166  x.bitblock = first;\
167  y.bitblock = second;\
168  double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\
169  sum = rslt.bitblock;\
170}while(0)
171
172
173
174#define double_int64_advance(x1, x2, rslt1, rslt2, carry) \
175  __asm__  ("sahf\n\t" \
176        "adc %[z1], %[z1]\n\t" \
177        "adc %[z2], %[z2]\n\t" \
178        "lahf\n\t" \
179     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
180         : "[z1]" (x1), "[z2]" (x2), \
181           "[carryflag]" (carry) \
182         : "cc")
183
184
185#define advance_with_carry(cursor, carry, rslt)\
[723]186do {\
187  BitBlock_int64 x, z;\
[462]188  x.bitblock = cursor;\
189  double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\
190  rslt = z.bitblock;\
[723]191} while(0)
[462]192
193
194
195
196#define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, carry) \
197  __asm__  ("sahf\n\t" \
198        "sbb %[e1], %[z1]\n\t" \
199        "sbb %[e2], %[z2]\n\t" \
200        "lahf\n\t" \
201     : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \
202         : "[z1]" (x1), "[z2]" (x2), \
203           [e1] "r" (y1), [e2] "r" (y2), \
204           "[carryflag]" (carry) \
205         : "cc")
206
207#define sbb128(first, second, borrow, diff) \
[723]208do {\
209  BitBlock_int64 rslt, x, y;\
[462]210  x.bitblock = first;\
211  y.bitblock = second;\
212  double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \
213                   rslt.int64[0], rslt.int64[1], borrow);\
214  diff = rslt.bitblock;\
215}while(0)
216
217#endif
218
219
220
[463]221#if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY)
[462]222
[959]223typedef __m128i sse_type;
224
225
226
227#define sse_or(b1, b2) _mm_or_si128(b1, b2)
228#define sse_and(b1, b2) _mm_and_si128(b1, b2)
229#define sse_xor(b1, b2) _mm_xor_si128(b1, b2)
230#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
231#define sse_if(cond, then_val, else_val) \
232  sse_or(sse_and(then_val, cond), sse_andc(else_val, cond))
233#define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
234#define sse_nor(a,b) (sse_not(sse_or(a,b)))
235
236#define sse_slli_64(r, shft) _mm_slli_epi64(r, shft)
237#define sse_srli_64(r, shft) _mm_srli_epi64(r, shft)
238#define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
239#define sse_sub_64(a, b) _mm_sub_epi64(a, b)
240#define sse_add_64(a, b) _mm_add_epi64(a, b)
241
242#define sse_slli_128(r, shft) \
243  ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
244   (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
245   sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64-(shft)), 8)))
246
247#define sse_srli_128(r, shft) \
248  ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
249   (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
250   sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64-(shft)), 8)))
251
252#define sse_to_int(x) _mm_cvtsi128_si32(x)
253
254#define sse_from_int(n) _mm_cvtsi32_si128(n)
255
256
257/*
[462]258typedef SIMD_type CarryType;
259
260#define Carry0 simd_const_1(0)
261
262#define test_carry(x) bitblock_has_bit(x)
263
264#define carry_or(carry1, carry2) simd_or(carry1, carry2)
[959]265*/
[462]266
[959]267#define sse_CARRYTYPE
268#ifdef uint32_CARRYTYPE
269typedef uint32_t CarryType;
270
271#define Carry0 0
272
273#define test_carry(x) ((x) != 0)
274
275#define carry_or(carry1, carry2) (carry1 | carry2)
276
277#define sse_from_CarryType(c) sse_from_int(c)
278
279#define sse_to_CarryType(c) sse_to_int(c)
280#endif
281
282#ifdef sse_CARRYTYPE
283#define CarryType sse_type
284
285#define Carry0 (_mm_set1_epi32(0))
286
287#define test_carry(x) (!_mm_testz_si128(x, x))
288
289#define carry_or(carry1, carry2) sse_or(carry1, carry2)
290
291#define sse_from_CarryType(c) c
292
293#define sse_to_CarryType(c) c
294#endif
295
296
297
298
299
[462]300#define adc128(x, y, carry,  sum) \
301do{ \
[959]302  sse_type gen = sse_and(x, y); \
303  sse_type prop = sse_or(x, y); \
304  sse_type partial = sse_add_64(sse_add_64(x, y), carry); \
305  sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \
306  sum = sse_add_64(c1, partial); \
307  carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \
[462]308} while(0)
309
310
311#define sbb128(x, y, borrow, difference) \
312do {\
[959]313  sse_type gen = sse_andc(y, x); \
314  sse_type prop = sse_not(sse_xor(x, y)); \
315  sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \
316  sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \
317  difference = sse_sub_64(partial, b1); \
318  borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \
[462]319}while(0)
320
321#define advance_with_carry(cursor, carry, rslt)\
[723]322do {\
[959]323  sse_type shift_out = sse_srli_64(cursor, 63);\
324  sse_type low_bits = sse_mergel_64(shift_out, carry);\
325  carry = sse_srli_128(shift_out, 64);\
326  rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\
[723]327} while(0)
[462]328
[959]329#define adc256(x, y, carry,  sum) \
330do {\
331        __m128i x0 = simd_lo128(x);\
332        __m128i x1 = simd_hi128(x);\
333        __m128i y0 = simd_lo128(y);\
334        __m128i y1 = simd_hi128(y);\
335        __m128i cry = sse_from_CarryType(carry);\
336        __m128i s0, s1;\
337        adc128(x0, y0, cry, s0);\
338        adc128(x1, y1, cry, s1);\
339        sum = simd_combine256(s1, s0);\
340        carry = sse_to_CarryType(cry);\
341} while(0)
342
343#define sbb256(x, y, borrow, diff) \
344do {\
345        __m128i x0 = simd_lo128(x);\
346        __m128i x1 = simd_hi128(x);\
347        __m128i y0 = simd_lo128(y);\
348        __m128i y1 = simd_hi128(y);\
349        __m128i brw = sse_from_CarryType(borrow);\
350        __m128i d0, d1;\
351        sbb128(x0, y0, brw, d0);\
352        sbb128(x1, y1, brw, d1);\
353        diff = simd_combine256(d1, d0);\
354        borrow = sse_to_CarryType(brw);\
355} while(0)
356
357#define advance_with_carry256(cursor, carry, rslt)\
358do {\
359        __m128i cursor0 = simd_lo128(cursor);\
360        __m128i cursor1 = simd_hi128(cursor);\
361        __m128i cry = sse_from_CarryType(carry);\
362        __m128i rslt0, rslt1;\
363        advance_with_carry(cursor0, cry, rslt0);\
364        advance_with_carry(cursor1, cry, rslt1);\
365        rslt = simd_combine256(rslt1, rslt0);\
366        carry = sse_to_CarryType(cry);\
367} while(0)
368
369
[462]370#endif
371
372
[959]373#endif
Note: See TracBrowser for help on using the repository browser.