[462] | 1 | /* Block Addition, Subtraction and Shifts with Carry |
---|
| 2 | Copyright (C) 2010, Robert D. Cameron |
---|
| 3 | Licensed to the public under the Open Software License 3.0. |
---|
| 4 | Licensed to International Characters Inc. |
---|
| 5 | under the Academic Free License version 3.0. |
---|
| 6 | |
---|
| 7 | This file defines addition, subtract and shift operations on |
---|
| 8 | 128-bit blocks. Different versions of the operations are |
---|
| 9 | selectable with the CARRY_STRATEGY preprocessor constant. |
---|
| 10 | |
---|
| 11 | Each implementation defines the following "abstract data type" |
---|
| 12 | for block operations with carry. |
---|
| 13 | |
---|
| 14 | Typename: CarryType |
---|
| 15 | Constant: Carry0 represents a value of 0 for the carry bit. |
---|
| 16 | Predicate: test_carry(x) returns nonzero if a carry bit is 1, 0 otherwise. |
---|
| 17 | Function: carry_or(carry1, carry2) forms the logical or of two carries. |
---|
| 18 | Function: adc128(x, y, carry, sum) computes (carry, sum) = x + y + carry, |
---|
| 19 | Function: advance_with_carry(cursor, carry, rslt) |
---|
| 20 | computes (carry, rslt) = cursor + cursor + carry |
---|
| 21 | Function: sbb128(x, y, borrow, diff) |
---|
| 22 | computes (borrow, diff) = y - x - borrow |
---|
| 23 | |
---|
| 24 | */ |
---|
| 25 | #ifndef BLOCK_CARRY_H |
---|
| 26 | #define BLOCK_CARRY_H |
---|
| 27 | |
---|
| 28 | |
---|
[723] | 29 | typedef union {SIMD_type bitblock; uint64_t int64[2];} BitBlock_int64; |
---|
[462] | 30 | |
---|
| 31 | |
---|
| 32 | |
---|
| 33 | /*------------------------------------------------------------*/ |
---|
[959] | 34 | #include "avx_simd.h" |
---|
[462] | 35 | |
---|
[463] | 36 | #define SIMD_CARRY_STRATEGY 1 |
---|
| 37 | #define ADC64_STRATEGY 2 |
---|
| 38 | #define ADC64_SAHF_STRATEGY 3 |
---|
[462] | 39 | |
---|
[463] | 40 | #ifdef ADC64 |
---|
| 41 | #define CARRY_STRATEGY ADC64_STRATEGY |
---|
| 42 | #else |
---|
| 43 | #define CARRY_STRATEGY SIMD_CARRY_STRATEGY |
---|
| 44 | #endif |
---|
[462] | 45 | |
---|
[463] | 46 | #if (CARRY_STRATEGY == ADC64_STRATEGY) |
---|
[759] | 47 | typedef uint64_t CarryType; |
---|
[959] | 48 | typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64; |
---|
[462] | 49 | |
---|
| 50 | #define Carry0 0 |
---|
| 51 | |
---|
[959] | 52 | #define test_carry(x) ((x) != 0) |
---|
[462] | 53 | |
---|
| 54 | #define carry_or(carry1, carry2) (carry1 | carry2) |
---|
| 55 | |
---|
[534] | 56 | |
---|
[959] | 57 | static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline)); |
---|
| 58 | static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline)); |
---|
| 59 | static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline)); |
---|
[534] | 60 | |
---|
[462] | 61 | |
---|
[959] | 62 | static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) { |
---|
| 63 | SIMD256_int64 a, b, rslt; |
---|
| 64 | //printf("carryin = %lu\n",carry); |
---|
| 65 | //print_simd_register("x", x); |
---|
| 66 | //print_simd_register("y", y); |
---|
| 67 | a.bitblock = x; |
---|
| 68 | b.bitblock = y; |
---|
| 69 | asm volatile("negq %[carryflag]\n\t" |
---|
| 70 | "movq 0(%[xaddr]), %[r0]\n\t" |
---|
| 71 | "adcq 0(%[yaddr]), %[r0]\n\t" |
---|
| 72 | "movq 8(%[xaddr]), %[r1]\n\t" |
---|
| 73 | "adcq 8(%[yaddr]), %[r1]\n\t" |
---|
| 74 | "movq 16(%[xaddr]), %[r2]\n\t" |
---|
| 75 | "adcq 16(%[yaddr]), %[r2]\n\t" |
---|
| 76 | "movq 24(%[xaddr]), %[r3]\n\t" |
---|
| 77 | "adcq 24(%[yaddr]), %[r3]\n\t" |
---|
| 78 | "movq $0, %[carryflag]\n\t" |
---|
| 79 | "adcq $0, %[carryflag]\n\t" |
---|
| 80 | : [carryflag] "=&r" (carry), |
---|
| 81 | [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3]) |
---|
| 82 | : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock) |
---|
| 83 | : "cc"); |
---|
| 84 | sum = rslt.bitblock; |
---|
| 85 | //printf("carryout = %lu\n",carry); |
---|
| 86 | //print_simd_register("sum", sum); |
---|
| 87 | } |
---|
| 88 | |
---|
| 89 | static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) { |
---|
| 90 | SIMD256_int64 a, b, rslt; |
---|
| 91 | //printf("borrowin = %lu\n",borrow); |
---|
| 92 | //print_simd_register("x", x); |
---|
| 93 | //print_simd_register("y", y); |
---|
| 94 | a.bitblock = x; |
---|
| 95 | b.bitblock = y; |
---|
| 96 | asm volatile("negq %[carryflag]\n\t" |
---|
| 97 | "movq 0(%[xaddr]), %[r0]\n\t" |
---|
| 98 | "sbbq 0(%[yaddr]), %[r0]\n\t" |
---|
| 99 | "movq 8(%[xaddr]), %[r1]\n\t" |
---|
| 100 | "sbbq 8(%[yaddr]), %[r1]\n\t" |
---|
| 101 | "movq 16(%[xaddr]), %[r2]\n\t" |
---|
| 102 | "sbbq 16(%[yaddr]), %[r2]\n\t" |
---|
| 103 | "movq 24(%[xaddr]), %[r3]\n\t" |
---|
| 104 | "sbbq 24(%[yaddr]), %[r3]\n\t" |
---|
| 105 | "movq $0, %[carryflag]\n\t" |
---|
| 106 | "adcq $0, %[carryflag]\n\t" |
---|
| 107 | : [carryflag] "=&r" (borrow), |
---|
| 108 | [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3]) |
---|
| 109 | : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock) |
---|
| 110 | : "cc"); |
---|
| 111 | diff = rslt.bitblock; |
---|
| 112 | //printf("borrowout = %lu\n",borrow); |
---|
| 113 | //print_simd_register("diff", diff); |
---|
| 114 | } |
---|
[729] | 115 | |
---|
[959] | 116 | static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) { |
---|
| 117 | SIMD256_int64 r; |
---|
| 118 | SIMD_type a = x; |
---|
| 119 | //printf("shift in = %lu\n",carry); |
---|
| 120 | //print_simd_register("x", x); |
---|
| 121 | asm volatile("negq %[carryflag]\n\t" |
---|
| 122 | "movq 0(%[xaddr]), %[r0]\n\t" |
---|
| 123 | "adcq %[r0], %[r0]\n\t" |
---|
| 124 | "movq 8(%[xaddr]), %[r1]\n\t" |
---|
| 125 | "adcq %[r1], %[r1]\n\t" |
---|
| 126 | "movq 16(%[xaddr]), %[r2]\n\t" |
---|
| 127 | "adcq %[r2], %[r2]\n\t" |
---|
| 128 | "movq 24(%[xaddr]), %[r3]\n\t" |
---|
| 129 | "adcq %[r3], %[r3]\n\t" |
---|
| 130 | "movq $0, %[carryflag]\n\t" |
---|
| 131 | "adcq $0, %[carryflag]\n\t" |
---|
| 132 | : [carryflag] "=&r" (carry), |
---|
| 133 | [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3]) |
---|
| 134 | : "[carryflag]" (carry), [xaddr] "r" (&a) |
---|
| 135 | : "cc"); |
---|
| 136 | rslt = r.bitblock; |
---|
| 137 | //printf("shift out = %lu\n",carry); |
---|
| 138 | //print_simd_register("rslt", rslt); |
---|
| 139 | } |
---|
[462] | 140 | |
---|
| 141 | #endif |
---|
| 142 | |
---|
[463] | 143 | #if (CARRY_STRATEGY == ADC64_SAHF_STRATEGY) |
---|
[462] | 144 | typedef uint64_t CarryType; |
---|
| 145 | |
---|
| 146 | #define Carry0 0 |
---|
| 147 | |
---|
| 148 | #define test_carry(x) (((x)&256) > 0) |
---|
| 149 | |
---|
| 150 | #define carry_or(carry1, carry2) (carry1 | carry2) |
---|
| 151 | |
---|
| 152 | #define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \ |
---|
| 153 | __asm__ ("sahf\n\t" \ |
---|
| 154 | "adc %[e1], %[z1]\n\t" \ |
---|
| 155 | "adc %[e2], %[z2]\n\t" \ |
---|
| 156 | "lahf\n\t" \ |
---|
| 157 | : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ |
---|
| 158 | : "[z1]" (x1), "[z2]" (x2), \ |
---|
| 159 | [e1] "r" (y1), [e2] "r" (y2), \ |
---|
| 160 | "[carryflag]" (carry) \ |
---|
| 161 | : "cc") |
---|
| 162 | |
---|
| 163 | #define adc128(first, second, carry, sum) \ |
---|
[723] | 164 | do {\ |
---|
| 165 | BitBlock_int64 rslt, x, y;\ |
---|
[462] | 166 | x.bitblock = first;\ |
---|
| 167 | y.bitblock = second;\ |
---|
| 168 | double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\ |
---|
| 169 | sum = rslt.bitblock;\ |
---|
| 170 | }while(0) |
---|
| 171 | |
---|
| 172 | |
---|
| 173 | |
---|
| 174 | #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \ |
---|
| 175 | __asm__ ("sahf\n\t" \ |
---|
| 176 | "adc %[z1], %[z1]\n\t" \ |
---|
| 177 | "adc %[z2], %[z2]\n\t" \ |
---|
| 178 | "lahf\n\t" \ |
---|
| 179 | : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ |
---|
| 180 | : "[z1]" (x1), "[z2]" (x2), \ |
---|
| 181 | "[carryflag]" (carry) \ |
---|
| 182 | : "cc") |
---|
| 183 | |
---|
| 184 | |
---|
| 185 | #define advance_with_carry(cursor, carry, rslt)\ |
---|
[723] | 186 | do {\ |
---|
| 187 | BitBlock_int64 x, z;\ |
---|
[462] | 188 | x.bitblock = cursor;\ |
---|
| 189 | double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\ |
---|
| 190 | rslt = z.bitblock;\ |
---|
[723] | 191 | } while(0) |
---|
[462] | 192 | |
---|
| 193 | |
---|
| 194 | |
---|
| 195 | |
---|
| 196 | #define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, carry) \ |
---|
| 197 | __asm__ ("sahf\n\t" \ |
---|
| 198 | "sbb %[e1], %[z1]\n\t" \ |
---|
| 199 | "sbb %[e2], %[z2]\n\t" \ |
---|
| 200 | "lahf\n\t" \ |
---|
| 201 | : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ |
---|
| 202 | : "[z1]" (x1), "[z2]" (x2), \ |
---|
| 203 | [e1] "r" (y1), [e2] "r" (y2), \ |
---|
| 204 | "[carryflag]" (carry) \ |
---|
| 205 | : "cc") |
---|
| 206 | |
---|
| 207 | #define sbb128(first, second, borrow, diff) \ |
---|
[723] | 208 | do {\ |
---|
| 209 | BitBlock_int64 rslt, x, y;\ |
---|
[462] | 210 | x.bitblock = first;\ |
---|
| 211 | y.bitblock = second;\ |
---|
| 212 | double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \ |
---|
| 213 | rslt.int64[0], rslt.int64[1], borrow);\ |
---|
| 214 | diff = rslt.bitblock;\ |
---|
| 215 | }while(0) |
---|
| 216 | |
---|
| 217 | #endif |
---|
| 218 | |
---|
| 219 | |
---|
| 220 | |
---|
[463] | 221 | #if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY) |
---|
[462] | 222 | |
---|
[959] | 223 | typedef __m128i sse_type; |
---|
| 224 | |
---|
| 225 | |
---|
| 226 | |
---|
| 227 | #define sse_or(b1, b2) _mm_or_si128(b1, b2) |
---|
| 228 | #define sse_and(b1, b2) _mm_and_si128(b1, b2) |
---|
| 229 | #define sse_xor(b1, b2) _mm_xor_si128(b1, b2) |
---|
| 230 | #define sse_andc(b1, b2) _mm_andnot_si128(b2, b1) |
---|
| 231 | #define sse_if(cond, then_val, else_val) \ |
---|
| 232 | sse_or(sse_and(then_val, cond), sse_andc(else_val, cond)) |
---|
| 233 | #define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF))) |
---|
| 234 | #define sse_nor(a,b) (sse_not(sse_or(a,b))) |
---|
| 235 | |
---|
| 236 | #define sse_slli_64(r, shft) _mm_slli_epi64(r, shft) |
---|
| 237 | #define sse_srli_64(r, shft) _mm_srli_epi64(r, shft) |
---|
| 238 | #define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a) |
---|
| 239 | #define sse_sub_64(a, b) _mm_sub_epi64(a, b) |
---|
| 240 | #define sse_add_64(a, b) _mm_add_epi64(a, b) |
---|
| 241 | |
---|
| 242 | #define sse_slli_128(r, shft) \ |
---|
| 243 | ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \ |
---|
| 244 | (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \ |
---|
| 245 | sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64-(shft)), 8))) |
---|
| 246 | |
---|
| 247 | #define sse_srli_128(r, shft) \ |
---|
| 248 | ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \ |
---|
| 249 | (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \ |
---|
| 250 | sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64-(shft)), 8))) |
---|
| 251 | |
---|
| 252 | #define sse_to_int(x) _mm_cvtsi128_si32(x) |
---|
| 253 | |
---|
| 254 | #define sse_from_int(n) _mm_cvtsi32_si128(n) |
---|
| 255 | |
---|
| 256 | |
---|
| 257 | /* |
---|
[462] | 258 | typedef SIMD_type CarryType; |
---|
| 259 | |
---|
| 260 | #define Carry0 simd_const_1(0) |
---|
| 261 | |
---|
| 262 | #define test_carry(x) bitblock_has_bit(x) |
---|
| 263 | |
---|
| 264 | #define carry_or(carry1, carry2) simd_or(carry1, carry2) |
---|
[959] | 265 | */ |
---|
[462] | 266 | |
---|
[959] | 267 | #define sse_CARRYTYPE |
---|
| 268 | #ifdef uint32_CARRYTYPE |
---|
| 269 | typedef uint32_t CarryType; |
---|
| 270 | |
---|
| 271 | #define Carry0 0 |
---|
| 272 | |
---|
| 273 | #define test_carry(x) ((x) != 0) |
---|
| 274 | |
---|
| 275 | #define carry_or(carry1, carry2) (carry1 | carry2) |
---|
| 276 | |
---|
| 277 | #define sse_from_CarryType(c) sse_from_int(c) |
---|
| 278 | |
---|
| 279 | #define sse_to_CarryType(c) sse_to_int(c) |
---|
| 280 | #endif |
---|
| 281 | |
---|
| 282 | #ifdef sse_CARRYTYPE |
---|
| 283 | #define CarryType sse_type |
---|
| 284 | |
---|
| 285 | #define Carry0 (_mm_set1_epi32(0)) |
---|
| 286 | |
---|
| 287 | #define test_carry(x) (!_mm_testz_si128(x, x)) |
---|
| 288 | |
---|
| 289 | #define carry_or(carry1, carry2) sse_or(carry1, carry2) |
---|
| 290 | |
---|
| 291 | #define sse_from_CarryType(c) c |
---|
| 292 | |
---|
| 293 | #define sse_to_CarryType(c) c |
---|
| 294 | #endif |
---|
| 295 | |
---|
| 296 | |
---|
| 297 | |
---|
| 298 | |
---|
| 299 | |
---|
[462] | 300 | #define adc128(x, y, carry, sum) \ |
---|
| 301 | do{ \ |
---|
[959] | 302 | sse_type gen = sse_and(x, y); \ |
---|
| 303 | sse_type prop = sse_or(x, y); \ |
---|
| 304 | sse_type partial = sse_add_64(sse_add_64(x, y), carry); \ |
---|
| 305 | sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \ |
---|
| 306 | sum = sse_add_64(c1, partial); \ |
---|
| 307 | carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \ |
---|
[462] | 308 | } while(0) |
---|
| 309 | |
---|
| 310 | |
---|
| 311 | #define sbb128(x, y, borrow, difference) \ |
---|
| 312 | do {\ |
---|
[959] | 313 | sse_type gen = sse_andc(y, x); \ |
---|
| 314 | sse_type prop = sse_not(sse_xor(x, y)); \ |
---|
| 315 | sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \ |
---|
| 316 | sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \ |
---|
| 317 | difference = sse_sub_64(partial, b1); \ |
---|
| 318 | borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \ |
---|
[462] | 319 | }while(0) |
---|
| 320 | |
---|
| 321 | #define advance_with_carry(cursor, carry, rslt)\ |
---|
[723] | 322 | do {\ |
---|
[959] | 323 | sse_type shift_out = sse_srli_64(cursor, 63);\ |
---|
| 324 | sse_type low_bits = sse_mergel_64(shift_out, carry);\ |
---|
| 325 | carry = sse_srli_128(shift_out, 64);\ |
---|
| 326 | rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\ |
---|
[723] | 327 | } while(0) |
---|
[462] | 328 | |
---|
[959] | 329 | #define adc256(x, y, carry, sum) \ |
---|
| 330 | do {\ |
---|
| 331 | __m128i x0 = simd_lo128(x);\ |
---|
| 332 | __m128i x1 = simd_hi128(x);\ |
---|
| 333 | __m128i y0 = simd_lo128(y);\ |
---|
| 334 | __m128i y1 = simd_hi128(y);\ |
---|
| 335 | __m128i cry = sse_from_CarryType(carry);\ |
---|
| 336 | __m128i s0, s1;\ |
---|
| 337 | adc128(x0, y0, cry, s0);\ |
---|
| 338 | adc128(x1, y1, cry, s1);\ |
---|
| 339 | sum = simd_combine256(s1, s0);\ |
---|
| 340 | carry = sse_to_CarryType(cry);\ |
---|
| 341 | } while(0) |
---|
| 342 | |
---|
| 343 | #define sbb256(x, y, borrow, diff) \ |
---|
| 344 | do {\ |
---|
| 345 | __m128i x0 = simd_lo128(x);\ |
---|
| 346 | __m128i x1 = simd_hi128(x);\ |
---|
| 347 | __m128i y0 = simd_lo128(y);\ |
---|
| 348 | __m128i y1 = simd_hi128(y);\ |
---|
| 349 | __m128i brw = sse_from_CarryType(borrow);\ |
---|
| 350 | __m128i d0, d1;\ |
---|
| 351 | sbb128(x0, y0, brw, d0);\ |
---|
| 352 | sbb128(x1, y1, brw, d1);\ |
---|
| 353 | diff = simd_combine256(d1, d0);\ |
---|
| 354 | borrow = sse_to_CarryType(brw);\ |
---|
| 355 | } while(0) |
---|
| 356 | |
---|
| 357 | #define advance_with_carry256(cursor, carry, rslt)\ |
---|
| 358 | do {\ |
---|
| 359 | __m128i cursor0 = simd_lo128(cursor);\ |
---|
| 360 | __m128i cursor1 = simd_hi128(cursor);\ |
---|
| 361 | __m128i cry = sse_from_CarryType(carry);\ |
---|
| 362 | __m128i rslt0, rslt1;\ |
---|
| 363 | advance_with_carry(cursor0, cry, rslt0);\ |
---|
| 364 | advance_with_carry(cursor1, cry, rslt1);\ |
---|
| 365 | rslt = simd_combine256(rslt1, rslt0);\ |
---|
| 366 | carry = sse_to_CarryType(cry);\ |
---|
| 367 | } while(0) |
---|
| 368 | |
---|
| 369 | |
---|
[462] | 370 | #endif |
---|
| 371 | |
---|
| 372 | |
---|
[959] | 373 | #endif |
---|