 Timestamp:
 Mar 21, 2011, 5:14:08 PM (8 years ago)
 Location:
 trunk/lib
 Files:

 4 edited
Legend:
 Unmodified
 Added
 Removed

trunk/lib/block_carry.h
r759 r959 32 32 33 33 /**/ 34 #include " sse_simd.h"34 #include "avx_simd.h" 35 35 36 36 #define SIMD_CARRY_STRATEGY 1 … … 39 39 40 40 #ifdef ADC64 41 #ifdef SAHFLAHF42 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY43 #else44 41 #define CARRY_STRATEGY ADC64_STRATEGY 45 #endif46 #else47 #ifdef SAHFLAHF48 #define CARRY_STRATEGY ADC64_SAHF_STRATEGY49 42 #else 50 43 #define CARRY_STRATEGY SIMD_CARRY_STRATEGY 51 44 #endif 52 #endif53 45 54 46 #if (CARRY_STRATEGY == ADC64_STRATEGY) 55 47 typedef uint64_t CarryType; 48 typedef union {SIMD_type bitblock; uint64_t int64[4];} SIMD256_int64; 56 49 57 50 #define Carry0 0 58 51 59 #define test_carry(x) ((x) >0)52 #define test_carry(x) ((x) != 0) 60 53 61 54 #define carry_or(carry1, carry2) (carry1  carry2) 62 55 63 #define clc() \ 64 __asm__ __volatile__ ("clc\n\t":::) 65 66 #define adc(x,y,carry,sum) \ 67 __asm__ __volatile__ ("add %[carryflag], %[e]\n\t" \ 68 "adc %[e], %[z]\n\t" \ 69 "mov $0, %1\n\t" \ 70 "adc %[carryflag], %1\n\t" \ 71 : [z] "=r" (sum), [carryflag] "=a" (carry) \ 72 : "[z]" (x), [e] "r" (y), "[carryflag]" (carry) \ 73 : "cc") 74 75 #define double_int64_adc(x1, x2, y1, y2, rslt1, rslt2, carry) \ 76 __asm__ __volatile__ ("neg %[carryflag]\n\t" \ 77 "adc %[e1], %[z1]\n\t" \ 78 "adc %[e2], %[z2]\n\t" \ 79 "mov $0, %[carryflag]\n\t" \ 80 "adc $0, %[carryflag]\n\t" \ 81 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=r" (carry) \ 82 : "[z1]" (x1), "[z2]" (x2), \ 83 [e1] "r" (y1), [e2] "r" (y2), \ 84 "[carryflag]" (carry) \ 85 : "cc") 86 87 88 #define adc128(first, second, carry, sum) \ 89 do {\ 90 BitBlock_int64 rslt, x, y;\ 91 x.bitblock = first;\ 92 y.bitblock = second;\ 93 double_int64_adc(x.int64[0], x.int64[1], y.int64[0], y.int64[1], rslt.int64[0], rslt.int64[1], carry);\ 94 sum = rslt.bitblock;\ 95 } while(0) 96 97 98 99 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \ 100 __asm__ (\ 101 "add %[z1], %[z1]\n\t" \ 102 "adc %[z2], %[z2]\n\t" \ 103 "lea 0(%[carryflag], %[z1]), %[z1]\n\t" \ 104 "setc %%al\n\t" \ 105 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ 106 : "[z1]" (x1), "[z2]" (x2), \ 107 "[carryflag]" (carry) \ 108 : "cc") 109 110 /* Slow 111 #define double_int64_advance(x1, x2, rslt1, rslt2, carry) \ 112 __asm__ (\ 113 "shld $1, %[z1], %[z2]\n\t" \ 114 "lea 0(%[carryflag], %[z1], 2), %[z1]\n\t" \ 115 "setc %%al\n\t" \ 116 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [carryflag] "=a" (carry) \ 117 : "[z1]" (x1), "[z2]" (x2), \ 118 "[carryflag]" (carry) \ 119 : "cc") 120 */ 121 122 #define advance_with_carry(cursor, carry, rslt)\ 123 do {\ 124 BitBlock_int64 x, z;\ 125 x.bitblock = cursor;\ 126 double_int64_advance(x.int64[0], x.int64[1], z.int64[0], z.int64[1], carry);\ 127 rslt = z.bitblock;\ 128 } while(0) 129 130 #define double_int64_sbb(x1, x2, y1, y2, rslt1, rslt2, brw) \ 131 __asm__ ("neg %[borrowflag]\n\t" \ 132 "sbb %[e1], %[z1]\n\t" \ 133 "sbb %[e2], %[z2]\n\t" \ 134 "mov $0, %[borrowflag]\n\t" \ 135 "sbb $0, %[borrowflag]\n\t" \ 136 : [z1] "=r" (rslt1), [z2] "=r" (rslt2), [borrowflag] "=a" (brw) \ 137 : "[z1]" (x1), "[z2]" (x2), \ 138 [e1] "r" (y1), [e2] "r" (y2), \ 139 "[borrowflag]" (brw) \ 140 : "cc") 141 142 #define sbb128(first, second, borrow, diff) \ 143 do {\ 144 BitBlock_int64 rslt, x, y;\ 145 x.bitblock = first;\ 146 y.bitblock = second;\ 147 double_int64_sbb(x.int64[0], x.int64[1], y.int64[0], y.int64[1], \ 148 rslt.int64[0], rslt.int64[1], borrow);\ 149 diff = rslt.bitblock;\ 150 } while(0) 151 56 57 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) __attribute__((always_inline)); 58 static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) __attribute__((always_inline)); 59 static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) __attribute__((always_inline)); 60 61 62 static inline void adc256(SIMD_type x, SIMD_type y, CarryType & carry, SIMD_type & sum) { 63 SIMD256_int64 a, b, rslt; 64 //printf("carryin = %lu\n",carry); 65 //print_simd_register("x", x); 66 //print_simd_register("y", y); 67 a.bitblock = x; 68 b.bitblock = y; 69 asm volatile("negq %[carryflag]\n\t" 70 "movq 0(%[xaddr]), %[r0]\n\t" 71 "adcq 0(%[yaddr]), %[r0]\n\t" 72 "movq 8(%[xaddr]), %[r1]\n\t" 73 "adcq 8(%[yaddr]), %[r1]\n\t" 74 "movq 16(%[xaddr]), %[r2]\n\t" 75 "adcq 16(%[yaddr]), %[r2]\n\t" 76 "movq 24(%[xaddr]), %[r3]\n\t" 77 "adcq 24(%[yaddr]), %[r3]\n\t" 78 "movq $0, %[carryflag]\n\t" 79 "adcq $0, %[carryflag]\n\t" 80 : [carryflag] "=&r" (carry), 81 [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3]) 82 : "[carryflag]" (carry), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock) 83 : "cc"); 84 sum = rslt.bitblock; 85 //printf("carryout = %lu\n",carry); 86 //print_simd_register("sum", sum); 87 } 88 89 static inline void sbb256(SIMD_type x, SIMD_type y, CarryType & borrow, SIMD_type & diff) { 90 SIMD256_int64 a, b, rslt; 91 //printf("borrowin = %lu\n",borrow); 92 //print_simd_register("x", x); 93 //print_simd_register("y", y); 94 a.bitblock = x; 95 b.bitblock = y; 96 asm volatile("negq %[carryflag]\n\t" 97 "movq 0(%[xaddr]), %[r0]\n\t" 98 "sbbq 0(%[yaddr]), %[r0]\n\t" 99 "movq 8(%[xaddr]), %[r1]\n\t" 100 "sbbq 8(%[yaddr]), %[r1]\n\t" 101 "movq 16(%[xaddr]), %[r2]\n\t" 102 "sbbq 16(%[yaddr]), %[r2]\n\t" 103 "movq 24(%[xaddr]), %[r3]\n\t" 104 "sbbq 24(%[yaddr]), %[r3]\n\t" 105 "movq $0, %[carryflag]\n\t" 106 "adcq $0, %[carryflag]\n\t" 107 : [carryflag] "=&r" (borrow), 108 [r0] "=&r" (rslt.int64[0]), [r1] "=&r" (rslt.int64[1]), [r2] "=&r" (rslt.int64[2]), [r3] "=&r" (rslt.int64[3]) 109 : "[carryflag]" (borrow), [xaddr] "r" (&a.bitblock), [yaddr] "r" (&b.bitblock) 110 : "cc"); 111 diff = rslt.bitblock; 112 //printf("borrowout = %lu\n",borrow); 113 //print_simd_register("diff", diff); 114 } 115 116 static inline void advance_with_carry256(SIMD_type x, CarryType & carry, SIMD_type & rslt) { 117 SIMD256_int64 r; 118 SIMD_type a = x; 119 //printf("shift in = %lu\n",carry); 120 //print_simd_register("x", x); 121 asm volatile("negq %[carryflag]\n\t" 122 "movq 0(%[xaddr]), %[r0]\n\t" 123 "adcq %[r0], %[r0]\n\t" 124 "movq 8(%[xaddr]), %[r1]\n\t" 125 "adcq %[r1], %[r1]\n\t" 126 "movq 16(%[xaddr]), %[r2]\n\t" 127 "adcq %[r2], %[r2]\n\t" 128 "movq 24(%[xaddr]), %[r3]\n\t" 129 "adcq %[r3], %[r3]\n\t" 130 "movq $0, %[carryflag]\n\t" 131 "adcq $0, %[carryflag]\n\t" 132 : [carryflag] "=&r" (carry), 133 [r0] "=&r" (r.int64[0]), [r1] "=&r" (r.int64[1]), [r2] "=&r" (r.int64[2]), [r3] "=&r" (r.int64[3]) 134 : "[carryflag]" (carry), [xaddr] "r" (&a) 135 : "cc"); 136 rslt = r.bitblock; 137 //printf("shift out = %lu\n",carry); 138 //print_simd_register("rslt", rslt); 139 } 152 140 153 141 #endif … … 233 221 #if (CARRY_STRATEGY == SIMD_CARRY_STRATEGY) 234 222 223 typedef __m128i sse_type; 224 225 226 227 #define sse_or(b1, b2) _mm_or_si128(b1, b2) 228 #define sse_and(b1, b2) _mm_and_si128(b1, b2) 229 #define sse_xor(b1, b2) _mm_xor_si128(b1, b2) 230 #define sse_andc(b1, b2) _mm_andnot_si128(b2, b1) 231 #define sse_if(cond, then_val, else_val) \ 232 sse_or(sse_and(then_val, cond), sse_andc(else_val, cond)) 233 #define sse_not(b) (sse_xor(b, _mm_set1_epi32(0xFFFFFFFF))) 234 #define sse_nor(a,b) (sse_not(sse_or(a,b))) 235 236 #define sse_slli_64(r, shft) _mm_slli_epi64(r, shft) 237 #define sse_srli_64(r, shft) _mm_srli_epi64(r, shft) 238 #define sse_mergel_64(a, b) _mm_unpacklo_epi64(b, a) 239 #define sse_sub_64(a, b) _mm_sub_epi64(a, b) 240 #define sse_add_64(a, b) _mm_add_epi64(a, b) 241 242 #define sse_slli_128(r, shft) \ 243 ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \ 244 (shft) >= 64 ? sse_slli_64(_mm_slli_si128(r, 8), (shft)  64) : \ 245 sse_or(sse_slli_64(r, shft), _mm_slli_si128(sse_srli_64(r, 64(shft)), 8))) 246 247 #define sse_srli_128(r, shft) \ 248 ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \ 249 (shft) >= 64 ? sse_srli_64(_mm_srli_si128(r, 8), (shft)  64) : \ 250 sse_or(sse_srli_64(r, shft), _mm_srli_si128(sse_slli_64(r, 64(shft)), 8))) 251 252 #define sse_to_int(x) _mm_cvtsi128_si32(x) 253 254 #define sse_from_int(n) _mm_cvtsi32_si128(n) 255 256 257 /* 235 258 typedef SIMD_type CarryType; 236 259 … … 240 263 241 264 #define carry_or(carry1, carry2) simd_or(carry1, carry2) 265 */ 266 267 #define sse_CARRYTYPE 268 #ifdef uint32_CARRYTYPE 269 typedef uint32_t CarryType; 270 271 #define Carry0 0 272 273 #define test_carry(x) ((x) != 0) 274 275 #define carry_or(carry1, carry2) (carry1  carry2) 276 277 #define sse_from_CarryType(c) sse_from_int(c) 278 279 #define sse_to_CarryType(c) sse_to_int(c) 280 #endif 281 282 #ifdef sse_CARRYTYPE 283 #define CarryType sse_type 284 285 #define Carry0 (_mm_set1_epi32(0)) 286 287 #define test_carry(x) (!_mm_testz_si128(x, x)) 288 289 #define carry_or(carry1, carry2) sse_or(carry1, carry2) 290 291 #define sse_from_CarryType(c) c 292 293 #define sse_to_CarryType(c) c 294 #endif 295 296 297 298 242 299 243 300 #define adc128(x, y, carry, sum) \ 244 301 do{ \ 245 SIMD_type gen = simd_and(x, y); \246 SIMD_type prop = simd_or(x, y); \247 SIMD_type partial = simd_add_64(simd_add_64(x, y), carry); \248 SIMD_type c1 = sisd_slli(simd_srli_64(simd_or(gen, simd_andc(prop, partial)), 63), 64); \249 sum = s imd_add_64(c1, partial); \250 carry = s isd_srli(simd_or(gen, simd_andc(prop, sum)), 127); \302 sse_type gen = sse_and(x, y); \ 303 sse_type prop = sse_or(x, y); \ 304 sse_type partial = sse_add_64(sse_add_64(x, y), carry); \ 305 sse_type c1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_andc(prop, partial)), 63), 64); \ 306 sum = sse_add_64(c1, partial); \ 307 carry = sse_srli_128(sse_or(gen, sse_andc(prop, sum)), 127); \ 251 308 } while(0) 252 309 … … 254 311 #define sbb128(x, y, borrow, difference) \ 255 312 do {\ 256 SIMD_type gen = simd_andc(y, x); \257 SIMD_type prop = simd_not(simd_xor(x, y)); \258 SIMD_type partial = simd_sub_64(simd_sub_64(x, y), borrow); \259 SIMD_type b1 = sisd_slli(simd_srli_64(simd_or(gen, simd_and(prop, partial)), 63), 64); \260 difference = s imd_sub_64(partial, b1); \261 borrow = s isd_srli(simd_or(gen, simd_and(prop, difference)), 127); \313 sse_type gen = sse_andc(y, x); \ 314 sse_type prop = sse_not(sse_xor(x, y)); \ 315 sse_type partial = sse_sub_64(sse_sub_64(x, y), borrow); \ 316 sse_type b1 = sse_slli_128(sse_srli_64(sse_or(gen, sse_and(prop, partial)), 63), 64); \ 317 difference = sse_sub_64(partial, b1); \ 318 borrow = sse_srli_128(sse_or(gen, sse_and(prop, difference)), 127); \ 262 319 }while(0) 263 320 264 265 321 #define advance_with_carry(cursor, carry, rslt)\ 266 322 do {\ 267 SIMD_type shift_out = simd_srli_64(cursor, 63);\ 268 SIMD_type low_bits = simd_mergel_64(shift_out, carry);\ 269 carry = sisd_srli(shift_out, 64);\ 270 rslt = simd_or(simd_add_64(cursor, cursor), low_bits);\ 271 } while(0) 272 273 #endif 274 #endif 275 276 323 sse_type shift_out = sse_srli_64(cursor, 63);\ 324 sse_type low_bits = sse_mergel_64(shift_out, carry);\ 325 carry = sse_srli_128(shift_out, 64);\ 326 rslt = sse_or(sse_add_64(cursor, cursor), low_bits);\ 327 } while(0) 328 329 #define adc256(x, y, carry, sum) \ 330 do {\ 331 __m128i x0 = simd_lo128(x);\ 332 __m128i x1 = simd_hi128(x);\ 333 __m128i y0 = simd_lo128(y);\ 334 __m128i y1 = simd_hi128(y);\ 335 __m128i cry = sse_from_CarryType(carry);\ 336 __m128i s0, s1;\ 337 adc128(x0, y0, cry, s0);\ 338 adc128(x1, y1, cry, s1);\ 339 sum = simd_combine256(s1, s0);\ 340 carry = sse_to_CarryType(cry);\ 341 } while(0) 342 343 #define sbb256(x, y, borrow, diff) \ 344 do {\ 345 __m128i x0 = simd_lo128(x);\ 346 __m128i x1 = simd_hi128(x);\ 347 __m128i y0 = simd_lo128(y);\ 348 __m128i y1 = simd_hi128(y);\ 349 __m128i brw = sse_from_CarryType(borrow);\ 350 __m128i d0, d1;\ 351 sbb128(x0, y0, brw, d0);\ 352 sbb128(x1, y1, brw, d1);\ 353 diff = simd_combine256(d1, d0);\ 354 borrow = sse_to_CarryType(brw);\ 355 } while(0) 356 357 #define advance_with_carry256(cursor, carry, rslt)\ 358 do {\ 359 __m128i cursor0 = simd_lo128(cursor);\ 360 __m128i cursor1 = simd_hi128(cursor);\ 361 __m128i cry = sse_from_CarryType(carry);\ 362 __m128i rslt0, rslt1;\ 363 advance_with_carry(cursor0, cry, rslt0);\ 364 advance_with_carry(cursor1, cry, rslt1);\ 365 rslt = simd_combine256(rslt1, rslt0);\ 366 carry = sse_to_CarryType(cry);\ 367 } while(0) 368 369 370 #endif 371 372 373 #endif 
trunk/lib/carryQ.h
r947 r959 133 133 static inline BitBlock BitBlock_advance_ci_co(BitBlock strm, CarryQtype cq, const int carryno) { 134 134 BitBlock rslt; 135 advance_with_carry (strm, cq[carryno], rslt);135 advance_with_carry256(strm, cq[carryno], rslt); 136 136 return rslt; 137 137 } … … 140 140 BitBlock rslt; 141 141 cq[carryno] = Carry0; 142 advance_with_carry (strm, cq[carryno], rslt);142 advance_with_carry256(strm, cq[carryno], rslt); 143 143 return rslt; 144 144 } … … 147 147 BitBlock rslt; 148 148 CarryType c = cq[carryno]; 149 advance_with_carry (strm, c, rslt);149 advance_with_carry256(strm, c, rslt); 150 150 return rslt; 151 151 } … … 157 157 static inline BitBlock BitBlock_add_ci_co(BitBlock strm1, BitBlock strm2, CarryQtype cq, const int carryno) { 158 158 BitBlock sum; 159 adc 128(strm1, strm2, cq[carryno], sum);159 adc256(strm1, strm2, cq[carryno], sum); 160 160 return sum; 161 161 } … … 164 164 BitBlock sum; 165 165 cq[carryno] = Carry0; 166 adc 128(strm1, strm2, cq[carryno], sum);166 adc256(strm1, strm2, cq[carryno], sum); 167 167 return sum; 168 168 } … … 171 171 BitBlock sum; 172 172 CarryType c = cq[carryno]; 173 adc 128(strm1, strm2, c, sum);173 adc256(strm1, strm2, c, sum); 174 174 return sum; 175 175 } … … 178 178 BitBlock sum; 179 179 CarryType c = Carry0; 180 adc 128(strm1, strm2, c, sum);180 adc256(strm1, strm2, c, sum); 181 181 return sum; 182 182 } … … 184 184 static inline BitBlock BitBlock_sub_ci_co(BitBlock strm1, BitBlock strm2, CarryQtype cq, const int carryno) { 185 185 BitBlock diff; 186 sbb 128(strm1, strm2, cq[carryno], diff);186 sbb256(strm1, strm2, cq[carryno], diff); 187 187 return diff; 188 188 } … … 191 191 BitBlock diff; 192 192 cq[carryno] = Carry0; 193 sbb 128(strm1, strm2, cq[carryno], diff);193 sbb256(strm1, strm2, cq[carryno], diff); 194 194 return diff; 195 195 } … … 198 198 BitBlock diff; 199 199 CarryType c = cq[carryno]; 200 sbb 128(strm1, strm2, c, diff);200 sbb256(strm1, strm2, c, diff); 201 201 return diff; 202 202 } … … 205 205 BitBlock diff; 206 206 CarryType c = Carry0; 207 sbb 128(strm1, strm2, c, diff);207 sbb256(strm1, strm2, c, diff); 208 208 return diff; 209 209 } … … 211 211 static inline BitBlock BitBlock_scanthru_ci_co(BitBlock markers0, BitBlock charclass, CarryQtype cq, const int carryno) { 212 212 BitBlock markers1; 213 adc 128(markers0, charclass, cq[carryno], markers1);213 adc256(markers0, charclass, cq[carryno], markers1); 214 214 return simd_andc(markers1, charclass); 215 215 } … … 218 218 BitBlock markers1; 219 219 cq[carryno] = Carry0; 220 adc 128(markers0, charclass, cq[carryno], markers1);220 adc256(markers0, charclass, cq[carryno], markers1); 221 221 return simd_andc(markers1, charclass); 222 222 } … … 225 225 BitBlock markers1; 226 226 CarryType c = cq[carryno]; 227 adc 128(markers0, charclass, c, markers1);227 adc256(markers0, charclass, c, markers1); 228 228 return simd_andc(markers1, charclass); 229 229 } … … 232 232 BitBlock markers1; 233 233 CarryType c = Carry0; 234 adc 128(markers0, charclass, c, markers1);234 adc256(markers0, charclass, c, markers1); 235 235 return simd_andc(markers1, charclass); 236 236 } … … 238 238 static inline BitBlock BitBlock_scanto_ci_co(BitBlock markers0, BitBlock charclass, CarryQtype cq, const int carryno) { 239 239 BitBlock markers1; 240 adc 128(markers0, simd_not(charclass), cq[carryno], markers1);240 adc256(markers0, simd_not(charclass), cq[carryno], markers1); 241 241 return simd_and(markers1, charclass); 242 242 } … … 245 245 BitBlock markers1; 246 246 cq[carryno] = Carry0; 247 adc 128(markers0, simd_not(charclass), cq[carryno], markers1);247 adc256(markers0, simd_not(charclass), cq[carryno], markers1); 248 248 return simd_and(markers1, charclass); 249 249 } … … 253 253 CarryType c = cq[carryno]; 254 254 BitBlock scanclass = simd_andc(EOF_mask, charclass); 255 adc 128(markers0, scanclass, c, markers1);255 adc256(markers0, scanclass, c, markers1); 256 256 return simd_andc(markers1, scanclass); 257 257 } … … 261 261 CarryType c = Carry0; 262 262 BitBlock scanclass = simd_andc(EOF_mask, charclass); 263 adc 128(markers0, scanclass, c, markers1);263 adc256(markers0, scanclass, c, markers1); 264 264 return simd_andc(markers1, scanclass); 265 265 } 
trunk/lib/lib_simd.h
r948 r959 1 1 /* lib_simd_h: SIMD Library including idealized SIMD operations 2 Copyright (C) 20 08, Robert D. Cameron2 Copyright (C) 2011, Robert D. Cameron 3 3 Licensed to the public under the Open Software License 3.0. 4 4 Licensed to International Characters Inc. … … 16 16 #include <limits.h> 17 17 18 #ifndef LONG_BIT19 #if ULONG_MAX == 0xFFFFFFFF20 #define LONG_BIT 3221 #endif22 #if ULONG_MAX == 0xFFFFFFFFFFFFFFFF23 18 #define LONG_BIT 64 24 #endif 25 #endif 26 27 #if (defined(__i386)  defined(__x86_64)) 28 #ifdef TEMPLATED_SIMD_LIB 29 #include "sse_simd_t.h" 30 #endif 31 #ifndef TEMPLATED_SIMD_LIB 32 #include "sse_simd.h" 33 #endif 34 #endif 35 #ifdef _ARCH_PPC 36 #include "altivec_simd.h" 37 #endif 38 39 /* Useful definitions from Linux kernel*/ 40 #ifdef __GNUC__ 41 /* 42 #define likely(x) __builtin_expect((x),1) 43 #define unlikely(x) __builtin_expect((x),0) 44 */ 45 static inline long likely(long x) { 46 return __builtin_expect(x, 1); 47 } 48 static inline long unlikely(long x) { 49 return __builtin_expect(x, 0); 50 } 51 52 #endif 53 #ifdef _MSC_VER 54 #define inline __inline 55 #include "lib/sse_simd.h" 56 #define likely(x) (x) 57 #define unlikely(x) (x) 58 #endif 59 60 #ifdef TEMPLATED_SIMD_LIB 61 static inline SIMD_type sisd_sll(SIMD_type blk, SIMD_type n) { 62 return simd<128>::sll(blk, n); 63 } 64 static inline SIMD_type sisd_srl(SIMD_type blk, SIMD_type n) { 65 return simd<128>::srl(blk, n); 66 } 67 #define sisd_slli(blk, n) simd<128>::slli<n>(blk) 68 #define sisd_srli(blk, n) simd<128>::srli<n>(blk) 69 #endif 70 71 72 /* Shift forward and back operations, based on endianness */ 73 #if BYTE_ORDER == BIG_ENDIAN 74 #define sisd_sfl(blk, n) sisd_srl(blk, n) 75 #define sisd_sbl(blk, n) sisd_sll(blk, n) 76 #define sisd_sfli(blk, n) sisd_srli(blk, n) 77 #define sisd_sbli(blk, n) sisd_slli(blk, n) 78 #define sb_op(x, n) ((x)<<(n)) 79 #define sf_op(x, n) ((x)>>(n)) 80 #define cfzl __builtin_clzl 81 #define cbzl __builtin_ctzl 82 #endif 19 20 #include "../lib/avx_simd.h" 83 21 84 22 #if BYTE_ORDER == LITTLE_ENDIAN … … 89 27 #define sb_op(x, n) ((x)>>(n)) 90 28 #define sf_op(x, n) ((x)<<(n)) 29 91 30 #ifdef __GNUC__ 92 31 #define cfzl __builtin_ctzl 93 32 #define cbzl __builtin_clzl 33 #define likely(x) __builtin_expect((x),1) 34 #define unlikely(x) __builtin_expect((x),0) 94 35 #endif 95 36 #ifdef _MSC_VER … … 116 57 if (v.elems[0] != 0) return cfzl(v.elems[0]); 117 58 else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]); 118 #ifdef _MSC_VER119 59 else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]); 120 60 else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]); 121 #endif122 #ifndef _MSC_VER123 #if LONG_BIT < 64124 else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);125 else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);126 #endif127 #endif128 61 else return 8*sizeof(SIMD_type); 129 62 } … … 132 65 union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v; 133 66 v.vec = bits; 134 #if LONG_BIT == 64135 if (v.elems[1] != 0) return cbzl(v.elems[1]);136 else if (v.elems[0] != 0) return LONG_BIT + cbzl(v.elems[0]);137 #endif138 #if LONG_BIT < 64139 67 if (v.elems[3] != 0) return cbzl(v.elems[3]); 140 68 else if (v.elems[2] != 0) return LONG_BIT + cbzl(v.elems[2]); 141 69 else if (v.elems[1] != 0) return 2*LONG_BIT + cbzl(v.elems[1]); 142 70 else if (v.elems[0] != 0) return 3*LONG_BIT + cbzl(v.elems[0]); 143 #endif144 71 else return 8*sizeof(SIMD_type); 145 72 } 
trunk/lib/s2p.h
r712 r959 96 96 */ 97 97 98 99 #ifndef USE_S2P_AVX 98 100 #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ 99 101 do {\ … … 104 106 p1 = simd_if(hi_mask, simd_slli_16(t0, shift), t1);\ 105 107 } while(0) 108 #endif 109 110 111 /* For AVX, we use a modified s2p_step function to avoid a number 112 of conversions from 128bit mode to 256bit mode just to 113 immediately convert back. */ 114 #ifdef USE_S2P_AVX 115 #define sse_andc(b1, b2) _mm_andnot_si128(b2, b1) 116 #define sse_himask_16 _mm_set1_epi32(0xFF00FF00) 117 #define sse_slli_16(r, shft) _mm_slli_epi16(r, shft) 118 #define sse_srli_16(r, shft) _mm_srli_epi16(r, shft) 119 #define sse_packus_16(a, b) _mm_packus_epi16(b, a) 120 #define sse_pack_16(a, b) \ 121 _mm_packus_epi16(sse_andc(b, sse_himask_16), sse_andc(a, sse_himask_16)) 122 #define sse_pack_16_ll(v1, v2) sse_pack_16(v1, v2) 123 #define sse_pack_16_hh(v1, v2) sse_packus_16(sse_srli_16(v1, 8), sse_srli_16(v2, 8)) 124 125 #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ 126 do {\ 127 __m128i s00, s01, s10, s11, t00, t01, t10, t11;\ 128 __m128i t10shift, t11shift, t00shift, t01shift;\ 129 s00 = simd_hi128(s0);\ 130 s01 = simd_lo128(s0);\ 131 s10 = simd_hi128(s1);\ 132 s11 = simd_lo128(s1);\ 133 t00 = sse_pack_16_hh(s00, s01);\ 134 t10 = sse_pack_16_ll(s00, s01);\ 135 t01 = sse_pack_16_hh(s10, s11);\ 136 t11 = sse_pack_16_ll(s10, s11);\ 137 t10shift = sse_srli_16(t10, shift);\ 138 t11shift = sse_srli_16(t11, shift);\ 139 t00shift = sse_slli_16(t00, shift);\ 140 t01shift = sse_slli_16(t01, shift);\ 141 p0 = simd_if(hi_mask, simd_combine256(t00, t01), simd_combine256(t10shift, t11shift));\ 142 p1 = simd_if(hi_mask, simd_combine256(t00shift, t01shift), simd_combine256(t10, t11));\ 143 } while(0) 144 #endif 145 146 106 147 107 148 #define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ … … 125 166 } while(0) 126 167 168 169 170 171 127 172 /* For sizeof(SIMD_type) = 16 */ 128 173 typedef uint16_t BitPack;
Note: See TracChangeset
for help on using the changeset viewer.