Changeset 183 for trunk/lib


Ignore:
Timestamp:
Jul 10, 2008, 6:17:58 PM (11 years ago)
Author:
cameron
Message:

SIMD templated library - restructuring.

Location:
trunk/lib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/lib_simd.h

    r179 r183  
    4949#endif
    5050#if BYTE_ORDER == LITTLE_ENDIAN
     51#ifdef TEMPLATED_SIMD_LIB
     52static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
     53        return simd<128>::sll(blk, n);
     54}
     55
     56static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
     57        return simd<128>::srl(blk, n);
     58}
     59#define sisd_sfli(blk, n) simd<128>::slli<n>(blk)
     60#define sisd_sbli(blk, n) simd<128>::srli<n>(blk)
     61#endif
     62#ifndef TEMPLATED_SIMD_LIB
    5163static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
    5264        return sisd_sll(blk, n);
    5365}
    54 //#define sisd_sbl(blk, n) sisd_srl(blk, n)
     66static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
     67        return sisd_srl(blk, n);
     68}
    5569#define sisd_sfli(blk, n) sisd_slli(blk, n)
    5670#define sisd_sbli(blk, n) sisd_srli(blk, n)
     71#endif
    5772#define sb_op(x, n) ((x)>>(n))
    5873#define sf_op(x, n) ((x)<<(n))
  • trunk/lib/sse_simd_t.h

    r182 r183  
    11/*  Idealized SIMD Operations with SSE versions
    2     Copyright (C) 2006, 2007, 2008, Robert D. Cameron
     2    Copyright (C) 2006, 2007, 2008, Robert D. Cameron and Dan Lin
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters Inc.
     
    5252}
    5353
    54 /* Idealized operations with direct implementation by built-in
    55    operations for various target architectures. */
    56 
    57 #define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
    58 #define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
    59 #define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
    60 
    61 #define simd_max_8(a, b) _mm_max_epu8(a, b)
    62 
    63 //#define sisd_add(a, b) simd_add_128(a, b)
    64 //#define sisd_sub(a, b) simd_sub_128(a, b)
     54
     55/*------------------------------------------------------------*/
     56/* II. Declarations of field-width based operations. */
     57
     58/*  Half-operand modifier specifications use "x", "h" or "l",
     59 *  "x" - no modification of the corresponding operand value
     60 *  "h" - each n-bit field is modified by taking the high n/2 bits.
     61 *  "l" - each n-bit field is modified by taking the low n/2 bits. */
     62 
     63enum HOM_t {x,h,l};
     64
     65/* simd<fw> is a template struct providing all the simd operations
     66 * for a given field width.  */
     67template <int fw>
     68struct simd {
     69        /* The himask selector in which each field is fw/2 1 bits,
     70         * followed by fw/2 0 bits. */
     71        static inline SIMD_type himask();
     72       
     73        /* Splat constant generator with compile-time constant. */
     74        template <int v> static inline SIMD_type constant();
     75        /* Splat generator using the first field of a register. */
     76        static inline SIMD_type splat(SIMD_type r);
     77       
     78        /* Shift immediate with the shift constant as a template parameter. */
     79        template <int shft> static inline SIMD_type srli(SIMD_type r);
     80        template <int shft> static inline SIMD_type slli(SIMD_type r);
     81        template <int shft> static inline SIMD_type srai(SIMD_type r);
     82       
     83        /* Shift operations with register-specified shift values. */
     84        static inline SIMD_type srl(SIMD_type r, SIMD_type shft);
     85        static inline SIMD_type sll(SIMD_type r, SIMD_type shft);
     86       
     87        /* Binary operations. */
     88        static inline SIMD_type add(SIMD_type r1, SIMD_type r2);
     89        static inline SIMD_type sub(SIMD_type r1, SIMD_type r2);
     90        static inline SIMD_type mult(SIMD_type r1, SIMD_type r2);
     91        static inline SIMD_type max(SIMD_type r1, SIMD_type r2);
     92        static inline SIMD_type eq(SIMD_type r1, SIMD_type r2);
     93        static inline SIMD_type gt(SIMD_type r1, SIMD_type r2);
     94        static inline SIMD_type pack(SIMD_type r1, SIMD_type r2);
     95        static inline SIMD_type mergeh(SIMD_type r1, SIMD_type r2);
     96        static inline SIMD_type mergel(SIMD_type r1, SIMD_type r2);
     97
     98//      /* Functions for half-operand modification. */
     99//     
     100//      template <HOM_t m> static inline SIMD_type hom(SIMD_type r);
     101//      template <HOM_t m> static inline SIMD_type hx(SIMD_type r);
     102       
     103        /* Binary operations with half-operand modifiers */
     104       
     105        template <HOM_t m1, HOM_t m2> static inline SIMD_type add(SIMD_type r1, SIMD_type r2);
     106        template <HOM_t m1, HOM_t m2> static inline SIMD_type sub(SIMD_type r1, SIMD_type r2);
     107        template <HOM_t m1, HOM_t m2> static inline SIMD_type mult(SIMD_type r1, SIMD_type r2);
     108        template <HOM_t m1, HOM_t m2> static inline SIMD_type pack(SIMD_type r1, SIMD_type r2);
     109        template <HOM_t m1, HOM_t m2> static inline SIMD_type mergeh(SIMD_type r1, SIMD_type r2);
     110        template <HOM_t m1, HOM_t m2> static inline SIMD_type mergel(SIMD_type r1, SIMD_type r2);
     111};
     112
     113#define sisd_to_int(x) _mm_cvtsi128_si32(x)
     114
     115#define sisd_from_int(n) _mm_cvtsi32_si128(n)
     116
     117
     118
     119
     120/* III.  Implementations of simd<fw> operations. */
     121
     122/* Constant generator functions for various field widths. */
     123
     124template<> inline SIMD_type simd<2>::himask() {return _mm_set1_epi8(0xAA);}
     125
     126template<> inline SIMD_type simd<4>::himask() {return _mm_set1_epi8(0xCC);}
     127
     128template<> inline SIMD_type simd<8>::himask() {return _mm_set1_epi8(0xF0);}
     129
     130template<> inline SIMD_type simd<16>::himask() {return _mm_set1_epi16(0xFF00);}
     131
     132template<> inline SIMD_type simd<32>::himask() {return _mm_set1_epi32(0xFFFF0000);}
     133
     134template<> inline SIMD_type simd<64>::himask() {return _mm_set_epi32(-1,0,-1,0);}
     135
     136template<> inline SIMD_type simd<128>::himask() {return _mm_set_epi32(-1,-1,0,0);}
     137
     138template<> template <int n> inline SIMD_type simd<4>::constant() {return _mm_set1_epi8((n)<<4|(n));}
     139
     140template<> template <int n> inline SIMD_type simd<8>::constant() {return _mm_set1_epi8(n);}
     141
     142template<> template <int n> inline SIMD_type simd<16>::constant() {return _mm_set1_epi16(n);}
     143
     144template<> template <int n> inline SIMD_type simd<32>::constant() {return _mm_set1_epi32(n);}
     145
     146template<> template <> inline SIMD_type simd<1>::constant<0>() {return simd<8>::constant<0>();}
     147template<> template <> inline SIMD_type simd<1>::constant<1>() {return simd<8>::constant<-1>();}
     148
     149template<> template <int n> inline SIMD_type simd<2>::constant() {return simd<4>::constant<(n<<2|n)>();}
     150
     151// Splat the first 16-bit int into all positions.
     152template <> inline SIMD_type simd<16>::splat(SIMD_type x) {
     153  SIMD_type t = _mm_shufflelo_epi16(x,0);
     154  return _mm_shuffle_epi32(t,0);
     155}
     156
     157// Splat the first 32-bit int into all positions.
     158template <> inline SIMD_type simd<32>::splat(SIMD_type x) {
     159  return _mm_shuffle_epi32(x,0);
     160}
     161
     162/* Shift immediate operations with direct implementation by built-ins. */
     163
     164template<> template<int sh> inline SIMD_type simd<16>::slli(SIMD_type r) {return _mm_slli_epi16(r, sh);}
     165
     166template<> template<int sh> inline SIMD_type simd<32>::slli(SIMD_type r) {return _mm_slli_epi32(r, sh);}
     167
     168template<> template<int sh> inline SIMD_type simd<64>::slli(SIMD_type r) {return _mm_slli_epi64(r, sh);}
     169
     170template<> template<int sh> inline SIMD_type simd<16>::srli(SIMD_type r) {return _mm_srli_epi16(r, sh);}
     171
     172template<> template<int sh> inline SIMD_type simd<32>::srli(SIMD_type r) {return _mm_srli_epi32(r, sh);}
     173
     174template<> template<int sh> inline SIMD_type simd<64>::srli(SIMD_type r) {return _mm_srli_epi64(r, sh);}
     175
     176/* simd_srai
     177 * fw: 16,32*/
     178template<> template<int sh> inline SIMD_type simd<16>::srai(SIMD_type r) {return _mm_srai_epi16(r, sh);}
     179
     180template<> template<int sh> inline SIMD_type simd<32>::srai(SIMD_type r) {return _mm_srai_epi32(r, sh);}
     181                 
     182
     183
     184/* General rules for slli/srli for field widths 2, 4, 8 in terms of 32-bit shifts. */
     185
     186
     187// Doesn't work:
     188//template<int fw> template<int sh>
     189//inline SIMD_type simd<fw>::slli(SIMD_type r) {
     190//      return simd_and(simd<32>::slli<sh>(r), simd<fw>::constant<6>());
     191//}
     192//
     193
     194
     195template<> template<int sh>
     196inline SIMD_type simd<2>::slli(SIMD_type r) {
     197        return simd_and(simd<32>::slli<sh>(r),simd<2>::constant<((3<<sh)&3)>());
     198}
     199
     200template<> template<int sh>
     201inline SIMD_type simd<4>::slli(SIMD_type r) {
     202        return simd_and(simd<32>::slli<sh>(r),simd<4>::constant<((15<<sh)&15)>());
     203}
     204
     205template<> template<int sh>
     206inline SIMD_type simd<8>::slli(SIMD_type r) {
     207        return simd_and(simd<32>::slli<sh>(r),simd<8>::constant<((255<<sh)&255)>());
     208}
     209
     210
     211//template<int fw> template<int sh>
     212//inline SIMD_type simd<fw>::srli(SIMD_type r) {
     213//      return simd_and(simd<32>::srli<sh>(r),simd<fw>::constant<((1<<(fw-sh))-1)>());
     214//}
     215//
     216
     217
     218template<> template<int sh>
     219inline SIMD_type simd<2>::srli(SIMD_type r) {
     220        return simd_and(simd<32>::srli<sh>(r),simd<2>::constant<(3>>sh)>());
     221}
     222
     223template<> template<int sh>
     224inline SIMD_type simd<4>::srli(SIMD_type r) {
     225        return simd_and(simd<32>::srli<sh>(r),simd<4>::constant<(15>>sh)>());
     226}
     227
     228template<> template<int sh>
     229inline SIMD_type simd<8>::srli(SIMD_type r) {
     230        return simd_and(simd<32>::srli<sh>(r),simd<8>::constant<(255>>sh)>());
     231}
     232
     233
     234
     235
     236/* Shift immediate for 128-bit fields */
     237
     238template<> template<int shft>
     239inline SIMD_type simd<128>::slli(SIMD_type r) {
     240        return (shft % 8 == 0 ? _mm_slli_si128(r, shft/8) :
     241                shft >= 64 ? simd<64>::slli<shft-64>(_mm_slli_si128(r, 8)) :
     242                simd_or(simd<64>::slli<shft>(r), _mm_slli_si128(simd<64>::srli<64-shft>(r), 8)));
     243}
     244
     245template<> template<int shft>
     246inline SIMD_type simd<128>::srli(SIMD_type r) {
     247        return (shft % 8 == 0 ? _mm_srli_si128(r, shft/8) :
     248                shft >= 64 ? simd<64>::srli<shft-64>(_mm_srli_si128(r, 8)) :
     249                simd_or(simd<64>::srli<shft>(r), _mm_srli_si128(simd<64>::slli<64-shft>(r), 8)));
     250}
     251
     252
     253/* Shifts with shift values specified in an operand register. */
     254
     255template<>
     256inline SIMD_type simd<128>::srl(SIMD_type r, SIMD_type shft) {
     257        return simd_or(_mm_srl_epi64(r, shft),
     258                       simd_or(_mm_srli_si128(_mm_srl_epi64(r, _mm_sub_epi32(shft, sisd_from_int(64))), 8),
     259                               _mm_srli_si128(_mm_sll_epi64(r, _mm_sub_epi32(sisd_from_int(64), shft)), 8)));
     260}
     261
     262template<>
     263inline SIMD_type simd<128>::sll(SIMD_type r, SIMD_type shft) {
     264        return simd_or(_mm_sll_epi64(r, shft),
     265                       simd_or(_mm_slli_si128(_mm_sll_epi64(r, _mm_sub_epi32(shft, sisd_from_int(64))), 8),
     266                               _mm_slli_si128(_mm_srl_epi64(r, _mm_sub_epi32(sisd_from_int(64), shft)), 8)));
     267}
     268
     269template<>
     270inline SIMD_type simd<64>::srl(SIMD_type r, SIMD_type shft) {
     271        return simd_if(simd<128>::himask(),
     272                       _mm_srl_epi64(r, _mm_srli_si128(shft, 8)),
     273                       _mm_srl_epi64(r, simd_andc(shft, simd<128>::himask())));
     274}
     275
     276template<>
     277inline SIMD_type simd<64>::sll(SIMD_type r, SIMD_type shft) {
     278        return simd_if(simd<128>::himask(),
     279                       _mm_sll_epi64(r, _mm_srli_si128(shft, 8)),
     280                       _mm_sll_epi64(r, simd_andc(shft, simd<128>::himask())));
     281}
     282
     283
     284/* simd_add
     285 * fw: 2,4,8,16,32,64
     286
     287   Use built-ins for 8, 16, 32, 64, simulations for 2, 4. */
     288
     289template<> inline SIMD_type simd<8>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi8(r1, r2);}
     290
     291template<> inline SIMD_type simd<16>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi16(r1, r2);}
     292
     293template<> inline SIMD_type simd<32>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi32(r1, r2);}
     294
     295template<> inline SIMD_type simd<64>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi64(r1, r2);}
     296
     297template<>
     298inline SIMD_type simd<2>::add(SIMD_type r1, SIMD_type r2) {
     299         SIMD_type c1 = simd_xor(r1,r2);
     300         SIMD_type borrow = simd_and(r1,r2);
     301         SIMD_type c2 = simd_xor(c1,(simd<128>::slli<1>(borrow)));
     302         return simd_if(simd<2>::himask(),c2,c1);
     303}
     304
     305template<>
     306SIMD_type simd<4>::add(SIMD_type r1, SIMD_type r2) {
     307        return simd_if(simd<8>::himask(),
     308                       simd<8>::add(r1,simd_and(r2,simd<8>::himask())),
     309                       simd<8>::add(r1, r2));
     310}
     311
     312/* simd_sub
     313 * fw: 2,4,8,16,32,64
     314
     315   Use built-ins for 8, 16, 32, 64, simulations for 2, 4. */
     316
     317template<> inline SIMD_type simd<8>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi8(r1, r2);}
     318
     319template<> inline SIMD_type simd<16>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi16(r1, r2);}
     320
     321template<> inline SIMD_type simd<32>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi32(r1, r2);}
     322
     323template<> inline SIMD_type simd<64>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi64(r1, r2);}
     324
     325
     326template<>
     327inline SIMD_type simd<2>::sub(SIMD_type r1, SIMD_type r2)
     328{
     329         SIMD_type c1 = simd_xor(r1,r2);
     330         SIMD_type borrow = simd_andc(r2,r1);
     331         SIMD_type c2 = simd_xor(c1,(simd<128>::slli<1>(borrow)));
     332         return simd_if(simd<2>::himask(),c2,c1);
     333}
     334
     335template<>
     336inline SIMD_type simd<4>::sub(SIMD_type r1, SIMD_type r2){
     337        return simd_if(simd<8>::himask(),
     338                       simd<8>::sub(r1, simd_and(r2,simd<8>::himask())),
     339                       simd<8>::sub(r1, r2));
     340}
     341
     342/* simd_mult for 16 bits only. */
     343
     344template<> inline SIMD_type simd<16>::mult(SIMD_type r1, SIMD_type r2) {return _mm_mullo_epi16(r1, r2);}
     345
     346/* simd_max for 8 bits only. */
     347
     348template<> inline SIMD_type simd<8>::max(SIMD_type r1, SIMD_type r2) {return _mm_max_epu8(r1, r2);}
     349
     350
     351/* simd_eq
     352 * fw: 8,16,32*/
     353
     354template<> inline SIMD_type simd<8>::eq(SIMD_type r1, SIMD_type r2) {return _mm_cmpeq_epi8(r1, r2);}
     355
     356template<> inline SIMD_type simd<16>::eq(SIMD_type r1, SIMD_type r2) {return _mm_cmpeq_epi16(r1, r2);}
     357
     358template<> inline SIMD_type simd<32>::eq(SIMD_type r1, SIMD_type r2) {return _mm_cmpeq_epi32(r1, r2);}
     359
     360
     361
     362/*simd_pack
     363 * fw: 2,4,8,16*/
     364
     365/* Built-in operation for fw = 16. */
     366template<>
     367inline SIMD_type simd<16>::pack(SIMD_type r1, SIMD_type r2) {
     368        return _mm_packus_epi16(simd_andc(r2, simd<16>::himask()), simd_andc(r1, simd<16>::himask()));
     369}
     370
     371/* fw: 2, 4, 8 */
     372template<int fw>
     373inline SIMD_type pack(SIMD_type r1, SIMD_type r2){
     374        return simd<fw*2>::pack(simd_if(simd<fw>::himask(),simd<128>::srli<fw/2>(r1),r1),
     375                                simd_if(simd<fw>::himask(),simd<128>::srli<fw/2>(r2),r2));
     376}
     377
     378/* simd_mergeh
     379 * fw: 1,2,4,8,16,32,64*/
     380template<int fw>
     381inline SIMD_type simd<fw>::mergeh(SIMD_type r1, SIMD_type r2){
     382        /*fw: 1,2,4*/
     383        return simd<fw*2>::mergeh(simd_if(simd<fw*2>::himask(),r1,simd<fw*2>::srli<fw>(r2)),
     384                                  simd_if(simd<fw*2>::himask(),simd<fw*2>::slli<fw>(r1),r2));
     385}
     386
     387template<> inline SIMD_type simd<8>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi8(r2, r1);}
     388template<> inline SIMD_type simd<16>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi16(r2, r1);}
     389template<> inline SIMD_type simd<32>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi32(r2, r1);}
     390template<> inline SIMD_type simd<64>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi64(r2, r1);}
     391
     392
     393/* simd_mergel
     394 * fw: 1,2,4,8,16,32,64*/
     395template<int fw>
     396inline SIMD_type simd<fw>::mergel(SIMD_type r1, SIMD_type r2){
     397        /*fw: 1,2,4*/
     398        return simd<fw*2>::mergel(simd_if(simd<fw*2>::himask(),r1,simd<fw*2>::srli<fw>(r2)),
     399                                  simd_if(simd<fw*2>::himask(),simd<fw*2>::slli<fw>(r1),r2));
     400}
     401
     402template<> inline SIMD_type simd<8>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi8(r2, r1);}
     403template<> inline SIMD_type simd<16>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi16(r2, r1);}
     404template<> inline SIMD_type simd<32>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi32(r2, r1);}
     405template<> inline SIMD_type simd<64>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi64(r2, r1);}
     406
     407
     408
     409
     410
     411#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
     412#define simd_all_le_8(v1, v2) simd_all_eq_8(simd_max_8(v1, v2), v2)
     413
     414#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
     415
     416#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
     417
     418
     419
     420/* simd_all_true
     421 * fw: 8*/
     422template<int fw>
     423static inline int simd_all_true(SIMD_type r);
     424template<>
     425static inline int simd_all_true<8>(SIMD_type r) {
     426        return _mm_movemask_epi8(r) == 0xFFFF;
     427}
     428
     429/* simd_any_true
     430 * fw: 8*/
     431template<int fw>
     432static inline int simd_any_true(SIMD_type r);
     433template<>
     434static inline int simd_any_true<8>(SIMD_type r) {
     435        return _mm_movemask_epi8(r) != 0;
     436}
     437
     438/* simd_any_sign_bit
     439 * fw: 8*/
     440template<int fw>
     441static inline int simd_any_sign_bit(SIMD_type r);
     442template<>
     443static inline int simd_any_sign_bit<8>(SIMD_type r) {
     444        return _mm_movemask_epi8(r) != 0;
     445}
     446
     447
     448
     449/* IV.  Half operand modifiers - implementations. */
     450/* Half operand modifier functions.*/
     451
     452/* Half operand modifier*/
     453/* Half operand modifier*/
     454template <int fw, HOM_t m>
     455struct SIMD {
     456        static inline SIMD_type hom(SIMD_type r) {}
     457};
     458
     459template <int fw>
     460struct SIMD<fw, x> {
     461        static inline SIMD_type hom(SIMD_type r) {return r;}
     462        static inline SIMD_type l2x(SIMD_type r) {return r;}
     463};
     464
     465template <int fw>
     466struct SIMD<fw, l> {
     467        static inline SIMD_type hom(SIMD_type r) {return simd_andc(r, simd<fw>::himask());}
     468        static inline SIMD_type l2x(SIMD_type r) {return r;}
     469};
     470
     471//template <int fw>
     472//struct SIMD<fw, h> {
     473//      static inline SIMD_type hom(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
     474//      static inline SIMD_type l2x(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
     475//};
     476//
     477template <>
     478struct SIMD<2, h> {
     479        static inline SIMD_type hom(SIMD_type r) {return simd<2>::srli<1>(r);}
     480        static inline SIMD_type l2x(SIMD_type r) {return simd<2>::srli<1>(r);}
     481};
     482
     483template <>
     484struct SIMD<4, h> {
     485        static inline SIMD_type hom(SIMD_type r) {return simd<4>::srli<2>(r);}
     486        static inline SIMD_type l2x(SIMD_type r) {return simd<4>::srli<2>(r);}
     487};
     488
     489template <>
     490struct SIMD<8, h> {
     491        static inline SIMD_type hom(SIMD_type r) {return simd<8>::srli<4>(r);}
     492        static inline SIMD_type l2x(SIMD_type r) {return simd<8>::srli<4>(r);}
     493};
     494
     495template <>
     496struct SIMD<16, h> {
     497        static inline SIMD_type hom(SIMD_type r) {return simd<16>::srli<8>(r);}
     498        static inline SIMD_type l2x(SIMD_type r) {return simd<16>::srli<8>(r);}
     499};
     500
     501template <>
     502struct SIMD<32, h> {
     503        static inline SIMD_type hom(SIMD_type r) {return simd<32>::srli<16>(r);}
     504        static inline SIMD_type l2x(SIMD_type r) {return simd<32>::srli<16>(r);}
     505};
     506
     507
     508/* SIMD operations extended with HOM*/
     509template<int fw> template <HOM_t m1, HOM_t m2>
     510inline SIMD_type simd<fw>::add(SIMD_type r1, SIMD_type r2){
     511        return simd<fw>::add(SIMD<fw,m1>::hom(r1),SIMD<fw,m1>::hom(r2));
     512}
     513
     514template<int fw> template <HOM_t m1, HOM_t m2>
     515inline SIMD_type simd<fw>::sub(SIMD_type r1, SIMD_type r2){
     516        return simd<fw>::sub(SIMD<fw,m1>::hom(r1),SIMD<fw,m1>::hom(r2));
     517}
     518
     519template<int fw> template <HOM_t m1, HOM_t m2>
     520inline SIMD_type simd<fw>::pack(SIMD_type r1, SIMD_type r2){
     521        return simd<fw>::pack(SIMD<fw,m1>::l2x(r1),SIMD<fw,m1>::l2x(r2));
     522}
     523
     524template<int fw> template <HOM_t m1, HOM_t m2>
     525inline SIMD_type simd<fw>::mergeh(SIMD_type r1, SIMD_type r2){
     526        return simd<fw>::mergeh(SIMD<fw,m1>::hom(r1),SIMD<fw,m1>::hom(r2));
     527}
     528
     529template<int fw> template <HOM_t m1, HOM_t m2>
     530inline SIMD_type simd<fw>::mergel(SIMD_type r1, SIMD_type r2){
     531        return simd<fw>::mergel(SIMD<fw,m1>::hom(r1),SIMD<fw,m1>::hom(r2));
     532}
     533
     534
     535//
     536//template <HOM_t m>
     537//struct HOM {
     538//template<int fw> SIMD_type hom(SIMD_type r) {return r;}
     539//template<int fw> SIMD_type l2x(SIMD_type r) {return r;}
     540//};
     541//
     542//template <>
     543//template <int fw>
     544//SIMD_type HOM<l>::hom(SIMD_type r) {return simd_andc(r, simd<fw>::himask());}
     545//
     546//template <>
     547//template <int fw>
     548//SIMD_type HOM<h>::hom(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
     549//
     550//template <>
     551//template <int fw>
     552//SIMD_type HOM<h>::l2x(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
     553//
     554//
     555///* SIMD operations extended with Half-Operand Modifiers */
     556//
     557//template<int fw> template <HOM_t m1, HOM_t m2>
     558//inline SIMD_type simd<fw>::add(SIMD_type r1, SIMD_type r2){
     559//      return simd<fw>::add(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>(r2));
     560//}
     561//
     562//template<int fw> template <HOM_t m1, HOM_t m2>
     563//inline SIMD_type simd<fw>::sub(SIMD_type r1, SIMD_type r2){
     564//      return simd<fw>::sub(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>(r2));
     565//}
     566//
     567//template<int fw> template <HOM_t m1, HOM_t m2>
     568//inline SIMD_type simd<fw>::mult(SIMD_type r1, SIMD_type r2){
     569//      return simd<fw>::mult(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>(r2));
     570//}
     571//
     572//template<int fw> template <HOM_t m1, HOM_t m2>
     573//inline SIMD_type simd<fw>::pack(SIMD_type r1, SIMD_type r2){
     574//      return simd<fw>::pack(HOM<m1>::l2x<fw>, HOM<m2>::hom<fw>::hom(r2));
     575//}
     576//
     577//template<int fw> template <HOM_t m1, HOM_t m2>
     578//inline SIMD_type simd<fw>::mergeh(SIMD_type r1, SIMD_type r2){
     579//      return simd<fw>::mergeh(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>::hom(r2));
     580//}
     581//
     582//template<int fw> template <HOM_t m1, HOM_t m2>
     583//inline SIMD_type simd<fw>::mergel(SIMD_type r1, SIMD_type r2){
     584//      return simd<fw>::mergel(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>::hom(r2));
     585//}
     586 
     587/* V.  sisd operations on full 128-bit register width. */
     588
     589//struct sisd {
     590//      template <int shft> inline SIMD_type slli(SIMD_type r) {return simd<128>::slli<shft>(r);}
     591//      template <int shft> inline SIMD_type srli(SIMD_type r) {return simd<128>::srli<shft>(r);}
     592//      inline SIMD_type sll(SIMD_type r, SIMD_type shft) {return simd<128>::sll<shft>(r, shft);}
     593//      inline SIMD_type srl(SIMD_type r, SIMD_type shft) {return simd<128>::srl<shft>(r, shft);}
     594//};
     595
    65596
    66597#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
     
    74605#endif
    75606
    76 #define sisd_to_int(x) _mm_cvtsi128_si32(x)
    77 
    78 #define sisd_from_int(n) _mm_cvtsi32_si128(n)
    79 
    80 
    81 #define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
    82 #define simd_all_le_8(v1, v2) simd_all_eq_8(simd_max_8(v1, v2), v2)
    83 
    84 #define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
    85 
    86 #define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
    87607
    88608#define bitblock_test_bit(blk, n) \
    89609   sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
    90 
    91 // Splat the first 16-bit int into all positions.
    92 static inline SIMD_type simd_splat_16(SIMD_type x) {
    93   SIMD_type t = _mm_shufflelo_epi16(x,0);
    94   return _mm_shuffle_epi32(t,0);
    95 }
    96 
    97 // Splat the first 32-bit int into all positions.
    98 static inline SIMD_type simd_splat_32(SIMD_type x) {
    99   return _mm_shuffle_epi32(x,0);
    100 }
    101610
    102611
     
    115624
    116625
    117 // Linda says that this is way too complex!
    118 //
    119 //#define SIMD_DEFINE(op, intrinsic, fw) \
    120 //template <int w> \
    121 //SIMD_type simd_ ## op (SIMD_type r1, SIMD_type r2); \
    122 //template <> \
    123 //SIMD_type simd_ ## op < fw > (SIMD_type r1, SIMD_type r2) {\
    124 //      return intrinsic ## fw(r1, r2);\
    125 //}
    126 //
    127 //#define SIMD_DEFINE_8_16_32(op, intrinsic)\
    128 //SIMD_DEFINE(op, intrinsic, 8) \
    129 //SIMD_DEFINE(op, intrinsic, 16)\
    130 //SIMD_DEFINE(op, intrinsic, 32)
    131 //
    132 //SIMD_DEFINE_8_16_32(sub, _mm_sub_epi)
    133 //SIMD_DEFINE_8_16_32(eq, _mm_cmpeq_epi)
    134 
    135 /* simd_himask
    136  * fw: 2,4,8,16,32,64,128*/
    137 template<int fw>
    138 static inline SIMD_type simd_himask();
    139 
    140 template<>
    141 static inline SIMD_type simd_himask<2>() {
    142         return _mm_set1_epi8(0xAA);
    143 }
    144 template<>
    145 static inline SIMD_type simd_himask<4>() {
    146         return _mm_set1_epi8(0xCC);
    147 }
    148 template<>
    149 static inline SIMD_type simd_himask<8>() {
    150         return _mm_set1_epi8(0xF0);
    151 }
    152 template<>
    153 static inline SIMD_type simd_himask<16>() {
    154         return _mm_set1_epi16(0xFF00);
    155 }
    156 template<>
    157 static inline SIMD_type simd_himask<32>() {
    158         return _mm_set1_epi32(0xFFFF0000);
    159 }
    160 template<>
    161 static inline SIMD_type simd_himask<64>() {
    162         return _mm_set_epi32(-1,0,-1,0);
    163 }
    164 template<>
    165 static inline SIMD_type simd_himask<128>() {
    166         return _mm_set_epi32(-1,-1,0,0);
    167 }
    168 
    169 /* simd_const
    170  * fw: 2,4,8,16,32*/
    171 template<int fw>
    172 static inline SIMD_type simd_const(int n);
    173 template<>
    174 static inline SIMD_type simd_const<4>(int n) {
    175         return _mm_set1_epi8((n)<<4|(n));
    176 }
    177 template<>
    178 static inline SIMD_type simd_const<8>(int n) {
    179         return _mm_set1_epi8(n);
    180 }
    181 template<>
    182 static inline SIMD_type simd_const<16>(int n) {
    183         return _mm_set1_epi16(n);
    184 }
    185 template<>
    186 static inline SIMD_type simd_const<32>(int n) {
    187         return _mm_set1_epi32(n);
    188 }
    189 template<>
    190 static inline SIMD_type simd_const<1>(int n) {
    191         if(n==0) return simd_const<8>(0);
    192         else return simd_const<8>(-1);
    193 }
    194 template<>
    195 static inline SIMD_type simd_const<2>(int n) {
    196         return simd_const<4>(n<<2|n);
    197 }
    198 
    199 template<int fw, int val>
    200 static inline SIMD_type simd_const();
    201 
    202 template<>
    203 static inline SIMD_type simd_const<1, 0>() {
    204         return simd_const<8>(0);
    205 }
    206 
    207 template<>
    208 static inline SIMD_type simd_const<1, 1>() {
    209         return simd_const<8>(-1);
    210 }
    211 
    212 
    213 /* simd_srli
    214  * fw: 2,4,8,16,32,64*/
    215 template<int fw>
    216 static inline SIMD_type simd_srli(SIMD_type r, int sh);
    217 template<>
    218 static inline SIMD_type simd_srli<16>(SIMD_type r, int sh) {
    219         return _mm_srli_epi16(r, sh);
    220 }
    221 template<>
    222 static inline SIMD_type simd_srli<32>(SIMD_type r, int sh) {
    223         return _mm_srli_epi32(r, sh);
    224 }
    225 template<>
    226 static inline SIMD_type simd_srli<64>(SIMD_type r, int sh) {
    227         return _mm_srli_epi64(r, sh);
    228 }
    229 template<>
    230 static inline SIMD_type simd_srli<2>(SIMD_type r, int sh) {
    231         return simd_and(simd_srli<32>(r,sh),simd_const<2>(3>>sh));
    232 }
    233 template<>
    234 static inline SIMD_type simd_srli<4>(SIMD_type r, int sh) {
    235         return simd_and(simd_srli<32>(r,sh),simd_const<4>(15>>sh));
    236 }
    237 template<>
    238 static inline SIMD_type simd_srli<8>(SIMD_type r, int sh) {
    239         return simd_and(simd_srli<32>(r,sh),simd_const<8>(255>>sh));
    240 }
    241 
    242 /* simd_slli
    243  * fw: 2,4,8,16,32,64*/
    244 template<int fw>
    245 static inline SIMD_type simd_slli(SIMD_type r, int sh);
    246 template<>
    247 static inline SIMD_type simd_slli<16>(SIMD_type r, int sh) {
    248         return _mm_slli_epi16(r, sh);
    249 }
    250 template<>
    251 static inline SIMD_type simd_slli<32>(SIMD_type r, int sh) {
    252         return _mm_slli_epi32(r, sh);
    253 }
    254 template<>
    255 static inline SIMD_type simd_slli<64>(SIMD_type r, int sh) {
    256         return _mm_slli_epi64(r, sh);
    257 }
    258 template<>
    259 static inline SIMD_type simd_slli<2>(SIMD_type r, int sh) {
    260         return simd_and(simd_slli<32>(r,sh),simd_const<2>((3<<sh)&3));
    261 }
    262 template<>
    263 static inline SIMD_type simd_slli<4>(SIMD_type r, int sh) {
    264         return simd_and(simd_slli<32>(r,sh),simd_const<4>((15<<sh)&15));
    265 }
    266 template<>
    267 static inline SIMD_type simd_slli<8>(SIMD_type r, int sh) {
    268         return simd_and(simd_slli<32>(r,sh),simd_const<8>((255<<sh) &255));
    269 }
    270 
    271 #define simd_slli_128(r, shft) \
    272   ((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
    273    (shft) >= 64 ? simd_slli<64>(_mm_slli_si128(r, 8), (shft) - 64) : \
    274    simd_or(simd_slli<64>(r, shft), _mm_slli_si128(simd_srli<64>(r, 64-(shft)), 8)))
    275 
    276 #define simd_srli_128(r, shft) \
    277   ((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
    278    (shft) >= 64 ? simd_srli<64>(_mm_srli_si128(r, 8), (shft) - 64) : \
    279    simd_or(simd_srli<64>(r, shft), _mm_srli_si128(simd_slli<64>(r, 64-(shft)), 8)))
    280 
    281 #define simd_sll_128(r, shft) \
    282    simd_or(simd_sll_64(r, shft), \
    283            simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub<32>(shft, sisd_from_int(64))), 8), \
    284                    _mm_slli_si128(simd_srl_64(r, simd_sub<32>(sisd_from_int(64), shft)), 8)))
    285 
    286 #define simd_srl_128(r, shft) \
    287    simd_or(simd_srl_64(r, shft), \
    288            simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub<32>(shft, sisd_from_int(64))), 8), \
    289                    _mm_srli_si128(simd_sll_64(r, simd_sub<32>(sisd_from_int(64), shft)), 8)))
    290 
    291 #define sisd_sll(r, shft) simd_sll_128(r, shft)
    292 #define sisd_srl(r, shft) simd_srl_128(r, shft)
    293 #define sisd_slli(r, shft) simd_slli_128(r, shft)
    294 #define sisd_srli(r, shft) simd_srli_128(r, shft)
    295 
    296 
    297 /* simd_srai
    298  * fw: 16,32*/
    299 template<int fw>
    300 static inline SIMD_type simd_srai(SIMD_type r, int sh);
    301 template<>
    302 static inline SIMD_type simd_srai<16>(SIMD_type r, int sh) {
    303         return _mm_srai_epi16(r, sh);
    304 }
    305 template<>
    306 static inline SIMD_type simd_srai<32>(SIMD_type r, int sh) {
    307         return _mm_srai_epi32(r, sh);
    308 }
    309                  
    310 /* simd_add
    311  * fw: 2,4,8,16,32,64*/
    312 template<int fw>
    313 static inline SIMD_type simd_add(SIMD_type r1, SIMD_type r2);
    314 template<>
    315 static inline SIMD_type simd_add<2>(SIMD_type r1, SIMD_type r2) {
    316          SIMD_type c1 = simd_xor(r1,r2);
    317          SIMD_type borrow = simd_and(r1,r2);
    318          SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
    319          return simd_if(simd_himask<2>(),c2,c1);
    320 }
    321 template<>
    322 static inline SIMD_type simd_add<8>(SIMD_type r1, SIMD_type r2) {
    323         return _mm_add_epi8(r1, r2);
    324 }
    325 template<>
    326 static inline SIMD_type simd_add<4>(SIMD_type r1, SIMD_type r2) {
    327         return simd_if(simd_himask<8>(), simd_add<8>(simd_and(r1,simd_himask<8>()),simd_and(r2,simd_himask<8>()))
    328         ,simd_add<8>(simd_andc(r1,simd_himask<8>()),simd_andc(r2,simd_himask<8>())));
    329 }
    330 template<>
    331 static inline SIMD_type simd_add<16>(SIMD_type r1, SIMD_type r2) {
    332         return _mm_add_epi16(r1, r2);
    333 }
    334 template<>
    335 static inline SIMD_type simd_add<32>(SIMD_type r1, SIMD_type r2) {
    336         return _mm_add_epi32(r1, r2);
    337 }
    338 template<>
    339 static inline SIMD_type simd_add<64>(SIMD_type r1, SIMD_type r2) {
    340         return _mm_add_epi64(r1, r2);
    341 }
    342 /* simd_sub
    343  * fw: 2,4,8,16,32,64*/
    344 template<int fw>
    345 static inline SIMD_type simd_sub(SIMD_type r1, SIMD_type r2);
    346 
    347 template<>
    348 static inline SIMD_type simd_sub<2>(SIMD_type r1, SIMD_type r2)
    349 {
    350          SIMD_type c1 = simd_xor(r1,r2);
    351          SIMD_type borrow = simd_andc(r2,r1);
    352          SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
    353          return simd_if(simd_himask<2>(),c2,c1);
    354 }
    355 template<>
    356 static inline SIMD_type simd_sub<8>(SIMD_type r1, SIMD_type r2) {
    357         return _mm_sub_epi8(r1, r2);
    358 }
    359 template<>
    360 static inline SIMD_type simd_sub<4>(SIMD_type r1,SIMD_type r2){
    361         return simd_if(simd_himask<8>(), simd_sub<8>(simd_and(r1,simd_himask<8>()),simd_and(r2,simd_himask<8>()))
    362         ,simd_sub<8>(simd_andc(r1,simd_himask<8>()),simd_andc(r2,simd_himask<8>())));
    363 }
    364 template<>
    365 static inline SIMD_type simd_sub<16>(SIMD_type r1, SIMD_type r2) {
    366         return _mm_sub_epi16(r1, r2);
    367 }
    368 template<>
    369 static inline SIMD_type simd_sub<32>(SIMD_type r1, SIMD_type r2) {
    370         return _mm_sub_epi32(r1, r2);
    371 }
    372 template<>
    373 static inline SIMD_type simd_sub<64>(SIMD_type r1, SIMD_type r2) {
    374         return _mm_sub_epi64(r1, r2);
    375 }
    376 /* simd_eq
    377  * fw: 8,16,32*/
    378 template<int fw>
    379 static inline SIMD_type simd_eq(SIMD_type r1, SIMD_type r2);
    380 
    381 template<>
    382 static inline SIMD_type simd_eq<8>(SIMD_type r1, SIMD_type r2) {
    383         return _mm_cmpeq_epi8(r1, r2);
    384 }
    385 template<>
    386 static inline SIMD_type simd_eq<16>(SIMD_type r1, SIMD_type r2) {
    387         return _mm_cmpeq_epi16(r1, r2);
    388 }
    389 template<>
    390 static inline SIMD_type simd_eq<32>(SIMD_type r1, SIMD_type r2) {
    391         return _mm_cmpeq_epi32(r1, r2);
    392 }
    393 
    394 enum HOM {x,h,l};
    395 
    396 
    397 /*simd_pack
    398  * fw: 2,4,8,16*/
    399 template<int fw>
    400 static inline SIMD_type simd_pack(SIMD_type r1, SIMD_type r2){
    401         /*fw:2,4,8*/
    402         return simd_pack<fw*2>(simd_if(simd_himask<fw>(),sisd_srli(r1,fw/2),r1),simd_if(simd_himask<fw>(),sisd_srli(r2,fw/2),r2));
    403 }
    404 template<>
    405 static inline SIMD_type simd_pack<16>(SIMD_type r1, SIMD_type r2) {
    406         return _mm_packus_epi16(simd_andc(r2, simd_himask<16>()), simd_andc(r1, simd_himask<16>()));
    407 }
    408 
    409 /* simd_mergeh
    410  * fw: 1,2,4,8,16,32,64*/
    411 template<int fw>
    412 static inline SIMD_type simd_mergeh(SIMD_type r1, SIMD_type r2){
    413         /*fw: 1,2,4*/
    414         return simd_mergeh<fw*2>(simd_if(simd_himask<fw*2>(),r1,simd_srli<fw*2>(r2,fw)),
    415         simd_if(simd_himask<fw*2>(),simd_slli<fw*2>(r1,fw),r2));
    416 }
    417 
    418 template<>
    419 static inline SIMD_type simd_mergeh<8>(SIMD_type r1, SIMD_type r2) {
    420         return _mm_unpackhi_epi8(r2, r1);
    421 }
    422 template<>
    423 static inline SIMD_type simd_mergeh<16>(SIMD_type r1, SIMD_type r2) {
    424         return _mm_unpackhi_epi16(r2, r1);
    425 }
    426 template<>
    427 static inline SIMD_type simd_mergeh<32>(SIMD_type r1, SIMD_type r2) {
    428         return _mm_unpackhi_epi32(r2, r1);
    429 }
    430 template<>
    431 static inline SIMD_type simd_mergeh<64>(SIMD_type r1, SIMD_type r2) {
    432         return _mm_unpackhi_epi64(r2, r1);
    433 }
    434 
    435 /* simd_mergel
    436  * fw: 1,2,4,8,16,32,64*/
    437 template<int fw>
    438 static inline SIMD_type simd_mergel(SIMD_type r1, SIMD_type r2){
    439         /*fw: 1,2,4*/
    440         return simd_mergel<fw*2>(simd_if(simd_himask<fw*2>(),r1,simd_srli<fw*2>(r2,fw)),
    441         simd_if(simd_himask<fw*2>(),simd_slli<fw*2>(r1,fw),r2));
    442 }
    443 template<>
    444 static inline SIMD_type simd_mergel<8>(SIMD_type r1, SIMD_type r2) {
    445         return _mm_unpacklo_epi8(r2, r1);
    446 }
    447 template<>
    448 static inline SIMD_type simd_mergel<16>(SIMD_type r1, SIMD_type r2) {
    449         return _mm_unpacklo_epi16(r2, r1);
    450 }
    451 template<>
    452 static inline SIMD_type simd_mergel<32>(SIMD_type r1, SIMD_type r2) {
    453         return _mm_unpacklo_epi32(r2, r1);
    454 }
    455 template<>
    456 static inline SIMD_type simd_mergel<64>(SIMD_type r1, SIMD_type r2) {
    457         return _mm_unpacklo_epi64(r2, r1);
    458 }
    459 
    460 
    461 /* simd_all_true
    462  * fw: 8*/
    463 template<int fw>
    464 static inline int simd_all_true(SIMD_type r);
    465 template<>
    466 static inline int simd_all_true<8>(SIMD_type r) {
    467         return _mm_movemask_epi8(r) == 0xFFFF;
    468 }
    469 
    470 /* simd_any_true
    471  * fw: 8*/
    472 template<int fw>
    473 static inline int simd_any_true(SIMD_type r);
    474 template<>
    475 static inline int simd_any_true<8>(SIMD_type r) {
    476         return _mm_movemask_epi8(r) != 0;
    477 }
    478 
    479 /* simd_any_sign_bit
    480  * fw: 8*/
    481 template<int fw>
    482 static inline int simd_any_sign_bit(SIMD_type r);
    483 template<>
    484 static inline int simd_any_sign_bit<8>(SIMD_type r) {
    485         return _mm_movemask_epi8(r) != 0;
    486 }
    487 
    488 /* Half operand modifier*/
    489 template <int fw, HOM m>
    490 struct SIMD {
    491         static inline SIMD_type hom(SIMD_type r) {}
    492 };
    493 
    494 template <int fw>
    495 struct SIMD<fw, x> {
    496         static inline SIMD_type hom(SIMD_type r) {return r;}
    497         static inline SIMD_type hx(SIMD_type r) {return r;}
    498 };
    499 
    500 template <int fw>
    501 struct SIMD<fw, l> {
    502         static inline SIMD_type hom(SIMD_type r) {return simd_andc(r, simd_himask<fw>());}
    503         static inline SIMD_type hx(SIMD_type r) {return r;}
    504 };
    505 
    506 template <int fw>
    507 struct SIMD<fw, h> {
    508         static inline SIMD_type hom(SIMD_type r) {return simd_srli<fw>(r, fw/2);}
    509         static inline SIMD_type hx(SIMD_type r) {return simd_srli<fw>(r, fw/2);}
    510 };
    511 
    512 
    513 /* SIMD operations extended with HOM*/
    514 template<int fw, HOM m1, HOM m2>
    515 static inline SIMD_type simd_add(SIMD_type r1, SIMD_type r2){
    516         return simd_add<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m1>::hom(r2));
    517 }
    518 
    519 template<int fw, HOM m1, HOM m2>
    520 static inline SIMD_type simd_sub(SIMD_type r1, SIMD_type r2){
    521         return simd_sub<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    522 }
    523 
    524 template<int fw, HOM m1, HOM m2>
    525 static inline SIMD_type simd_eq(SIMD_type r1, SIMD_type r2){
    526         return simd_eq<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    527 }
    528 
    529 template<int fw, HOM m1, HOM m2>
    530 static inline SIMD_type simd_pack(SIMD_type r1, SIMD_type r2){
    531         return simd_pack<fw>(SIMD<fw,m1>::hx(r1),SIMD<fw,m2>::hx(r2));
    532 }
    533 
    534 template<int fw, HOM m1, HOM m2>
    535 static inline SIMD_type simd_mergeh(SIMD_type r1, SIMD_type r2){
    536         return simd_mergeh<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    537 }
    538 
    539 template<int fw, HOM m1, HOM m2>
    540 static inline SIMD_type simd_mergel(SIMD_type r1, SIMD_type r2){
    541         return simd_mergel<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    542 }
    543                  
     626
     627
    544628static inline int bitblock_has_bit(SIMD_type v) {
    545   return !simd_all_true<8>(simd_eq<8>(v, simd_const<8>(0)));
     629  return !simd_all_true<8>(simd<8>::eq(v, simd<8>::constant<0>()));
    546630}
    547631
    548632static inline int bitblock_bit_count(SIMD_type v) {
    549633  int bit_count = 0;
    550   SIMD_type cts_2 = simd_add<2,l,h>(v, v);
    551   SIMD_type cts_4 = simd_add<4,l,h>(cts_2, cts_2);
    552   SIMD_type cts_8 = simd_add<8,l,h>(cts_4, cts_4);
    553   SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const<8>(0));
    554   /* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
    555   SIMD_type cts_128 = simd_add<64>(cts_64, sisd_srli(cts_64,64));
     634  SIMD_type cts_2 = simd<2>::add<l,h>(v, v);
     635  SIMD_type cts_4 = simd<4>::add<l,h>(cts_2, cts_2);
     636  SIMD_type cts_8 = simd<8>::add<l,h>(cts_4, cts_4);
     637  SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd<8>::constant<0>());
     638  /* SIMD_type cts_128 = simd<a28>::add<l,h>(cts_64, cts_64) */;
     639  SIMD_type cts_128 = simd<64>::add(cts_64, simd<128>::srli<64>(cts_64));
    556640  return (int) sisd_to_int(cts_128);
    557641}
Note: See TracChangeset for help on using the changeset viewer.