Changeset 182


Ignore:
Timestamp:
Jul 9, 2008, 2:08:37 PM (11 years ago)
Author:
lindanl
Message:

Templated SIMD Library modifications

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/sse_simd_t.h

    r179 r182  
    3030/* I. SIMD bitwise logical operations */
    3131
    32 #define simd_or(b1, b2) _mm_or_si128(b1, b2)
    33 #define simd_and(b1, b2) _mm_and_si128(b1, b2)
    34 #define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
    35 
     32static inline SIMD_type simd_and(SIMD_type b1, SIMD_type b2) {
     33        return _mm_and_si128(b1, b2);
     34}
    3635static inline SIMD_type simd_andc(SIMD_type b1, SIMD_type b2) {
    3736        return _mm_andnot_si128(b2, b1);
    3837}
    39 #define simd_if(cond, then_val, else_val) \
    40   simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
    41 #define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
    42 #define simd_nor(a,b) (simd_not(simd_or(a,b)))
     38static inline SIMD_type simd_or(SIMD_type b1, SIMD_type b2) {
     39        return  _mm_or_si128(b1, b2);
     40}
     41static inline SIMD_type simd_xor(SIMD_type b1, SIMD_type b2) {
     42        return  _mm_xor_si128(b1, b2);
     43}
     44static inline SIMD_type simd_not(SIMD_type b) {
     45        return  simd_xor(b, _mm_set1_epi32(0xFFFFFFFF));
     46}
     47static inline SIMD_type simd_nor(SIMD_type b1, SIMD_type b2) {
     48        return  simd_not(simd_or(b1,b2));
     49}
     50static inline SIMD_type simd_if(SIMD_type cond, SIMD_type then_val, SIMD_type else_val) {       
     51        return  simd_or(simd_and(then_val, cond), simd_andc(else_val, cond));
     52}
    4353
    4454/* Idealized operations with direct implementation by built-in
     
    7585
    7686#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
    77 
    78 #define simd_himask_16 _mm_set1_epi16(0xFF00)
    79 #define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
    80 #define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
    81 #define simd_pack_16(a, b) \
    82   _mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
    83 #define simd_pack_16_ll(a, b) simd_pack_16(a, b)
    84 #define simd_pack_16_hh(a, b) \
    85   simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
    86 
    8787
    8888#define bitblock_test_bit(blk, n) \
     
    136136 * fw: 2,4,8,16,32,64,128*/
    137137template<int fw>
    138 inline SIMD_type simd_himask();
    139 
    140 template<>
    141 inline SIMD_type simd_himask<2>() {
     138static inline SIMD_type simd_himask();
     139
     140template<>
     141static inline SIMD_type simd_himask<2>() {
    142142        return _mm_set1_epi8(0xAA);
    143143}
    144144template<>
    145 inline SIMD_type simd_himask<4>() {
     145static inline SIMD_type simd_himask<4>() {
    146146        return _mm_set1_epi8(0xCC);
    147147}
    148148template<>
    149 inline SIMD_type simd_himask<8>() {
     149static inline SIMD_type simd_himask<8>() {
    150150        return _mm_set1_epi8(0xF0);
    151151}
    152152template<>
    153 inline SIMD_type simd_himask<16>() {
     153static inline SIMD_type simd_himask<16>() {
    154154        return _mm_set1_epi16(0xFF00);
    155155}
    156156template<>
    157 inline SIMD_type simd_himask<32>() {
     157static inline SIMD_type simd_himask<32>() {
    158158        return _mm_set1_epi32(0xFFFF0000);
    159159}
    160160template<>
    161 inline SIMD_type simd_himask<64>() {
     161static inline SIMD_type simd_himask<64>() {
    162162        return _mm_set_epi32(-1,0,-1,0);
    163163}
    164164template<>
    165 inline SIMD_type simd_himask<128>() {
     165static inline SIMD_type simd_himask<128>() {
    166166        return _mm_set_epi32(-1,-1,0,0);
    167167}
     
    170170 * fw: 2,4,8,16,32*/
    171171template<int fw>
    172 inline SIMD_type simd_const(int n);
    173 template<>
    174 inline SIMD_type simd_const<4>(int n) {
     172static inline SIMD_type simd_const(int n);
     173template<>
     174static inline SIMD_type simd_const<4>(int n) {
    175175        return _mm_set1_epi8((n)<<4|(n));
    176176}
    177177template<>
    178 inline SIMD_type simd_const<8>(int n) {
     178static inline SIMD_type simd_const<8>(int n) {
    179179        return _mm_set1_epi8(n);
    180180}
    181181template<>
    182 inline SIMD_type simd_const<16>(int n) {
     182static inline SIMD_type simd_const<16>(int n) {
    183183        return _mm_set1_epi16(n);
    184184}
    185185template<>
    186 inline SIMD_type simd_const<32>(int n) {
     186static inline SIMD_type simd_const<32>(int n) {
    187187        return _mm_set1_epi32(n);
    188188}
    189189template<>
    190 inline SIMD_type simd_const<1>(int n) {
     190static inline SIMD_type simd_const<1>(int n) {
    191191        if(n==0) return simd_const<8>(0);
    192192        else return simd_const<8>(-1);
    193193}
    194194template<>
    195 inline SIMD_type simd_const<2>(int n) {
     195static inline SIMD_type simd_const<2>(int n) {
    196196        return simd_const<4>(n<<2|n);
    197197}
    198198
    199199template<int fw, int val>
    200 inline SIMD_type simd_const();
    201 
    202 template<>
    203 inline SIMD_type simd_const<1, 0>() {
     200static inline SIMD_type simd_const();
     201
     202template<>
     203static inline SIMD_type simd_const<1, 0>() {
    204204        return simd_const<8>(0);
    205205}
    206206
    207207template<>
    208 inline SIMD_type simd_const<1, 1>() {
     208static inline SIMD_type simd_const<1, 1>() {
    209209        return simd_const<8>(-1);
    210210}
     
    214214 * fw: 2,4,8,16,32,64*/
    215215template<int fw>
    216 inline SIMD_type simd_srli(SIMD_type r, int sh);
    217 template<>
    218 inline SIMD_type simd_srli<16>(SIMD_type r, int sh) {
     216static inline SIMD_type simd_srli(SIMD_type r, int sh);
     217template<>
     218static inline SIMD_type simd_srli<16>(SIMD_type r, int sh) {
    219219        return _mm_srli_epi16(r, sh);
    220220}
    221221template<>
    222 inline SIMD_type simd_srli<32>(SIMD_type r, int sh) {
     222static inline SIMD_type simd_srli<32>(SIMD_type r, int sh) {
    223223        return _mm_srli_epi32(r, sh);
    224224}
    225225template<>
    226 inline SIMD_type simd_srli<64>(SIMD_type r, int sh) {
     226static inline SIMD_type simd_srli<64>(SIMD_type r, int sh) {
    227227        return _mm_srli_epi64(r, sh);
    228228}
    229229template<>
    230 inline SIMD_type simd_srli<2>(SIMD_type r, int sh) {
     230static inline SIMD_type simd_srli<2>(SIMD_type r, int sh) {
    231231        return simd_and(simd_srli<32>(r,sh),simd_const<2>(3>>sh));
    232232}
    233233template<>
    234 inline SIMD_type simd_srli<4>(SIMD_type r, int sh) {
     234static inline SIMD_type simd_srli<4>(SIMD_type r, int sh) {
    235235        return simd_and(simd_srli<32>(r,sh),simd_const<4>(15>>sh));
    236236}
    237237template<>
    238 inline SIMD_type simd_srli<8>(SIMD_type r, int sh) {
     238static inline SIMD_type simd_srli<8>(SIMD_type r, int sh) {
    239239        return simd_and(simd_srli<32>(r,sh),simd_const<8>(255>>sh));
    240240}
     
    243243 * fw: 2,4,8,16,32,64*/
    244244template<int fw>
    245 inline SIMD_type simd_slli(SIMD_type r, int sh);
    246 template<>
    247 inline SIMD_type simd_slli<16>(SIMD_type r, int sh) {
     245static inline SIMD_type simd_slli(SIMD_type r, int sh);
     246template<>
     247static inline SIMD_type simd_slli<16>(SIMD_type r, int sh) {
    248248        return _mm_slli_epi16(r, sh);
    249249}
    250250template<>
    251 inline SIMD_type simd_slli<32>(SIMD_type r, int sh) {
     251static inline SIMD_type simd_slli<32>(SIMD_type r, int sh) {
    252252        return _mm_slli_epi32(r, sh);
    253253}
    254254template<>
    255 inline SIMD_type simd_slli<64>(SIMD_type r, int sh) {
     255static inline SIMD_type simd_slli<64>(SIMD_type r, int sh) {
    256256        return _mm_slli_epi64(r, sh);
    257257}
    258258template<>
    259 inline SIMD_type simd_slli<2>(SIMD_type r, int sh) {
     259static inline SIMD_type simd_slli<2>(SIMD_type r, int sh) {
    260260        return simd_and(simd_slli<32>(r,sh),simd_const<2>((3<<sh)&3));
    261261}
    262262template<>
    263 inline SIMD_type simd_slli<4>(SIMD_type r, int sh) {
     263static inline SIMD_type simd_slli<4>(SIMD_type r, int sh) {
    264264        return simd_and(simd_slli<32>(r,sh),simd_const<4>((15<<sh)&15));
    265265}
    266266template<>
    267 inline SIMD_type simd_slli<8>(SIMD_type r, int sh) {
     267static inline SIMD_type simd_slli<8>(SIMD_type r, int sh) {
    268268        return simd_and(simd_slli<32>(r,sh),simd_const<8>((255<<sh) &255));
    269269}
     
    298298 * fw: 16,32*/
    299299template<int fw>
    300 inline SIMD_type simd_srai(SIMD_type r, int sh);
    301 template<>
    302 inline SIMD_type simd_srai<16>(SIMD_type r, int sh) {
     300static inline SIMD_type simd_srai(SIMD_type r, int sh);
     301template<>
     302static inline SIMD_type simd_srai<16>(SIMD_type r, int sh) {
    303303        return _mm_srai_epi16(r, sh);
    304304}
    305305template<>
    306 inline SIMD_type simd_srai<32>(SIMD_type r, int sh) {
     306static inline SIMD_type simd_srai<32>(SIMD_type r, int sh) {
    307307        return _mm_srai_epi32(r, sh);
    308308}
     
    311311 * fw: 2,4,8,16,32,64*/
    312312template<int fw>
    313 inline SIMD_type simd_add(SIMD_type r1, SIMD_type r2);
    314 template<>
    315 inline SIMD_type simd_add<2>(SIMD_type r1, SIMD_type r2) {
     313static inline SIMD_type simd_add(SIMD_type r1, SIMD_type r2);
     314template<>
     315static inline SIMD_type simd_add<2>(SIMD_type r1, SIMD_type r2) {
    316316         SIMD_type c1 = simd_xor(r1,r2);
    317317         SIMD_type borrow = simd_and(r1,r2);
     
    320320}
    321321template<>
    322 inline SIMD_type simd_add<8>(SIMD_type r1, SIMD_type r2) {
     322static inline SIMD_type simd_add<8>(SIMD_type r1, SIMD_type r2) {
    323323        return _mm_add_epi8(r1, r2);
    324324}
    325325template<>
    326 inline SIMD_type simd_add<4>(SIMD_type r1, SIMD_type r2) {
     326static inline SIMD_type simd_add<4>(SIMD_type r1, SIMD_type r2) {
    327327        return simd_if(simd_himask<8>(), simd_add<8>(simd_and(r1,simd_himask<8>()),simd_and(r2,simd_himask<8>()))
    328328        ,simd_add<8>(simd_andc(r1,simd_himask<8>()),simd_andc(r2,simd_himask<8>())));
    329329}
    330330template<>
    331 inline SIMD_type simd_add<16>(SIMD_type r1, SIMD_type r2) {
     331static inline SIMD_type simd_add<16>(SIMD_type r1, SIMD_type r2) {
    332332        return _mm_add_epi16(r1, r2);
    333333}
    334334template<>
    335 inline SIMD_type simd_add<32>(SIMD_type r1, SIMD_type r2) {
     335static inline SIMD_type simd_add<32>(SIMD_type r1, SIMD_type r2) {
    336336        return _mm_add_epi32(r1, r2);
    337337}
    338338template<>
    339 inline SIMD_type simd_add<64>(SIMD_type r1, SIMD_type r2) {
     339static inline SIMD_type simd_add<64>(SIMD_type r1, SIMD_type r2) {
    340340        return _mm_add_epi64(r1, r2);
    341341}
    342342/* simd_sub
    343  * fw: 8,16,32,64*/
    344 template<int fw>
    345 inline SIMD_type simd_sub(SIMD_type r1, SIMD_type r2);
    346 
    347 template<>
    348 inline SIMD_type simd_sub<8>(SIMD_type r1, SIMD_type r2) {
     343 * fw: 2,4,8,16,32,64*/
     344template<int fw>
     345static inline SIMD_type simd_sub(SIMD_type r1, SIMD_type r2);
     346
     347template<>
     348static inline SIMD_type simd_sub<2>(SIMD_type r1, SIMD_type r2)
     349{
     350         SIMD_type c1 = simd_xor(r1,r2);
     351         SIMD_type borrow = simd_andc(r2,r1);
     352         SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
     353         return simd_if(simd_himask<2>(),c2,c1);
     354}
     355template<>
     356static inline SIMD_type simd_sub<8>(SIMD_type r1, SIMD_type r2) {
    349357        return _mm_sub_epi8(r1, r2);
    350358}
    351359template<>
    352 inline SIMD_type simd_sub<16>(SIMD_type r1, SIMD_type r2) {
     360static inline SIMD_type simd_sub<4>(SIMD_type r1,SIMD_type r2){
     361        return simd_if(simd_himask<8>(), simd_sub<8>(simd_and(r1,simd_himask<8>()),simd_and(r2,simd_himask<8>()))
     362        ,simd_sub<8>(simd_andc(r1,simd_himask<8>()),simd_andc(r2,simd_himask<8>())));
     363}
     364template<>
     365static inline SIMD_type simd_sub<16>(SIMD_type r1, SIMD_type r2) {
    353366        return _mm_sub_epi16(r1, r2);
    354367}
    355368template<>
    356 inline SIMD_type simd_sub<32>(SIMD_type r1, SIMD_type r2) {
     369static inline SIMD_type simd_sub<32>(SIMD_type r1, SIMD_type r2) {
    357370        return _mm_sub_epi32(r1, r2);
    358371}
    359372template<>
    360 inline SIMD_type simd_sub<64>(SIMD_type r1, SIMD_type r2) {
     373static inline SIMD_type simd_sub<64>(SIMD_type r1, SIMD_type r2) {
    361374        return _mm_sub_epi64(r1, r2);
    362375}
     
    364377 * fw: 8,16,32*/
    365378template<int fw>
    366 inline SIMD_type simd_eq(SIMD_type r1, SIMD_type r2);
    367 
    368 template<>
    369 inline SIMD_type simd_eq<8>(SIMD_type r1, SIMD_type r2) {
     379static inline SIMD_type simd_eq(SIMD_type r1, SIMD_type r2);
     380
     381template<>
     382static inline SIMD_type simd_eq<8>(SIMD_type r1, SIMD_type r2) {
    370383        return _mm_cmpeq_epi8(r1, r2);
    371384}
    372385template<>
    373 inline SIMD_type simd_eq<16>(SIMD_type r1, SIMD_type r2) {
     386static inline SIMD_type simd_eq<16>(SIMD_type r1, SIMD_type r2) {
    374387        return _mm_cmpeq_epi16(r1, r2);
    375388}
    376389template<>
    377 inline SIMD_type simd_eq<32>(SIMD_type r1, SIMD_type r2) {
     390static inline SIMD_type simd_eq<32>(SIMD_type r1, SIMD_type r2) {
    378391        return _mm_cmpeq_epi32(r1, r2);
    379392}
     
    397410 * fw: 1,2,4,8,16,32,64*/
    398411template<int fw>
    399 inline SIMD_type simd_mergeh(SIMD_type r1, SIMD_type r2){
     412static inline SIMD_type simd_mergeh(SIMD_type r1, SIMD_type r2){
    400413        /*fw: 1,2,4*/
    401414        return simd_mergeh<fw*2>(simd_if(simd_himask<fw*2>(),r1,simd_srli<fw*2>(r2,fw)),
     
    404417
    405418template<>
    406 inline SIMD_type simd_mergeh<8>(SIMD_type r1, SIMD_type r2) {
     419static inline SIMD_type simd_mergeh<8>(SIMD_type r1, SIMD_type r2) {
    407420        return _mm_unpackhi_epi8(r2, r1);
    408421}
    409422template<>
    410 inline SIMD_type simd_mergeh<16>(SIMD_type r1, SIMD_type r2) {
     423static inline SIMD_type simd_mergeh<16>(SIMD_type r1, SIMD_type r2) {
    411424        return _mm_unpackhi_epi16(r2, r1);
    412425}
    413426template<>
    414 inline SIMD_type simd_mergeh<32>(SIMD_type r1, SIMD_type r2) {
     427static inline SIMD_type simd_mergeh<32>(SIMD_type r1, SIMD_type r2) {
    415428        return _mm_unpackhi_epi32(r2, r1);
    416429}
    417430template<>
    418 inline SIMD_type simd_mergeh<64>(SIMD_type r1, SIMD_type r2) {
     431static inline SIMD_type simd_mergeh<64>(SIMD_type r1, SIMD_type r2) {
    419432        return _mm_unpackhi_epi64(r2, r1);
    420433}
     
    423436 * fw: 1,2,4,8,16,32,64*/
    424437template<int fw>
    425 inline SIMD_type simd_mergel(SIMD_type r1, SIMD_type r2){
     438static inline SIMD_type simd_mergel(SIMD_type r1, SIMD_type r2){
    426439        /*fw: 1,2,4*/
    427440        return simd_mergel<fw*2>(simd_if(simd_himask<fw*2>(),r1,simd_srli<fw*2>(r2,fw)),
     
    429442}
    430443template<>
    431 inline SIMD_type simd_mergel<8>(SIMD_type r1, SIMD_type r2) {
     444static inline SIMD_type simd_mergel<8>(SIMD_type r1, SIMD_type r2) {
    432445        return _mm_unpacklo_epi8(r2, r1);
    433446}
    434447template<>
    435 inline SIMD_type simd_mergel<16>(SIMD_type r1, SIMD_type r2) {
     448static inline SIMD_type simd_mergel<16>(SIMD_type r1, SIMD_type r2) {
    436449        return _mm_unpacklo_epi16(r2, r1);
    437450}
    438451template<>
    439 inline SIMD_type simd_mergel<32>(SIMD_type r1, SIMD_type r2) {
     452static inline SIMD_type simd_mergel<32>(SIMD_type r1, SIMD_type r2) {
    440453        return _mm_unpacklo_epi32(r2, r1);
    441454}
    442455template<>
    443 inline SIMD_type simd_mergel<64>(SIMD_type r1, SIMD_type r2) {
     456static inline SIMD_type simd_mergel<64>(SIMD_type r1, SIMD_type r2) {
    444457        return _mm_unpacklo_epi64(r2, r1);
    445458}
     
    449462 * fw: 8*/
    450463template<int fw>
    451 inline int simd_all_true(SIMD_type r);
    452 template<>
    453 inline int simd_all_true<8>(SIMD_type r) {
     464static inline int simd_all_true(SIMD_type r);
     465template<>
     466static inline int simd_all_true<8>(SIMD_type r) {
    454467        return _mm_movemask_epi8(r) == 0xFFFF;
    455468}
     
    458471 * fw: 8*/
    459472template<int fw>
    460 inline int simd_any_true(SIMD_type r);
    461 template<>
    462 inline int simd_any_true<8>(SIMD_type r) {
     473static inline int simd_any_true(SIMD_type r);
     474template<>
     475static inline int simd_any_true<8>(SIMD_type r) {
    463476        return _mm_movemask_epi8(r) != 0;
    464477}
     
    467480 * fw: 8*/
    468481template<int fw>
    469 inline int simd_any_sign_bit(SIMD_type r);
    470 template<>
    471 inline int simd_any_sign_bit<8>(SIMD_type r) {
     482static inline int simd_any_sign_bit(SIMD_type r);
     483template<>
     484static inline int simd_any_sign_bit<8>(SIMD_type r) {
    472485        return _mm_movemask_epi8(r) != 0;
    473486}
     
    500513/* SIMD operations extended with HOM*/
    501514template<int fw, HOM m1, HOM m2>
    502 inline SIMD_type simd_add(SIMD_type r1, SIMD_type r2){
     515static inline SIMD_type simd_add(SIMD_type r1, SIMD_type r2){
    503516        return simd_add<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m1>::hom(r2));
    504517}
    505518
    506519template<int fw, HOM m1, HOM m2>
    507 inline SIMD_type simd_sub(SIMD_type r1, SIMD_type r2){
     520static inline SIMD_type simd_sub(SIMD_type r1, SIMD_type r2){
    508521        return simd_sub<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    509522}
    510523
    511524template<int fw, HOM m1, HOM m2>
    512 inline SIMD_type simd_eq(SIMD_type r1, SIMD_type r2){
     525static inline SIMD_type simd_eq(SIMD_type r1, SIMD_type r2){
    513526        return simd_eq<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    514527}
     
    520533
    521534template<int fw, HOM m1, HOM m2>
    522 inline SIMD_type simd_mergeh(SIMD_type r1, SIMD_type r2){
     535static inline SIMD_type simd_mergeh(SIMD_type r1, SIMD_type r2){
    523536        return simd_mergeh<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    524537}
    525538
    526539template<int fw, HOM m1, HOM m2>
    527 inline SIMD_type simd_mergel(SIMD_type r1, SIMD_type r2){
     540static inline SIMD_type simd_mergel(SIMD_type r1, SIMD_type r2){
    528541        return simd_mergel<fw>(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
    529542}
Note: See TracChangeset for help on using the changeset viewer.