Changeset 2200 for trunk/lib


Ignore:
Timestamp:
May 30, 2012, 8:24:52 PM (7 years ago)
Author:
cameron
Message:

Custom USE_S2P_AVX transposition; ADCMAGIC variations

Location:
trunk/lib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib/bitblock256.hpp

    r2199 r2200  
    106106        bitblock128_t bubble0 = simd128<64>::eq(sum0, all_ones);
    107107        bitblock128_t bubble1 = simd128<64>::eq(sum1, all_ones);
    108         //bitblock256_t bubble = avx_general_combine256(bubble1, bubble0);
     108        //bitblock128_t bubble = hsimd128<64>::packss(bubble1, bubble0);
     109        bitblock256_t bubble = avx_general_combine256(bubble1, bubble0);
    109110        //uint64_t carry_mask = _mm256_movemask_pd((__m256d) icarry) * 2 + convert(carry_in);
    110111        uint64_t carry_mask = hsimd128<64>::signmask(icarry1) * 8 + hsimd128<64>::signmask(icarry0) * 2 + convert(carry_in);
    111         //uint64_t bubble_mask = _mm256_movemask_pd((__m256d) bubble);
    112         uint64_t bubble_mask = hsimd128<64>::signmask(bubble1) * 4 + hsimd128<64>::signmask(bubble0);
     112        uint64_t bubble_mask = _mm256_movemask_pd((__m256d) bubble);
     113        //uint64_t bubble_mask = hsimd128<64>::signmask(bubble1) * 4 + hsimd128<64>::signmask(bubble0);
     114        //uint64_t bubble_mask = hsimd128<32>::signmask(bubble);
    113115        uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;
    114116        uint64_t increments = carry_scan_thru_bubbles | (carry_scan_thru_bubbles - carry_mask);
  • trunk/lib/s2p.hpp

    r1760 r2200  
    1 /*  s2p128 - Serial to Parallel Bit Stream Transposition
     1/*  s2p - Serial to Parallel Bit Stream Transposition
    22    Copyright (c) 2007, 2008, 2010, 2011  Robert D. Cameron.
    33    Licensed to the public under the Open Software License 3.0.
     
    99#define S2P_HPP
    1010
    11 #include "idisa128.hpp"
     11#include "idisa.hpp"
    1212
    1313#define BytePack BitBlock
     
    107107   immediately convert back. */
    108108#ifdef USE_S2P_AVX
    109 #define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
    110 #define sse_himask_16 _mm_set1_epi32(0xFF00FF00)
    111 #define sse_slli_16(r, shft) _mm_slli_epi16(r, shft)
    112 #define sse_srli_16(r, shft) _mm_srli_epi16(r, shft)
    113 #define sse_packus_16(a, b) _mm_packus_epi16(b, a)
    114 #define sse_pack_16(a, b) \
    115   _mm_packus_epi16(sse_andc(b, sse_himask_16), sse_andc(a, sse_himask_16))
    116 #define sse_pack_16_ll(v1, v2) sse_pack_16(v1, v2)
    117 #define sse_pack_16_hh(v1, v2) sse_packus_16(sse_srli_16(v1, 8), sse_srli_16(v2, 8))
     109#include "idisa_cpp/idisa_sse2.cpp"
     110#define avx_select_lo128(x) \
     111        ((__m128i) _mm256_castps256_ps128(x))
     112
     113#define avx_select_hi128(x) \
     114        ((__m128i)(_mm256_extractf128_ps(x, 1)))
     115
     116#define avx_general_combine256(x, y) \
     117   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
    118118
    119119#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
    120120  do {\
    121         __m128i s00, s01, s10, s11, t00, t01, t10, t11;\
    122         __m128i t10shift, t11shift, t00shift, t01shift;\
    123         s00 = simd_hi128(s0);\
    124         s01 = simd_lo128(s0);\
    125         s10 = simd_hi128(s1);\
    126         s11 = simd_lo128(s1);\
    127         t00 = sse_pack_16_hh(s00, s01);\
    128         t10 = sse_pack_16_ll(s00, s01);\
    129         t01 = sse_pack_16_hh(s10, s11);\
    130         t11 = sse_pack_16_ll(s10, s11);\
    131         t10shift = sse_srli_16(t10, shift);\
    132         t11shift = sse_srli_16(t11, shift);\
    133         t00shift = sse_slli_16(t00, shift);\
    134         t01shift = sse_slli_16(t01, shift);\
    135         p0 = simd<1>::ifh(hi_mask, simd_combine256(t00, t01), simd_combine256(t10shift, t11shift));\
    136         p1 = simd<1>::ifh(hi_mask, simd_combine256(t00shift, t01shift), simd_combine256(t10, t11));\
     121        bitblock128_t s00, s01, s10, s11, t00, t01, t10, t11;\
     122        bitblock128_t t10shift, t11shift, t00shift, t01shift;\
     123        s00 = avx_select_hi128(s0);\
     124        s01 = avx_select_lo128(s0);\
     125        s10 = avx_select_hi128(s1);\
     126        s11 = avx_select_lo128(s1);\
     127        t00 = hsimd128<16>::packh(s00, s01);\
     128        t10 = hsimd128<16>::packl(s00, s01);\
     129        t01 = hsimd128<16>::packh(s10, s11);\
     130        t11 = hsimd128<16>::packl(s10, s11);\
     131        t10shift = simd128<16>::srli<shift>(t10);\
     132        t11shift = simd128<16>::srli<shift>(t11);\
     133        t00shift = simd128<16>::slli<shift>(t00);\
     134        t01shift = simd128<16>::slli<shift>(t01);\
     135        p0 = simd<1>::ifh(hi_mask, avx_general_combine256(t00, t01), avx_general_combine256(t10shift, t11shift));\
     136        p1 = simd<1>::ifh(hi_mask, avx_general_combine256(t00shift, t01shift), avx_general_combine256(t10, t11));\
    137137  } while(0)
    138138#endif
Note: See TracChangeset for help on using the changeset viewer.