source: trunk/lib/s2p.hpp @ 3455

Last change on this file since 3455 was 3455, checked in by linmengl, 6 years ago

minor fix on s2p.hpp to enable USE_S2P_AVX

File size: 7.8 KB
Line 
1/*  s2p - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, 2011  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7
8#ifndef S2P_HPP
9#define S2P_HPP
10
11#include "idisa.hpp"
12
13#define BytePack BitBlock
14
15/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
16   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
17
18   The following header shows the intent, although a macro is used for
19   speed.
20static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
21                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
22                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
23                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
24*/
25
26/*  1.  ALGORITHM Selection. 
27        Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack
28        Default is s2p_bytepack.
29        Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override.
30*/
31
32#ifdef USE_S2P_IDEAL
33#define S2P_ALGORITHM s2p_ideal
34#endif
35
36#ifdef USE_S2P_MOVEMASK
37#define S2P_ALGORITHM s2p_movemask
38#endif
39
40#ifndef S2P_ALGORITHM
41#define S2P_ALGORITHM s2p_bytepack
42#endif
43
44#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
45  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
46
47/*  s2p_ideal is an ideal serial to parallel transposition
48    algorithm given an architecture with native support for
49    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
50    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
51    operations.
52*/
53
54#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
55  do {\
56        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
57        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
58        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
59        bit0123_0 = hsimd<8>::packh(s0, s1);\
60        bit0123_1 = hsimd<8>::packh(s2, s3);\
61        bit0123_2 = hsimd<8>::packh(s4, s5);\
62        bit0123_3 = hsimd<8>::packh(s6, s7);\
63        bit4567_0 = hsimd<8>::packl(s0, s1);\
64        bit4567_1 = hsimd<8>::packl(s2, s3);\
65        bit4567_2 = hsimd<8>::packl(s4, s5);\
66        bit4567_3 = hsimd<8>::packl(s6, s7);\
67        bit01_0 = hsimd<4>::packh(bit0123_0, bit0123_1);\
68        bit01_1 = hsimd<4>::packh(bit0123_2, bit0123_3);\
69        bit23_0 = hsimd<4>::packl(bit0123_0, bit0123_1);\
70        bit23_1 = hsimd<4>::packl(bit0123_2, bit0123_3);\
71        bit45_0 = hsimd<4>::packh(bit4567_0, bit4567_1);\
72        bit45_1 = hsimd<4>::packh(bit4567_2, bit4567_3);\
73        bit67_0 = hsimd<4>::packl(bit4567_0, bit4567_1);\
74        bit67_1 = hsimd<4>::packl(bit4567_2, bit4567_3);\
75        p0 = hsimd<2>::packh(bit01_0, bit01_1);\
76        p1 = hsimd<2>::packl(bit01_0, bit01_1);\
77        p2 = hsimd<2>::packh(bit23_0, bit23_1);\
78        p3 = hsimd<2>::packl(bit23_0, bit23_1);\
79        p4 = hsimd<2>::packh(bit45_0, bit45_1);\
80        p5 = hsimd<2>::packl(bit45_0, bit45_1);\
81        p6 = hsimd<2>::packh(bit67_0, bit67_1);\
82        p7 = hsimd<2>::packl(bit67_0, bit67_1);\
83  } while(0)
84
85
86/*  s2p_bytepack is a fast serial to parallel transposition
87    algorithm given an architecture with simd_pack_16 operations,
88    but not at small field widths.
89    MMX, SSE, Altivec ...
90*/
91
92
93#ifndef USE_S2P_AVX
94#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
95  do {\
96        BitBlock t0,t1;\
97        t0 = hsimd<16>::packh(s0, s1);\
98        t1 = hsimd<16>::packl(s0, s1);\
99        p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\
100        p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\
101  } while(0)
102#endif
103
104
105/* For AVX, we use a modified s2p_step function to avoid a number
106   of conversions from 128-bit mode to 256-bit mode just to
107   immediately convert back. */
108#ifdef USE_S2P_AVX
109#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
110  do {\
111        bitblock128_t s00, s01, s10, s11, t00, t01, t10, t11;\
112        bitblock128_t t10shift, t11shift, t00shift, t01shift;\
113        s00 = avx_select_hi128(s0);\
114        s01 = avx_select_lo128(s0);\
115        s10 = avx_select_hi128(s1);\
116        s11 = avx_select_lo128(s1);\
117        t00 = hsimd128<16>::packh(s00, s01);\
118        t10 = hsimd128<16>::packl(s00, s01);\
119        t01 = hsimd128<16>::packh(s10, s11);\
120        t11 = hsimd128<16>::packl(s10, s11);\
121        t10shift = simd128<16>::srli<shift>(t10);\
122        t11shift = simd128<16>::srli<shift>(t11);\
123        t00shift = simd128<16>::slli<shift>(t00);\
124        t01shift = simd128<16>::slli<shift>(t01);\
125        p0 = simd<1>::ifh(hi_mask, avx_general_combine256(t00, t01), avx_general_combine256(t10shift, t11shift));\
126        p1 = simd<1>::ifh(hi_mask, avx_general_combine256(t00shift, t01shift), avx_general_combine256(t10, t11));\
127  } while(0)
128#endif
129
130#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
131  do {\
132        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
133        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
134        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
135        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
136        s2p_step(s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\
137        s2p_step(s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\
138        s2p_step(s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\
139        s2p_step(s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\
140        s2p_step(bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\
141        s2p_step(bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\
142        s2p_step(bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\
143        s2p_step(bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\
144        s2p_step(bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\
145        s2p_step(bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\
146        s2p_step(bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\
147        s2p_step(bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\
148  } while(0)
149
150/* For sizeof(BitBlock) = 16 */
151typedef uint16_t BitPack;
152
153#define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \
154  do { \
155        union { BitPack bit_pack[8];\
156                BitBlock bit_block;\
157              } b;\
158        b.bit_pack[0] = hsimd<8>::signmask(s0);\
159        b.bit_pack[1] = hsimd<8>::signmask(s1);\
160        b.bit_pack[2] = hsimd<8>::signmask(s2);\
161        b.bit_pack[3] = hsimd<8>::signmask(s3);\
162        b.bit_pack[4] = hsimd<8>::signmask(s4);\
163        b.bit_pack[5] = hsimd<8>::signmask(s5);\
164        b.bit_pack[6] = hsimd<8>::signmask(s6);\
165        b.bit_pack[7] = hsimd<8>::signmask(s7);\
166        p = b.bit_block;\
167   } while (0)
168
169#define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \
170  do { \
171        t0 = simd<8>::add(s0, s0);\
172        t1 = simd<8>::add(s1, s1);\
173        t2 = simd<8>::add(s2, s2);\
174        t3 = simd<8>::add(s3, s3);\
175        t4 = simd<8>::add(s4, s4);\
176        t5 = simd<8>::add(s5, s5);\
177        t6 = simd<8>::add(s6, s6);\
178        t7 = simd<8>::add(s7, s7);\
179  } while (0)
180
181
182#define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
183  do { \
184        BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\
185        movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\
186        bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\
187        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\
188        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
189        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\
190        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
191        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\
192        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
193        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\
194        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
195        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\
196        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
197        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\
198        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
199        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\
200  } while (0)
201
202
203#endif // S2P_HPP
204
Note: See TracBrowser for help on using the repository browser.