source: trunk/lib/s2p.hpp @ 4063

Last change on this file since 4063 was 3487, checked in by cameron, 6 years ago

USE_S2P_AVX2 mode

File size: 9.9 KB
Line 
1/*  s2p - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, 2011  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7
8#ifndef S2P_HPP
9#define S2P_HPP
10
11#include "idisa.hpp"
12
13#define BytePack BitBlock
14
15/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
16   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
17
18   The following header shows the intent, although a macro is used for
19   speed.
20static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
21                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
22                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
23                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
24*/
25
26/*  1.  ALGORITHM Selection. 
27        Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack
28        Default is s2p_bytepack.
29        Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override.
30*/
31
32#ifdef USE_S2P_IDEAL
33#define S2P_ALGORITHM s2p_ideal
34#endif
35
36#ifdef USE_S2P_MOVEMASK
37#define S2P_ALGORITHM s2p_movemask
38#endif
39
40#ifndef S2P_ALGORITHM
41#define S2P_ALGORITHM s2p_bytepack
42#endif
43
44#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
45  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
46
47/*  s2p_ideal is an ideal serial to parallel transposition
48    algorithm given an architecture with native support for
49    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
50    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
51    operations.
52*/
53
54#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
55  do {\
56        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
57        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
58        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
59        bit0123_0 = hsimd<8>::packh(s0, s1);\
60        bit0123_1 = hsimd<8>::packh(s2, s3);\
61        bit0123_2 = hsimd<8>::packh(s4, s5);\
62        bit0123_3 = hsimd<8>::packh(s6, s7);\
63        bit4567_0 = hsimd<8>::packl(s0, s1);\
64        bit4567_1 = hsimd<8>::packl(s2, s3);\
65        bit4567_2 = hsimd<8>::packl(s4, s5);\
66        bit4567_3 = hsimd<8>::packl(s6, s7);\
67        bit01_0 = hsimd<4>::packh(bit0123_0, bit0123_1);\
68        bit01_1 = hsimd<4>::packh(bit0123_2, bit0123_3);\
69        bit23_0 = hsimd<4>::packl(bit0123_0, bit0123_1);\
70        bit23_1 = hsimd<4>::packl(bit0123_2, bit0123_3);\
71        bit45_0 = hsimd<4>::packh(bit4567_0, bit4567_1);\
72        bit45_1 = hsimd<4>::packh(bit4567_2, bit4567_3);\
73        bit67_0 = hsimd<4>::packl(bit4567_0, bit4567_1);\
74        bit67_1 = hsimd<4>::packl(bit4567_2, bit4567_3);\
75        p0 = hsimd<2>::packh(bit01_0, bit01_1);\
76        p1 = hsimd<2>::packl(bit01_0, bit01_1);\
77        p2 = hsimd<2>::packh(bit23_0, bit23_1);\
78        p3 = hsimd<2>::packl(bit23_0, bit23_1);\
79        p4 = hsimd<2>::packh(bit45_0, bit45_1);\
80        p5 = hsimd<2>::packl(bit45_0, bit45_1);\
81        p6 = hsimd<2>::packh(bit67_0, bit67_1);\
82        p7 = hsimd<2>::packl(bit67_0, bit67_1);\
83  } while(0)
84
85
86/*  s2p_bytepack is a fast serial to parallel transposition
87    algorithm given an architecture with simd_pack_16 operations,
88    but not at small field widths.
89    MMX, SSE, Altivec ...
90*/
91
92
93#ifndef USE_S2P_AVX
94#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
95  do {\
96        BitBlock t0,t1;\
97        t0 = hsimd<16>::packh(s0, s1);\
98        t1 = hsimd<16>::packl(s0, s1);\
99        p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\
100        p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\
101  } while(0)
102#endif
103
104
105/* For AVX, we use a modified s2p_step function to avoid a number
106   of conversions from 128-bit mode to 256-bit mode just to
107   immediately convert back. */
108#ifdef USE_S2P_AVX
109#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
110  do {\
111        bitblock128_t s00, s01, s10, s11, t00, t01, t10, t11;\
112        bitblock128_t t10shift, t11shift, t00shift, t01shift;\
113        s00 = avx_select_hi128(s0);\
114        s01 = avx_select_lo128(s0);\
115        s10 = avx_select_hi128(s1);\
116        s11 = avx_select_lo128(s1);\
117        t00 = hsimd128<16>::packh(s00, s01);\
118        t10 = hsimd128<16>::packl(s00, s01);\
119        t01 = hsimd128<16>::packh(s10, s11);\
120        t11 = hsimd128<16>::packl(s10, s11);\
121        t10shift = simd128<16>::srli<shift>(t10);\
122        t11shift = simd128<16>::srli<shift>(t11);\
123        t00shift = simd128<16>::slli<shift>(t00);\
124        t01shift = simd128<16>::slli<shift>(t01);\
125        p0 = simd<1>::ifh(hi_mask, avx_general_combine256(t00, t01), avx_general_combine256(t10shift, t11shift));\
126        p1 = simd<1>::ifh(hi_mask, avx_general_combine256(t00shift, t01shift), avx_general_combine256(t10, t11));\
127  } while(0)
128#endif
129
130#ifndef USE_S2P_AVX2
131#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
132  do {\
133        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
134        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
135        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
136        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
137        s2p_step(s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\
138        s2p_step(s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\
139        s2p_step(s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\
140        s2p_step(s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\
141        s2p_step(bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\
142        s2p_step(bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\
143        s2p_step(bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\
144        s2p_step(bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\
145        s2p_step(bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\
146        s2p_step(bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\
147        s2p_step(bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\
148        s2p_step(bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\
149  } while(0)
150#endif
151
152#ifdef USE_S2P_AVX2
153#define s2p_step_shuf(shuf, s0, s1, hi_mask, shift, p0, p1)  \
154  do {\
155        BitBlock x0, x1, t0, t1;\
156        x0 = _mm256_permute4x64_epi64(_mm256_shuffle_epi8(s0, shuf), 0xD8);\
157        x1 = _mm256_permute4x64_epi64(_mm256_shuffle_epi8(s1, shuf), 0xD8);\
158        t0 = _mm256_permute2x128_si256(x1, x0, 0x31);\
159        t1 = _mm256_permute2x128_si256(x1, x0, 0x20);\
160        p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\
161        p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\
162  } while(0)
163
164#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
165  do {\
166        BitBlock shuf = _mm256_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200, 0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200);\
167        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
168        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
169        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
170        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
171        s2p_step_shuf(shuf, s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\
172        s2p_step_shuf(shuf, s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\
173        s2p_step_shuf(shuf, s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\
174        s2p_step_shuf(shuf, s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\
175        s2p_step_shuf(shuf, bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\
176        s2p_step_shuf(shuf, bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\
177        s2p_step_shuf(shuf, bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\
178        s2p_step_shuf(shuf, bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\
179        s2p_step_shuf(shuf, bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\
180        s2p_step_shuf(shuf, bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\
181        s2p_step_shuf(shuf, bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\
182        s2p_step_shuf(shuf, bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\
183  } while(0)
184
185#endif
186
187
188
189
190/* For sizeof(BitBlock) = 16 */
191#if BLOCK_SIZE == 128
192typedef uint16_t BitPack;
193#endif
194#if BLOCK_SIZE == 256
195typedef uint32_t BitPack;
196#endif
197
198#define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \
199  do { \
200        union { BitPack bit_pack[8];\
201                BitBlock bit_block;\
202              } b;\
203        b.bit_pack[0] = hsimd<8>::signmask(s0);\
204        b.bit_pack[1] = hsimd<8>::signmask(s1);\
205        b.bit_pack[2] = hsimd<8>::signmask(s2);\
206        b.bit_pack[3] = hsimd<8>::signmask(s3);\
207        b.bit_pack[4] = hsimd<8>::signmask(s4);\
208        b.bit_pack[5] = hsimd<8>::signmask(s5);\
209        b.bit_pack[6] = hsimd<8>::signmask(s6);\
210        b.bit_pack[7] = hsimd<8>::signmask(s7);\
211        p = b.bit_block;\
212   } while (0)
213
214#define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \
215  do { \
216        t0 = simd<8>::add(s0, s0);\
217        t1 = simd<8>::add(s1, s1);\
218        t2 = simd<8>::add(s2, s2);\
219        t3 = simd<8>::add(s3, s3);\
220        t4 = simd<8>::add(s4, s4);\
221        t5 = simd<8>::add(s5, s5);\
222        t6 = simd<8>::add(s6, s6);\
223        t7 = simd<8>::add(s7, s7);\
224  } while (0)
225
226
227#define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
228  do { \
229        BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\
230        movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\
231        bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\
232        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\
233        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
234        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\
235        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
236        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\
237        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
238        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\
239        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
240        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\
241        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
242        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\
243        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
244        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\
245  } while (0)
246
247
248#endif // S2P_HPP
249
Note: See TracBrowser for help on using the repository browser.