1 | /* s2p - Serial to Parallel Bit Stream Transposition |
---|
2 | Copyright (c) 2007, 2008, 2010, 2011 Robert D. Cameron. |
---|
3 | Licensed to the public under the Open Software License 3.0. |
---|
4 | Licensed to International Characters Inc. |
---|
5 | under the Academic Free License version 3.0. |
---|
6 | */ |
---|
7 | |
---|
8 | #ifndef S2P_H |
---|
9 | #define S2P_H |
---|
10 | |
---|
11 | #include "idisa128_c.h" |
---|
12 | |
---|
13 | #define BytePack BitBlock |
---|
14 | |
---|
15 | /* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7, |
---|
16 | s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7. |
---|
17 | |
---|
18 | The following header shows the intent, although a macro is used for |
---|
19 | speed. |
---|
20 | static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3, |
---|
21 | BytePack s5, BytePack s6, BytePack s7, BytePack s8, |
---|
22 | BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3, |
---|
23 | BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7); |
---|
24 | */ |
---|
25 | |
---|
26 | /* 1. ALGORITHM Selection. |
---|
27 | Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack |
---|
28 | Default is s2p_bytepack. |
---|
29 | Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override. |
---|
30 | */ |
---|
31 | |
---|
32 | #ifdef USE_S2P_IDEAL |
---|
33 | #define S2P_ALGORITHM s2p_ideal |
---|
34 | #endif |
---|
35 | |
---|
36 | #ifdef USE_S2P_MOVEMASK |
---|
37 | #define S2P_ALGORITHM s2p_movemask |
---|
38 | #endif |
---|
39 | |
---|
40 | #ifndef S2P_ALGORITHM |
---|
41 | #define S2P_ALGORITHM s2p_bytepack |
---|
42 | #endif |
---|
43 | |
---|
44 | #define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\ |
---|
45 | S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7) |
---|
46 | |
---|
47 | /* s2p_ideal is an ideal serial to parallel transposition |
---|
48 | algorithm given an architecture with native support for |
---|
49 | simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition |
---|
50 | of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack |
---|
51 | operations. |
---|
52 | */ |
---|
53 | |
---|
54 | #define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
55 | do {\ |
---|
56 | BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\ |
---|
57 | bit4567_0, bit4567_1, bit4567_2, bit4567_3;\ |
---|
58 | BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\ |
---|
59 | bit0123_0 = hsimd_packh_8(s0, s1);\ |
---|
60 | bit0123_1 = hsimd_packh_8(s2, s3);\ |
---|
61 | bit0123_2 = hsimd_packh_8(s4, s5);\ |
---|
62 | bit0123_3 = hsimd_packh_8(s6, s7);\ |
---|
63 | bit4567_0 = hsimd_packl_8(s0, s1);\ |
---|
64 | bit4567_1 = hsimd_packl_8(s2, s3);\ |
---|
65 | bit4567_2 = hsimd_packl_8(s4, s5);\ |
---|
66 | bit4567_3 = hsimd_packl_8(s6, s7);\ |
---|
67 | bit01_0 = hsimd_packh_4(bit0123_0, bit0123_1);\ |
---|
68 | bit01_1 = hsimd_packh_4(bit0123_2, bit0123_3);\ |
---|
69 | bit23_0 = hsimd_packl_4(bit0123_0, bit0123_1);\ |
---|
70 | bit23_1 = hsimd_packl_4(bit0123_2, bit0123_3);\ |
---|
71 | bit45_0 = hsimd_packh_4(bit4567_0, bit4567_1);\ |
---|
72 | bit45_1 = hsimd_packh_4(bit4567_2, bit4567_3);\ |
---|
73 | bit67_0 = hsimd_packl_4(bit4567_0, bit4567_1);\ |
---|
74 | bit67_1 = hsimd_packl_4(bit4567_2, bit4567_3);\ |
---|
75 | p0 = hsimd_packh_2(bit01_0, bit01_1);\ |
---|
76 | p1 = hsimd_packl_2(bit01_0, bit01_1);\ |
---|
77 | p2 = hsimd_packh_2(bit23_0, bit23_1);\ |
---|
78 | p3 = hsimd_packl_2(bit23_0, bit23_1);\ |
---|
79 | p4 = hsimd_packh_2(bit45_0, bit45_1);\ |
---|
80 | p5 = hsimd_packl_2(bit45_0, bit45_1);\ |
---|
81 | p6 = hsimd_packh_2(bit67_0, bit67_1);\ |
---|
82 | p7 = hsimd_packl_2(bit67_0, bit67_1);\ |
---|
83 | } while(0) |
---|
84 | |
---|
85 | |
---|
86 | /* s2p_bytepack is a fast serial to parallel transposition |
---|
87 | algorithm given an architecture with simd_pack_16 operations, |
---|
88 | but not at small field widths. |
---|
89 | MMX, SSE, Altivec ... |
---|
90 | */ |
---|
91 | |
---|
92 | |
---|
93 | #ifndef USE_S2P_AVX |
---|
94 | #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ |
---|
95 | do {\ |
---|
96 | BitBlock t0,t1;\ |
---|
97 | t0 = hsimd_packh_16(s0, s1);\ |
---|
98 | t1 = hsimd_packl_16(s0, s1);\ |
---|
99 | p0 = simd_ifh_1(hi_mask, t0, simd_srli_16(shift, t1));\ |
---|
100 | p1 = simd_ifh_1(hi_mask, simd_slli_16(shift, t0), t1);\ |
---|
101 | } while(0) |
---|
102 | #endif |
---|
103 | |
---|
104 | |
---|
105 | /* For AVX, we use a modified s2p_step function to avoid a number |
---|
106 | of conversions from 128-bit mode to 256-bit mode just to |
---|
107 | immediately convert back. */ |
---|
108 | #ifdef USE_S2P_AVX |
---|
109 | #include "idisa_cpp/idisa_sse2.cpp" |
---|
110 | #define avx_select_lo128(x) \ |
---|
111 | ((__m128i) _mm256_castps256_ps128(x)) |
---|
112 | |
---|
113 | #define avx_select_hi128(x) \ |
---|
114 | ((__m128i)(_mm256_extractf128_ps(x, 1))) |
---|
115 | |
---|
116 | #define avx_general_combine256(x, y) \ |
---|
117 | (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1)) |
---|
118 | |
---|
119 | #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ |
---|
120 | do {\ |
---|
121 | bitblock128_t s00, s01, s10, s11, t00, t01, t10, t11;\ |
---|
122 | bitblock128_t t10shift, t11shift, t00shift, t01shift;\ |
---|
123 | s00 = avx_select_hi128(s0);\ |
---|
124 | s01 = avx_select_lo128(s0);\ |
---|
125 | s10 = avx_select_hi128(s1);\ |
---|
126 | s11 = avx_select_lo128(s1);\ |
---|
127 | t00 = hsimd_packh_16(s00, s01);\ |
---|
128 | t10 = hsimd_packl_16(s00, s01);\ |
---|
129 | t01 = hsimd_packh_16(s10, s11);\ |
---|
130 | t11 = hsimd_packl_16(s10, s11);\ |
---|
131 | t10shift = simd_srli_16(shift, t10);\ |
---|
132 | t11shift = simd_srli_16(shift, t11);\ |
---|
133 | t00shift = simd_slli_16(shift, t00);\ |
---|
134 | t01shift = simd_slli_16(shift, t01);\ |
---|
135 | p0 = simd_ifh_1(hi_mask, avx_general_combine256(t00, t01), avx_general_combine256(t10shift, t11shift));\ |
---|
136 | p1 = simd_ifh_1(hi_mask, avx_general_combine256(t00shift, t01shift), avx_general_combine256(t10, t11));\ |
---|
137 | } while(0) |
---|
138 | #endif |
---|
139 | |
---|
140 | #define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
141 | do {\ |
---|
142 | BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\ |
---|
143 | BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\ |
---|
144 | BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\ |
---|
145 | BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\ |
---|
146 | s2p_step(s0,s1,simd_himask_2(),1,bit00224466_0,bit11335577_0);\ |
---|
147 | s2p_step(s2,s3,simd_himask_2(),1,bit00224466_1,bit11335577_1);\ |
---|
148 | s2p_step(s4,s5,simd_himask_2(),1,bit00224466_2,bit11335577_2);\ |
---|
149 | s2p_step(s6,s7,simd_himask_2(),1,bit00224466_3,bit11335577_3);\ |
---|
150 | s2p_step(bit00224466_0,bit00224466_1,simd_himask_4(),2,bit00004444_0,bit22226666_0);\ |
---|
151 | s2p_step(bit00224466_2,bit00224466_3,simd_himask_4(),2,bit00004444_1,bit22226666_1);\ |
---|
152 | s2p_step(bit11335577_0,bit11335577_1,simd_himask_4(),2,bit11115555_0,bit33337777_0);\ |
---|
153 | s2p_step(bit11335577_2,bit11335577_3,simd_himask_4(),2,bit11115555_1,bit33337777_1);\ |
---|
154 | s2p_step(bit00004444_0,bit00004444_1,simd_himask_8(),4,p0,p4);\ |
---|
155 | s2p_step(bit11115555_0,bit11115555_1,simd_himask_8(),4,p1,p5);\ |
---|
156 | s2p_step(bit22226666_0,bit22226666_1,simd_himask_8(),4,p2,p6);\ |
---|
157 | s2p_step(bit33337777_0,bit33337777_1,simd_himask_8(),4,p3,p7);\ |
---|
158 | } while(0) |
---|
159 | |
---|
160 | /* For sizeof(BitBlock) = 16 */ |
---|
161 | typedef uint16_t BitPack; |
---|
162 | |
---|
163 | #define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \ |
---|
164 | do { \ |
---|
165 | union { BitPack bit_pack[8];\ |
---|
166 | BitBlock bit_block;\ |
---|
167 | } b;\ |
---|
168 | b.bit_pack[0] = hsimd_signmask_8(s0);\ |
---|
169 | b.bit_pack[1] = hsimd_signmask_8(s1);\ |
---|
170 | b.bit_pack[2] = hsimd_signmask_8(s2);\ |
---|
171 | b.bit_pack[3] = hsimd_signmask_8(s3);\ |
---|
172 | b.bit_pack[4] = hsimd_signmask_8(s4);\ |
---|
173 | b.bit_pack[5] = hsimd_signmask_8(s5);\ |
---|
174 | b.bit_pack[6] = hsimd_signmask_8(s6);\ |
---|
175 | b.bit_pack[7] = hsimd_signmask_8(s7);\ |
---|
176 | p = b.bit_block;\ |
---|
177 | } while (0) |
---|
178 | |
---|
179 | #define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \ |
---|
180 | do { \ |
---|
181 | t0 = simd_add_8(s0, s0);\ |
---|
182 | t1 = simd_add_8(s1, s1);\ |
---|
183 | t2 = simd_add_8(s2, s2);\ |
---|
184 | t3 = simd_add_8(s3, s3);\ |
---|
185 | t4 = simd_add_8(s4, s4);\ |
---|
186 | t5 = simd_add_8(s5, s5);\ |
---|
187 | t6 = simd_add_8(s6, s6);\ |
---|
188 | t7 = simd_add_8(s7, s7);\ |
---|
189 | } while (0) |
---|
190 | |
---|
191 | |
---|
192 | #define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
193 | do { \ |
---|
194 | BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\ |
---|
195 | movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\ |
---|
196 | bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
197 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\ |
---|
198 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
199 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\ |
---|
200 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
201 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\ |
---|
202 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
203 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\ |
---|
204 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
205 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\ |
---|
206 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
207 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\ |
---|
208 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
209 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\ |
---|
210 | } while (0) |
---|
211 | |
---|
212 | |
---|
213 | #endif // S2P_H |
---|
214 | |
---|
215 | |
---|